コード例 #1
0
ファイル: findHost.py プロジェクト: MJKIM700/CS337_P1
def tweetData(year):
    tweets = pull_text.Data(year).tweets
    return tweets
コード例 #2
0
def get_nominees(year):
    key_words = [
        'nominated', 'nominate', 'nominee', 'nominates', 'nominating',
        'nominations', 'nomination', 'nom', 'noms', 'nommed'
    ]
    tweets = pull_text.Data(year).tweets
    if year == 2013 or year == 2015:
        awards = OFFICIAL_AWARDS_1.copy()
    else:
        awards = OFFICIAL_AWARDS_2.copy()
    # key_words.extend(awards)
    award_candidate_sents = dict()

    for tweet in tweets:
        try:
            cont_nom = False
            for word in key_words:
                if fuzz.partial_token_sort_ratio(
                        word, tweet.text
                ) > 90:  #if the tweet doesn't contain one of the key_words, go to the next tweet
                    cont_nom = True
                    break
            if not cont_nom:
                continue
        except:
            continue
        high_match_idx = -1
        high_match_score = -1
        equal_scores = []
        for idx, award in enumerate(awards):
            score = fuzz.token_set_ratio(tweet.text_unchanged, award)
            if score > high_match_score:
                high_match_idx = idx
                high_match_score = score
                equal_scores = []
            elif score == high_match_score:
                equal_scores.append(awards[high_match_idx])
                high_match_idx = idx
                high_match_score = score
        if high_match_score > 50:
            if len(equal_scores) > 0:
                for candidate in equal_scores:
                    award_candidate_sents = handle_high_score(
                        award_candidate_sents, candidate, tweet.text_unchanged,
                        high_match_score)
            award_candidate_sents = handle_high_score(award_candidate_sents,
                                                      awards[high_match_idx],
                                                      tweet.text_unchanged,
                                                      high_match_score)
    # print(award_candidate_sents)
    final_nominees = dict()
    awards_no_candidates = dict()
    for award in awards:
        bucket = []
        award_tokens = word_tokenize(award)
        if 'actor' in award_tokens or 'actress' in award_tokens or 'director' in award_tokens:
            person = True
        else:
            person = False
        if award in award_candidate_sents:
            all_candidates = award_candidate_sents[award]
        else:
            similar_award_candidates = awards.copy()
            similar_award_candidates.remove(award)
            most_similar = ''
            high_score = 0
            for similar in similar_award_candidates:
                score = fuzz.ratio(award, similar)
                if score > high_score:
                    high_score = score
                    most_similar = similar
            awards_no_candidates[award] = most_similar
            continue
        for candidate in all_candidates:
            tokenized = word_tokenize(candidate)
            nom_idx = -1
            for word in key_words:
                for idx, token in enumerate(tokenized):
                    if word == token:
                        nom_idx = idx
                        break
                if nom_idx != -1:
                    break
            if nom_idx == 0 or nom_idx == -1:
                bucket.extend(
                    make_string_combinations(tokenized, 'right', person))
            elif nom_idx == len(tokenized) - 1:
                bucket.extend(
                    make_string_combinations(tokenized, 'left', person))
            else:
                bucket.extend(
                    make_string_combinations(tokenized[0:nom_idx], 'left',
                                             person))
                bucket.extend(
                    make_string_combinations(tokenized[nom_idx + 1:], 'right',
                                             person))
        final_nominees[award] = bucket
    complete_noms = dict()
    for award in awards:
        if award in awards_no_candidates:
            use_award = awards_no_candidates[award]
            bucket = final_nominees[use_award]
            dist = FreqDist(bucket)
            for noms in dist.most_common(10)[6:]:
                if noms[0] not in complete_noms[award]:
                    complete_noms[award] = [noms[0]]
                else:
                    complete_noms[award].append(noms[0])
        else:
            bucket = final_nominees[award]
            dist = FreqDist(bucket)
            nom_list = []
            for noms in dist.most_common(5):
                nom_list.append(noms[0])
            complete_noms[award] = nom_list
    # print(complete_noms)
    return complete_noms
コード例 #3
0
ファイル: awards.py プロジェクト: MJKIM700/CS337_P1
def get_tweets(year):
    tweets = pull_text.Data(year).tweets
    tweettext = []
    for tweet in tweets:
        tweettext.append(tweet.text)
    return tweettext