コード例 #1
0
def get_relevant_tweets(user, i=0.8):
    relevant_tweets = []
    if user.features is None: user.features = {}
    if user.fact not in fact_to_words: return user
    if user.tweets is None: return user
    user_fact_words = [
        fw for fw in fact_to_words[user.fact] if fw in word_vectors.vocab
    ]
    for tweet in user.tweets:
        distance_to_topic = []
        tokens = gt.get_tokenize_text(tweet['text'])
        for token in tokens:
            if token not in word_vectors.vocab: continue
            increment = np.average(
                word_vectors.distances(token, other_words=user_fact_words))
            distance_to_topic.append(increment)
        if np.average(np.asarray(distance_to_topic)) < i:
            relevant_tweets.append(tweet)
    user.features['relevant_tweets'] = relevant_tweets
    #print(user.user_id, len(user.features['relevant_tweets']))
    return user
コード例 #2
0
def get_features(fact, transactions, users):
    if fact['true'] == 'unknown':
        print(fact)
        return
    this_transactions = transactions[transactions['fact'] == fact['hash']]
    if this_transactions.shape[0] == 0:
        print(fact.hash)
        return

    avg_friends = []
    avg_followers = []
    avg_status_cnt = []
    avg_reg_age = []
    avg_links = []
    avg_sent_pos = []
    avg_sent_neg = []
    avg_emoticons = []
    avg_questionM = []
    avg_mentions = []
    avg_personal_pronoun_first = []
    fr_has_url = []
    avg_sentiment = []
    share_most_freq_author = []
    lvl_size = ''
    matched_users = 0

    avg_len = []
    avg_words = []
    all_characters = []
    avg_unique_char = []
    retweets = []
    tweets_that_are_retweet = []
    avg_special_symbol = []
    tweets_that_are_reply = []
    avg_hashtags = []
    avg_exlamationM = []
    avg_multiQueExlM = []
    avg_upperCase = []
    avg_count_distinct_hashtags = []
    most_common_weekday = []
    most_common_hour = []
    avg_count_distinct_words = []
    avg_verified = []
    avg_time_retweet = []
    avg_len_description = []
    avg_len_name = []

    emoji_pattern = re.compile(
        "(:\(|:\))|"
        u"(\ud83d[\ude00-\ude4f])|"  # emoticons
        u"(\ud83c[\udf00-\uffff])|"  # symbols & pictographs (1 of 2)
        u"(\ud83d[\u0000-\uddff])|"  # symbols & pictographs (2 of 2)
        u"(\ud83d[\ude80-\udeff])|"  # transport & map symbols
        u"(\ud83c[\udde0-\uddff])"  # flags (iOS)
        "+",
        flags=re.UNICODE)

    link_pattern = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    pronouns = [
        'I', 'you', 'he', 'she', 'it', 'they', 'we', 'me', 'him', 'her', 'its',
        'our', 'us', 'them', 'my', 'your', 'his', 'hers', 'yours', 'theirs',
        'mine', 'ours'
    ]

    for idx, tr in this_transactions.iterrows():
        tokenized_text = gt.get_tokenize_text(tr['text'])
        if len(tokenized_text) == 0: tokenized_text = [0]
        chars = [c for c in tr['text']]
        avg_questionM.append(1 if '?' in tr['text'] else 0)
        avg_personal_pronoun_first.append(1 if tokenized_text[0] in
                                          pronouns else 0)
        sent_score = sid.polarity_scores(tr['text'])['compound']
        avg_sentiment.append(sent_score)
        avg_sent_pos.append(1 if sent_score > 0.5 else 0)
        avg_sent_neg.append(1 if sent_score < 0.5 else 0)
        avg_emoticons.append(len(re.findall(emoji_pattern, tr['text'])))
        avg_links.extend(re.findall(link_pattern, tr['text']))
        fr_has_url.append(
            1 if len(re.findall(link_pattern, tr['text'])) != 0 else 0)
        avg_mentions.append(len([c for c in chars if c == '@']))
        lvl_size = idx
        avg_len.append(len(tr['text']))
        avg_words.append(len(tokenized_text))
        avg_count_distinct_words.extend(tokenized_text)
        chars = [c for c in tr['text']]
        all_characters.extend(chars)
        avg_unique_char.append(len(set(chars)))
        avg_hashtags.append(len([c for c in chars if c == '#']))
        avg_mentions.append(len([c for c in chars if c == '@']))
        avg_special_symbol.append(len(re.findall('[^0-9a-zA-Z *]',
                                                 tr['text'])))
        avg_questionM.append(1 if '?' in tr['text'] else 0)
        avg_exlamationM.append(1 if '!' in tr['text'] else 0)
        avg_multiQueExlM.append(
            1 if len(re.findall('/[?]/', tr['text'])) +
            len(re.findall('/[!]/', tr['text'])) > 1 else 0)
        avg_upperCase.append(len(re.findall('/[A-Z]/', tr['text'])))
        avg_count_distinct_hashtags.append(
            (len(re.findall('/[#]/', tr['text']))))
        most_common_weekday.append(tr['timestamp'].day)
        most_common_hour.append(tr['timestamp'].hour)

        # if tr['user_id'] in users['user_id'].values: print(tr['user_id'])
        user = [u for u in users if int(u.user_id) == int(tr['user_id'])]
        if len(user) < 1:
            print(tr['user_id'])
            continue
        user = user[0]
        if user.features == None:
            print(user.user_id)
            continue

        matched_users += 1
        avg_friends.append(
            int(user.features['friends']) if 'friends' in user.features else 0)
        avg_followers.append(
            int(user.features['followers']) if 'followers' in
            user.features else 0)
        avg_status_cnt.append(
            int(user.features['statuses_count']) if 'statuses_count' in
            user.features else 0)
        avg_reg_age.append(
            int((datetime.datetime.now().replace(tzinfo=None) -
                 parser.parse(user.features['created_at']).replace(tzinfo=None)
                 ).days if 'created_at' in user.features else 0))

        avg_verified.append(1 if 'verified' in user.features
                            and user.features['verified'] == 'true' else 0)
        avg_time_retweet.append(int(user.avg_time_to_retweet))
        avg_len_description.append(
            len(user.features['description']) if 'description' in
            user.features else 0)
        avg_len_name.append(
            len(user.features['name']) if 'name' in user.features else 0)

    if matched_users == 0: matched_users = 1
    num_transactions = this_transactions.shape[0]

    avg_emoticons = 1.0 * sum(avg_emoticons) / (num_transactions)
    avg_mentions = sum(avg_mentions) / (num_transactions)
    avg_links = 1.0 * len(set(avg_links)) / (num_transactions)
    avg_questionM = 1.0 * sum(avg_questionM) / (num_transactions)
    avg_sent_pos = 1.0 * sum(avg_sent_pos) / (num_transactions)
    avg_sent_neg = 1.0 * sum(avg_sent_neg) / (num_transactions)
    avg_sentiment = 1.0 * sum(avg_sentiment) / (num_transactions)
    avg_personal_pronoun_first = sum(avg_personal_pronoun_first) * 1.0 / (
        num_transactions)
    fr_has_url = 1.0 * sum(fr_has_url) / (num_transactions)
    share_most_freq_author = 1 / num_transactions
    lvl_size = lvl_size

    avg_friends = 1.0 * sum(avg_friends) / matched_users
    avg_reg_age = 1.0 * sum(avg_reg_age) / matched_users
    avg_followers = 1.0 * sum(avg_followers) / matched_users
    avg_status_cnt = 1.0 * sum(avg_status_cnt) / matched_users

    avg_len = 1.0 * sum(avg_len) / num_transactions
    avg_words = 1.0 * sum(avg_words) / num_transactions
    avg_unique_char = 1.0 * sum(avg_unique_char) / num_transactions
    avg_hashtags = sum(avg_hashtags) / num_transactions
    avg_retweets = 1.0 * sum(retweets) / num_transactions
    avg_special_symbol = sum(avg_special_symbol) / num_transactions
    avg_exlamationM = 1.0 * sum(avg_exlamationM) / num_transactions
    avg_multiQueExlM = 1.0 * sum(avg_multiQueExlM) / num_transactions
    avg_upperCase = 1.0 * sum(avg_upperCase) / num_transactions
    avg_count_distinct_hashtags = 1.0 * sum(
        avg_count_distinct_hashtags) / num_transactions
    most_common_weekday = 1.0 * sum(most_common_weekday) / num_transactions
    most_common_hour = 1.0 * sum(most_common_hour) / num_transactions
    avg_count_distinct_words = 1.0 * len(
        set(avg_count_distinct_words)) / num_transactions
    avg_verified = 1.0 * sum(avg_verified) / num_transactions
    avg_time_retweet = 1.0 * sum(avg_time_retweet) / num_transactions
    avg_len_description = 1.0 * sum(avg_len_description) / num_transactions
    avg_len_name = 1.0 * sum(avg_len_name) / num_transactions

    return {
        'hash': fact['hash'],
        'topic': fact['topic'],
        'source_tweet': fact['source_tweet'],
        'text': fact['text'],
        'y': int(fact['true']),
        'avg_mentions': avg_mentions,
        'avg_emoticons': avg_emoticons,
        'avg_links': avg_links,
        'avg_questionM': avg_questionM,
        'avg_personal_pronoun_first': avg_personal_pronoun_first,
        'avg_sent_pos': avg_sent_pos,
        'avg_sent_neg': avg_sent_neg,
        'avg_sentiment': avg_sentiment,
        'fr_has_url': fr_has_url,
        'share_most_freq_author': share_most_freq_author,
        'lvl_size': lvl_size,
        'avg_followers': avg_followers,
        'avg_friends': avg_friends,
        'avg_status_cnt': avg_status_cnt,
        'avg_reg_age': avg_reg_age,
        'avg_len': avg_len,
        'avg_words': avg_words,
        'avg_unique_char': avg_unique_char,
        'avg_hashtags': avg_hashtags,
        'avg_retweets': avg_retweets,
        'avg_special_symbol': avg_special_symbol,
        'avg_exlamationM': avg_exlamationM,
        'avg_multiQueExlM': avg_multiQueExlM,
        'avg_upperCase': avg_upperCase,
        'avg_count_distinct_hashtags': avg_count_distinct_hashtags,
        'most_common_weekday': most_common_weekday,
        'most_common_hour': most_common_hour,
        'avg_count_distinct_words': avg_count_distinct_words,
        'avg_verified': avg_verified,
        'avg_time_retweet': avg_time_retweet,
        'avg_len_description': avg_len_description,
        'avg_len_name': avg_len_name
    }
コード例 #3
0
 def get_credibility(text):
     text = gt.get_tokenize_text(text)
     text = [word_to_idx[w] for w in text if w in word_to_idx]
     text = sequence.pad_sequences([text], maxlen=12)
     probs = model.predict_proba(text)
     return probs
コード例 #4
0
def get_features(fact, transactions, users):
    if fact['true'] == 'unknown':
        print(fact)
        return
    this_transactions = transactions[transactions['fact'] == fact['hash']]
    if this_transactions.shape[0] == 0:
        print(fact.hash)
        return

    avg_friends = []
    avg_followers = []
    avg_status_cnt = []
    avg_reg_age = []
    avg_links = []
    avg_sent_pos = []
    avg_sent_neg = []
    avg_emoticons = []
    avg_questionM = []
    avg_mentions = []
    avg_personal_pronoun_first = []
    fr_has_url = []
    avg_sentiment = []
    share_most_freq_author = []
    lvl_size = ''
    matched_users = 0

    emoji_pattern = re.compile(
        "(:\(|:\))|"
        u"(\ud83d[\ude00-\ude4f])|"  # emoticons
        u"(\ud83c[\udf00-\uffff])|"  # symbols & pictographs (1 of 2)
        u"(\ud83d[\u0000-\uddff])|"  # symbols & pictographs (2 of 2)
        u"(\ud83d[\ude80-\udeff])|"  # transport & map symbols
        u"(\ud83c[\udde0-\uddff])"  # flags (iOS)
        "+",
        flags=re.UNICODE)

    link_pattern = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    pronouns = [
        'I', 'you', 'he', 'she', 'it', 'they', 'we', 'me', 'him', 'her', 'its',
        'our', 'us', 'them', 'my', 'your', 'his', 'hers', 'yours', 'theirs',
        'mine', 'ours'
    ]

    for idx, tr in this_transactions.iterrows():
        tokenized_text = gt.get_tokenize_text(tr['text'])
        if len(tokenized_text) == 0: tokenized_text = [0]
        chars = [c for c in tr['text']]
        avg_questionM.append(1 if '?' in tr['text'] else 0)
        avg_personal_pronoun_first.append(1 if tokenized_text[0] in
                                          pronouns else 0)
        sent_score = sid.polarity_scores(tr['text'])['compound']
        avg_sentiment.append(sent_score)
        avg_sent_pos.append(1 if sent_score > 0.5 else 0)
        avg_sent_neg.append(1 if sent_score < 0.5 else 0)
        avg_emoticons.append(len(re.findall(emoji_pattern, tr['text'])))
        avg_links.extend(re.findall(link_pattern, tr['text']))
        fr_has_url.append(
            1 if len(re.findall(link_pattern, tr['text'])) != 0 else 0)
        avg_mentions.append(len([c for c in chars if c == '@']))
        lvl_size = idx

        # if tr['user_id'] in users['user_id'].values: print(tr['user_id'])
        user = [u for u in users if int(u.user_id) == int(tr['user_id'])]
        if len(user) < 1:
            print(tr['user_id'])
            continue
        user = user[0]
        if user.features == None:
            print(user.user_id)
            continue

        matched_users += 1
        avg_friends.append(
            int(user.features['friends']) if 'friends' in user.features else 0)
        avg_followers.append(
            int(user.features['followers']) if 'followers' in
            user.features else 0)
        avg_status_cnt.append(
            int(user.features['statuses_count']) if 'statuses_count' in
            user.features else 0)
        avg_reg_age.append(
            int((datetime.datetime.now().replace(tzinfo=None) -
                 parser.parse(user.features['created_at']).replace(tzinfo=None)
                 ).days if 'created_at' in user.features else 0))

    if matched_users == 0: matched_users = 1
    num_transactions = this_transactions.shape[0]

    avg_emoticons = 1.0 * sum(avg_emoticons) / (num_transactions)
    avg_mentions = sum(avg_mentions) / (num_transactions)
    avg_links = 1.0 * len(set(avg_links)) / (num_transactions)
    avg_questionM = 1.0 * sum(avg_questionM) / (num_transactions)
    avg_sent_pos = 1.0 * sum(avg_sent_pos) / (num_transactions)
    avg_sent_neg = 1.0 * sum(avg_sent_neg) / (num_transactions)
    avg_sentiment = 1.0 * sum(avg_sentiment) / (num_transactions)
    avg_personal_pronoun_first = sum(avg_personal_pronoun_first) * 1.0 / (
        num_transactions)
    fr_has_url = 1.0 * sum(fr_has_url) / (num_transactions)
    share_most_freq_author = 1 / num_transactions
    lvl_size = lvl_size

    avg_friends = 1.0 * sum(avg_friends) / matched_users
    avg_reg_age = 1.0 * sum(avg_reg_age) / matched_users
    avg_followers = 1.0 * sum(avg_followers) / matched_users
    avg_status_cnt = 1.0 * sum(avg_status_cnt) / matched_users

    return {
        'hash': fact['hash'],
        'topic': fact['topic'],
        'source_tweet': fact['source_tweet'],
        'text': fact['text'],
        'y': int(fact['true']),
        'avg_mentions': avg_mentions,
        'avg_emoticons': avg_emoticons,
        'avg_links': avg_links,
        'avg_questionM': avg_questionM,
        'avg_personal_pronoun_first': avg_personal_pronoun_first,
        'avg_sent_pos': avg_sent_pos,
        'avg_sent_neg': avg_sent_neg,
        'avg_sentiment': avg_sentiment,
        'fr_has_url': fr_has_url,
        'share_most_freq_author': share_most_freq_author,
        'lvl_size': lvl_size,
        'avg_followers': avg_followers,
        'avg_friends': avg_friends,
        'avg_status_cnt': avg_status_cnt,
        'avg_reg_age': avg_reg_age
    }