Example #1
0
def get_article_pairs(graph_file, articles_file, pairs_out_file, set_type):

    G = read_graph(graph_file)
    articles = pd.read_csv(articles_file, sep='\t')['url'].tolist()

    if set_type == 'test':
        pairs = []
        for a in articles:
            if G.out_degree(a) > 1:
                for p in G.successors(a):
                    pairs.append([a, p])

        df = pd.DataFrame(pairs, columns=['article', 'paper'])
        df.to_csv(pairs_out_file, sep='\t', index=None)

    elif set_type == 'train':
        true_pairs = []
        for a in articles:
            if G.out_degree(a) == 1:
                true_pairs.append([a, next(iter(G.successors(a))), True])

        false_pairs = []
        for a in articles:
            if G.out_degree(a) == 1:
                true_successor = next(iter(G.successors(a)))
                while True:
                    index = randint(0, len(true_pairs) - 1)
                    if true_pairs[index][1] != true_successor:
                        false_pairs.append([a, true_pairs[index][1], False])
                        break

        df = pd.DataFrame(true_pairs + false_pairs,
                          columns=['article', 'paper', 'related'])
        df.to_csv(pairs_out_file, sep='\t', index=None)
Example #2
0
def aggregate_all_features(graph_file, articles_file, papers_file, tweets_file,
                           model_folder, final_file):

    df = pd.read_csv(articles_file, sep='\t')

    df = attach_social_media_details(graph_file, tweets_file, articles_file)

    predict_similarity(graph_file, articles_file, papers_file, model_folder)
    sim = pd.read_csv(model_folder + '/predict_pairs.tsv', sep='\t').drop(
        'paper', axis=1).groupby('article').max().reset_index()
    df = df.merge(sim, left_on='url', right_on='article').drop('article',
                                                               axis=1)

    df['readability'] = df['full_text'].apply(
        lambda x: textstat.flesch_reading_ease(x))
    df['title_subjectivity'] = df['title'].apply(
        lambda x: TextBlob(x).subjectivity)
    df['title_polarity'] = df['title'].apply(lambda x: TextBlob(x).polarity)
    df['title_clickbaitness'] = df['title'].apply(is_clickbait)

    df = pd.concat([
        df.drop(['quote_indicators'], axis=1),
        df['quote_indicators'].apply(lambda x: pd.Series(eval(x)))
    ],
                   axis=1)

    df['has_author'] = df['authors'].apply(lambda x: len(eval(x)) != 0)

    G = read_graph(graph_file)
    pagerank = nx.pagerank(G.reverse())
    betweenness_centrality = nx.betweenness_centrality(G)
    degree_centrality = nx.degree_centrality(G)
    in_degree_centrality = nx.in_degree_centrality(G)
    out_degree_centrality = nx.out_degree_centrality(G)

    df['pageRank'] = df['url'].apply(lambda x: pagerank[x])
    df['betweenness_centrality'] = df['url'].apply(
        lambda x: betweenness_centrality[x])
    df['degree_centrality'] = df['url'].apply(lambda x: degree_centrality[x])
    df['in_degree_centrality'] = df['url'].apply(
        lambda x: in_degree_centrality[x])
    df['out_degree_centrality'] = df['url'].apply(
        lambda x: out_degree_centrality[x])

    df['word_count'] = df['full_text'].apply(
        lambda x: len(re.findall(r'\w+', x)))

    df['alexa_rank'] = df['url'].apply(lambda x: bs4.BeautifulSoup(
        urllib.request.urlopen("http://data.alexa.com/data?cli=10&dat=s&url=" +
                               str(x)).read(), "xml").find("REACH")['RANK'])

    df.url = df.url.apply(lambda x: analyze_url(x)[0])

    df = df[[
        'url', 'likes', 'replies_count', 'title_clickbaitness',
        'betweenness_centrality', 'degree_centrality', 'in_degree_centrality',
        'out_degree_centrality', 'replies_mean_polarity',
        'replies_mean_subjectivity', 'retweets', 'stance',
        'tweets_mean_polarity', 'tweets_mean_subjectivity',
        'tweets_time_delta', 'users_countries', 'users_median_followers',
        'users_median_friends', 'users_median_tweets', 'related',
        'readability', 'title_subjectivity', 'title_polarity',
        'count_all_quotes', 'count_PER_quotes', 'count_ORG_quotes',
        'count_unnamed_quotes', 'has_author', 'pageRank', 'word_count',
        'alexa_rank'
    ]]

    df.columns = [
        'url', '#Likes', '#Replies', 'Title Clickbaitness',
        'Betweenness Centrality', 'Degree Centrality', 'In Degree Centrality',
        'Out Degree Centrality', 'Replies Polarity', 'Replies Subjectivity',
        '#Retweets', 'Replies Stance', 'Tweets Polarity',
        'Tweets Subjectivity', 'Tweets Shelf Life', '#Users Countries',
        '#Followers', '#Users Friends', '#Users Tweets', 'STS', 'Readability',
        'Title Subjectivity', 'Title Polarity', '#Quotes', '#Person Quotes',
        '#Scientific Mentions', '#Weasel Quotes', 'Author Signature',
        'Personalized PageRank', 'Article Word Count', 'Alexa Rank'
    ]

    df.to_csv(final_file, sep='\t', index=None)
Example #3
0
def aggregate_tweet_details(graph_file, tweet_file, article_in_file,
                            article_out_file):
    G = read_graph(graph_file)
    tweet_details = pd.read_csv(tweet_file, sep='\t').fillna(0)
    article_details = pd.read_csv(article_in_file, sep='\t')

    def func(url, tweet_details):
        tweet_details = tweet_details.copy()
        if len(tweet_details['publish_date']) in [0, 1]:
            delta_in_hours = 0
        else:
            tweet_details['publish_date'] = pd.to_datetime(
                tweet_details['publish_date']).astype('int64') // 1e9
            [dmin,
             dmax] = np.percentile(tweet_details['publish_date'].tolist(),
                                   [5, 95])
            delta_in_hours = (dmax - dmin) // 3600
            if len(tweet_details['publish_date']) != 2:
                tweet_details = tweet_details[
                    (dmin < tweet_details['publish_date'])
                    & (tweet_details['publish_date'] < dmax)]

        agg = [url]
        agg.append(delta_in_hours)
        agg.append(
            len(tweet_details['user_country'].dropna().unique().tolist()))
        agg = agg + tweet_details[['RTs', 'replies_num', 'likes'
                                   ]].sum(axis=0).tolist()
        agg = agg + tweet_details[[
            'tweet_polarity', 'tweet_subjectivity', 'replies_mean_polarity',
            'replies_mean_subjectivity'
        ]].mean(axis=0).tolist()
        agg = agg + tweet_details[[
            'user_followers_count', 'user_tweet_count', 'user_friends_count'
        ]].median(axis=0).tolist()
        agg.append(tweet_details['stance'].mean(axis=0))

        agg = {
            'url': agg[0],
            'tweets_time_delta': agg[1],
            'users_countries': agg[2],
            'retweets': agg[3],
            'replies_count': agg[4],
            'likes': agg[5],
            'tweets_mean_polarity': agg[6],
            'tweets_mean_subjectivity': agg[7],
            'replies_mean_polarity': agg[8],
            'replies_mean_subjectivity': agg[9],
            'users_median_followers': agg[10],
            'users_median_tweets': agg[11],
            'users_median_friends': agg[12],
            'stance': agg[13]
        }
        return agg

    article_details = article_details.merge(
        pd.DataFrame(article_details['url'].apply(lambda x: func(
            x, tweet_details[tweet_details['url'].isin(G.predecessors(x))])).
                     tolist()),
        on='url')
    article_details.to_csv(article_out_file, sep='\t', index=None)
Example #4
0
def attach_social_media_details(graph_file, tweet_file, article_file):
    G = read_graph(graph_file)

    tweet_details = pd.read_csv(tweet_file, sep='\t')
    tweet_details['likes'] = tweet_details['popularity'] - tweet_details['RTs']
    tweet_details['tweet_polarity'] = tweet_details['full_text'].apply(
        lambda x: TextBlob(x).sentiment.polarity)
    tweet_details['tweet_subjectivity'] = tweet_details['full_text'].apply(
        lambda x: TextBlob(x).sentiment.subjectivity)
    tweet_details['replies_mean_polarity'] = tweet_details['replies'].apply(
        lambda x: np.mean([TextBlob(r).sentiment.polarity for r in eval(x)]))
    tweet_details['replies_mean_subjectivity'] = tweet_details[
        'replies'].apply(lambda x: np.mean(
            [TextBlob(r).sentiment.subjectivity for r in eval(x)]))
    tweet_details['user'] = tweet_details['url'].apply(
        lambda x: x.split('/')[3])
    tweet_details = tweet_details.merge(pd.read_csv(twitter_users_file,
                                                    sep='\t'),
                                        left_on='user',
                                        right_on='screen_name',
                                        how='left')
    tweet_details = tweet_details.drop(['popularity', 'user', 'screen_name'],
                                       axis=1)
    tweet_details = tweet_details.replace('\\N', np.NaN)

    classifier = pickle.load(open(stance_classifier, 'rb'))
    X = np.array(stance_features_extraction(tweet_details[[
        'replies'
    ]].rename(columns={'replies': 'Tweet'})).drop('Tweet', axis=1).values,
                 dtype=np.float32)
    tweet_details['stance'] = classifier.predict_proba(X)[:, 1]
    tweet_details['stance'] = tweet_details.apply(
        lambda x: 0 if x.replies_num == 0 else x.stance, axis=1)
    tweet_details = tweet_details.fillna(0)

    article_details = pd.read_csv(article_file, sep='\t')

    def func(url, tweet_details):
        tweet_details = tweet_details.copy()
        if len(tweet_details['publish_date']) in [0, 1]:
            delta_in_hours = 0
        else:
            tweet_details['publish_date'] = pd.to_datetime(
                tweet_details['publish_date']).astype('int64') // 1e9
            [dmin,
             dmax] = np.percentile(tweet_details['publish_date'].tolist(),
                                   [5, 95])
            delta_in_hours = (dmax - dmin) // 3600
            if len(tweet_details['publish_date']) != 2:
                tweet_details = tweet_details[
                    (dmin < tweet_details['publish_date'])
                    & (tweet_details['publish_date'] < dmax)]

        agg = [url]
        agg.append(delta_in_hours)
        agg.append(
            len(tweet_details['user_country'].dropna().unique().tolist()))
        agg = agg + tweet_details[['RTs', 'replies_num', 'likes'
                                   ]].sum(axis=0).tolist()
        agg = agg + tweet_details[[
            'tweet_polarity', 'tweet_subjectivity', 'replies_mean_polarity',
            'replies_mean_subjectivity'
        ]].mean(axis=0).tolist()
        agg = agg + tweet_details[[
            'user_followers_count', 'user_tweet_count', 'user_friends_count'
        ]].median(axis=0).tolist()
        agg.append(tweet_details['stance'].mean(axis=0))

        agg = {
            'url': agg[0],
            'tweets_time_delta': agg[1],
            'users_countries': agg[2],
            'retweets': agg[3],
            'replies_count': agg[4],
            'likes': agg[5],
            'tweets_mean_polarity': agg[6],
            'tweets_mean_subjectivity': agg[7],
            'replies_mean_polarity': agg[8],
            'replies_mean_subjectivity': agg[9],
            'users_median_followers': agg[10],
            'users_median_tweets': agg[11],
            'users_median_friends': agg[12],
            'stance': agg[13]
        }
        return agg

    article_details = article_details.merge(
        pd.DataFrame(article_details['url'].apply(lambda x: func(
            x, tweet_details[tweet_details['url'].isin(G.predecessors(x))])).
                     tolist()),
        on='url')
    return article_details