def get_article_pairs(graph_file, articles_file, pairs_out_file, set_type): G = read_graph(graph_file) articles = pd.read_csv(articles_file, sep='\t')['url'].tolist() if set_type == 'test': pairs = [] for a in articles: if G.out_degree(a) > 1: for p in G.successors(a): pairs.append([a, p]) df = pd.DataFrame(pairs, columns=['article', 'paper']) df.to_csv(pairs_out_file, sep='\t', index=None) elif set_type == 'train': true_pairs = [] for a in articles: if G.out_degree(a) == 1: true_pairs.append([a, next(iter(G.successors(a))), True]) false_pairs = [] for a in articles: if G.out_degree(a) == 1: true_successor = next(iter(G.successors(a))) while True: index = randint(0, len(true_pairs) - 1) if true_pairs[index][1] != true_successor: false_pairs.append([a, true_pairs[index][1], False]) break df = pd.DataFrame(true_pairs + false_pairs, columns=['article', 'paper', 'related']) df.to_csv(pairs_out_file, sep='\t', index=None)
def aggregate_all_features(graph_file, articles_file, papers_file, tweets_file, model_folder, final_file): df = pd.read_csv(articles_file, sep='\t') df = attach_social_media_details(graph_file, tweets_file, articles_file) predict_similarity(graph_file, articles_file, papers_file, model_folder) sim = pd.read_csv(model_folder + '/predict_pairs.tsv', sep='\t').drop( 'paper', axis=1).groupby('article').max().reset_index() df = df.merge(sim, left_on='url', right_on='article').drop('article', axis=1) df['readability'] = df['full_text'].apply( lambda x: textstat.flesch_reading_ease(x)) df['title_subjectivity'] = df['title'].apply( lambda x: TextBlob(x).subjectivity) df['title_polarity'] = df['title'].apply(lambda x: TextBlob(x).polarity) df['title_clickbaitness'] = df['title'].apply(is_clickbait) df = pd.concat([ df.drop(['quote_indicators'], axis=1), df['quote_indicators'].apply(lambda x: pd.Series(eval(x))) ], axis=1) df['has_author'] = df['authors'].apply(lambda x: len(eval(x)) != 0) G = read_graph(graph_file) pagerank = nx.pagerank(G.reverse()) betweenness_centrality = nx.betweenness_centrality(G) degree_centrality = nx.degree_centrality(G) in_degree_centrality = nx.in_degree_centrality(G) out_degree_centrality = nx.out_degree_centrality(G) df['pageRank'] = df['url'].apply(lambda x: pagerank[x]) df['betweenness_centrality'] = df['url'].apply( lambda x: betweenness_centrality[x]) df['degree_centrality'] = df['url'].apply(lambda x: degree_centrality[x]) df['in_degree_centrality'] = df['url'].apply( lambda x: in_degree_centrality[x]) df['out_degree_centrality'] = df['url'].apply( lambda x: out_degree_centrality[x]) df['word_count'] = df['full_text'].apply( lambda x: len(re.findall(r'\w+', x))) df['alexa_rank'] = df['url'].apply(lambda x: bs4.BeautifulSoup( urllib.request.urlopen("http://data.alexa.com/data?cli=10&dat=s&url=" + str(x)).read(), "xml").find("REACH")['RANK']) df.url = df.url.apply(lambda x: analyze_url(x)[0]) df = df[[ 'url', 'likes', 'replies_count', 'title_clickbaitness', 'betweenness_centrality', 'degree_centrality', 'in_degree_centrality', 'out_degree_centrality', 'replies_mean_polarity', 'replies_mean_subjectivity', 'retweets', 'stance', 'tweets_mean_polarity', 'tweets_mean_subjectivity', 'tweets_time_delta', 'users_countries', 'users_median_followers', 'users_median_friends', 'users_median_tweets', 'related', 'readability', 'title_subjectivity', 'title_polarity', 'count_all_quotes', 'count_PER_quotes', 'count_ORG_quotes', 'count_unnamed_quotes', 'has_author', 'pageRank', 'word_count', 'alexa_rank' ]] df.columns = [ 'url', '#Likes', '#Replies', 'Title Clickbaitness', 'Betweenness Centrality', 'Degree Centrality', 'In Degree Centrality', 'Out Degree Centrality', 'Replies Polarity', 'Replies Subjectivity', '#Retweets', 'Replies Stance', 'Tweets Polarity', 'Tweets Subjectivity', 'Tweets Shelf Life', '#Users Countries', '#Followers', '#Users Friends', '#Users Tweets', 'STS', 'Readability', 'Title Subjectivity', 'Title Polarity', '#Quotes', '#Person Quotes', '#Scientific Mentions', '#Weasel Quotes', 'Author Signature', 'Personalized PageRank', 'Article Word Count', 'Alexa Rank' ] df.to_csv(final_file, sep='\t', index=None)
def aggregate_tweet_details(graph_file, tweet_file, article_in_file, article_out_file): G = read_graph(graph_file) tweet_details = pd.read_csv(tweet_file, sep='\t').fillna(0) article_details = pd.read_csv(article_in_file, sep='\t') def func(url, tweet_details): tweet_details = tweet_details.copy() if len(tweet_details['publish_date']) in [0, 1]: delta_in_hours = 0 else: tweet_details['publish_date'] = pd.to_datetime( tweet_details['publish_date']).astype('int64') // 1e9 [dmin, dmax] = np.percentile(tweet_details['publish_date'].tolist(), [5, 95]) delta_in_hours = (dmax - dmin) // 3600 if len(tweet_details['publish_date']) != 2: tweet_details = tweet_details[ (dmin < tweet_details['publish_date']) & (tweet_details['publish_date'] < dmax)] agg = [url] agg.append(delta_in_hours) agg.append( len(tweet_details['user_country'].dropna().unique().tolist())) agg = agg + tweet_details[['RTs', 'replies_num', 'likes' ]].sum(axis=0).tolist() agg = agg + tweet_details[[ 'tweet_polarity', 'tweet_subjectivity', 'replies_mean_polarity', 'replies_mean_subjectivity' ]].mean(axis=0).tolist() agg = agg + tweet_details[[ 'user_followers_count', 'user_tweet_count', 'user_friends_count' ]].median(axis=0).tolist() agg.append(tweet_details['stance'].mean(axis=0)) agg = { 'url': agg[0], 'tweets_time_delta': agg[1], 'users_countries': agg[2], 'retweets': agg[3], 'replies_count': agg[4], 'likes': agg[5], 'tweets_mean_polarity': agg[6], 'tweets_mean_subjectivity': agg[7], 'replies_mean_polarity': agg[8], 'replies_mean_subjectivity': agg[9], 'users_median_followers': agg[10], 'users_median_tweets': agg[11], 'users_median_friends': agg[12], 'stance': agg[13] } return agg article_details = article_details.merge( pd.DataFrame(article_details['url'].apply(lambda x: func( x, tweet_details[tweet_details['url'].isin(G.predecessors(x))])). tolist()), on='url') article_details.to_csv(article_out_file, sep='\t', index=None)
def attach_social_media_details(graph_file, tweet_file, article_file): G = read_graph(graph_file) tweet_details = pd.read_csv(tweet_file, sep='\t') tweet_details['likes'] = tweet_details['popularity'] - tweet_details['RTs'] tweet_details['tweet_polarity'] = tweet_details['full_text'].apply( lambda x: TextBlob(x).sentiment.polarity) tweet_details['tweet_subjectivity'] = tweet_details['full_text'].apply( lambda x: TextBlob(x).sentiment.subjectivity) tweet_details['replies_mean_polarity'] = tweet_details['replies'].apply( lambda x: np.mean([TextBlob(r).sentiment.polarity for r in eval(x)])) tweet_details['replies_mean_subjectivity'] = tweet_details[ 'replies'].apply(lambda x: np.mean( [TextBlob(r).sentiment.subjectivity for r in eval(x)])) tweet_details['user'] = tweet_details['url'].apply( lambda x: x.split('/')[3]) tweet_details = tweet_details.merge(pd.read_csv(twitter_users_file, sep='\t'), left_on='user', right_on='screen_name', how='left') tweet_details = tweet_details.drop(['popularity', 'user', 'screen_name'], axis=1) tweet_details = tweet_details.replace('\\N', np.NaN) classifier = pickle.load(open(stance_classifier, 'rb')) X = np.array(stance_features_extraction(tweet_details[[ 'replies' ]].rename(columns={'replies': 'Tweet'})).drop('Tweet', axis=1).values, dtype=np.float32) tweet_details['stance'] = classifier.predict_proba(X)[:, 1] tweet_details['stance'] = tweet_details.apply( lambda x: 0 if x.replies_num == 0 else x.stance, axis=1) tweet_details = tweet_details.fillna(0) article_details = pd.read_csv(article_file, sep='\t') def func(url, tweet_details): tweet_details = tweet_details.copy() if len(tweet_details['publish_date']) in [0, 1]: delta_in_hours = 0 else: tweet_details['publish_date'] = pd.to_datetime( tweet_details['publish_date']).astype('int64') // 1e9 [dmin, dmax] = np.percentile(tweet_details['publish_date'].tolist(), [5, 95]) delta_in_hours = (dmax - dmin) // 3600 if len(tweet_details['publish_date']) != 2: tweet_details = tweet_details[ (dmin < tweet_details['publish_date']) & (tweet_details['publish_date'] < dmax)] agg = [url] agg.append(delta_in_hours) agg.append( len(tweet_details['user_country'].dropna().unique().tolist())) agg = agg + tweet_details[['RTs', 'replies_num', 'likes' ]].sum(axis=0).tolist() agg = agg + tweet_details[[ 'tweet_polarity', 'tweet_subjectivity', 'replies_mean_polarity', 'replies_mean_subjectivity' ]].mean(axis=0).tolist() agg = agg + tweet_details[[ 'user_followers_count', 'user_tweet_count', 'user_friends_count' ]].median(axis=0).tolist() agg.append(tweet_details['stance'].mean(axis=0)) agg = { 'url': agg[0], 'tweets_time_delta': agg[1], 'users_countries': agg[2], 'retweets': agg[3], 'replies_count': agg[4], 'likes': agg[5], 'tweets_mean_polarity': agg[6], 'tweets_mean_subjectivity': agg[7], 'replies_mean_polarity': agg[8], 'replies_mean_subjectivity': agg[9], 'users_median_followers': agg[10], 'users_median_tweets': agg[11], 'users_median_friends': agg[12], 'stance': agg[13] } return agg article_details = article_details.merge( pd.DataFrame(article_details['url'].apply(lambda x: func( x, tweet_details[tweet_details['url'].isin(G.predecessors(x))])). tolist()), on='url') return article_details