def get_dataset( tweets_path: str, labeled_data_paths: List[str] ) -> Tuple[np.ndarray, np.ndarray, CountVectorizer]: # Create the tweet vectorizer using the full dataset tweets = [t.cleaned for t in get_tweets(tweets_path)] vectorizer, _ = count_vectorize(tweets, min_df=0.01) # Lists to hold inputs and outputs X: List[np.ndarray] = [] y: List[int] = [] # Fetch labeled tweets label_counter: Counter = Counter() labeled_tweets: Iterable[Dict[str, Any]] = chain( *(read_as_json_gz(path) for path in labeled_data_paths)) for tweet_dict in labeled_tweets: cleaned_tweet: CleanedTweet = clean_tweet(tweet_dict['tweet']) input_features = vectorizer.transform([cleaned_tweet.text ]).toarray()[0] label = int(tweet_dict['label']) label_counter[label] += 1 X.append(input_features) y.append(label) print('Count distribution: 0 -> {0}, 1 -> {1}, 2 -> {2}'.format( label_counter[0], label_counter[1], label_counter[2])) return np.array(X), np.array(y), vectorizer
def analyze_text(tweets_file: str, output_file: str): # Load data and extract classes tweets = read_as_json_gz(tweets_file) anti_tweets = [clean_tweet(t['tweet'], should_remove_stopwords=False).text for t in tweets if t['label'] == 1] pro_tweets = [clean_tweet(t['tweet'], should_remove_stopwords=False).text for t in tweets if t['label'] == 0] anti_sentiments = analyze_sentiments(anti_tweets) pro_sentiments = analyze_sentiments(pro_tweets) # Compute Averages and Std Dev of each metric, as well as pairwise statistical tests anti_avg_results: Dict[str, Dict[str, float]] = dict() pro_avg_results: Dict[str, Dict[str, float]] = dict() stat_tests: Dict[str, Dict[str, float]] = dict() for key in anti_sentiments.keys(): anti_avg_results[key] = dict(avg=np.average(anti_sentiments[key]), std=np.std(anti_sentiments[key]), median=np.median(anti_sentiments[key])) pro_avg_results[key] = dict(avg=np.average(pro_sentiments[key]), std=np.std(pro_sentiments[key]), median=np.median(pro_sentiments[key])) t_stat, p_value = stats.ttest_ind(anti_sentiments[key], pro_sentiments[key], equal_var=False) stat_tests[key] = dict(t_stat=t_stat, p_value=p_value) results = { 'anti': anti_avg_results, 'pro': pro_avg_results, 't_tests': stat_tests } write_as_json_gz(results, output_file)
def compare_engagement(label_path: str, engagement_dict: Dict[str, Engagement], output_file: str): tweets = read_as_json_gz(label_path) anti_urls = [t['url'] for t in tweets if t['label'] == 1] pro_urls = [t['url'] for t in tweets if t['label'] == 0] anti_engagement = [engagement_dict[url] for url in anti_urls] pro_engagement = [engagement_dict[url] for url in pro_urls] results: Dict[str, Dict[str, float]] = dict() for field in Engagement._fields: anti_values = [e._asdict()[field] for e in anti_engagement] pro_values = [e._asdict()[field] for e in pro_engagement] anti_results = { 'average': float(np.average(anti_values)), 'median': float(np.median(anti_values)), 'std': float(np.std(anti_values)), 'max': float(np.max(anti_values)), 'min': float(np.min(anti_values)) } pro_results = { 'average': float(np.average(pro_values)), 'median': float(np.median(pro_values)), 'std': float(np.std(pro_values)), 'max': float(np.max(pro_values)), 'min': float(np.min(pro_values)) } stat, p_value = stats.ttest_ind(anti_values, pro_values, equal_var=False) results[field] = dict(anti=anti_results, pro=pro_results, test=dict(stat=stat, p_value=p_value)) write_as_json_gz(results, output_file) # Print out the results in a Latex-like format for convenience labels = ['min', 'median', 'max', 'average', 'std'] for field, result_dicts in results.items(): print(field) anti_results = result_dicts['anti'] row = ['{0:.3f}'.format(anti_results[label]) for label in labels] print('Anti: ') print(' & '.join(row)) pro_results = result_dicts['pro'] row = ['{0:.3f}'.format(pro_results[label]) for label in labels] print('Pro: ') print(' & '.join(row)) print('==========')
def get_topics(data_file: str, n_components: int, n_words: int, trials: int): # Load data and fit vectorizer tweets = read_as_json_gz(data_file) cleaned_tweets = [clean_tweet(t['tweet'], should_remove_stopwords=True).text for t in tweets] vectorizer, features = count_vectorize(cleaned_tweets) vocab = vectorizer.get_feature_names() # Separate tweets by class anti_tweets = [vec.toarray().reshape(-1) for vec, tweet in zip(features, tweets) if tweet['label'] == 1] pro_tweets = [vec.toarray().reshape(-1) for vec, tweet in zip(features, tweets) if tweet['label'] == 0] # Fit a topic model for each class print('===== Anti 5G - COVID Topics =====') print('Number of Tweets: {0}'.format(len(anti_tweets))) fit_topic_model(anti_tweets, n_components, n_words, vocab=vocab, trials=trials) print() print('===== Pro 5G - COVID Topics =====') print('Number of Tweets: {0}'.format(len(pro_tweets))) fit_topic_model(pro_tweets, n_components, n_words, vocab=vocab, trials=trials)
def find_top_ngrams(data_file: str, n: int, top: int): # Load data tweets = read_as_json_gz(data_file) # Separate tweets by class anti_tweets = list( set( clean_tweet(t['tweet'], should_remove_stopwords=True).text for t in tweets if t['label'] == 0)) pro_tweets = list( set( clean_tweet(t['tweet'], should_remove_stopwords=True).text for t in tweets if t['label'] == 1)) # Fit a topic model for each class print('===== Anti 5G - COVID Topics =====') find_ngrams(anti_tweets, n=n, top=top) print() print('===== Pro 5G - COVID Topics =====') find_ngrams(pro_tweets, n=n, top=top)