Exemple #1
0
def get_dataset(
    tweets_path: str, labeled_data_paths: List[str]
) -> Tuple[np.ndarray, np.ndarray, CountVectorizer]:
    # Create the tweet vectorizer using the full dataset
    tweets = [t.cleaned for t in get_tweets(tweets_path)]
    vectorizer, _ = count_vectorize(tweets, min_df=0.01)

    # Lists to hold inputs and outputs
    X: List[np.ndarray] = []
    y: List[int] = []

    # Fetch labeled tweets
    label_counter: Counter = Counter()
    labeled_tweets: Iterable[Dict[str, Any]] = chain(
        *(read_as_json_gz(path) for path in labeled_data_paths))

    for tweet_dict in labeled_tweets:
        cleaned_tweet: CleanedTweet = clean_tweet(tweet_dict['tweet'])
        input_features = vectorizer.transform([cleaned_tweet.text
                                               ]).toarray()[0]

        label = int(tweet_dict['label'])
        label_counter[label] += 1

        X.append(input_features)
        y.append(label)

    print('Count distribution: 0 -> {0}, 1 -> {1}, 2 -> {2}'.format(
        label_counter[0], label_counter[1], label_counter[2]))

    return np.array(X), np.array(y), vectorizer
def analyze_text(tweets_file: str, output_file: str):
    # Load data and extract classes
    tweets = read_as_json_gz(tweets_file)

    anti_tweets = [clean_tweet(t['tweet'], should_remove_stopwords=False).text for t in tweets if t['label'] == 1]
    pro_tweets = [clean_tweet(t['tweet'], should_remove_stopwords=False).text for t in tweets if t['label'] == 0]

    anti_sentiments = analyze_sentiments(anti_tweets)
    pro_sentiments = analyze_sentiments(pro_tweets)

    # Compute Averages and Std Dev of each metric, as well as pairwise statistical tests
    anti_avg_results: Dict[str, Dict[str, float]] = dict()
    pro_avg_results: Dict[str, Dict[str, float]] = dict()
    stat_tests: Dict[str, Dict[str, float]] = dict()

    for key in anti_sentiments.keys():
        anti_avg_results[key] = dict(avg=np.average(anti_sentiments[key]), std=np.std(anti_sentiments[key]), median=np.median(anti_sentiments[key]))
        pro_avg_results[key] = dict(avg=np.average(pro_sentiments[key]), std=np.std(pro_sentiments[key]), median=np.median(pro_sentiments[key]))

        t_stat, p_value = stats.ttest_ind(anti_sentiments[key], pro_sentiments[key], equal_var=False)
        stat_tests[key] = dict(t_stat=t_stat, p_value=p_value)

    results = {
        'anti': anti_avg_results,
        'pro': pro_avg_results,
        't_tests': stat_tests
    }
    write_as_json_gz(results, output_file)    
def compare_engagement(label_path: str, engagement_dict: Dict[str, Engagement],
                       output_file: str):
    tweets = read_as_json_gz(label_path)

    anti_urls = [t['url'] for t in tweets if t['label'] == 1]
    pro_urls = [t['url'] for t in tweets if t['label'] == 0]

    anti_engagement = [engagement_dict[url] for url in anti_urls]
    pro_engagement = [engagement_dict[url] for url in pro_urls]

    results: Dict[str, Dict[str, float]] = dict()
    for field in Engagement._fields:
        anti_values = [e._asdict()[field] for e in anti_engagement]
        pro_values = [e._asdict()[field] for e in pro_engagement]

        anti_results = {
            'average': float(np.average(anti_values)),
            'median': float(np.median(anti_values)),
            'std': float(np.std(anti_values)),
            'max': float(np.max(anti_values)),
            'min': float(np.min(anti_values))
        }

        pro_results = {
            'average': float(np.average(pro_values)),
            'median': float(np.median(pro_values)),
            'std': float(np.std(pro_values)),
            'max': float(np.max(pro_values)),
            'min': float(np.min(pro_values))
        }

        stat, p_value = stats.ttest_ind(anti_values,
                                        pro_values,
                                        equal_var=False)
        results[field] = dict(anti=anti_results,
                              pro=pro_results,
                              test=dict(stat=stat, p_value=p_value))

    write_as_json_gz(results, output_file)

    # Print out the results in a Latex-like format for convenience
    labels = ['min', 'median', 'max', 'average', 'std']
    for field, result_dicts in results.items():
        print(field)

        anti_results = result_dicts['anti']
        row = ['{0:.3f}'.format(anti_results[label]) for label in labels]
        print('Anti: ')
        print(' & '.join(row))

        pro_results = result_dicts['pro']
        row = ['{0:.3f}'.format(pro_results[label]) for label in labels]
        print('Pro: ')
        print(' & '.join(row))

        print('==========')
def get_topics(data_file: str, n_components: int, n_words: int, trials: int):
    # Load data and fit vectorizer
    tweets = read_as_json_gz(data_file)
    cleaned_tweets = [clean_tweet(t['tweet'], should_remove_stopwords=True).text for t in tweets]
    vectorizer, features = count_vectorize(cleaned_tweets)
    vocab = vectorizer.get_feature_names()

    # Separate tweets by class
    anti_tweets = [vec.toarray().reshape(-1) for vec, tweet in zip(features, tweets) if tweet['label'] == 1]
    pro_tweets = [vec.toarray().reshape(-1) for vec, tweet in zip(features, tweets) if tweet['label'] == 0]

    # Fit a topic model for each class
    print('===== Anti 5G - COVID Topics =====')
    print('Number of Tweets: {0}'.format(len(anti_tweets)))
    fit_topic_model(anti_tweets, n_components, n_words, vocab=vocab, trials=trials)

    print()
    print('===== Pro 5G - COVID Topics =====')
    print('Number of Tweets: {0}'.format(len(pro_tweets)))
    fit_topic_model(pro_tweets, n_components, n_words, vocab=vocab, trials=trials)
Exemple #5
0
def find_top_ngrams(data_file: str, n: int, top: int):
    # Load data
    tweets = read_as_json_gz(data_file)

    # Separate tweets by class
    anti_tweets = list(
        set(
            clean_tweet(t['tweet'], should_remove_stopwords=True).text
            for t in tweets if t['label'] == 0))
    pro_tweets = list(
        set(
            clean_tweet(t['tweet'], should_remove_stopwords=True).text
            for t in tweets if t['label'] == 1))

    # Fit a topic model for each class
    print('===== Anti 5G - COVID Topics =====')
    find_ngrams(anti_tweets, n=n, top=top)

    print()
    print('===== Pro 5G - COVID Topics =====')
    find_ngrams(pro_tweets, n=n, top=top)