Beispiel #1
0
def label_data(tweets_file: str, output_file: str, num_samples: int,
               start_index: int):

    dataset: List[Dict[str, Any]] = []

    for tweet_id, tweet in enumerate(get_tweets(tweets_file)):
        if tweet_id < start_index:
            continue

        if len(dataset) >= num_samples:
            break

        print('Tweet {0}: {1}'.format(tweet_id, tweet.original))

        while True:
            try:
                print('Enter a label: ', end=' ')
                label = int(input().strip())
                break
            except ValueError:
                pass

        if label in (0, 1, 2):
            dataset.append(
                dict(tweet=tweet.original, label=label, url=tweet.url))
            print('Dataset Size: {0}'.format(len(dataset)))

        print('==========')

    write_as_json_gz(dataset, output_file)
def analyze_sentiments(input_file: str):
    
    sample_dict: Dict[int, SentimentTuple] = dict()
    polarities: List[float] = []
    subjectivities: List[float] = []

    for index, tweet in enumerate(get_tweets(input_file)):
        text = TextBlob(tweet)
        sentiment = text.sentiment

        sample_dict[index] = SentimentTuple(text=tweet, polarity=sentiment.polarity, subjectivity=sentiment.subjectivity)
        polarities.append(sentiment.polarity)
        subjectivities.append(sentiment.subjectivity)

    print('==== Sentiment Statistics ====')
    print('Average polarity: {0}'.format(np.average(polarities)))
    print('Std polarity: {0}'.format(np.std(polarities)))
    print('Average subjectivity: {0}'.format(np.average(subjectivities)))
    print('Std subjectivity: {0}'.format(np.std(subjectivities)))
    print('==============================')

    print('==== Clustering Based On Polarity ====')
    cluster(sample_dict, mode='polarity')
    print('======================================')

    print('==== Clustering Based on Subjectivity ====')
    cluster(sample_dict, mode='subjectivity')
    print('==========================================')
Beispiel #3
0
def get_dataset(
    tweets_path: str, labeled_data_paths: List[str]
) -> Tuple[np.ndarray, np.ndarray, CountVectorizer]:
    # Create the tweet vectorizer using the full dataset
    tweets = [t.cleaned for t in get_tweets(tweets_path)]
    vectorizer, _ = count_vectorize(tweets, min_df=0.01)

    # Lists to hold inputs and outputs
    X: List[np.ndarray] = []
    y: List[int] = []

    # Fetch labeled tweets
    label_counter: Counter = Counter()
    labeled_tweets: Iterable[Dict[str, Any]] = chain(
        *(read_as_json_gz(path) for path in labeled_data_paths))

    for tweet_dict in labeled_tweets:
        cleaned_tweet: CleanedTweet = clean_tweet(tweet_dict['tweet'])
        input_features = vectorizer.transform([cleaned_tweet.text
                                               ]).toarray()[0]

        label = int(tweet_dict['label'])
        label_counter[label] += 1

        X.append(input_features)
        y.append(label)

    print('Count distribution: 0 -> {0}, 1 -> {1}, 2 -> {2}'.format(
        label_counter[0], label_counter[1], label_counter[2]))

    return np.array(X), np.array(y), vectorizer
Beispiel #4
0
def label_dataset(tweets_path: str, model: Any, vectorizer: CountVectorizer,
                  output_file: str):

    labeled_dataset: List[Dict[str, Any]] = []
    for tweet in get_tweets(tweets_path):
        features = vectorizer.transform([tweet.cleaned]).toarray()
        label = model.predict(features)[0]

        labeled_dataset.append(
            dict(tweet=tweet.original, label=int(label), url=tweet.url))

    write_as_json_gz(labeled_dataset, output_file)
def topic_model(input_path: str, num_topics: int, num_words: int):

    tweets = list(get_tweets(input_path))
    vectorizer, features = count_vectorize(tweets)

    vocab = vectorizer.get_feature_names()

    # Fit the topic model
    lda = LatentDirichletAllocation(n_components=num_topics, random_state=0)
    lda.fit(features)

    topic_words = dict()
    for index, component in enumerate(lda.components_):
        top_indices = np.argsort(component)[::-1][:num_words]

        topic_tokens = [vocab[i] for i in top_indices]

        print('Topic {0}: {1}'.format(index, ' '.join(topic_tokens)))
Beispiel #6
0
"""
CRON JOB 2
"""

from tweet_utils import get_tweets

topic = "Full_Stack"
keyword_list = ["frontend", "backend", "fullstack"]
limit = 15

get_tweets(topic, keyword_list, limit)