コード例 #1
0
def get_corpus(user_data):
    corpus = dict()
    corpus['user_handle'] = [user_data[0]['user_handle']]
    user_handle = user_data[0]['user_handle']
    corpus['tweet'] = []
    corpus['hashtags'] = []
    corpus['related_content'] = []
    tweet = clean_tweet(user_data[0]['text'])
    hashtags = get_hashtag_tokenize(user_data[0]['hashtags'])
    related_content = get_related_content(user_data[0])
    for user in tqdm.tqdm(user_data[1:]):
        if user['user_handle'] != user_handle:
            user_handle = user['user_handle']
            corpus['user_handle'].append(user['user_handle'])
            hashtags = ' '.join(word for word in hashtags.split())
            related_content = ' '.join(word
                                       for word in related_content.split())
            corpus['tweet'].append(tweet)
            corpus['hashtags'].append(hashtags)
            corpus['related_content'].append(related_content)
            tweet = clean_tweet(user['text'])
            hashtags = get_hashtag_tokenize(user['hashtags'])
            related_content = get_related_content(user)
        else:
            tweet = tweet + ' ' + clean_tweet(user['text'])
            hashtags = hashtags + ' ' + get_hashtag_tokenize(user['hashtags'])
            related_content = related_content + get_related_content(user)

    hashtags = ' '.join(word for word in hashtags.split())
    related_content = ' '.join(word for word in related_content.split())
    corpus['tweet'].append(tweet)
    corpus['hashtags'].append(hashtags)
    corpus['related_content'].append(related_content)
    corpus_list.append(corpus)
コード例 #2
0
def get_user_descriptions(length=500000):
    """build corpus: list of tweets from twitter account


    :param length: To avoid memory explosion
    :return:
    """
    sql = MySqlUtils()
    users = sql.get_data(user_query)
    users_list = [user['user_handle'] for user in users[:length]]
    print('Query count {}'.format(len(users)))
    query = 'SELECT description, user_handle FROM user where user_handle IN (' + ','.join(
        ("'{}'".format(user) for user in users_list)) + ')'

    descriptions = sql.get_data(query)
    corpus = dict()
    print("Descriptions", len(descriptions))

    for row in descriptions:
        text = row['description']
        user_handle = row['user_handle']
        if len(text) > 0:
            tokens = clean_tweet(text, stem=False, lemmatize=False, as_string=False)
            corpus[user_handle] = ' '.join(tokens)
    return corpus
コード例 #3
0
def run_lf_lda():
    base_lda_model = joblib.load(
        '/Users/shashankwadhwa/Desktop/Work/stealth/analytics/src/NLP/junk/testing_on_brand24/data/ldamodel.pkl'
    )
    eta = base_lda_model.get_topics()
    eta_exp = np.exp(eta)
    eta_softmax = eta_exp / eta_exp.sum(axis=1)[:, None]

    tweets = get_b24_tweets()
    doc_clean = [clean_tweet(tweet, as_string=False) for tweet in tweets]
    dictionary = corpora.Dictionary(doc_clean)
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

    ldamodel = Lda(doc_term_matrix,
                   num_topics=10,
                   id2word=dictionary,
                   passes=10,
                   eta=eta_softmax)
    joblib.dump(
        ldamodel,
        '/Users/shashankwadhwa/Desktop/Work/stealth/analytics/src/NLP/junk/testing_on_brand24/data/lf_ldamodel.pkl'
    )
    topics = ldamodel.print_topics(num_topics=10, num_words=20)

    for topic in topics:
        print(topic)
        print('*' * 80)
コード例 #4
0
def get_accuracy_preprocessing():
    b24_tweets_labeled = joblib.load('src/NLP/junk/testing_on_brand24/data/b24_tweets_labeled.pkl')
    topic_words_count = {}
    for topic, tweets in b24_tweets_labeled.items():
        words = clean_tweet(' '.join(tweets)).split()
        topic_words_count[topic] = dict(Counter(words))

    joblib.dump(topic_words_count, 'src/NLP/junk/testing_on_brand24/data/clustop_topic_words_count.pkl')
コード例 #5
0
def train_fasttext():
    f = open('fasttext_train.txt', 'w')
    tweets = get_data(length=1500)
    for tweet in tweets:
        f.write(clean_tweet(tweet['text']))
        f.write('\n')

    f.close()
コード例 #6
0
def cluster_tweets():
    word_vectors = joblib.load('w2v_word_vectors.pkl')

    tweets = get_data(length=2000)
    # vectorizer = TfidfVectorizer(strip_accents='ascii', stop_words=stopwords, max_df=0.02, preprocessor=clean_tweet)
    # vectorizer.fit([t['text'] for t in tweets])
    # features = vectorizer.get_feature_names()

    clean_tweets = list(filter(None, [clean_tweet(t['text']) for t in tweets]))
    print(len(clean_tweets))
    tweet_vectors = {}
    for tweet in clean_tweets:
        tweet_word_vectors = []
        # tf_idf_vector = vectorizer.transform([tweet]).todense().tolist()[0]
        tweet_words = tweet.split()
        for word in tweet_words:
            if word in word_vectors:
                word_vector = word_vectors[word]
                # try:
                #     tf_idf_score = tf_idf_vector[features.index(word)]
                # except ValueError:
                #     tf_idf_score = 0
                # word_vector *= tf_idf_score
                tweet_word_vectors.append(list(word_vector))

        if len(tweet_word_vectors) > 0:
            tweet_vector = np.array(tweet_word_vectors).mean(axis=0)
            tweet_vectors[tweet] = tweet_vector

    tweet_vectors_values = list(tweet_vectors.values())
    final_tweets = list(tweet_vectors.keys())

    n_clusters = 30
    k_means = KMeans(n_clusters=n_clusters, init='k-means++', max_iter=100)
    k_means.fit(tweet_vectors_values)
    labels = list(k_means.labels_)

    for i in range(n_clusters):
        print('Cluster %s' % i)
        distance_from_center = k_means.transform(
            tweet_vectors_values
        )[:, i]  # distance to the first cluster center and so on..
        nearest_points = np.argsort(distance_from_center)[::][:100]
        nearest_tweets = [final_tweets[point] for point in nearest_points]

        vectorizer = TfidfVectorizer(strip_accents='ascii',
                                     stop_words=stopwords,
                                     min_df=3,
                                     preprocessor=clean_tweet)
        vector_matrix = vectorizer.fit_transform(nearest_tweets)
        tf_idf_words = np.array(vectorizer.get_feature_names())
        tf_idf_sorting = np.argsort(vector_matrix.toarray()).flatten()[::-1]
        top_words = tf_idf_words[tf_idf_sorting][:20]

        print(labels.count(i))
        print(nearest_tweets)
        print(top_words)
        print('\n')
コード例 #7
0
def get_accuracy_preprocessing():
    reviews_dict = joblib.load('src/NLP/junk/test_lsi/data/reviews_dict.pkl')
    topic_words_count = {}
    for topic, reviews in reviews_dict.items():
        words = clean_tweet(' '.join(reviews)).split()
        topic_words_count[topic] = dict(Counter(words))

    joblib.dump(topic_words_count,
                'src/NLP/junk/test_lsi/data/topic_words_count.pkl')
コード例 #8
0
def cluster_tweets():
    word_vectors = joblib.load(
        'src/NLP/junk/testing_on_brand24/data/word_vectors.pkl')
    all_tweets = get_b24_tweets_sample()
    tweets_categories = get_tweets_categories()

    tweet_vectors = {}
    for t in all_tweets:
        tweet_category = tweets_categories[t]
        tweet_word_vectors = []
        tweet_words = clean_tweet(clean_b24_tweet(t, tweet_category)).split()
        for word in tweet_words:
            if word in word_vectors:
                word_vector = word_vectors[word]
                tweet_word_vectors.append(list(word_vector))

        if len(tweet_word_vectors) > 0:
            tweet_vector = np.array(tweet_word_vectors).mean(axis=0)
            tweet_vectors[t] = tweet_vector

    joblib.dump(tweet_vectors,
                'src/NLP/junk/testing_on_brand24/data/tweet_vectors.pkl')

    # tweet_vectors = joblib.load('src/NLP/junk/testing_on_brand24/data/tweet_vectors.pkl')
    tweet_vectors_values = list(tweet_vectors.values())
    # b24_tweets_labeled = joblib.load('src/NLP/junk/testing_on_brand24/data/b24_tweets_labeled.pkl')
    # centroids = []
    # for category, tweets in b24_tweets_labeled.items():
    #     category_tweet_vectors = []
    #     for t in tweets:
    #         try:
    #             category_tweet_vectors.append(list(tweet_vectors[t]))
    #         except Exception as e:
    #             pass

    #     category_centroid = np.array(category_tweet_vectors).mean(axis=0)
    #     centroids.append(category_centroid)

    # centroids_array = np.array(centroids)
    # pca = PCA(n_components=2).fit(tweet_vectors_values)
    # pca_2d = pca.transform(tweet_vectors_values)
    # joblib.dump(pca_2d, 'src/NLP/junk/testing_on_brand24/data/pca_2d.pkl')

    n_clusters = 10
    spectral = SpectralClustering(n_clusters=n_clusters)
    spectral.fit(tweet_vectors_values)
    labels = list(spectral.labels_)

    clusters = {}
    for ctr, label in enumerate(labels):
        if label in clusters:
            clusters[label].append(all_tweets[ctr])
        else:
            clusters[label] = [all_tweets[ctr]]

    joblib.dump(clusters, 'src/NLP/junk/testing_on_brand24/data/clusters.pkl')
コード例 #9
0
def get_related_content(user):
    if user['related_content']:
        user['related_content'] = list(clean_tweet(user['related_content']))
        try:
            related_content = lda_model(user['related_content'])
        except Exception as e:
            print(e)
            return ''
    else:
        related_content = ''
    return related_content
コード例 #10
0
def run_w2v():
    tweets = get_data(length=1500)
    clean_tweets = [
        clean_tweet(tweet['text'], as_string=False) for tweet in tweets
    ]
    #bigram_transformer = gensim.models.Phrases(clean_tweets)
    model = Word2Vec(sentences=clean_tweets,
                     sg=1,
                     size=200,
                     window=5,
                     min_count=10)
    return model
コード例 #11
0
def run_d2v():
    tweets = get_data(length=2000)
    clean_tweets = list(
        filter(None,
               [clean_tweet(t['text'], as_string=False) for t in tweets]))
    print('Clean tweets:', len(clean_tweets))
    documents = get_documents(clean_tweets)
    model = Doc2Vec(documents=documents,
                    dm=1,
                    size=200,
                    window=5,
                    min_count=10)
    return (model, clean_tweets)
コード例 #12
0
def get_data(length=500000, offset=0):
    """build corpus: list of tweets from twitter account


    :param length: To avoid memory explosion
    :return:
    """
    sql = MySqlUtils()
    users = sql.get_data(user_query)
    users_list = [user['user_handle'] for user in users[offset:(offset + length)]]
    print('Query count {}'.format(len(users)))
    query = 'SELECT text, user_handle, retweets, retweets_permalink FROM tweet where user_handle IN (' + ','.join(
        ("'{}'".format(user) for user in users_list)) + ')'

    tweets = sql.get_data(query)
    corpus = dict()
    users_retweets_count = dict()
    print("Tweets", len(tweets))
    retweets = []
    user_handle = ''
    all_retweet_count = []

    for tweet in tweets:
        if tweet['user_handle'] not in corpus:
            if user_handle:
                users_retweets_count[user_handle] = np.sum(retweets)
                all_retweet_count.extend(retweets)
            retweets = []
            user_handle = tweet['user_handle']
            corpus[tweet['user_handle']] = tweet['text']
            if tweet['retweets_permalink']:
                retweets.append(0)
            else:
                retweets.append(tweet['retweets'])
        else:
            corpus[tweet['user_handle']] = corpus[tweet['user_handle']] + '. ' + tweet['text']
            if tweet['retweets_permalink']:
                retweets.append(0)
            else:
                retweets.append(tweet['retweets'])
    # for last user_handle
    all_retweet_count.extend(retweets)
    users_retweets_count[user_handle] = np.sum(retweets)
    for user_handle, text in corpus.items():
        # TODO: Too Slow. Speed this up
        tokens = clean_tweet(text, stem=False, lemmatize=False, as_string=False)

        corpus[user_handle] = ' '.join(tokens, )
    return corpus, users_retweets_count, np.sum(all_retweet_count)
コード例 #13
0
def visualize_clusters():
    """
    http://www.dummies.com/programming/big-data/data-science/how-to-visualize-the-clusters-in-a-k-means-unsupervised-learning-model/
    """

    print('1')
    word_vectors = joblib.load('w2v_word_vectors.pkl')
    print('2')

    tweets = get_data(length=2000)
    print('3')

    clean_tweets = list(filter(None, [clean_tweet(t['text']) for t in tweets]))
    print(len(clean_tweets))
    print('4')

    tweet_vectors = {}
    for tweet in clean_tweets:
        tweet_word_vectors = []
        tweet_words = tweet.split()
        for word in tweet_words:
            if word in word_vectors:
                word_vector = word_vectors[word]
                tweet_word_vectors.append(list(word_vector))

        if len(tweet_word_vectors) > 0:
            tweet_vector = np.array(tweet_word_vectors).mean(axis=0)
            tweet_vectors[tweet] = tweet_vector

    print('5')

    tweet_vectors_values = list(tweet_vectors.values())
    final_tweets = list(tweet_vectors.keys())

    print('6')

    pca = PCA(n_components=2).fit(tweet_vectors_values)
    pca_2d = pca.transform(tweet_vectors_values)

    print('7')

    n_clusters = 30
    k_means = KMeans(n_clusters=n_clusters, init='k-means++', max_iter=100)
    k_means.fit(tweet_vectors_values)

    plt.figure('K-means with 30 clusters')
    plt.scatter(pca_2d[:, 0], pca_2d[:, 1], c=k_means.labels_)
    plt.show()
コード例 #14
0
def run_lda():
    tweets = get_b24_tweets()
    doc_clean = [clean_tweet(tweet, as_string=False) for tweet in tweets]
    dictionary = corpora.Dictionary(doc_clean)
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

    ldamodel = Lda(doc_term_matrix,
                   num_topics=10,
                   id2word=dictionary,
                   passes=50)
    joblib.dump(ldamodel, 'src/NLP/junk/testing_on_brand24/data/ldamodel.pkl')
    topics = ldamodel.print_topics(num_topics=10, num_words=20)

    for topic in topics:
        print(topic)
        print('*' * 80)
コード例 #15
0
def train_w2v():
    b24_all_tweets = get_b24_tweets()
    db_tweets = [t['text'] for t in get_db_tweets(length=2000)]
    all_tweets = b24_all_tweets + db_tweets
    clean_tweets = [clean_tweet(t, as_string=False) for t in all_tweets]
    model = Word2Vec(sentences=clean_tweets,
                     sg=1,
                     size=200,
                     window=10,
                     min_count=10)
    word_vectors = {}
    for word in model.wv.vocab:
        word_vectors[word] = model[word]

    joblib.dump(model, 'src/NLP/junk/testing_on_brand24/data/model.pkl')
    joblib.dump(word_vectors,
                'src/NLP/junk/testing_on_brand24/data/word_vectors.pkl')
コード例 #16
0
def cluster_tweets():
    model = joblib.load('src/NLP/junk/testing_on_brand24/data/ldamodel.pkl')
    tweets = get_b24_tweets()
    doc_clean = [clean_tweet(tweet, as_string=False) for tweet in tweets]
    dictionary = corpora.Dictionary(doc_clean)
    clusters = {}
    for ctr, d in enumerate(doc_clean):
        bow = dictionary.doc2bow(d)
        topics = model.get_document_topics(bow)
        sorted_topics = sorted(topics, key=lambda x: x[1], reverse=True)
        label = sorted_topics[0][0]
        if label in clusters:
            clusters[label].append(tweets[ctr])
        else:
            clusters[label] = [tweets[ctr]]

    joblib.dump(clusters,
                'src/NLP/junk/testing_on_brand24/data/lda_clusters.pkl')
コード例 #17
0
def train_d2v():
    all_reviews = get_data()
    clean_reviews = [
        clean_tweet(review, as_string=False) for review in all_reviews
    ]
    documents = get_documents(clean_reviews)
    model = Doc2Vec(documents=documents,
                    dm=1,
                    size=200,
                    window=5,
                    min_count=10)
    review_vectors = {}
    for ctr, r in enumerate(all_reviews):
        review_vectors[r] = model.docvecs[ctr]

    joblib.dump(
        review_vectors,
        'src/NLP/junk/testing_on_snap/data/d2v_outputs/d2v_review_vectors.pkl')
コード例 #18
0
def get_vector(data):
    tweet_data = get_combined_tweet_data(data)
    tweet_data = [clean_tweet(remove_hash(tweet)) for tweet in tweet_data]
    tweet = [tweet.split() for tweet in tweet_data]
    model = Word2Vec(min_count=1)
    model.build_vocab(tweet)
    model.train(tweet, total_examples=model.corpus_count, epochs=model.iter)
    final_matrix = []
    count = []

    for index, tweet in tqdm.tqdm(enumerate(tweet)):
        if tweet:
            matrix = [model[text] for text in tweet if text]
            matrix = np.array(matrix)
            final_matrix.append(np.mean(matrix, axis=0))
        else:
            count.append(index)
    return np.array(final_matrix), count, tweet_data
コード例 #19
0
def train_w2v():
    all_reviews = get_data()
    clean_reviews = [
        clean_tweet(review, as_string=False) for review in all_reviews
    ]
    model = Word2Vec(sentences=clean_reviews,
                     sg=1,
                     size=200,
                     window=5,
                     min_count=10)
    word_vectors = {}
    for word in model.wv.vocab:
        word_vectors[word] = model[word]

    joblib.dump(
        word_vectors,
        'src/NLP/junk/testing_on_snap/data/w2v_outputs/w2v_word_vectors_only_nouns.pkl'
    )

    return model
コード例 #20
0
def run_lda_on_all_tweets():
    tweets = get_data(length=1500)
    doc_clean = [
        clean_tweet(tweet['text'],
                    stem=False,
                    lemmatize=False,
                    as_string=False) for tweet in tweets
    ]
    dictionary = corpora.Dictionary(doc_clean)
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

    ldamodel = Lda(doc_term_matrix,
                   num_topics=15,
                   id2word=dictionary,
                   passes=50)
    topics = ldamodel.print_topics(num_topics=15, num_words=20)

    print(topics)
    for topic in topics:
        print(topic)
        print('*' * 80)
コード例 #21
0
def cluster_reviews():
    word_vectors = joblib.load(
        'src/NLP/junk/testing_on_snap/data/w2v_outputs/w2v_word_vectors_only_nouns.pkl'
    )

    all_reviews = get_data()

    review_vectors = {}
    for review in all_reviews:
        review_word_vectors = []
        review_words = clean_tweet(review).split()
        for word in review_words:
            if word in word_vectors:
                word_vector = word_vectors[word]
                review_word_vectors.append(list(word_vector))

        if len(review_word_vectors) > 0:
            review_vector = np.array(review_word_vectors).mean(axis=0)
            review_vectors[review] = review_vector

    review_vectors_values = list(review_vectors.values())

    n_clusters = 16
    k_means = KMeans(n_clusters=n_clusters, init='k-means++', max_iter=100)
    k_means.fit(review_vectors_values)
    labels = list(k_means.labels_)

    clusters = {}
    for ctr, label in enumerate(labels):
        if label in clusters:
            clusters[label].append(all_reviews[ctr])
        else:
            clusters[label] = [all_reviews[ctr]]

    joblib.dump(
        clusters,
        'src/NLP/junk/testing_on_snap/data/w2v_outputs/w2v_clusters_only_nouns.pkl'
    )
コード例 #22
0
def run_lda_on_all_tweets():
    tweets = get_data(length=200, offset=500)
    doc_clean = [
        clean_tweet(tweet['text'],
                    stem=False,
                    lemmatize=False,
                    as_string=False) for tweet in tweets
    ]
    dictionary = corpora.Dictionary(doc_clean)
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

    ldamodel = Lda(
        '/Users/shashankwadhwa/Desktop/Work/stealth/mallet-2.0.8/bin/mallet',
        doc_term_matrix,
        num_topics=5,
        id2word=dictionary,
        iterations=50)
    topics = ldamodel.print_topics(num_topics=5, num_words=10)

    print(topics)
    for topic in topics:
        print(topic)
        print('*' * 80)