def get_corpus(user_data): corpus = dict() corpus['user_handle'] = [user_data[0]['user_handle']] user_handle = user_data[0]['user_handle'] corpus['tweet'] = [] corpus['hashtags'] = [] corpus['related_content'] = [] tweet = clean_tweet(user_data[0]['text']) hashtags = get_hashtag_tokenize(user_data[0]['hashtags']) related_content = get_related_content(user_data[0]) for user in tqdm.tqdm(user_data[1:]): if user['user_handle'] != user_handle: user_handle = user['user_handle'] corpus['user_handle'].append(user['user_handle']) hashtags = ' '.join(word for word in hashtags.split()) related_content = ' '.join(word for word in related_content.split()) corpus['tweet'].append(tweet) corpus['hashtags'].append(hashtags) corpus['related_content'].append(related_content) tweet = clean_tweet(user['text']) hashtags = get_hashtag_tokenize(user['hashtags']) related_content = get_related_content(user) else: tweet = tweet + ' ' + clean_tweet(user['text']) hashtags = hashtags + ' ' + get_hashtag_tokenize(user['hashtags']) related_content = related_content + get_related_content(user) hashtags = ' '.join(word for word in hashtags.split()) related_content = ' '.join(word for word in related_content.split()) corpus['tweet'].append(tweet) corpus['hashtags'].append(hashtags) corpus['related_content'].append(related_content) corpus_list.append(corpus)
def get_user_descriptions(length=500000): """build corpus: list of tweets from twitter account :param length: To avoid memory explosion :return: """ sql = MySqlUtils() users = sql.get_data(user_query) users_list = [user['user_handle'] for user in users[:length]] print('Query count {}'.format(len(users))) query = 'SELECT description, user_handle FROM user where user_handle IN (' + ','.join( ("'{}'".format(user) for user in users_list)) + ')' descriptions = sql.get_data(query) corpus = dict() print("Descriptions", len(descriptions)) for row in descriptions: text = row['description'] user_handle = row['user_handle'] if len(text) > 0: tokens = clean_tweet(text, stem=False, lemmatize=False, as_string=False) corpus[user_handle] = ' '.join(tokens) return corpus
def run_lf_lda(): base_lda_model = joblib.load( '/Users/shashankwadhwa/Desktop/Work/stealth/analytics/src/NLP/junk/testing_on_brand24/data/ldamodel.pkl' ) eta = base_lda_model.get_topics() eta_exp = np.exp(eta) eta_softmax = eta_exp / eta_exp.sum(axis=1)[:, None] tweets = get_b24_tweets() doc_clean = [clean_tweet(tweet, as_string=False) for tweet in tweets] dictionary = corpora.Dictionary(doc_clean) doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean] ldamodel = Lda(doc_term_matrix, num_topics=10, id2word=dictionary, passes=10, eta=eta_softmax) joblib.dump( ldamodel, '/Users/shashankwadhwa/Desktop/Work/stealth/analytics/src/NLP/junk/testing_on_brand24/data/lf_ldamodel.pkl' ) topics = ldamodel.print_topics(num_topics=10, num_words=20) for topic in topics: print(topic) print('*' * 80)
def get_accuracy_preprocessing(): b24_tweets_labeled = joblib.load('src/NLP/junk/testing_on_brand24/data/b24_tweets_labeled.pkl') topic_words_count = {} for topic, tweets in b24_tweets_labeled.items(): words = clean_tweet(' '.join(tweets)).split() topic_words_count[topic] = dict(Counter(words)) joblib.dump(topic_words_count, 'src/NLP/junk/testing_on_brand24/data/clustop_topic_words_count.pkl')
def train_fasttext(): f = open('fasttext_train.txt', 'w') tweets = get_data(length=1500) for tweet in tweets: f.write(clean_tweet(tweet['text'])) f.write('\n') f.close()
def cluster_tweets(): word_vectors = joblib.load('w2v_word_vectors.pkl') tweets = get_data(length=2000) # vectorizer = TfidfVectorizer(strip_accents='ascii', stop_words=stopwords, max_df=0.02, preprocessor=clean_tweet) # vectorizer.fit([t['text'] for t in tweets]) # features = vectorizer.get_feature_names() clean_tweets = list(filter(None, [clean_tweet(t['text']) for t in tweets])) print(len(clean_tweets)) tweet_vectors = {} for tweet in clean_tweets: tweet_word_vectors = [] # tf_idf_vector = vectorizer.transform([tweet]).todense().tolist()[0] tweet_words = tweet.split() for word in tweet_words: if word in word_vectors: word_vector = word_vectors[word] # try: # tf_idf_score = tf_idf_vector[features.index(word)] # except ValueError: # tf_idf_score = 0 # word_vector *= tf_idf_score tweet_word_vectors.append(list(word_vector)) if len(tweet_word_vectors) > 0: tweet_vector = np.array(tweet_word_vectors).mean(axis=0) tweet_vectors[tweet] = tweet_vector tweet_vectors_values = list(tweet_vectors.values()) final_tweets = list(tweet_vectors.keys()) n_clusters = 30 k_means = KMeans(n_clusters=n_clusters, init='k-means++', max_iter=100) k_means.fit(tweet_vectors_values) labels = list(k_means.labels_) for i in range(n_clusters): print('Cluster %s' % i) distance_from_center = k_means.transform( tweet_vectors_values )[:, i] # distance to the first cluster center and so on.. nearest_points = np.argsort(distance_from_center)[::][:100] nearest_tweets = [final_tweets[point] for point in nearest_points] vectorizer = TfidfVectorizer(strip_accents='ascii', stop_words=stopwords, min_df=3, preprocessor=clean_tweet) vector_matrix = vectorizer.fit_transform(nearest_tweets) tf_idf_words = np.array(vectorizer.get_feature_names()) tf_idf_sorting = np.argsort(vector_matrix.toarray()).flatten()[::-1] top_words = tf_idf_words[tf_idf_sorting][:20] print(labels.count(i)) print(nearest_tweets) print(top_words) print('\n')
def get_accuracy_preprocessing(): reviews_dict = joblib.load('src/NLP/junk/test_lsi/data/reviews_dict.pkl') topic_words_count = {} for topic, reviews in reviews_dict.items(): words = clean_tweet(' '.join(reviews)).split() topic_words_count[topic] = dict(Counter(words)) joblib.dump(topic_words_count, 'src/NLP/junk/test_lsi/data/topic_words_count.pkl')
def cluster_tweets(): word_vectors = joblib.load( 'src/NLP/junk/testing_on_brand24/data/word_vectors.pkl') all_tweets = get_b24_tweets_sample() tweets_categories = get_tweets_categories() tweet_vectors = {} for t in all_tweets: tweet_category = tweets_categories[t] tweet_word_vectors = [] tweet_words = clean_tweet(clean_b24_tweet(t, tweet_category)).split() for word in tweet_words: if word in word_vectors: word_vector = word_vectors[word] tweet_word_vectors.append(list(word_vector)) if len(tweet_word_vectors) > 0: tweet_vector = np.array(tweet_word_vectors).mean(axis=0) tweet_vectors[t] = tweet_vector joblib.dump(tweet_vectors, 'src/NLP/junk/testing_on_brand24/data/tweet_vectors.pkl') # tweet_vectors = joblib.load('src/NLP/junk/testing_on_brand24/data/tweet_vectors.pkl') tweet_vectors_values = list(tweet_vectors.values()) # b24_tweets_labeled = joblib.load('src/NLP/junk/testing_on_brand24/data/b24_tweets_labeled.pkl') # centroids = [] # for category, tweets in b24_tweets_labeled.items(): # category_tweet_vectors = [] # for t in tweets: # try: # category_tweet_vectors.append(list(tweet_vectors[t])) # except Exception as e: # pass # category_centroid = np.array(category_tweet_vectors).mean(axis=0) # centroids.append(category_centroid) # centroids_array = np.array(centroids) # pca = PCA(n_components=2).fit(tweet_vectors_values) # pca_2d = pca.transform(tweet_vectors_values) # joblib.dump(pca_2d, 'src/NLP/junk/testing_on_brand24/data/pca_2d.pkl') n_clusters = 10 spectral = SpectralClustering(n_clusters=n_clusters) spectral.fit(tweet_vectors_values) labels = list(spectral.labels_) clusters = {} for ctr, label in enumerate(labels): if label in clusters: clusters[label].append(all_tweets[ctr]) else: clusters[label] = [all_tweets[ctr]] joblib.dump(clusters, 'src/NLP/junk/testing_on_brand24/data/clusters.pkl')
def get_related_content(user): if user['related_content']: user['related_content'] = list(clean_tweet(user['related_content'])) try: related_content = lda_model(user['related_content']) except Exception as e: print(e) return '' else: related_content = '' return related_content
def run_w2v(): tweets = get_data(length=1500) clean_tweets = [ clean_tweet(tweet['text'], as_string=False) for tweet in tweets ] #bigram_transformer = gensim.models.Phrases(clean_tweets) model = Word2Vec(sentences=clean_tweets, sg=1, size=200, window=5, min_count=10) return model
def run_d2v(): tweets = get_data(length=2000) clean_tweets = list( filter(None, [clean_tweet(t['text'], as_string=False) for t in tweets])) print('Clean tweets:', len(clean_tweets)) documents = get_documents(clean_tweets) model = Doc2Vec(documents=documents, dm=1, size=200, window=5, min_count=10) return (model, clean_tweets)
def get_data(length=500000, offset=0): """build corpus: list of tweets from twitter account :param length: To avoid memory explosion :return: """ sql = MySqlUtils() users = sql.get_data(user_query) users_list = [user['user_handle'] for user in users[offset:(offset + length)]] print('Query count {}'.format(len(users))) query = 'SELECT text, user_handle, retweets, retweets_permalink FROM tweet where user_handle IN (' + ','.join( ("'{}'".format(user) for user in users_list)) + ')' tweets = sql.get_data(query) corpus = dict() users_retweets_count = dict() print("Tweets", len(tweets)) retweets = [] user_handle = '' all_retweet_count = [] for tweet in tweets: if tweet['user_handle'] not in corpus: if user_handle: users_retweets_count[user_handle] = np.sum(retweets) all_retweet_count.extend(retweets) retweets = [] user_handle = tweet['user_handle'] corpus[tweet['user_handle']] = tweet['text'] if tweet['retweets_permalink']: retweets.append(0) else: retweets.append(tweet['retweets']) else: corpus[tweet['user_handle']] = corpus[tweet['user_handle']] + '. ' + tweet['text'] if tweet['retweets_permalink']: retweets.append(0) else: retweets.append(tweet['retweets']) # for last user_handle all_retweet_count.extend(retweets) users_retweets_count[user_handle] = np.sum(retweets) for user_handle, text in corpus.items(): # TODO: Too Slow. Speed this up tokens = clean_tweet(text, stem=False, lemmatize=False, as_string=False) corpus[user_handle] = ' '.join(tokens, ) return corpus, users_retweets_count, np.sum(all_retweet_count)
def visualize_clusters(): """ http://www.dummies.com/programming/big-data/data-science/how-to-visualize-the-clusters-in-a-k-means-unsupervised-learning-model/ """ print('1') word_vectors = joblib.load('w2v_word_vectors.pkl') print('2') tweets = get_data(length=2000) print('3') clean_tweets = list(filter(None, [clean_tweet(t['text']) for t in tweets])) print(len(clean_tweets)) print('4') tweet_vectors = {} for tweet in clean_tweets: tweet_word_vectors = [] tweet_words = tweet.split() for word in tweet_words: if word in word_vectors: word_vector = word_vectors[word] tweet_word_vectors.append(list(word_vector)) if len(tweet_word_vectors) > 0: tweet_vector = np.array(tweet_word_vectors).mean(axis=0) tweet_vectors[tweet] = tweet_vector print('5') tweet_vectors_values = list(tweet_vectors.values()) final_tweets = list(tweet_vectors.keys()) print('6') pca = PCA(n_components=2).fit(tweet_vectors_values) pca_2d = pca.transform(tweet_vectors_values) print('7') n_clusters = 30 k_means = KMeans(n_clusters=n_clusters, init='k-means++', max_iter=100) k_means.fit(tweet_vectors_values) plt.figure('K-means with 30 clusters') plt.scatter(pca_2d[:, 0], pca_2d[:, 1], c=k_means.labels_) plt.show()
def run_lda(): tweets = get_b24_tweets() doc_clean = [clean_tweet(tweet, as_string=False) for tweet in tweets] dictionary = corpora.Dictionary(doc_clean) doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean] ldamodel = Lda(doc_term_matrix, num_topics=10, id2word=dictionary, passes=50) joblib.dump(ldamodel, 'src/NLP/junk/testing_on_brand24/data/ldamodel.pkl') topics = ldamodel.print_topics(num_topics=10, num_words=20) for topic in topics: print(topic) print('*' * 80)
def train_w2v(): b24_all_tweets = get_b24_tweets() db_tweets = [t['text'] for t in get_db_tweets(length=2000)] all_tweets = b24_all_tweets + db_tweets clean_tweets = [clean_tweet(t, as_string=False) for t in all_tweets] model = Word2Vec(sentences=clean_tweets, sg=1, size=200, window=10, min_count=10) word_vectors = {} for word in model.wv.vocab: word_vectors[word] = model[word] joblib.dump(model, 'src/NLP/junk/testing_on_brand24/data/model.pkl') joblib.dump(word_vectors, 'src/NLP/junk/testing_on_brand24/data/word_vectors.pkl')
def cluster_tweets(): model = joblib.load('src/NLP/junk/testing_on_brand24/data/ldamodel.pkl') tweets = get_b24_tweets() doc_clean = [clean_tweet(tweet, as_string=False) for tweet in tweets] dictionary = corpora.Dictionary(doc_clean) clusters = {} for ctr, d in enumerate(doc_clean): bow = dictionary.doc2bow(d) topics = model.get_document_topics(bow) sorted_topics = sorted(topics, key=lambda x: x[1], reverse=True) label = sorted_topics[0][0] if label in clusters: clusters[label].append(tweets[ctr]) else: clusters[label] = [tweets[ctr]] joblib.dump(clusters, 'src/NLP/junk/testing_on_brand24/data/lda_clusters.pkl')
def train_d2v(): all_reviews = get_data() clean_reviews = [ clean_tweet(review, as_string=False) for review in all_reviews ] documents = get_documents(clean_reviews) model = Doc2Vec(documents=documents, dm=1, size=200, window=5, min_count=10) review_vectors = {} for ctr, r in enumerate(all_reviews): review_vectors[r] = model.docvecs[ctr] joblib.dump( review_vectors, 'src/NLP/junk/testing_on_snap/data/d2v_outputs/d2v_review_vectors.pkl')
def get_vector(data): tweet_data = get_combined_tweet_data(data) tweet_data = [clean_tweet(remove_hash(tweet)) for tweet in tweet_data] tweet = [tweet.split() for tweet in tweet_data] model = Word2Vec(min_count=1) model.build_vocab(tweet) model.train(tweet, total_examples=model.corpus_count, epochs=model.iter) final_matrix = [] count = [] for index, tweet in tqdm.tqdm(enumerate(tweet)): if tweet: matrix = [model[text] for text in tweet if text] matrix = np.array(matrix) final_matrix.append(np.mean(matrix, axis=0)) else: count.append(index) return np.array(final_matrix), count, tweet_data
def train_w2v(): all_reviews = get_data() clean_reviews = [ clean_tweet(review, as_string=False) for review in all_reviews ] model = Word2Vec(sentences=clean_reviews, sg=1, size=200, window=5, min_count=10) word_vectors = {} for word in model.wv.vocab: word_vectors[word] = model[word] joblib.dump( word_vectors, 'src/NLP/junk/testing_on_snap/data/w2v_outputs/w2v_word_vectors_only_nouns.pkl' ) return model
def run_lda_on_all_tweets(): tweets = get_data(length=1500) doc_clean = [ clean_tweet(tweet['text'], stem=False, lemmatize=False, as_string=False) for tweet in tweets ] dictionary = corpora.Dictionary(doc_clean) doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean] ldamodel = Lda(doc_term_matrix, num_topics=15, id2word=dictionary, passes=50) topics = ldamodel.print_topics(num_topics=15, num_words=20) print(topics) for topic in topics: print(topic) print('*' * 80)
def cluster_reviews(): word_vectors = joblib.load( 'src/NLP/junk/testing_on_snap/data/w2v_outputs/w2v_word_vectors_only_nouns.pkl' ) all_reviews = get_data() review_vectors = {} for review in all_reviews: review_word_vectors = [] review_words = clean_tweet(review).split() for word in review_words: if word in word_vectors: word_vector = word_vectors[word] review_word_vectors.append(list(word_vector)) if len(review_word_vectors) > 0: review_vector = np.array(review_word_vectors).mean(axis=0) review_vectors[review] = review_vector review_vectors_values = list(review_vectors.values()) n_clusters = 16 k_means = KMeans(n_clusters=n_clusters, init='k-means++', max_iter=100) k_means.fit(review_vectors_values) labels = list(k_means.labels_) clusters = {} for ctr, label in enumerate(labels): if label in clusters: clusters[label].append(all_reviews[ctr]) else: clusters[label] = [all_reviews[ctr]] joblib.dump( clusters, 'src/NLP/junk/testing_on_snap/data/w2v_outputs/w2v_clusters_only_nouns.pkl' )
def run_lda_on_all_tweets(): tweets = get_data(length=200, offset=500) doc_clean = [ clean_tweet(tweet['text'], stem=False, lemmatize=False, as_string=False) for tweet in tweets ] dictionary = corpora.Dictionary(doc_clean) doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean] ldamodel = Lda( '/Users/shashankwadhwa/Desktop/Work/stealth/mallet-2.0.8/bin/mallet', doc_term_matrix, num_topics=5, id2word=dictionary, iterations=50) topics = ldamodel.print_topics(num_topics=5, num_words=10) print(topics) for topic in topics: print(topic) print('*' * 80)