def train_tsne(training_size=2000, metric='cosine', n_components=3, perplexity=100, angle=.12): # adjust this downward to see it it affects accuracy np = pd.np tweets = read_csv(os.path.join(BIGDATA_PATH, 'tweets.csv.gz')) tweets = tweets[tweets.isbot >= 0] gc.collect() # reclaim RAM released above # labels3 = tweets.isbot.apply(lambda x: int(x * 3)) labels = tweets.isbot.apply(lambda x: int(x * 2)) lsa = LsiModel.load( os.path.join(BIGDATA_PATH, 'lsa_tweets_5589798_2003588x200.pkl')) tfidf = TfidfModel(id2word=lsa.id2word, dictionary=lsa.id2word) bows = np.array([lsa.id2word.doc2bow(txt.split()) for txt in tweets.text]) # tfidfs = tfidf[bows] X = pd.DataFrame( [pd.Series(dict(v)) for v in tqdm(lsa[tfidf[bows]], total=len(bows))], index=tweets.index) mask = ~X.isnull().any(axis=1) mask.index = tweets.index # >>> sum(~mask) # 99 # >>> tweets.loc[mask.argmin()] # isbot 0.17 # strict 13 # user b'CrisParanoid:' # text b'#sad again' # Name: 571, dtype: object X = X[mask] y = tweets.isbot[mask] labels = labels[mask] test_size = 1.0 - training_size if training_size < 1 else float( len(X) - training_size) / len(X) Xindex, Xindex_test, yindex, yindex_test = train_test_split( X.index.values, y.index.values, test_size=test_size) X, Xtest, y, ytest = X.loc[Xindex], X.loc[Xindex_test], y.loc[ yindex], y.loc[yindex_test] # labels_test = labels.loc[yindex_test] labels = labels.loc[yindex] tsne = TSNE(metric='precomputed', n_components=n_components, angle=angle, perplexity=perplexity) tsne = tsne.fit(positive_distances(X.values, metric=metric)) return tsne, X, Xtest, y, ytest
def lsa_twitter(cased_tokens): """ Latent Sentiment Analyis on random sampling of twitter search results for words listed in cased_tokens """ # Only 5 of these tokens are saved for a no_below=2 filter: # PyCons NLPS #PyCon2016 #NaturalLanguageProcessing #naturallanguageprocessing if cased_tokens is None: cased_tokens = ('PyConOpenSpaces PyCon PyCon2017 PyCon2018 PyCon2016 PyCon2015 OpenSpace PyconTutorial ' + 'NLP NaturalLanguageProcessing NLPInAction NaturalLanguageProcessingInAction NLPIA Twote Twip' ).split() cased_tokens += [s + 's' for s in cased_tokens] cased_tokens += 'TotalGood TotalGoods HobsonLane Hob Hobs TotalGood.com ' \ 'www.TotalGood.com http://www.TotalGood.com https://www.TotalGood.com'.split() allcase_tokens = cased_tokens + [s.lower() for s in cased_tokens] allcase_tokens += [s.title() for s in cased_tokens] allcase_tokens += [s.upper() for s in cased_tokens] KEEP_TOKENS = allcase_tokens + ['#' + s for s in allcase_tokens] # takes 15 minutes and 10GB of RAM for 500k tweets if you keep all 20M unique tokens/names URLs vocab_path = os.path.join(BIGDATA_PATH, 'vocab939370.pkl') if os.path.isfile(vocab_path): print('Loading vocab: {} ...'.format(vocab_path)) vocab = Dictionary.load(vocab_path) print(' len(vocab) loaded: {}'.format(len(vocab.dfs))) else: tweets_path = os.path.join(BIGDATA_PATH, 'tweets.csv.gz') print('Loading tweets: {} ...'.format(tweets_path)) tweets = read_csv(tweets_path) tweets = np.array(tweets.text.str.split()) with gzip.open(os.path.join(BIGDATA_PATH, 'tweets.txt.gz'), 'w') as f: for tokens in tweets: f.write((' '.join(tokens) + '\n').encode('utf-8')) # tweets['text'] = tweets.text.apply(lambda s: eval(s).decode('utf-8')) # tweets['user'] = tweets.user.apply(lambda s: eval(s).decode('utf-8')) # tweets.to_csv('tweets.csv.gz', compression='gzip') print('Computing vocab from {} tweets...'.format(len(tweets))) vocab = Dictionary(tweets, no_below=NO_BELOW, no_above=NO_ABOVE, keep_tokens=set(KEEP_TOKENS)) vocab.filter_extremes(no_below=NO_BELOW, no_above=NO_ABOVE, keep_n=KEEP_N, keep_tokens=set(KEEP_TOKENS)) print(' len(vocab) after filtering: {}'.format(len(vocab.dfs))) # no time at all, just a bookeeping step, doesn't actually compute anything tfidf = TfidfModel(id2word=vocab, dictionary=vocab) tfidf.save(os.path.join(BIGDATA_PATH, 'tfidf{}.pkl'.format(len(vocab.dfs)))) tweets = [vocab.doc2bow(tw) for tw in tweets] json.dump(tweets, gzip.open(os.path.join(BIGDATA_PATH, 'tweet_bows.json.gz'), 'w')) gc.collect() # LSA is more useful name than LSA lsa = LsiModel(tfidf[tweets], num_topics=200, id2word=vocab, extra_samples=100, power_iters=2) return lsa
def clean_df(df, header=None, **read_csv_kwargs): """ Convert UTF8 characters in a CSV file or dataframe into ASCII Args: df (DataFrame or str): DataFrame or path or url to CSV """ df = read_csv(df, header=header, **read_csv_kwargs) df = df.fillna(' ') for col in df.columns: df[col] = df[col].apply(unicode2ascii) return df
def train_lda(training_size=2000, metric='cosine'): tweets = read_csv(os.path.join(BIGDATA_PATH, 'tweets.csv.gz')) tweets = tweets[tweets.isbot >= 0] # labels3 = tweets.isbot.apply(lambda x: int(x * 3)) labels = tweets.isbot.apply(lambda x: int(x * 2)) lsa = LsiModel.load( os.path.join(BIGDATA_PATH, 'lsa_tweets_5589798_2003588x200.pkl')) tfidf = TfidfModel(id2word=lsa.id2word, dictionary=lsa.id2word) bows = np.array([lsa.id2word.doc2bow(txt.split()) for txt in tweets.text]) # tfidfs = tfidf[bows] X = pd.DataFrame( [pd.Series(dict(v)) for v in tqdm(lsa[tfidf[bows]], total=len(bows))], index=tweets.index) mask = ~X.isnull().any(axis=1) mask.index = tweets.index X = X[mask] y = tweets.isbot[mask] labels = labels[mask] # labels3 = labels3[mask] test_size = 1.0 - training_size if training_size < 1 else float( len(X) - training_size) / len(X) Xindex, Xindex_test, yindex, yindex_test = train_test_split( X.index.values, y.index.values, test_size=test_size) X, Xtest, y, ytest = X.loc[Xindex], X.loc[Xindex_test], y.loc[ yindex], y.loc[yindex_test] labels_test = labels.loc[yindex_test] labels = labels.loc[yindex] lda = LDA('lsqr', 'auto', n_components=3) print(cross_val_score(lda, Xtest, labels_test, cv=7)) lda = LDA('lsqr', 'auto', n_components=3) lda = lda.fit(X.values, labels.values) y_lda = lda.predict(Xtest) print(mean_squared_error(y_lda, ytest)) df_test = pd.DataFrame(lda.predict(Xtest), index=Xtest.index, columns=['predict']) df_test['truth'] = labels_test return lda, df_test