def plotMostFrequentWords(base): vectorizer = CountVectorizer(lowercase=False) docs = vectorizer.fit_transform(base['Tweet']) features = vectorizer.get_feature_names() visualizer = FreqDistVisualizer(features=features) visualizer.fit(docs) visualizer.poof()
def tf(self, df): tf_vectorizer = CountVectorizer(min_df=0.01, max_df=0.85, max_features=no_features, ngram_range=[2, 3]) dtm_tf = tf_vectorizer.fit_transform(df['descriptions']) print("dtm:", dtm_tf.shape) df = pd.DataFrame(dtm_tf.toarray(), columns=tf_vectorizer.get_feature_names()) print(df.head()) #Show top tokens # Calculate column sums from DTM sum_words = dtm_tf.sum(axis=0) words_freq = [(word, sum_words[0, idx]) for word, idx in tf_vectorizer.vocabulary_.items()] # Now, sort them words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True) # Display top few print(words_freq[:20]) #Visualize Freq. of top 25 tokens plt.figure(figsize=(5, 8)) visualizer = FreqDistVisualizer( features=tf_vectorizer.get_feature_names(), n=25) visualizer.fit(dtm_tf) visualizer.poof() return (df, sum_words, words_freq)
def countvect_test_simple(X_train, X_test, y_train, y_test, token_izer): if (token_izer == '1'): countvect = CountVectorizer(tokenizer=tokenizer_preproccessor) elif (token_izer == '2'): countvect = CountVectorizer(tokenizer=tokenizer_preproccessor_imdb) else: countvect = CountVectorizer() #CountVect countvect.fit(X_train) #to metatrepoume se dtm sparse matrix X_train_dtm = countvect.transform(X_train) X_test_dtm = countvect.transform(X_test) #Ftiaxnoume Multinomial Naive Bayes modelo nb = MultinomialNB() #kanoume prediction gia to x_test_dtm # cross val score/ predict cvec_score = cross_val_score(nb, X_train_dtm, y_train, cv=10) feature_names = countvect.get_feature_names() print("Number of features: {}".format(len(feature_names))) print("to accuracy tou CountVectorizer me NB einai: {}".format( cvec_score.mean())) visualizer = FreqDistVisualizer(features=feature_names, orient='h') visualizer.fit(X_train_dtm) visualizer.poof() return cvec_score.mean()
def token_frequency_plot(corpus, n_features): """Generates plot of most common tokens""" corpus_vectorizer = CountVectorizer(ngram_range=(1, 2), stop_words='english', token_pattern="\\b[a-z][a-z]+\\b") doc_matrix = corpus_vectorizer.fit_transform(corpus) features = corpus_vectorizer.get_feature_names() viz = FreqDistVisualizer(features=features, n=n_features) viz.fit_transform(doc_matrix) viz.poof()
def tfidf_test_simple(X_train, X_test, y_train, y_test, token_izer): if (token_izer == '1'): tfvect = TfidfVectorizer(tokenizer=tokenizer_preproccessor) elif (token_izer == '2'): tfvect = TfidfVectorizer(tokenizer=tokenizer_preproccessor_imdb) else: tfvect = TfidfVectorizer() tfidf_train = tfvect.fit_transform(X_train) tfidf_test = tfvect.transform(X_test) nb = MultinomialNB() #train the model and timing it #kanoume prediction gia to x_test_dtm # cross val score/ predict cvec_score = cross_val_score(nb, tfidf_train, y_train, cv=4) feature_names = tfvect.get_feature_names() print("Number of features: {}".format(len(feature_names))) print("to accuracy tou TFIDF einai {}".format(cvec_score.mean())) visualizer = FreqDistVisualizer(features=feature_names, orient='10') visualizer.fit(tfidf_train) visualizer.poof() return cvec_score.mean()
if (len(valor) > 1): #print(len(valor)) colecao.append(valor) infile.close() return colecao def SaveColecao(colecao, file): with open(file, encoding='utf-8', mode="w+") as file: #writer = csv.writer(file, delimiter="") for i in colecao: #linha = (filename, paginas, ano, titulo, label, pA, pB) #print(linha) file.writelines(i + '\n') #writer.writerow(i) #doc.clear() file.close() #SaveColecao(all_documents, 'usiel.txt') all_documents = LoadArquivo('usiel.txt') from yellowbrick.text import FreqDistVisualizer from sklearn.feature_extraction.text import CountVectorizer vectorizer = CountVectorizer() docs = vectorizer.fit_transform(all_documents) features = vectorizer.get_feature_names() visualizer = FreqDistVisualizer(features=features) visualizer.fit(docs) visualizer.poof()
def visualised(a, b): visualizer = FreqDistVisualizer(b.get_feature_names(), n=10) visualizer.fit(a) visualizer.poof()
def doVisualizer(featNames,vector,numTerms=10): visualizer = FreqDistVisualizer(features=featNames,n=numTerms) visualizer.fit(vector) visualizer.poof()