def plotMostFrequentWords(base):
    vectorizer = CountVectorizer(lowercase=False)
    docs       = vectorizer.fit_transform(base['Tweet'])
    features   = vectorizer.get_feature_names()
    visualizer = FreqDistVisualizer(features=features)
    visualizer.fit(docs)
    visualizer.poof()
Ejemplo n.º 2
0
    def tf(self, df):

        tf_vectorizer = CountVectorizer(min_df=0.01,
                                        max_df=0.85,
                                        max_features=no_features,
                                        ngram_range=[2, 3])
        dtm_tf = tf_vectorizer.fit_transform(df['descriptions'])
        print("dtm:", dtm_tf.shape)

        df = pd.DataFrame(dtm_tf.toarray(),
                          columns=tf_vectorizer.get_feature_names())
        print(df.head())

        #Show top tokens
        # Calculate column sums from DTM
        sum_words = dtm_tf.sum(axis=0)

        words_freq = [(word, sum_words[0, idx])
                      for word, idx in tf_vectorizer.vocabulary_.items()]

        # Now, sort them
        words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)

        # Display top few
        print(words_freq[:20])

        #Visualize Freq. of top 25 tokens
        plt.figure(figsize=(5, 8))
        visualizer = FreqDistVisualizer(
            features=tf_vectorizer.get_feature_names(), n=25)
        visualizer.fit(dtm_tf)
        visualizer.poof()
        return (df, sum_words, words_freq)
def countvect_test_simple(X_train, X_test, y_train, y_test, token_izer):
    if (token_izer == '1'):
        countvect = CountVectorizer(tokenizer=tokenizer_preproccessor)
    elif (token_izer == '2'):
        countvect = CountVectorizer(tokenizer=tokenizer_preproccessor_imdb)
    else:
        countvect = CountVectorizer()
    #CountVect
    countvect.fit(X_train)
    #to metatrepoume se dtm sparse matrix
    X_train_dtm = countvect.transform(X_train)
    X_test_dtm = countvect.transform(X_test)
    #Ftiaxnoume Multinomial Naive Bayes modelo
    nb = MultinomialNB()
    #kanoume prediction gia to x_test_dtm

    # cross val score/ predict
    cvec_score = cross_val_score(nb, X_train_dtm, y_train, cv=10)

    feature_names = countvect.get_feature_names()
    print("Number of features: {}".format(len(feature_names)))
    print("to accuracy tou CountVectorizer me NB einai: {}".format(
        cvec_score.mean()))

    visualizer = FreqDistVisualizer(features=feature_names, orient='h')
    visualizer.fit(X_train_dtm)
    visualizer.poof()

    return cvec_score.mean()
Ejemplo n.º 4
0
def token_frequency_plot(corpus, n_features):
    """Generates plot of most common tokens"""
    corpus_vectorizer = CountVectorizer(ngram_range=(1, 2),
                                        stop_words='english',
                                        token_pattern="\\b[a-z][a-z]+\\b")
    doc_matrix = corpus_vectorizer.fit_transform(corpus)
    features = corpus_vectorizer.get_feature_names()
    viz = FreqDistVisualizer(features=features, n=n_features)
    viz.fit_transform(doc_matrix)
    viz.poof()
def tfidf_test_simple(X_train, X_test, y_train, y_test, token_izer):
    if (token_izer == '1'):
        tfvect = TfidfVectorizer(tokenizer=tokenizer_preproccessor)
    elif (token_izer == '2'):
        tfvect = TfidfVectorizer(tokenizer=tokenizer_preproccessor_imdb)
    else:
        tfvect = TfidfVectorizer()
    tfidf_train = tfvect.fit_transform(X_train)
    tfidf_test = tfvect.transform(X_test)
    nb = MultinomialNB()
    #train the model and timing it
    #kanoume prediction gia to x_test_dtm

    # cross val score/ predict
    cvec_score = cross_val_score(nb, tfidf_train, y_train, cv=4)
    feature_names = tfvect.get_feature_names()
    print("Number of features: {}".format(len(feature_names)))
    print("to accuracy tou TFIDF einai {}".format(cvec_score.mean()))

    visualizer = FreqDistVisualizer(features=feature_names, orient='10')
    visualizer.fit(tfidf_train)
    visualizer.poof()

    return cvec_score.mean()
Ejemplo n.º 6
0
            if (len(valor) > 1):
                #print(len(valor))
                colecao.append(valor)
    infile.close()
    return colecao


def SaveColecao(colecao, file):
    with open(file, encoding='utf-8', mode="w+") as file:
        #writer = csv.writer(file, delimiter="")
        for i in colecao:
            #linha = (filename, paginas, ano, titulo, label, pA, pB)
            #print(linha)
            file.writelines(i + '\n')
            #writer.writerow(i)
            #doc.clear()
    file.close()


#SaveColecao(all_documents, 'usiel.txt')
all_documents = LoadArquivo('usiel.txt')
from yellowbrick.text import FreqDistVisualizer
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
docs = vectorizer.fit_transform(all_documents)
features = vectorizer.get_feature_names()
visualizer = FreqDistVisualizer(features=features)
visualizer.fit(docs)
visualizer.poof()
def visualised(a, b):
    visualizer = FreqDistVisualizer(b.get_feature_names(), n=10)
    visualizer.fit(a)
    visualizer.poof()
Ejemplo n.º 8
0
def doVisualizer(featNames,vector,numTerms=10):
    visualizer = FreqDistVisualizer(features=featNames,n=numTerms)
    visualizer.fit(vector)
    visualizer.poof()