count_vect = CountVectorizer() print(type(twenty_train.data)) X_train_counts = count_vect.fit_transform(twenty_train.data) print("x_train_counts.shape: " + str(X_train_counts.shape)) try: print("attempting to print column 0") print(X_train_counts.getcol(0)) print(count_vect.vocabulary) print(count_vect.vocabulary_.get("theta")) except Exception: print("didn't work") from sklearn.feature_extraction.text import TfidfTransformer tfidf_transformer = TfidfTransformer().fit(X_train_counts) X_train_tfidf = tfidf_transformer.transform(X_train_counts) print("\n printing param names") print(tfidf_transformer._get_param_names()) # classifier print("twenty_train.target: ") print(twenty_train.target[:10]) from sklearn.naive_bayes import MultinomialNB clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target) #classify test documents. docs_new = [ "God is love", "OpenGL on the GPU is fast", "god I hate computers and GPU" ] X_new_counts = count_vect.transform(docs_new) X_new_tfidf = tfidf_transformer.transform(X_new_counts) predicted = clf.predict(X_new_tfidf)
count_vect = CountVectorizer() print(type(twenty_train.data)) X_train_counts = count_vect.fit_transform(twenty_train.data) print("x_train_counts.shape: "+str(X_train_counts.shape)) try: print("attempting to print column 0") print(X_train_counts.getcol(0)) print(count_vect.vocabulary) print(count_vect.vocabulary_.get("theta")) except Exception: print("didn't work") from sklearn.feature_extraction.text import TfidfTransformer tfidf_transformer = TfidfTransformer().fit(X_train_counts) X_train_tfidf = tfidf_transformer.transform(X_train_counts) print("\n printing param names") print(tfidf_transformer._get_param_names()) # classifier print("twenty_train.target: ") print(twenty_train.target[:10]) from sklearn.naive_bayes import MultinomialNB clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target) #classify test documents. docs_new = ["God is love", "OpenGL on the GPU is fast", "god I hate computers and GPU"] X_new_counts = count_vect.transform(docs_new) X_new_tfidf = tfidf_transformer.transform(X_new_counts) predicted = clf.predict(X_new_tfidf) for doc, category in zip(docs_new, predicted):
# Visualizando os dados txtAlex_train.shape txtAlex_train.data txtAlex_train[1] # Vocabulário txtAlex_vect.vocabulary_ ########## # TF-IDF # ########## from sklearn.feature_extraction.text import TfidfTransformer txtAlex_tfidf = TfidfTransformer() # Ver parametros txtAlex_tfidf._get_param_names() txtAlex_train_tfidf = txtAlex_tfidf.fit_transform(txtAlex_train) txtAlex_train_tfidf.shape txtAlex_train_tfidf.data txtAlex_train_tfidf[0].data # ver dados do primeiro "documento" # Visualizar o Array txtAlex_vect.vocabulary_ txtAlex_train_tfidf.toarray() ############ # Word2Vec # ############