count_vect = CountVectorizer()
print(type(twenty_train.data))
X_train_counts = count_vect.fit_transform(twenty_train.data)
print("x_train_counts.shape: " + str(X_train_counts.shape))
try:
    print("attempting to print column 0")
    print(X_train_counts.getcol(0))
    print(count_vect.vocabulary)
    print(count_vect.vocabulary_.get("theta"))
except Exception:
    print("didn't work")
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer().fit(X_train_counts)
X_train_tfidf = tfidf_transformer.transform(X_train_counts)
print("\n printing param names")
print(tfidf_transformer._get_param_names())

# classifier
print("twenty_train.target: ")
print(twenty_train.target[:10])
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

#classify test documents.
docs_new = [
    "God is love", "OpenGL on the GPU is fast", "god I hate computers and GPU"
]
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)
count_vect = CountVectorizer()
print(type(twenty_train.data))
X_train_counts = count_vect.fit_transform(twenty_train.data)
print("x_train_counts.shape: "+str(X_train_counts.shape))
try:
    print("attempting to print column 0")
    print(X_train_counts.getcol(0))
    print(count_vect.vocabulary)
    print(count_vect.vocabulary_.get("theta"))
except Exception:
    print("didn't work")
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer().fit(X_train_counts)
X_train_tfidf = tfidf_transformer.transform(X_train_counts)
print("\n printing param names")
print(tfidf_transformer._get_param_names())

# classifier
print("twenty_train.target: ")
print(twenty_train.target[:10])
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

#classify test documents.
docs_new = ["God is love", "OpenGL on the GPU is fast", "god I hate computers and GPU"]
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
Example #3
0
# Visualizando os dados
txtAlex_train.shape
txtAlex_train.data
txtAlex_train[1]

# Vocabulário
txtAlex_vect.vocabulary_

##########
# TF-IDF #
##########

from sklearn.feature_extraction.text import TfidfTransformer
txtAlex_tfidf = TfidfTransformer()
# Ver parametros
txtAlex_tfidf._get_param_names()

txtAlex_train_tfidf = txtAlex_tfidf.fit_transform(txtAlex_train)
txtAlex_train_tfidf.shape
txtAlex_train_tfidf.data 

txtAlex_train_tfidf[0].data # ver dados do primeiro "documento"

# Visualizar o Array
txtAlex_vect.vocabulary_
txtAlex_train_tfidf.toarray()

############
# Word2Vec #
############