def tfidfvec():
    c1, c2, c3 = cuted()
    # print(c1, c2, c3)
    tf = TfidfTransformer()

    data = tf.fit_transform([c1, c2, c3]).shape()

    print(tf.get_feature_names())
    print(data.toarray())
    return None
def tfidfvec():
    """
    中文特征值转化
    :return: None
    """
    c1,c2,c3 = cutword()
    print(c1,c2,c3)
    source = [c1, c2, c3]
    tf = TfidfTransformer()
    data = tf.fit_transform(source)
    print(tf.get_feature_names())
    print(data.toarray())
def tf_idf_trans_feature_vector():
    token_array = text_processed()
    training_token_array, test_token_array = split_string_2_data_array(
        token_array, 0.8)
    print(token_array)
    vectorizer = TfidfTransformer(stop_words='english', analyzer="word")
    # tokenize and build vocab
    X = vectorizer.fit_transform(token_array)
    analyze = vectorizer.build_analyzer()
    print(analyze("subject is not the case"))
    # summarize
    print(vectorizer.get_feature_names())
    # summarize encoded vector
    print(X.toarray())
    return X
            dealEmail = parseEmail.replace('germani', '')
            ### append the text to word_data
            word_data.append(dealEmail)
            ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris
            if name == 'sara':
                from_data.append(0)
            else:
                from_data.append(1)
            email.close()
    print 'word_data[152]', word_data[152]
print "emails processed"
from_sara.close()
from_chris.close()

pickle.dump(word_data, open("your_word_data.pkl", "w"))
pickle.dump(from_data, open("your_email_authors.pkl", "w"))

### in Part 4, do TfIdf vectorization here
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.corpus import stopwords
sw = stopwords.words('english')
transformer = TfidfTransformer(smooth_idf=False)
print 'word_data', word_data
for word in word_data:
    for stopword in sw:
        word.replace(stopword, '')
        word.decode('gbk')
transformer.fit_transform(word_data)

num = transformer.get_feature_names()
print 'num', len(num)
Example #5
0
def tfidf(allsentences):
    from sklearn.feature_extraction.text import TfidfTransformer
    tf_transformer = TfidfTransformer(use_idf=False)
    X = tf_transformer.fit(allsentences)
    print(tf_transformer.get_feature_names())
    return X.toarray()
# of times each word occurs in each document and finally assign each word an integer id.
#Exxtract and Runn9ng algorithm for NB at once
##############################
#Extract features from text file
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape


from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape
tfidf_transformer.get_feature_names()


##Running ML Algorithm for naive bayes
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target_names)
##############################################
#Building a pipeline: We can write less code and do all of the above, by building a pipeline as follows:
#Exxtract and Runn9ng algorithm for NB at once

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')),