def tfidfvec(): c1, c2, c3 = cuted() # print(c1, c2, c3) tf = TfidfTransformer() data = tf.fit_transform([c1, c2, c3]).shape() print(tf.get_feature_names()) print(data.toarray()) return None
def tfidfvec(): """ 中文特征值转化 :return: None """ c1,c2,c3 = cutword() print(c1,c2,c3) source = [c1, c2, c3] tf = TfidfTransformer() data = tf.fit_transform(source) print(tf.get_feature_names()) print(data.toarray())
def tf_idf_trans_feature_vector(): token_array = text_processed() training_token_array, test_token_array = split_string_2_data_array( token_array, 0.8) print(token_array) vectorizer = TfidfTransformer(stop_words='english', analyzer="word") # tokenize and build vocab X = vectorizer.fit_transform(token_array) analyze = vectorizer.build_analyzer() print(analyze("subject is not the case")) # summarize print(vectorizer.get_feature_names()) # summarize encoded vector print(X.toarray()) return X
dealEmail = parseEmail.replace('germani', '') ### append the text to word_data word_data.append(dealEmail) ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris if name == 'sara': from_data.append(0) else: from_data.append(1) email.close() print 'word_data[152]', word_data[152] print "emails processed" from_sara.close() from_chris.close() pickle.dump(word_data, open("your_word_data.pkl", "w")) pickle.dump(from_data, open("your_email_authors.pkl", "w")) ### in Part 4, do TfIdf vectorization here from sklearn.feature_extraction.text import TfidfTransformer from nltk.corpus import stopwords sw = stopwords.words('english') transformer = TfidfTransformer(smooth_idf=False) print 'word_data', word_data for word in word_data: for stopword in sw: word.replace(stopword, '') word.decode('gbk') transformer.fit_transform(word_data) num = transformer.get_feature_names() print 'num', len(num)
def tfidf(allsentences): from sklearn.feature_extraction.text import TfidfTransformer tf_transformer = TfidfTransformer(use_idf=False) X = tf_transformer.fit(allsentences) print(tf_transformer.get_feature_names()) return X.toarray()
# of times each word occurs in each document and finally assign each word an integer id. #Exxtract and Runn9ng algorithm for NB at once ############################## #Extract features from text file from sklearn.feature_extraction.text import CountVectorizer count_vect = CountVectorizer() X_train_counts = count_vect.fit_transform(twenty_train.data) X_train_counts.shape from sklearn.feature_extraction.text import TfidfTransformer tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) X_train_tfidf.shape tfidf_transformer.get_feature_names() ##Running ML Algorithm for naive bayes from sklearn.naive_bayes import MultinomialNB clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target_names) ############################################## #Building a pipeline: We can write less code and do all of the above, by building a pipeline as follows: #Exxtract and Runn9ng algorithm for NB at once from sklearn.pipeline import Pipeline from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.naive_bayes import MultinomialNB text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')),