def preprocess(): data,target = load_file() count_vectorizer = CountVectorizer(binary='true') data = count_vectorizer.fit_transform(data) tfidf_data = TfidfTransformer(use_idf=False).fit_transform(data) #print tfidf_data tfidf_data.toarray() return tfidf_data
label_inds.append(label) corpus.append(' '.join(doc)) # print len(corpus) label_inds = np.array(label_inds) vectorizer = CountVectorizer() tfidf = TfidfTransformer() X = vectorizer.fit_transform(corpus) # print len(vectorizer.get_feature_names()) transformer = TfidfTransformer() tfidf = transformer.fit_transform(X) # print tfidf.toarray().shape # print label_inds.shape test_split_mask = np.random.rand(len(corpus)) < 0.8 train_X = tfidf.toarray()[test_split_mask] train_Y = label_inds[test_split_mask] test_X = tfidf.toarray()[~test_split_mask] test_Y = label_inds[~test_split_mask] # print train_X.shape, test_Y.shape # KNN clf = KNeighborsClassifier(n_neighbors=3) clf.fit(train_X, train_Y) print clf.score(test_X, test_Y) # SVM clf = SVC() clf.fit(train_X, train_Y) print clf.score(test_X, test_Y)
soup = BeautifulSoup(text,"lxml") text = soup.get_text() text = text.lower() text = text.translate(trantab) text = tokenize(text) corpus.append(text) df[colname] = corpus return df posts_clean = clean_text(posts) title_tfidf = TfidfTransformer().fit_transform( CountVectorizer().fit_transform(posts_clean['title'])) content_tfidf = TfidfTransformer().fit_transform( CountVectorizer().fit_transform(posts_clean['content'])) X = np.concatenate((title_tfidf.toarray(),content_tfidf.toarray()),axis=1) X = scipy.sparse.csr_matrix(X) y = multi_labels #%% classifiers from sklearn.neural_network import MLPClassifier from sklearn.svm import LinearSVC from sklearn.ensemble import \ RandomForestClassifier,AdaBoostClassifier,BaggingClassifier,ExtraTreesClassifier,\ GradientBoostingClassifier,VotingClassifier from sklearn.linear_model import SGDClassifier, LogisticRegression,\ RidgeClassifier, PassiveAggressiveClassifier from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier from sklearn.neighbors import KNeighborsClassifier
content = [open(os.path.join(PATH, 'text', str(f))).read() for f in metadata.id] # Go from text to document-term-matrix ('vectorize' data) vectorizer = CountVectorizer(decode_error = 'ignore') X = vectorizer.fit_transform(content) # Select the 100 best features select = SelectKBest(chi2, k=50) select.fit(X[train,:], y[train]) # Xs = matrix with the 100 selected features Xs = select.transform(X) # Weigthing the matrix Xw = TfidfTransformer().fit_transform(Xs) Xw = Xw.toarray() # Extract feature names feature_names = np.array(vectorizer.get_feature_names())[select.get_support()] # Build orange data table, this is not really easy features = [Orange.feature.Continuous(x) for x in [str(b) for b in list(feature_names)]] classes = Orange.feature.Discrete("class", values=['0', '1']) domain = Orange.data.Domain(features, classes, id) feat_list = [] for i in range(Xw.shape[0]): feat_list.append(list(Xw[i,:]) + [int(y[i]) if y[i] < 3 else None]) out_data = Orange.data.Table(domain, feat_list) id = Orange.feature.Descriptor.new_meta_id()
text = soup.get_text() text = text.lower() text = text.translate(trantab) text = tokenize(text) corpus.append(text) df[colname] = corpus return df posts_clean = clean_text(posts) title_tfidf = TfidfTransformer().fit_transform(CountVectorizer().fit_transform( posts_clean['title'])) content_tfidf = TfidfTransformer().fit_transform( CountVectorizer().fit_transform(posts_clean['content'])) X = np.concatenate((title_tfidf.toarray(), content_tfidf.toarray()), axis=1) X = scipy.sparse.csr_matrix(X) y = multi_labels #%% classifiers from sklearn.neural_network import MLPClassifier from sklearn.svm import LinearSVC from sklearn.ensemble import \ RandomForestClassifier,AdaBoostClassifier,BaggingClassifier,ExtraTreesClassifier,\ GradientBoostingClassifier,VotingClassifier from sklearn.linear_model import SGDClassifier, LogisticRegression,\ RidgeClassifier, PassiveAggressiveClassifier from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier from sklearn.neighbors import KNeighborsClassifier
def get_tfidf_dictionary(): # vectorizer = # transformer = tfidf = TfidfTransformer(norm=None).fit_transform( CountVectorizer().fit_transform(train['title_spilt'])) tfidf.toarray()
stop = [] with open('stop_words.txt', 'r', encoding='utf-8') as f: for i in f: stop += i.split() def get_data(x): lists = jieba.cut(x) a = [] for i in lists: if len(i) >= 2 and i not in stop: a.append(i) return ' '.join(a) textitor = map(get_data, content) #文字預處理 #文字轉tfidf矩陣 vectorize = CountVectorizer() vt = vectorize.fit_transform(textitor) transformer = TfidfTransformer().fit_transform(vt) word = vectorize.get_feature_names() weight = transformer.toarray() print(DataFrame(weight, columns=word)) ''' x = sorted([ (n,i.sum()) for n, i in enumerate(weight.T)],key = lambda x : x[1], reverse = True) for i in range(5): print(word[x[i][0]],x[i][1]) '''
print('Result demo array is {}'.format(res_demo.toarray())) # Result is 2-d matrix containing document text matrix # Notice that in the second row, there is 2. # also, bad is repeated twice in that sentence. # so we can infer that 2 is corresponding to the word 'bad' print('Feature list: {}'.format(cv_demo.get_feature_names())) print('The data type of bow matrix {}'.format(type(cv_matrix))) print('Shape of the matrix {}'.format(cv_matrix.get_shape)) print('Size of the matrix is: {}'.format(sys.getsizeof(cv_matrix))) print(cv.get_feature_names()) print(cv_matrix.toarray()) normal_matrix = TfidfTransformer().fit_transform(cv_matrix) print(normal_matrix.toarray()) print(normal_matrix.T.toarray) res_graph = normal_matrix * normal_matrix.T # plt.spy(res_graph) nx_graph = nx.from_scipy_sparse_matrix(res_graph) nx.draw_circular(nx_graph) print('Number of edges {}'.format(nx_graph.number_of_edges())) print('Number of vertices {}'.format(nx_graph.number_of_nodes())) # plt.show() print('The memory used by the graph in Bytes is: {}'.format( sys.getsizeof(nx_graph))) ranks = nx.pagerank(nx_graph)
# In[13]: from sklearn.feature_extraction.text import CountVectorizer cv = CountVectorizer() bow = cv.fit_transform(train['Message']) # In[14]: #converting bow to tfidf # In[15]: from sklearn.feature_extraction.text import TfidfTransformer bow = TfidfTransformer().fit_transform(bow) bow = bow.toarray() bow = bow.tolist() # In[16]: #concatenating length to bow # In[62]: for i in range(5572): bow[i].append(train['Length'].iloc[i]) # In[18]: #splitting train_test
from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer # 语料 corpus = [ 'This is the first document.', 'This is the second second document.', 'And the third one.', 'Is this the first document?', ] # 将文本中的词语转换为词频矩阵 vectorizer = CountVectorizer() # 计算个词语出现的次数 X = vectorizer.fit_transform(corpus) #获取tfidf值 tfidf = TfidfTransformer().fit_transform(X) # 获取词袋中所有文本关键词 word = vectorizer.get_feature_names() print(word) # 查看词频结果 print(X.toarray()) print(tfidf.toarray())