def get_trainset(): train = pd.read_csv('labeledTrainData.tsv', delimiter="\t", quoting=3) y_train = train['sentiment'][:10000] train_data = [] #for i in xrange(0,len(train['review'])): for i in xrange(0,10000): train_data.append(" ".join(review_to_wordlist(train['review'][i]))) f = open('train_data.pickle', 'wb') f.write(pickle.dumps(train_data)) f.close() print "TrainSet Starting!\n" from sklearn.feature_extraction.text import TfidfVectorizer as TFIV tfv = TFIV(min_df=3, max_features=None, strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}', ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1, stop_words = 'english') X = tfv.fit_transform(train_data).todense() voc = tfv.vocabulary_ #f = open('X.pickle', 'wb') #f.write(pickle.dumps(X)) #f.close() #print 'TrainSet End!\n' del train_data return X, y_train, voc
def feature_tfidf(train_words, test_words): """ 建立tfidf 特征向量 :param words: :return: """ # 初始化TFIV对象,去停用词,加2元语言模型 tfv = TFIV(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1, stop_words='english') # 合并训练和测试集以便进行TFIDF向量化操作 X_all = train_words + test_words len_train = len(train_words) # 这一步有点慢,train model tfv.fit(X_all) X_all = tfv.transform(X_all) # 恢复成训练集和测试集部分 X = X_all[:len_train] X_test = X_all[len_train:] return X, X_test
def kaggle_21feb_svm_max_df_filtering(): tfv = TFIV(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=False, max_df=0.9) X_train, X_test, y_train, y_test = train_test_split(raw['x_raw'], raw['y'], test_size=0.2, random_state=42) lsvc = LinearSVC(max_iter=100000) cp1 = Pipeline([('tfidf', tfv), ('lsvc', lsvc)]) label = 'svm' scores = cross_val_score(cp1, X_train, y_train, cv=5, scoring='f1_micro') cp1.fit(X_train, y_train) mean_f1 = f1_score(y_test, cp1.predict(X_test), average='micro') print("Accuracy: %0.4f (+/- %0.4f) [%s] | Test: %0.4f" % (scores.mean(), scores.std(), label, mean_f1)) write_submission('21feb-svm-max_df_filtering-sub.csv', raw['X_Ids'], cp1.predict(raw['X_raw']))
def pred(test_idx, test_doc, model, voc): test_data = [] for i in xrange(len(test_doc)): test_data.append(" ".join(review_to_wordlist(test_doc[i]))) tfv = TFIV(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1, stop_words='english', vocabulary=voc) X_test = tfv.fit_transform(test_data).todense() X_test = np.array(X_test) pred = model.predict(X_test) test_idx = np.array(test_idx) like_idx = list(np.array(range(len(tag)))[pred == 1]) hate_idx = list(np.array(range(len(tag)))[pred == 0]) return like_idx, hate_idx
def train_2(labelList=[],maxLen=None,kFold=0): # 使用pandas读入数据集 all_data = pd.read_csv('../data/data_five_labels.csv') all_data.columns = ["disease","position","label","title","abstract","text"] # 将训练和测试数据都转成词list x_data = [] y_data = [] #{'症状': 24697, '病因': 8472, '检查': 4191, '鉴别': 1105, '治疗': 25620, '饮食护理': 10522, '预防': 7246, '并发症': 2200} for i in range(0, len(all_data['title'])): if all_data["label"][i] not in labelList:#剔除数据量不足的标签 x_data.append(all_data["title"][i]) y_data.append(all_data["label"][i]) stopwords = [] #创建停用词表 with open("../data/stopwords.txt","r",encoding="utf-8") as f: lines = f.readlines() for line in lines: stopwords.append(line.strip()) # 初始化TFIV对象,去停用词,加1-2元语言模型 tfv = TFIV( min_df=0,max_df=1,max_features=maxLen, strip_accents='unicode', analyzer='word', token_pattern=r'\b[\u4e00-\u9fa5]\w+\b', stop_words=stopwords,ngram_range=(1,3)) x_all = [] y_all = [] jieba.load_userdict("../data/diseaseDic.txt") jieba.load_userdict("../data/symptomDic.txt") print("正在分词、去停...") i= 0 for text in x_data: i+=1 print("第%d篇文章..."%i) try: x_all.append(" ".join(jieba.cut(seg_sentence(text)))) y_all.append(y_data[x_data.index(text)]) except: pass print("初始标签长度:%d" % len(y_data)) print("分词去停后标签长度:%d" %len(y_all)) print("正在抽取特征词...") tfv.fit(x_all) feature_list = tfv.get_feature_names() #将特征词表写入txt f = open("../data/feature_names_3.txt","w",encoding="utf-8") for x in feature_list: f.write(x+"\n") f.close() print("正在将文档映射为向量...") X_all = tfv.transform(x_all) clf = svm.SVC(kernel='linear',C=1) print("训练...") scores = cross_val_score(clf,X_all,y_all,cv=kFold) return scores
def train_1(): # 使用pandas读入数据集 all_data = pd.read_csv('../data/articles.csv') all_data.columns = ["disease","position","label","title","abstract","text"] # 取出句子的标签 y_data = all_data['label'] print("初始标签长度:%d" %len(y_data)) # 将训练和测试数据都转成词list x_data = [] for i in range(0, len(all_data['title'])): x_data.append(all_data["title"][i]) stopwords = [] #创建停用词表 with open("../data/stopwords.txt","r",encoding="utf-8") as f: lines = f.readlines() for line in lines: stopwords.append(line.strip()) # 初始化TFIV对象,去停用词,加1-2元语言模型 tfv = TFIV( min_df=0,max_df=1,max_features=30000, strip_accents='unicode', analyzer='word', token_pattern=r'\b[\u4e00-\u9fa5]\w+\b', stop_words=stopwords,ngram_range=(1,3)) x_all = [] y_all = [] jieba.load_userdict("../data/diseaseDic.txt") jieba.load_userdict("../data/symptomDic.txt") print("正在分词、去停...") i= 0 for text in x_data: i+=1 print("第%d篇文章..."%i) try: x_all.append(" ".join(jieba.cut(seg_sentence(text)))) y_all.append(y_data[x_data.index(text)]) except: pass print("分词去停后标签长度:%d" %len(y_all)) print("正在抽取特征词...") tfv.fit(x_all) feature_list = tfv.get_feature_names() #将特征词表写入txt f = open("../data/feature_names_3.txt","w",encoding="utf-8") for x in feature_list: f.write(x+"\n") f.close() print("正在将文档映射为向量...") X_all = tfv.transform(x_all) clf = svm.SVC(kernel='linear',C=1) print("训练...") scores = cross_val_score(clf,X_all,y_all,cv=5) print(scores)
def data_tfidf2(x): tfv = TFIV( min_df=3, max_df=0.5, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1, ) tfv.fit(x) x = tfv.transform(x) return x
def main(star_rating=1): # start = datetime.strptime(start_time, '%m/%d/%Y') # end = datetime.strptime(end_time, '%m/%d/%Y') review = data.review_body[data.star_rating == 1] stop_words = load_stopword() texts = [[ word for word in line.strip().lower().split() if word not in stop_words ] for line in review] #去除符号,数字,停用词 temp = [] for i in texts: temp.append(' '.join(i)) print('词语处理完成!') '''TF-idf处理数据''' n_features = 1000 #限定有多少个特征 tf_vectorizer = TFIV(strip_accents='unicode', max_features=n_features, stop_words='english', max_df=0.5, min_df=10) #建立TFidf向量化模型 tf = tf_vectorizer.fit_transform(temp) #使用模型向量化每一句话 '''LDA''' from sklearn.decomposition import LatentDirichletAllocation #导入LDA n_topics = 5 #一共有多少个主题 lda = LatentDirichletAllocation(n_components=n_topics, max_iter=50, learning_method='online', learning_offset=50., random_state=0) #建立LDA模型 lda.fit(tf) #拟合数据 def print_top_words(model, feature_names, n_top_words): for topic_idx, topic in enumerate(model.components_): print("Topic #%d:" % topic_idx) print(" ".join([ feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1] ])) print() #展示20个主题显示最好的结果 n_top_words = 20 tf_feature_names = tf_vectorizer.get_feature_names() #获得那1000个特征的值 print_top_words(lda, tf_feature_names, n_top_words)
def kaggle_21feb_ensemble_sub(): tfv = TFIV(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=False, max_df=0.9) X_train, X_test, y_train, y_test = train_test_split(raw['x_raw'], raw['y'], test_size=0.2, random_state=42) lr = LogisticRegression(solver='lbfgs') lsvc = LinearSVC() nb = MultinomialNB() cp1 = Pipeline([('tfidf', tfv), ('lsvc', lsvc)]) cp2 = Pipeline([('tfidf', tfv), ('lr', lr)]) cp3 = Pipeline([('tfidf', tfv), ('nb', nb)]) # Hard voting: majority rules, Soft voting: weighted average probabilities eclf = VotingClassifier(estimators=[('lsvc', cp1), ('lr', cp2), ('nb', cp3)], voting='hard') label = 'ensemble' scores = cross_val_score(eclf, X_train, y_train, cv=5, scoring='f1_micro') eclf.fit(X_train, y_train) mean_f1 = f1_score(y_test, eclf.predict(X_test), average='micro') print("Accuracy: %0.4f (+/- %0.4f) [%s] | Test: %0.4f" % (scores.mean(), scores.std(), label, mean_f1)) write_submission('21feb-ensemble-sub.csv', raw['X_Ids'], eclf.predict(raw['X_raw']))
def tfidf(self, ngram=2): tfv = TFIV(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, ngram), use_idf=1, smooth_idf=1, sublinear_tf=1, stop_words='english') X_all = self.traindata + self.testdata lentrain = len(self.traindata) tfv.fit(X_all) X_all = tfv.transform(X_all) self.X = X_all[:lentrain] self.X_test = X_all[lentrain:] print("vectorization data size: ", self.X.shape) return self.X, self.y_train, self.X_test, self.y_test
def tfidf(self, ngram=2): tfv = TFIV(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, ngram), use_idf=1, smooth_idf=1, sublinear_tf=1, stop_words='english') # Combine both to fit the TFIDF vectorization. X_all = self.traindata + self.testdata lentrain = len(self.traindata) tfv.fit(X_all) # This is the slow part! X_all = tfv.transform(X_all) self.X = X_all[:lentrain] # Separate back into training and test sets. self.X_test = X_all[lentrain:] print("vectorization data size: ", self.X.shape) return self.X, self.y_train, self.X_test
def train(maxFeature=None,minDf=0.0,maxDf=1.0,ngram=None,classifier=svm.SVC(kernel='linear')): # 使用pandas读入训练和测试csv文件 train = pd.read_csv('../data/train_0.8.csv') test = pd.read_csv('../data/test_0.2.csv') train.columns = ["disease","position","label","title","abstract","text"] test.columns = ["disease","position","label","title","abstract","text"] # 取出句子的标签 y_train = train['label'] y_test = test['label'] # 将训练和测试数据都转成词list train_data = [] for i in range(0, len(train['title'])): train_data.append(train["title"][i]+str(train["abstract"][i])+str(train["text"][i])) # train_data.append(train["abstract"][i]+train['text'][i]) test_data = [] for i in range(0, len(test['title'])): test_data.append(test["title"][i]+str(test["abstract"][i])+str(test["text"][i])) # test_data.append(train["abstract"][i]+test['text'][i]) stopwords = [] with open("../data/stopwords.txt","r",encoding="utf-8") as f: lines = f.readlines() for line in lines: stopwords.append(line.strip()) # with open("../data/diseaseDic.txt","r",encoding="utf-8") as f: # lines = f.readlines(); # for line in lines: # stopwords.append(line.strip()) # with open("../data/symptomDic.txt","r",encoding="utf-8") as f: # lines = f.readlines(); # for line in lines: # stopwords.append(line.strip()) # 初始化TFIV对象,去停用词,加1-2元语言模型 tfv = TFIV( min_df=minDf,max_df=maxDf,max_features=maxFeature, strip_accents='unicode', analyzer='word', token_pattern=r'\b[\u4e00-\u9fa5]\w+\b',ngram_range=ngram, stop_words=stopwords) # 合并训练和测试集以便进行TFIDF向量化操作 X_all = train_data + test_data tokenized_corpus = [] jieba.load_userdict("../data/diseaseDic.txt") jieba.load_userdict("../data/symptomDic.txt") print("正在分词、去停...") for text in X_all: tokenized_corpus.append(" ".join(jieba.cut(seg_sentence(text)))) len_train = len(train_data) # 对所有的数据进行向量化操作,耗时比较长 print("正在抽取特征词...") tfv.fit(tokenized_corpus) feature_list = tfv.get_feature_names() f = open("../data/feature_names_3.txt","w",encoding="utf-8") for x in feature_list: f.write(x+"\n") f.close() print("正在将文档映射为向量...") X_all = tfv.transform(tokenized_corpus) with open("../data/vectors.txt","w",encoding="utf-8") as file: file.write(str(X_all)) # 恢复成训练集和测试集部分 X = X_all[:len_train] X_test = X_all[len_train:] # 定义模型结果输出函数,依次输出p值,r值,F值和准确率accuracy def calculate_result(actual, pred): m_precision = metrics.precision_score(actual, pred,average=None) m_recall = metrics.recall_score(actual, pred,average=None) accuracy = metrics.accuracy_score(pred, actual) print('SVM分类模型在测试集上的准确率:',accuracy) print('SVM分类模型在测试集上的P值:', m_precision) print('SVM分类模型在测试集上的R值:', m_recall) print('SVM分类模型在测试集上的F1值:', metrics.f1_score(actual, pred,average=None)) return accuracy # 训练P_svm分类模型 print("正在训练分类器...") svclf = classifier svclf.fit(X, y_train) # #模型保存 # joblib.dump(svclf,"P_svm_train_model.m") print("正在预测...") pred = svclf.predict(X_test) acc = calculate_result(y_test, pred) return acc
print(len(labels)) print("File Reading Finished") # In[105]: print("Defining TFIDF Vectorizer") tfIdfVec = TFIV( min_df=3, # When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold max_features=10000, # If not None, build a vocabulary that only consider the top max_features ordered by term frequency across the corpus. strip_accents='unicode', # Remove accents during the preprocessing step. 'ascii' is a fast method that only works on characters that have an direct ASCII mapping. # 'unicode' is a slightly slower method that works on any characters. analyzer='word', # Whether the feature should be made of word or character n-grams.. Can be callable. token_pattern=r'\w{1,}', # Regular expression denoting what constitutes a "token", only used if analyzer == 'word'. ngram_range=(1,5), # The lower and upper boundary of the range of n-values for different n-grams to be extracted. use_idf=1, # Enable inverse-document-frequency reweighting. smooth_idf=1, # Smooth idf weights by adding one to document frequencies. sublinear_tf=1, # Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf). stop_words = 'english' # 'english' is currently the only supported string value. ) # In[106]: print("Fitting") tfIdfVec.fit(clean_data) # Learn vocabulary and idf from training set.
trainSetLabel.append(content[i]) for i in os.listdir('./test/'): words = [] with open('./test/%s' % i, encoding='gb18030', errors='ignore') as file: mail = file.read() words = ' '.join(analyse.extract_tags(mail, topK=20)) testList.append(words) i = sub('[..txt]', '', i) idList.append(i) allData = trainList + testList lenTrain = len(trainList) ft = TFIV() featureVector = ft.fit_transform(allData).toarray() #Separate back into training and dev sets. trainSet = featureVector[:lenTrain] testSet = featureVector[lenTrain:] #rain naive bayes model. mnb = MultinomialNB() MNBResult = mnb.fit(trainSet, trainSetLabel).predict(testSet) outPut = pd.DataFrame(data={ "classification": MNBResult, "id": idList, "content": testList })
seg_list = jieba.cut(stri,use_paddle=True) unlabeled_text.append(' '.join(list(seg_list))) for stri in test_origin_text: seg_list = jieba.cut(stri,use_paddle=True) test_text.append(' '.join(list(seg_list))) train_label = [] for i in range(0,1000): train_label.append(" ".join(review_to_wordlist(train_origin['label'][i]))) # min_df=3去除低词频的词,分析的视角是词,启用ngram_range,启用ITF,启用idf平滑smooth_idf=1 #用1 + log(tf)替换tfsublinear_tf=1 tfv = TFIV(min_df=3, strip_accents='unicode', analyzer='word',ngram_range=(1, 2) , use_idf=1,smooth_idf=1,sublinear_tf=1) # 注意我只用训练集训练 tfv.fit(train_text) X_all = train_text + unlabeled_text +test_text len_train = len(train_text) len_unlabeled = len(unlabeled_text) X_all = tfv.transform(X_all) # 恢复成训练集和测试集部分 # 左闭右开 train_X = X_all[:len_train] unlabeled_X = X_all[len_train:len_train+len_unlabeled]
def train(maxFeature=None,minDf=0.0,maxDf=1.0,ngram=None,classifier=svm.SVC(kernel='linear')): # 使用pandas读入训练和测试csv文件 train = pd.read_csv('../data/train.csv') test = pd.read_csv('../data/test.csv') train.columns = ["disease","position","label","title","abstract","text"] test.columns = ["disease","position","label","title","abstract","text"] word_vec = {} with open("../data/word_vector.csv","r",encoding="utf-8") as f: for line in csv.reader(f): word_vec.update({line[0]:eval(line[1])}) # 取出句子的标签 y_train = train['label'] y_test = test['label'] # 将训练和测试数据都转成词list train_data = [] for i in range(0, len(train['title'])): train_data.append(train["title"][i]+str(train["abstract"][i])+str(train["text"][i])) # train_data.append(train["abstract"][i]+train['text'][i]) test_data = [] for i in range(0, len(test['title'])): test_data.append(test["title"][i]+str(test["abstract"][i])+str(test["text"][i])) # test_data.append(train["abstract"][i]+test['text'][i]) stopwords = [] with open("../data/stopwords.txt","r",encoding="utf-8") as f: lines = f.readlines() for line in lines: stopwords.append(line.strip()) # 初始化TFIV对象,去停用词,加1-2元语言模型 tfv = TFIV( min_df=minDf,max_df=maxDf,max_features=maxFeature, strip_accents='unicode', analyzer='word', token_pattern=r'\b[\u4e00-\u9fa5]\w+\b',ngram_range=ngram, stop_words=stopwords) # 合并训练和测试集以便进行TFIDF向量化操作 X_all = train_data + test_data tokenized_corpus = [] # jieba.load_userdict("../data/diseaseDic.txt") # jieba.load_userdict("../data/symptomDic.txt") for text in X_all: tokenized_corpus.append(" ".join(jieba.cut(seg_sentence(text)))) len_train = len(train_data) # 对所有的数据进行向量化操作,耗时比较长 tfv.fit(tokenized_corpus) feature_list = tfv.get_feature_names() f = open("../data/feature_names_2.txt","w",encoding="utf-8") for x in feature_list: f.write(x+"\n") f.close() model = gensim.models.KeyedVectors.load_word2vec_format("../data/sgns.wiki.bigram-char", encoding="utf-8") X_all = tfv.transform(tokenized_corpus) all = [] for sentence in X_all: sentenceWeight = np.array([0]*300) num = 0 sentence = sentence.toarray().tolist()[0] for x in sentence: i = sentence.index(x) weight = x #使用x对词向量加权 if x != 0: num += 1 word = feature_list[i] if word in word_vec.keys(): wordMatrix = np.array(word_vec[word]) else: wordMatrix = np.array([0]*300) sentenceWeight = np.add(sentenceWeight,weight*wordMatrix) if num != 0: all.append((sentenceWeight/num).tolist()) else: all.append([0]*300) with open("../data/vectors.txt","w",encoding="utf-8") as file: file.write(str(X_all)) # 恢复成训练集和测试集部分 X = np.array(all[:len_train]) X_test = np.array(all[len_train:]) # 定义模型结果输出函数,依次输出p值,r值,F值和准确率accuracy def calculate_result(actual, pred): m_precision = metrics.precision_score(actual, pred,average=None) m_recall = metrics.recall_score(actual, pred,average=None) accuracy = metrics.accuracy_score(pred, actual) with open("../data/record.txt","a",encoding="utf-8") as f: f.write('SVM分类模型在测试集上的准确率:'+str(accuracy)+'\n') f.write('SVM分类模型在测试集上的准确率:'+str(accuracy)+'\n') f.write('SVM分类模型在测试集上的P值:'+str(m_precision)+'\n') f.write('SVM分类模型在测试集上的R值:'+str(m_recall)+'\n') f.write('SVM分类模型在测试集上的F1值:'+str(metrics.f1_score(actual, pred,average=None))+'\n') return accuracy # 训练P_svm分类模型 svclf = classifier svclf.fit(X, y_train) # #模型保存 # joblib.dump(svclf,"P_svm_train_model.m") pred = svclf.predict(X_test) acc = calculate_result(y_test, pred) return acc
def main(): #Load labeled training data train = pd.read_csv('../../data/labeledtrainData.tsv', header=0, delimiter="\t", quoting=3) #The column on which we will predict predCol = train['sentiment'] pickle.dump(predCol, open("../../picks/predCol", "wb")) #List for storing cleaned up training data trainData = [] #Loop counter numRevs = len(train['review']) for i in range(0, numRevs): if ((i + 1) % 2000 == 0): print("Train Review %d of %d\n" % (i + 1, numRevs)) #Clean each review> Please look at the definition of the sentimentToWordlist function in the preproc.py script trainData.append(" ".join( preProc.sentimentToWordlist(train['review'][i]))) #Load test data test = pd.read_csv('../../data/testData.tsv', header=0, delimiter="\t", quoting=3) #List for storing cleaned up test data testdata = [] #Loop counter numRevs = len(test['review']) for i in range(0, numRevs): if ((i + 1) % 2000 == 0): print("Test Review %d of %d\n" % (i + 1, numRevs)) #Clean each review> Please look at the definition of the sentimentToWordlist function in the preproc.py script testdata.append(" ".join(preProc.sentimentToWordlist( test['review'][i]))) #Define/build TfidfVectorizer print("Defining TFIDF Vectorizer") tfIdfVec = TFIV( min_df= 3, # When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold max_features= 3000, # If not None, build a vocabulary that only consider the top max_features ordered by term frequency across the corpus. strip_accents= 'unicode', # Remove accents during the preprocessing step. 'ascii' is a fast method that only works on characters that have an direct ASCII mapping. # 'unicode' is a slightly slower method that works on any characters. analyzer= 'word', # Whether the feature should be made of word or character n-grams.. Can be callable. token_pattern= r'\w{1,}', # Regular expression denoting what constitutes a "token", only used if analyzer == 'word'. ngram_range=( 1, 5 ), # The lower and upper boundary of the range of n-values for different n-grams to be extracted. use_idf=1, # Enable inverse-document-frequency reweighting. smooth_idf= 1, # Smooth idf weights by adding one to document frequencies. sublinear_tf= 1, # Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf). stop_words= 'english' # 'english' is currently the only supported string value. ) pickle.dump(tfIdfVec, open("../../picks/vectorizer.pkl", "wb")) combineData = trainData + testdata # Combine both to fit the TFIDF vectorization. trainLen = len(trainData) print("Fitting") tfIdfVec.fit(combineData) # Learn vocabulary and idf from training set. print("Transforming") combineData = tfIdfVec.transform( combineData ) # Transform documents to document-term matrix. Uses the vocabulary and document frequencies (df) learned by fit (or fit_transform). pickle.dump(combineData, open("../../picks/transformedData.pkl", "wb")) print("Fitting and transforming done") trainAfterFit = combineData[: trainLen] # Separate back into training and test sets. pickle.dump(trainAfterFit, open("../../picks/fittedTrainData.pkl", "wb")) testAfterFit = combineData[trainLen:] pickle.dump(testAfterFit, open("../../picks/fittedTestData.pkl", "wb"))
# # #coding=utf-8 import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer as TFIV from codes.new_model import seg_sentence import jieba # import scipy.sparse.csr.csr_matrix tfv = TFIV(strip_accents='unicode', analyzer='word', token_pattern=r'\b[\u4e00-\u9fa5]\w+\b', stop_words=None) tfv.fit([ " ".join(jieba.cut("SVM分类模型在测试集上的准确率")), " ".join(jieba.cut("本文为博主原创文章,未经博主允许不得转载")), " ".join(jieba.cut("【word2vec实例2】加载模型")) ]) print(tfv.get_feature_names()) # print(tfv.transform([" ".join(jieba.cut("SVM分类模型在测试集上的准确率"))," ".join(jieba.cut("本文为博主原创文章,未经博主允许不得转载"))," ".join(jieba.cut("【word2vec实例2】加载模型"))])) all = tfv.transform([ " ".join(jieba.cut("SVM分类模型在测试集上的准确率")), " ".join(jieba.cut("本文为博主原创文章,未经博主允许不得转载")), " ".join(jieba.cut("【word2vec实例2】加载模型")), "", "测试" ]) print(all) for x in all: x = x.toarray().tolist()[0] for xx in x: if xx != 0: print(x.index(xx)) # print(all[0]) # # print(all.toarray().tolist()) # # for x in all:
test_data = [] for i in range(0, len(test['review'])): test_data.append(" ".join(review_to_wordlist(test['review'][i]))) print(train_data[0]) # with all this stuff...; 居然 2000 个字节 ################################################################## ## 特征处理; 数据有了, 我们得想办法从数据里面拿到有区分度的特征 # 比如说 Kaggle 该问题的引导页提供的 word2vec 就是一种文本到数值域的特征抽取方式; # TF-IDF(term frequency-interdocument frequency)向量. 每一个电影评论最后转化成一个 TF-IDF 向量. # TF-IDF 是一种统计方法, 用以评估一字词(或者 n-gram)对于一个文件集或一个语料库中的其中一份文件的重要程度. # 字词的重要性随着它在文件中出现的次数成正比增加, 但同时会随着它在语料库中出现的频率成反比下降. # 同时我在单词的级别上又拓展到 2 元语言模型(对这个不了解的同学别着急, 后续的博客介绍马上就来) # 恩, 你可以再加 3 元 4 元 语言模型; 主要是单机内存不够了, 先就 2 元 上, 凑活用吧 from sklearn.feature_extraction.text import TfidfVectorizer as TFIV # 初始化 TFIV 对象, 去停用词, 加 2 元语言模型 tfv = TFIV(min_df=3, token_pattern=r'\w{1,}', ngram_range=(1, 2), sublinear_tf=1) # 这里的 token_pattern 还不是很懂.. X_all, len_train = train_data + test_data, len(train_data) print(len(X_all)) # 50000; 合并训练和测试集以便进行 TFIDF 向量化操作 X_all = tfv.fit_transform( X_all) # 这一步有点慢, 去喝杯茶刷会儿微博知乎歇会儿..., ./l2.py 中直接从这不的下一步开始的... X, x_test = X_all[:len_train], X_all[len_train:] # 恢复成训练集和测试集部分 ################################################################## ## 多项式朴素贝叶斯 vs 逻辑回归 # 特征现在我们拿到手了, 该建模了, 好吧, 博主折腾劲又上来了, 那个我们还是朴素贝叶斯和逻辑回归都建个分类器吧, 然后也可以比较比较 # 『 talk is cheap, I’ ll show you the code 』 from sklearn.naive_bayes import MultinomialNB as MNB from sklearn.model_selection import cross_val_score import numpy as np print("多项式贝叶斯分类器 20 折交叉验证得分: ", np.mean(cross_val_score(MNB(), X, y_train, cv=20,
def train_3(labelList=[],maxLen=None): # 使用pandas读入数据集 all_data = pd.read_csv('../data/data_five_labels.csv') all_data.columns = ["disease","position","label","title","abstract","text"] # 将训练和测试数据都转成词list x_data = [] y_data = [] #{'症状': 24697, '病因': 8472, '检查': 4191, '鉴别': 1105, '治疗': 25620, '饮食护理': 10522, '预防': 7246, '并发症': 2200} for i in range(0, len(all_data['title'])): if all_data["label"][i] in labelList:#剔除数据量不足的标签 x_data.append(all_data["title"][i]) y_data.append(all_data["label"][i]) stopwords = [] #创建停用词表 with open("../data/stopwords.txt","r",encoding="utf-8") as f: lines = f.readlines() for line in lines: stopwords.append(line.strip()) # 初始化TFIV对象,去停用词,加1-2元语言模型 tfv = TFIV( min_df=0,max_df=1,max_features=maxLen, strip_accents='unicode', analyzer='word', token_pattern=r'\b[\u4e00-\u9fa5]\w+\b', stop_words=stopwords,ngram_range=(1,3)) x_all = [] y_all = [] jieba.load_userdict("../data/diseaseDic.txt") jieba.load_userdict("../data/symptomDic.txt") print("正在分词、去停...") i= 0 for text in x_data: i+=1 print("第%d篇文章..."%i) try: x_all.append(" ".join(jieba.cut(seg_sentence(text)))) y_all.append(y_data[x_data.index(text)]) except: pass print("初始标签长度:%d" % len(y_data)) print("分词去停后标签长度:%d" %len(y_all)) print("正在抽取特征词...") tfv.fit(x_all) feature_list = tfv.get_feature_names() #将特征词表写入txt f = open("../data/feature_names_3.txt","w",encoding="utf-8") for x in feature_list: f.write(x+"\n") f.close() print("正在将文档映射为向量...") X_all = tfv.transform(x_all) X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=0) print("训练...") clf = svm.SVC(kernel='linear', C=1).fit(X_train,y_train) def calculate_result(actual, pred): m_precision = metrics.precision_score(actual, pred,average=None) m_recall = metrics.recall_score(actual, pred,average=None) accuracy = metrics.accuracy_score(pred, actual) print('SVM分类模型在测试集上的准确率:',accuracy) print('SVM分类模型在测试集上的P值:', m_precision) print('SVM分类模型在测试集上的R值:', m_recall) print('SVM分类模型在测试集上的F1值:', metrics.f1_score(actual, pred,average=None)) return accuracy pred = clf.predict(X_test) acc = calculate_result(y_test, pred) return acc
#.join()表示用空格来连接括号内的字符串 print i test_data = [] for i in xrange(0, len(test['review'])): print i, len(test['review']) test_data.append("-".join(review_to_worldlist(test['review'][i]))) endTime = time.clock() print train_data[0] print "%d s" % (endTime - startTime) from sklearn.feature_extraction.text import TfidfVectorizer as TFIV tfv = TFIV(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1, stop_words='english') X_all = train_data + test_data len_train = len(train_data) tfv.fit(X_all) X_all = tfv.transform(X_all) X = X_all[:len_train] #恢复训练集和测试集 X_test = X_all[len_train:] from sklearn.naive_bayes import MultinomialNB as MNB from sklearn.cross_validation import cross_val_score model_NB = MNB()
from sklearn.naive_bayes import MultinomialNB from sklearn.ensemble import VotingClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import f1_score from sklearn.model_selection import cross_val_score from sklearn.pipeline import Pipeline from sklearn.feature_extraction.text import TfidfVectorizer as TFIV from processing import data_no_features raw = data_no_features() tfv = TFIV(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 2), sublinear_tf=False, max_df=0.9) X_train, X_test, y_train, y_test = train_test_split(raw['x_raw'], raw['y'], test_size=0.2, random_state=42) lr = LogisticRegression(solver='lbfgs') lsvc = LinearSVC() nb = MultinomialNB() dt = DecisionTreeClassifier() cp1 = Pipeline([('tfidf', tfv), ('lsvc', lsvc)])
def train(maxFeature=None, minDf=0.0, maxDf=1.0, ngram=None, classifier=svm.SVC(kernel='linear'), index=None): # 使用pandas读入训练和测试csv文件 train = pd.read_csv('../data/train.csv') test = pd.read_csv('../data/test.csv') train.columns = ["disease", "position", "label", "title", "abstract", "text"] test.columns = ["disease", "position", "label", "title", "abstract", "text"] word_vec = {} with open("../data/word_vector.csv", "r", encoding="utf-8") as f: for line in csv.reader(f): word_vec.update({line[0]: eval(line[1])}) # 取出句子的标签 y_train = train['label'] y_test = test['label'] # 将训练和测试数据都转成词list train_data = [] for i in range(0, len(train['title'])): train_data.append(train["title"][i] + str(train["abstract"][i]) + str(train["text"][i])) # train_data.append(train["abstract"][i]+train['text'][i]) test_data = [] for i in range(0, len(test['title'])): test_data.append(test["title"][i] + str(test["abstract"][i]) + str(test["text"][i])) # test_data.append(train["abstract"][i]+test['text'][i]) stopwords = [] with open("../data/stopwords.txt", "r", encoding="utf-8") as f: lines = f.readlines() for line in lines: stopwords.append(line.strip()) # 初始化TFIV对象,去停用词,加1-2元语言模型 tfv = TFIV(min_df=minDf, max_df=maxDf, max_features=maxFeature, strip_accents='unicode', analyzer='word', token_pattern=r'\b[\u4e00-\u9fa5]\w+\b', ngram_range=ngram, stop_words=stopwords) # 合并训练和测试集以便进行TFIDF向量化操作 X_all = train_data + test_data tokenized_corpus = [] # jieba.load_userdict("../data/diseaseDic.txt") # jieba.load_userdict("../data/symptomDic.txt") print("正在分词、去停...") for text in X_all: tokenized_corpus.append(" ".join(jieba.cut(seg_sentence(text)))) len_train = len(train_data) # 对所有的数据进行向量化操作,耗时比较长 print("正在抽取特征词...") tfv.fit(tokenized_corpus) feature_list = tfv.get_feature_names() f = open("../data/feature_names_2.txt", "w", encoding="utf-8") for x in feature_list: f.write(x + "\n") f.close() print("正在将文档映射为向量...") model = gensim.models.KeyedVectors.load_word2vec_format("../data/sgns.wiki.bigram-char", encoding="utf-8") X_all = tfv.transform(tokenized_corpus) ii = 0 # 文章数 all = [] maxLen = 0 # f_weight = open("../data/file_weight_" + index + ".txt", "w", encoding="utf-8") for sentence in X_all: j = 0 # 每篇文章的特征词个数 ii += 1 print("article num is %d" %ii) sentenceWeight = np.array([]) sentence = sentence.toarray().tolist()[0] for x in sentence: i = sentence.index(x) weight = sentence[i] if weight != 0: word = feature_list[i] if word in word_vec.keys(): j += 1 wordMatrix = np.array(word_vec[word]) # else: # pass # # wordMatrix = np.array([0]*300) sentenceWeight = np.array(sentenceWeight.tolist() + (weight * wordMatrix).tolist()) all.append((sentenceWeight).tolist())#将每篇文章转换为词向量存入TXT文本,每篇文章的特征词数目不同,取最大长度,不足的补零 if maxLen < j: maxLen = j print("max len:",maxLen) for i in range(0,len(all)): if len(all[i]) < maxLen *300: all[i] = np.pad(np.array(all[i]), (0, maxLen - len(all[i])), mode='constant', constant_values=(0, 0)).tolist() print("正在训练PCA降维模型...") pca = PCA(n_components=500) pca.fit(np.array(all)) print("正在将文档矩阵降维...") all_data = pca.fit_transform(np.array(all)).tolist() with open("../data/vectors.txt", "w", encoding="utf-8") as file: file.write(str(X_all)) # 恢复成训练集和测试集部分 X = np.array(all_data[:len_train]) X_test = np.array(all_data[len_train:]) # 定义模型结果输出函数,依次输出p值,r值,F值和准确率accuracy def calculate_result(actual, pred): m_precision = metrics.precision_score(actual, pred, average=None) m_recall = metrics.recall_score(actual, pred, average=None) accuracy = metrics.accuracy_score(pred, actual) with open("../data/record_pca.txt", "a", encoding="utf-8") as f: f.write('SVM分类模型在测试集上的准确率:' + str(accuracy) + '\n') f.write('SVM分类模型在测试集上的准确率:' + str(accuracy) + '\n') f.write('SVM分类模型在测试集上的P值:' + str(m_precision) + '\n') f.write('SVM分类模型在测试集上的R值:' + str(m_recall) + '\n') f.write('SVM分类模型在测试集上的F1值:' + str(metrics.f1_score(actual, pred, average=None)) + '\n') return accuracy # 训练P_svm分类模型 print("正在训练分类器...") svclf = classifier svclf.fit(X, y_train) # #模型保存 joblib.dump(svclf, "P_svm_train_model.m") print("正在预测...") pred = svclf.predict(X_test) acc = calculate_result(y_test, pred) return acc
df = df.append([[txt, labels[l]]], ignore_index=True) pbar.update() df.columns = ['article' , 'author'] feature_set = df.loc[:,:].values ###Step 2: Data split from sklearn.model_selection import train_test_split features_train, features_test, labels_train, labels_test = train_test_split (df['article'],df['author'] ,test_size=0.2, random_state=42) ### Step 3: tf-idf ( Term frequency - inversse document frequency) processing from sklearn.feature_extraction.text import TfidfVectorizer as TFIV get_stop_words= TFIV( max_features=None, strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}', use_idf=1,smooth_idf=1,sublinear_tf=1, stop_words = 'english') my_stop_words = set(get_stop_words.get_stop_words()) my_stop_words.add("link") tfv = TFIV( max_features=None, strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}', use_idf=1,smooth_idf=1,sublinear_tf=1, stop_words = my_stop_words) my_stop_words = tfv.get_stop_words() tfid_train=tfv.fit_transform(features_train) tfid_test =tfv.transform(features_test)