def train(): trainFileName='train.pkl' testFileName='test.pkl' pipelineFileName='pipeline.pkl' if(os.path.exists(trainFileName)): fin=open(trainFileName,'r') trainData=pickle.load(fin) trainClass=pickle.load(fin) fin.close() else: trainText=mydataset.getAllTrainTextList() i=0; N=trainText.__len__() trainData=[] trainClass=[] for (tag,text) in trainText: i=i+1 if(i%5000==0): print('i=%08d finished %5.5f%% using jieba to cut the text\n'%(i,i*100.0/N)) trainData.append(text) trainClass.append(tag) fout=open(trainFileName,'w') pickle.dump(trainData,fout) pickle.dump(trainClass,fout) fout.close() #if(os.path.exists(pipelineFileName)): if(False): fin=open(pipelineFileName,'r') pipeline=pickle.load(fin) fin.close() else: pipeline = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', Perceptron()), ]) #pipeline.set_params(vect__max_df=0.6,clf__alpha=1e-07,clf__penalty='l2',tfidf__norm='l1',tfidf__use_idf=True,vect__ngram_range=(1,2)) pipeline.set_params(vect__max_df=0.6,tfidf__norm='l1',tfidf__use_idf=True,vect__ngram_range=(1,2)) trainNum=trainData.__len__() pipeline.fit(trainData[0:trainNum],trainClass[0:trainNum]) fout=open(pipelineFileName,'w') pickle.dump(pipeline,fout) fout.close() #################################### output train result trainNum=trainData.__len__() #print 'train result '+"#"*30 prec=pipeline.predict(trainData[0:trainNum]) expected=trainClass[0:trainNum] #print("Classification report for classifier:\n%s\n" #% (metrics.classification_report(expected, prec))) TP=0.0 TN=0.0 FP=0.0 FN=0.0 N=trainData.__len__() for i in range(0,trainNum): if(prec[i]==expected[i]): if(prec[i]==u'1'): TP=TP+1 else: TN=TN+1 else: if(prec[i]==u'1'): FP=FP+1 else: FN=FN+1 P=TP/(TP+FP) R=TP/(TP+FN) F=2*P*R/(P+R) #print('train result: P=%f,R=%f,F=%f\n'%(P,R,F)) return F,pipeline
#clf=Perceptron() cutModel = True if (cutModel): trainFileName = 'pipelineTrainCutAll.pkl' else: trainFileName = 'pipelineTrain.pkl' if (os.path.exists(trainFileName)): fin = open(trainFileName, 'r') trainData = pickle.load(fin) trainClass = pickle.load(fin) fin.close() else: trainText = mydataset.getAllTrainTextList(cutModel) i = 0 N = trainText.__len__() trainData = [] trainClass = [] for (tag, text) in trainText: i = i + 1 if (i % 5000 == 0): print('i=%08d finished %5.5f%% using jieba to cut the text\n' % (i, i * 100.0 / N)) trainData.append(text) trainClass.append(tag) fout = open(trainFileName, 'w') pickle.dump(trainData, fout)
from sklearn.feature_extraction.text import TfidfTransformer from sklearn.feature_extraction.text import HashingVectorizer from sklearn.linear_model import SGDClassifier from sklearn.grid_search import GridSearchCV from sklearn.pipeline import Pipeline import os import mydataset trainFileName='pipelineTrain.pkl' if(os.path.exists(trainFileName)): fin=open(trainFileName,'r') trainData=pickle.load(fin) trainClass=pickle.load(fin) fin.close() else: trainText=mydataset.getAllTrainTextList() i=0; N=trainText.__len__() trainData=[] trainClass=[] for (tag,text) in trainText: i=i+1 if(i%5000==0): print('i=%08d finished %5.5f%% using jieba to cut the text\n'%(i,i*100.0/N)) trainData.append(text) trainClass.append(tag) fout=open(trainFileName,'w') pickle.dump(trainData,fout) pickle.dump(trainClass,fout)
from sklearn.linear_model import SGDClassifier from sklearn.grid_search import GridSearchCV from sklearn.pipeline import Pipeline import os import mydataset cutModel=True trainFileName='pipelineTrainCutAll.pkl' if(os.path.exists(trainFileName)): fin=open(trainFileName,'r') trainData=pickle.load(fin) trainClass=pickle.load(fin) fin.close() else: trainText=mydataset.getAllTrainTextList(cutModel) i=0; N=trainText.__len__() trainData=[] trainClass=[] for (tag,text) in trainText: i=i+1 if(i%5000==0): print('i=%08d finished %5.5f%% using jieba to cut the text\n'%(i,i*100.0/N)) trainData.append(text) trainClass.append(tag) fout=open(trainFileName,'w') pickle.dump(trainData,fout) pickle.dump(trainClass,fout)
def train(clf=SGDClassifier(class_weight='balanced')): trainFileName = 'train.pkl' testFileName = 'test.pkl' pipelineFileName = 'pipeline.pkl' if (os.path.exists(trainFileName)): fin = open(trainFileName, 'r') trainData = pickle.load(fin) trainClass = pickle.load(fin) fin.close() else: trainText = mydataset.getAllTrainTextList() i = 0 N = trainText.__len__() trainData = [] trainClass = [] for (tag, text) in trainText: i = i + 1 if (i % 5000 == 0): print('i=%08d finished %5.5f%% using jieba to cut the text\n' % (i, i * 100.0 / N)) trainData.append(text) trainClass.append(tag) fout = open(trainFileName, 'w') pickle.dump(trainData, fout) pickle.dump(trainClass, fout) fout.close() #if(os.path.exists(pipelineFileName)): if (False): fin = open(pipelineFileName, 'r') pipeline = pickle.load(fin) fin.close() else: pipeline = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', clf), ]) pipeline.set_params(vect__max_df=0.6, tfidf__norm='l1', tfidf__use_idf=True, vect__ngram_range=(1, 2)) trainNum = trainData.__len__() pipeline.fit(trainData[0:trainNum], trainClass[0:trainNum]) fout = open(pipelineFileName, 'w') pickle.dump(pipeline, fout) fout.close() #################################### output train result trainNum = trainData.__len__() #print 'train result '+"#"*30 prec = pipeline.predict(trainData[0:trainNum]) expected = trainClass[0:trainNum] #print("Classification report for classifier:\n%s\n" #% (metrics.classification_report(expected, prec))) TP = 0.0 TN = 0.0 FP = 0.0 FN = 0.0 N = trainData.__len__() for i in range(0, trainNum): if (prec[i] == expected[i]): if (prec[i] == u'1'): TP = TP + 1 else: TN = TN + 1 else: if (prec[i] == u'1'): FP = FP + 1 else: FN = FN + 1 P = TP / (TP + FP) R = TP / (TP + FN) F = 2 * P * R / (P + R) #print('train result: P=%f,R=%f,F=%f\n'%(P,R,F)) return F, pipeline