def predic_unscore_api(inputtext): charstop = True # True means label attributes to previous char features = 3 # 1=discrete; 2=vectors; 3=both dictfile = 'vector/24scbow50.txt' modelname = 'datalunyu5001.m' vdict = util.readvec(dictfile) inputtext = inputtext #li = [line for line in util.text_to_lines(inputtext)] li = util.text_to_lines(inputtext) print(li) data = [] for line in li: x, y = util.line_toseq(line, charstop) print(x) if features == 1: d = crf.x_seq_to_features_discrete(x, charstop), y elif features == 2: d = crf.x_seq_to_features_vector(x, vdict, charstop), y elif features == 3: d = crf.x_seq_to_features_both(x, vdict, charstop), y data.append(d) tagger = pycrfsuite.Tagger() tagger.open(modelname) print("Start testing...") results = [] lines = [] Spp = [] Npp = [] out = [] #while data: for index in range(len(data)): print(len(data)) xseq, yref = data.pop(0) yout = tagger.tag(xseq) sp = 0 np = 0 for i in range(len(yout)): sp = tagger.marginal('S', i) Spp.append(sp) #S標記的機率 print(sp) np = tagger.marginal('N', i) Npp.append(np) #Nㄅ標記的機率 print(np) results.append(util.eval(yref, yout, "S")) lines.append( util.seq_to_line([x['gs0'] for x in xseq], yout, charstop, Spp, Npp)) #print(util.seq_to_line([x['gs0'] for x in xseq], (str(sp) +'/'+ str(np)),charstop)) out.append(yout) tp, fp, fn, tn = zip(*results) tp, fp, fn, tn = sum(tp), sum(fp), sum(fn), sum(tn) p, r = tp / (tp + fp), tp / (tp + fn) score = '' score = score + '<br>' + "Total tokens in Test Set:" + repr(tp + fp + fn + tn) score = score + '<br>' + "Total S in REF:" + repr(tp + fn) score = score + '<br>' + "Total S in OUT:" + repr(tp + fp) score = score + '<br>' + "Presicion:" + repr(p) score = score + '<br>' + "Recall:" + repr(r) score = score + '<br>' + "*******************F1-score:" + repr(2 * p * r / (p + r)) output = '' print(lines) for line in lines: #line = unquote(line) print("output:") print(line.encode('utf8')) #output = output + '<br>' + line output += line print(line) output = score + '<br>' + output #output = jsonify({'str': output}) return (out)
def predic(): charstop = True # True means label attributes to previous char features = 3 # 1=discrete; 2=vectors; 3=both dictfile = 'vector/24scbow50.txt' modelname = 'datalunyu5001.m' vdict = util.readvec(dictfile) inputtext = request.form.get('input_text', '') #li = [line for line in util.text_to_lines(inputtext)] li = util.text_to_lines(inputtext) print(li) data = [] for line in li: x, y = util.line_toseq(line, charstop) print(x) if features == 1: d = crf.x_seq_to_features_discrete(x, charstop), y elif features == 2: d = crf.x_seq_to_features_vector(x, vdict, charstop), y elif features == 3: d = crf.x_seq_to_features_both(x, vdict, charstop), y data.append(d) tagger = pycrfsuite.Tagger() tagger.open(modelname) print("Start testing...") results = [] lines = [] Spp = [] Npp = [] #while data: for index in range(len(data)): print(len(data)) xseq, yref = data.pop(0) yout = tagger.tag(xseq) sp = 0 np = 0 for i in range(len(yout)): sp = tagger.marginal('S', i) Spp.append(sp) #S標記的機率 print(sp) np = tagger.marginal('N', i) Npp.append(np) #Nㄅ標記的機率 print(np) results.append(util.eval(yref, yout, "S")) lines.append( util.seq_to_line([x['gs0'] for x in xseq], yout, charstop, Spp, Npp)) #print(util.seq_to_line([x['gs0'] for x in xseq], (str(sp) +'/'+ str(np)),charstop)) U_score = 0 p_Scount = 0 p_Ncount = 0 for i in range(len(Spp)): _s = 0 if Spp[i] > Npp[i]: _s = Spp[i] else: _s = Npp[i] _s = (_s - 0.5) * 10 U_score = U_score + _s p_Scount = p_Scount + Spp[i] p_Ncount = p_Ncount + Npp[i] tp, fp, fn, tn = zip(*results) tp, fp, fn, tn = sum(tp), sum(fp), sum(fn), sum(tn) p, r = tp / (tp + fp), tp / (tp + fn) score = '' score = score + '<br>' + "Total tokens in Test Set:" + repr(tp + fp + fn + tn) score = score + '<br>' + "Total S in REF:" + repr(tp + fn) score = score + '<br>' + "Total S in OUT:" + repr(tp + fp) score = score + '<br>' + "Presicion:" + repr(p) score = score + '<br>' + "Recall:" + repr(r) score = score + '<br>' + "*******************F1-score:" + repr(2 * p * r / (p + r)) score = score + '<br>' + "=======================" score = score + '<br>' + "character count:" + str(len(Spp)) score = score + '<br>' + "block uncertain rate:" + str( (U_score / len(Spp))) output = '' key = 0 for line in lines: #print (line.encode('utf8')) output = output + '<br>' + line #print (line) key = key + 1 #for index_m in ypp: # output = output + '<br>' + line output = score + '<br>' + output return (output)
xseq, yref = testdata.pop() #print(xseq) yout = tagger.tag(xseq) all_len += len(yout) #print(len(xseq),len(yout),all_len) sp = 0 np = 0 for i in range(len(yout)): sp = tagger.marginal('S',i) Spp.append(sp) #S標記的機率 #print(sp) np = tagger.marginal('N',i) Npp.append(np)#N標記的機率 #print(np) results.append(util.eval(yref, yout, "S")) lines.append(util.seq_to_line([x['gs0'] for x in xseq],yout,charstop,Spp,Npp)) #print(util.seq_to_line([x['gs0'] for x in xseq], (str(sp) +'/'+ str(np)),charstop)) score_array = [] All_u_score = 0 for i in range(len(Spp)): _s = 0 if Spp[i] > Npp[i]: _s = Spp[i] else :_s = Npp[i] #_s = (_s - 0.5) * 10 _s = (1 - _s) #U_score = U_score + _s score_array.append(_s) for i in range(len(testidx)): U_score = 0 #文本區塊的不確定值 text_count = 0 #字數
elif features == 3: d = crf.x_seq_to_features_both(x, vdict, charstop), y data.append(d) tagger = pycrfsuite.Tagger() tagger.open(modelname) print datetime.datetime.now() print "Start testing..." results = [] lines = [] while data: xseq, yref = data.pop() yout = tagger.tag(xseq) results.append(util.eval(yref, yout, "S")) lines.append(util.seq_to_line([x['gs0'] for x in xseq],yout,charstop)) tp, fp, fn, tn = zip(*results) tp, fp, fn, tn = sum(tp), sum(fp), sum(fn), sum(tn) p, r = tp/(tp+fp), tp/(tp+fn) print "Total tokens in Test Set:", tp+fp+fn+tn print "Total S in REF:", tp+fn print "Total S in OUT:", tp+fp print "Presicion:", p print "Recall:", r print "*******************F1-score:", 2*p*r/(p+r) for line in lines: print line.encode('utf8')
stt = datetime.datetime.now() print "Start training...", stt hmmtagger = nt.hmm.HiddenMarkovModelTagger.train(traindata) print "################# Training took:", datetime.datetime.now()-stt results = [] lines = [] testdata = [line for line in util.file_to_lines(glob.glob(hu))] for line in testdata: x, yref = util.line_toseq(line, charstop) out = hmmtagger.tag(x) _, yout = zip(*out) yout = list(yout) results.append(util.eval(yref, yout, "S")) lines.append(util.seq_to_line(x,yout,charstop)) tp, fp, fn, tn = zip(*results) tp, fp, fn, tn = sum(tp), sum(fp), sum(fn), sum(tn) p, r = tp/(tp+fp), tp/(tp+fn) print "Total tokens in Test Set:", tp+fp+fn+tn print "Total S in REF:", tp+fn print "Total S in OUT:", tp+fp print "Presicion:", p print "Recall:", r print "F1-score:", 2*p*r/(p+r) while lines: print lines.pop().encode('utf8')
stt = datetime.datetime.now() print (traindata[0]) print ("Start training...", stt) hmmtagger = nt.hmm.HiddenMarkovModelTagger.train(traindata) #hmmtagger = nt.HiddenMarkovModelTagger.train(traindata) print ("################# Training took:", datetime.datetime.now()-stt) results = [] lines = [] testdata = [line for line in util.file_to_lines(glob.glob(hu))] for line in testdata: x, yref = util.line_toseq(line, charstop) out = hmmtagger.tag(x) _, yout = zip(*out) yout = list(yout) results.append(util.eval(yref, yout, "S")) lines.append(util.seq_to_line(x,yout,charstop)) tp, fp, fn, tn = zip(*results) tp, fp, fn, tn = sum(tp), sum(fp), sum(fn), sum(tn) p, r = tp/(tp+fp), tp/(tp+fn) print ("Total tokens in Test Set:", tp+fp+fn+tn) print ("Total S in REF:", tp+fn) print ("Total S in OUT:", tp+fp) print ("Presicion:", p) print ("Recall:", r) print ("F1-score:", 2*p*r/(p+r)) while lines: print (lines.pop().encode('utf8'))