def dataary(rowdata): data_x = [] data_y = [] data = [] lineary = '' for _line in li: lineary += _line lineary = lineary.replace(":", "") lineary = lineary.replace("、", "") lineary = lineary.replace("!", "") lineary = lineary.replace(".", "") lineary = lineary.replace("?", "") lineary = lineary.replace("》", "") lineary = lineary.replace("《", "") x, y = util.line_toseq(lineary.pop(), charstop) data_x.append(x) data_y.append(y) ''' while rowdata: x, y = util.line_toseq(rowdata.pop(), charstop) data_x.append(x) data_y.append(y) ''' data.append([data_x, data_y]) return data
def dataary(li, gram, features, vdict): data = [] lineary = '' for _line in li: lineary += _line lineary = lineary.replace(":", "") lineary = lineary.replace("、", "") lineary = lineary.replace("!", "") lineary = lineary.replace(".", "") lineary = lineary.replace("?", "") lineary = lineary.replace("》", "") lineary = lineary.replace("《", "") x, y = util.line_toseq(lineary, charstop) del y[0] y = y + ['N'] #這邊在做文本做gram if features == 1: d = crf.x_seq_to_features_discrete(x, charstop, gram), y elif features == 2: d = crf.x_seq_to_features_vector(x, vdict, charstop), y elif features == 3: d = crf.x_seq_to_features_both(x, vdict, charstop, gram), y #d = crf.x_seq_to_features_discrete(x, charstop,gram), y data.append(d) return data
def dataary(rowdata): data_x = [] data_y = [] data = [] while rowdata: x, y = util.line_toseq(rowdata.pop(), charstop) data_x.append(x) data_y.append(y) data.append([data_x, data_y]) return data
def dataary(li, gram): data = [] for line in li: x, y = util.line_toseq(line, charstop) #print(x) #print(y[:5]) #這邊在做文本做gram d = crf.x_seq_to_features_discrete(x, charstop, gram), y data.append(d) return data
def dataary(li,gram,features,vdict): data = [] for line in li: x, y = util.line_toseq(line, charstop) #print(x) #print(y[:5]) #這邊在做文本做gram if features == 1: d = crf.x_seq_to_features_discrete(x, charstop,gram), y elif features == 2: d = crf.x_seq_to_features_vector(x, vdict, charstop), y elif features == 3: d = crf.x_seq_to_features_both(x, vdict, charstop,gram), y #d = crf.x_seq_to_features_discrete(x, charstop,gram), y data.append(d) return data
"python cpr.py 'qualitative/allover-sjw-gold.txt' 'qualitative/allover-sjw-me.txt' 1" args = sys.argv if len(args) > 1: material1 = args[1] material2 = args[2] charstop = int(args[3]) # Prepare li: list of random lines print "Reading from files..." gold = [line for line in util.file_to_lines(glob.glob(material1))] out = [line for line in util.file_to_lines(glob.glob(material2))] golddata = [] for line in gold: golddata.append(util.line_toseq(line, charstop)) outdata = [] for line in out: outdata.append(util.line_toseq(line, charstop)) # testdata shape: [([x1, x2, ...],[y1,y2,...]),([],[])] results = [] assert len(golddata) == len(outdata) for i in range(len(golddata)): try: yref = golddata[i][1] yout = outdata[i][1] results.append(util.eval(yref, yout, "S")) except AssertionError:
random.shuffle(li) li = li[:size] print "Preparing dictionaries..." if dense: vdict = util.lstmvec(dictfile) else: charset = util.make_charset(li,7) print "Preparing datasets..." dataset_train = li[:cut1] dataset_validate = li[cut1:cut2] dataset_test = li[cut2:] dataset = [] while dataset_train: x, y = util.line_toseq(dataset_train.pop(), charstop) if dense: dataset.append(util.seq_to_densevec(x, y, vdict)) else: dataset.append(util.seq_to_sparsevec(x,y,charset)) if not len(dataset_train)%1000: print "len(dataset_train)", len(dataset_train) dataset_train = dataset dataset = [] while dataset_validate: x, y = util.line_toseq(dataset_validate.pop(), charstop) if dense: dataset.append(util.seq_to_densevec(x, y, vdict)) else: dataset.append(util.seq_to_sparsevec(x,y,charset)) if not len(dataset_validate)%1000: print "len(dataset_validate)", len(dataset_validate) dataset_validate = dataset #sys.exit()
print('train:', trainidx) print('test:', testidx) traindataary = [] testdata = [] for i in trainidx: traindataary = numpy.hstack((traindataary, rowdata[i])) testdataary = [] for i in testidx: testdataary = rowdata[i] testdata.append(testdataary) #print(testdata) traindata = [] #資料處理 for line in traindataary: x, y = util.line_toseq(line, charstop) traindata.append(zip(x, y)) trainer = hmm.HiddenMarkovModelTrainer() tagger = trainer.train_supervised(traindata) #建立訓練模型檔案 #開始測試 print(datetime.datetime.now()) print("Start closed testing...") results = [] f.write(str(log_text)) for j in range(len(testdata)): #第j區塊 blocktext = j + 1
li = li[:size] print("Preparing dictionaries...") if dense: vdict = util.lstmvec(dictfile) else: charset = util.make_charset(li, 7) print("Preparing datasets...") dataset_train = li[:cut1] dataset_validate = li[cut1:cut2] dataset_test = li[cut2:] dataset = [] print(type(dataset_train)) while dataset_train: x, y = util.line_toseq(dataset_train.pop(), charstop) if dense: dataset.append(util.seq_to_densevec(x, y, vdict)) else: dataset.append(util.seq_to_sparsevec(x, y, charset)) if not len(dataset_train) % 1000: print("len(dataset_train)", len(dataset_train)) dataset_train = dataset dataset = [] while dataset_validate: x, y = util.line_toseq(dataset_validate.pop(), charstop) if dense: dataset.append(util.seq_to_densevec(x, y, vdict)) else: dataset.append(util.seq_to_sparsevec(x, y, charset)) if not len(dataset_validate) % 1000: print("len(dataset_validate)", len(dataset_validate)) dataset_validate = dataset
def predic_unscore_api(inputtext): charstop = True # True means label attributes to previous char features = 3 # 1=discrete; 2=vectors; 3=both dictfile = 'vector/24scbow50.txt' modelname = 'datalunyu5001.m' vdict = util.readvec(dictfile) inputtext = inputtext #li = [line for line in util.text_to_lines(inputtext)] li = util.text_to_lines(inputtext) print(li) data = [] for line in li: x, y = util.line_toseq(line, charstop) print(x) if features == 1: d = crf.x_seq_to_features_discrete(x, charstop), y elif features == 2: d = crf.x_seq_to_features_vector(x, vdict, charstop), y elif features == 3: d = crf.x_seq_to_features_both(x, vdict, charstop), y data.append(d) tagger = pycrfsuite.Tagger() tagger.open(modelname) print("Start testing...") results = [] lines = [] Spp = [] Npp = [] out = [] #while data: for index in range(len(data)): print(len(data)) xseq, yref = data.pop(0) yout = tagger.tag(xseq) sp = 0 np = 0 for i in range(len(yout)): sp = tagger.marginal('S', i) Spp.append(sp) #S標記的機率 print(sp) np = tagger.marginal('N', i) Npp.append(np) #Nㄅ標記的機率 print(np) results.append(util.eval(yref, yout, "S")) lines.append( util.seq_to_line([x['gs0'] for x in xseq], yout, charstop, Spp, Npp)) #print(util.seq_to_line([x['gs0'] for x in xseq], (str(sp) +'/'+ str(np)),charstop)) out.append(yout) tp, fp, fn, tn = zip(*results) tp, fp, fn, tn = sum(tp), sum(fp), sum(fn), sum(tn) p, r = tp / (tp + fp), tp / (tp + fn) score = '' score = score + '<br>' + "Total tokens in Test Set:" + repr(tp + fp + fn + tn) score = score + '<br>' + "Total S in REF:" + repr(tp + fn) score = score + '<br>' + "Total S in OUT:" + repr(tp + fp) score = score + '<br>' + "Presicion:" + repr(p) score = score + '<br>' + "Recall:" + repr(r) score = score + '<br>' + "*******************F1-score:" + repr(2 * p * r / (p + r)) output = '' print(lines) for line in lines: #line = unquote(line) print("output:") print(line.encode('utf8')) #output = output + '<br>' + line output += line print(line) output = score + '<br>' + output #output = jsonify({'str': output}) return (out)
def predic(): charstop = True # True means label attributes to previous char features = 3 # 1=discrete; 2=vectors; 3=both dictfile = 'vector/24scbow50.txt' modelname = 'datalunyu5001.m' vdict = util.readvec(dictfile) inputtext = request.form.get('input_text', '') #li = [line for line in util.text_to_lines(inputtext)] li = util.text_to_lines(inputtext) print(li) data = [] for line in li: x, y = util.line_toseq(line, charstop) print(x) if features == 1: d = crf.x_seq_to_features_discrete(x, charstop), y elif features == 2: d = crf.x_seq_to_features_vector(x, vdict, charstop), y elif features == 3: d = crf.x_seq_to_features_both(x, vdict, charstop), y data.append(d) tagger = pycrfsuite.Tagger() tagger.open(modelname) print("Start testing...") results = [] lines = [] Spp = [] Npp = [] #while data: for index in range(len(data)): print(len(data)) xseq, yref = data.pop(0) yout = tagger.tag(xseq) sp = 0 np = 0 for i in range(len(yout)): sp = tagger.marginal('S', i) Spp.append(sp) #S標記的機率 print(sp) np = tagger.marginal('N', i) Npp.append(np) #Nㄅ標記的機率 print(np) results.append(util.eval(yref, yout, "S")) lines.append( util.seq_to_line([x['gs0'] for x in xseq], yout, charstop, Spp, Npp)) #print(util.seq_to_line([x['gs0'] for x in xseq], (str(sp) +'/'+ str(np)),charstop)) U_score = 0 p_Scount = 0 p_Ncount = 0 for i in range(len(Spp)): _s = 0 if Spp[i] > Npp[i]: _s = Spp[i] else: _s = Npp[i] _s = (_s - 0.5) * 10 U_score = U_score + _s p_Scount = p_Scount + Spp[i] p_Ncount = p_Ncount + Npp[i] tp, fp, fn, tn = zip(*results) tp, fp, fn, tn = sum(tp), sum(fp), sum(fn), sum(tn) p, r = tp / (tp + fp), tp / (tp + fn) score = '' score = score + '<br>' + "Total tokens in Test Set:" + repr(tp + fp + fn + tn) score = score + '<br>' + "Total S in REF:" + repr(tp + fn) score = score + '<br>' + "Total S in OUT:" + repr(tp + fp) score = score + '<br>' + "Presicion:" + repr(p) score = score + '<br>' + "Recall:" + repr(r) score = score + '<br>' + "*******************F1-score:" + repr(2 * p * r / (p + r)) score = score + '<br>' + "=======================" score = score + '<br>' + "character count:" + str(len(Spp)) score = score + '<br>' + "block uncertain rate:" + str( (U_score / len(Spp))) output = '' key = 0 for line in lines: #print (line.encode('utf8')) output = output + '<br>' + line #print (line) key = key + 1 #for index_m in ypp: # output = output + '<br>' + line output = score + '<br>' + output return (output)
def buildCrf(inputtext): material = inputtext #material = 'data/24s/*' #material = "data/sjw/A05*" filename = 'model' size = 80 trainportion = 0.9 dictfile = 'data/vector/24scbow300.txt' crfmethod = "l2sgd" # {‘lbfgs’, ‘l2sgd’, ‘ap’, ‘pa’, ‘arow’} charstop = True # True means label attributes to previous char features = 1 # 1=discrete; 2=vectors; 3=both random.seed(101) #宣告指令式 "python runcrf.py 'data/sjw/*' 80 data/vector/vectors300.txt 1 1" args = sys.argv ''' if len(args)>1: material = args[1] size = int(args[2]) dictfile = args[3] features = int(args[4]) charstop = int(args[5]) ''' cut = int(size * trainportion) #訓練模型名稱 modelname = filename.replace('/', '').replace( '*', '') + str(size) + str(charstop) + ".m" print(modelname) print("Material:", material) print("Size:", size, "entries,", trainportion, "as training") print(datetime.datetime.now()) # Prepare li: list of random lines if features > 1: vdict = util.readvec(dictfile) #先處理文本 print("Dict:", dictfile) li = [line for line in util.file_to_lines(glob.glob(material))] #已經切成陣列 random.shuffle(li) #做亂數取樣 li = li[:size] # Prepare data: list of x(char), y(label) sequences data = [] for line in li: x, y = util.line_toseq(line, charstop) #print(x) #print(y[:5]) #這邊在做文本做gram if features == 1: d = crf.x_seq_to_features_discrete(x, charstop), y elif features == 2: d = crf.x_seq_to_features_vector(x, vdict, charstop), y elif features == 3: d = crf.x_seq_to_features_both(x, vdict, charstop), y data.append(d) traindata = data[:cut] testdata = data[cut:] #print(traindata) trainer = pycrfsuite.Trainer() #print trainer.params() #print(traindata[0]) for t in traindata: x, y = t trainer.append(x, y) trainer.select(crfmethod) #做訓練 trainer.set('max_iterations', 10) #測試迴圈 #trainer.set('delta',0) #print ("!!!!before train", datetime.datetime.now()) trainer.train(modelname) #print ("!!!!after train", datetime.datetime.now()) tagger = pycrfsuite.Tagger() #建立訓練模型檔案 tagger.open(modelname) tagger.dump(modelname + ".txt") print(datetime.datetime.now()) print("Start closed testing...") results = [] print(traindata) while traindata: x, yref = traindata.pop() yout = tagger.tag(x) pr = tagger.marginal('S', 0) pp = tagger.probability(yout) results.append(util.eval(yref, yout, "S")) tp, fp, fn, tn = zip(*results) tp, fp, fn, tn = sum(tp), sum(fp), sum(fn), sum(tn) p, r = tp / (tp + fp), tp / (tp + fn) print("Total tokens in Train Set:", tp + fp + fn + tn) print("Total S in REF:", tp + fn) print("Total S in OUT:", tp + fp) print("Presicion:", p) print("Recall:", r) print("*******************F1-score:", 2 * p * r / (p + r)) print("*******************:", pr) print("*******************:", pp) print("*******************:", yout) print(datetime.datetime.now()) return (modelname)
# Prepare li: list of random lines print "Reading from files..." li = [line for line in util.file_to_lines(glob.glob(material))] random.shuffle(li) li = li[:size] # Prepare data: list of x(char), y(label) sequences print "Prepare list of sequences..." closetestdata = li[:cut] testdata = li[cut:] traindata = [] for line in closetestdata: x, y = util.line_toseq(line, charstop) traindata.append(zip(x,y)) # traindata shape: [[(x,y),(x,y), ...],[],[],...] # testdata shape: [([x1, x2, ...],[y1,y2,...]),([],[])] stt = datetime.datetime.now() print "Start training...", stt hmmtagger = nt.hmm.HiddenMarkovModelTagger.train(traindata) print "################# Training took:", datetime.datetime.now()-stt results = [] for line in testdata: x, yref = util.line_toseq(line, charstop) out = hmmtagger.tag(x)
"python cpr.py 'qualitative/allover-sjw-gold.txt' 'qualitative/allover-sjw-me.txt' 1" args = sys.argv if len(args)>1: material1 = args[1] material2 = args[2] charstop = int(args[3]) # Prepare li: list of random lines print "Reading from files..." gold = [line for line in util.file_to_lines(glob.glob(material1))] out = [line for line in util.file_to_lines(glob.glob(material2))] golddata = [] for line in gold: golddata.append(util.line_toseq(line, charstop)) outdata = [] for line in out: outdata.append(util.line_toseq(line, charstop)) # testdata shape: [([x1, x2, ...],[y1,y2,...]),([],[])] results = [] assert len(golddata)==len(outdata) for i in range(len(golddata)): try: yref = golddata[i][1] yout = outdata[i][1] results.append(util.eval(yref, yout, "S")) except AssertionError: