import util " PARAMETERS " charstop = False # True means label attributes to previous char " END OF PARAMETERS " "python cpr.py 'qualitative/allover-sjw-gold.txt' 'qualitative/allover-sjw-me.txt' 1" args = sys.argv if len(args) > 1: material1 = args[1] material2 = args[2] charstop = int(args[3]) # Prepare li: list of random lines print "Reading from files..." gold = [line for line in util.file_to_lines(glob.glob(material1))] out = [line for line in util.file_to_lines(glob.glob(material2))] golddata = [] for line in gold: golddata.append(util.line_toseq(line, charstop)) outdata = [] for line in out: outdata.append(util.line_toseq(line, charstop)) # testdata shape: [([x1, x2, ...],[y1,y2,...]),([],[])] results = [] assert len(golddata) == len(outdata) for i in range(len(golddata)):
# -*- coding: utf8 -*- import util import sys import glob material = '../data/24s/*' #material = 'data/24s/*' i = 0 for line in util.file_to_lines(glob.glob(material)): #i = i+1 #print (" ".join(util.line_toraw(line)).encode('utf8')) print(" ".join(util.line_toraw(line))) #print i
modelname = material.replace('/','').replace('*','')+str(size)+"glove50" validate_interval = 10000 hidden_size = 50 learning_rate = 0.001 random.seed(101) print "Material:", material print "Size:", size, "entries,", trainportion, "as training", validateportion, "as validation" print "Dense:", dense print "charstop:", charstop starttime = datetime.datetime.now() print "Starting Time:",starttime print "Preparing text..." li = [line for line in util.file_to_lines(glob.glob(material))] random.shuffle(li) li = li[:size] print "Preparing dictionaries..." if dense: vdict = util.lstmvec(dictfile) else: charset = util.make_charset(li,7) print "Preparing datasets..." dataset_train = li[:cut1] dataset_validate = li[cut1:cut2] dataset_test = li[cut2:] dataset = [] while dataset_train:
modelname = material.replace('/', '').replace('*', '') + str(size) + "glove50" validate_interval = 10000 hidden_size = 50 learning_rate = 0.001 random.seed(101) print "Material:", material print "Size:", size, "entries,", trainportion, "as training", validateportion, "as validation" print "Dense:", dense print "charstop:", charstop starttime = datetime.datetime.now() print "Starting Time:", starttime print "Preparing text..." li = [line for line in util.file_to_lines(glob.glob(material))] random.shuffle(li) li = li[:size] print "Preparing dictionaries..." if dense: vdict = util.lstmvec(dictfile) else: charset = util.make_charset(li, 7) print "Preparing datasets..." dataset_train = li[:cut1] dataset_validate = li[cut1:cut2] dataset_test = li[cut2:] dataset = [] while dataset_train:
def buildCrf(inputtext): material = inputtext #material = 'data/24s/*' #material = "data/sjw/A05*" filename = 'model' size = 80 trainportion = 0.9 dictfile = 'data/vector/24scbow300.txt' crfmethod = "l2sgd" # {‘lbfgs’, ‘l2sgd’, ‘ap’, ‘pa’, ‘arow’} charstop = True # True means label attributes to previous char features = 1 # 1=discrete; 2=vectors; 3=both random.seed(101) #宣告指令式 "python runcrf.py 'data/sjw/*' 80 data/vector/vectors300.txt 1 1" args = sys.argv ''' if len(args)>1: material = args[1] size = int(args[2]) dictfile = args[3] features = int(args[4]) charstop = int(args[5]) ''' cut = int(size * trainportion) #訓練模型名稱 modelname = filename.replace('/', '').replace( '*', '') + str(size) + str(charstop) + ".m" print(modelname) print("Material:", material) print("Size:", size, "entries,", trainportion, "as training") print(datetime.datetime.now()) # Prepare li: list of random lines if features > 1: vdict = util.readvec(dictfile) #先處理文本 print("Dict:", dictfile) li = [line for line in util.file_to_lines(glob.glob(material))] #已經切成陣列 random.shuffle(li) #做亂數取樣 li = li[:size] # Prepare data: list of x(char), y(label) sequences data = [] for line in li: x, y = util.line_toseq(line, charstop) #print(x) #print(y[:5]) #這邊在做文本做gram if features == 1: d = crf.x_seq_to_features_discrete(x, charstop), y elif features == 2: d = crf.x_seq_to_features_vector(x, vdict, charstop), y elif features == 3: d = crf.x_seq_to_features_both(x, vdict, charstop), y data.append(d) traindata = data[:cut] testdata = data[cut:] #print(traindata) trainer = pycrfsuite.Trainer() #print trainer.params() #print(traindata[0]) for t in traindata: x, y = t trainer.append(x, y) trainer.select(crfmethod) #做訓練 trainer.set('max_iterations', 10) #測試迴圈 #trainer.set('delta',0) #print ("!!!!before train", datetime.datetime.now()) trainer.train(modelname) #print ("!!!!after train", datetime.datetime.now()) tagger = pycrfsuite.Tagger() #建立訓練模型檔案 tagger.open(modelname) tagger.dump(modelname + ".txt") print(datetime.datetime.now()) print("Start closed testing...") results = [] print(traindata) while traindata: x, yref = traindata.pop() yout = tagger.tag(x) pr = tagger.marginal('S', 0) pp = tagger.probability(yout) results.append(util.eval(yref, yout, "S")) tp, fp, fn, tn = zip(*results) tp, fp, fn, tn = sum(tp), sum(fp), sum(fn), sum(tn) p, r = tp / (tp + fp), tp / (tp + fn) print("Total tokens in Train Set:", tp + fp + fn + tn) print("Total S in REF:", tp + fn) print("Total S in OUT:", tp + fp) print("Presicion:", p) print("Recall:", r) print("*******************F1-score:", 2 * p * r / (p + r)) print("*******************:", pr) print("*******************:", pp) print("*******************:", yout) print(datetime.datetime.now()) return (modelname)
cut = int(size * trainportion) #訓練模型名稱 modelname = material.replace('/', '').replace( '*', '') + str(size) + str(charstop) + ".m" print("Material:", material) print("Size:", size, "entries,", trainportion, "as training") print(datetime.datetime.now()) # Prepare li: list of random lines if features > 1: vdict = util.readvec(dictfile) #先處理文本 print("Dict:", dictfile) li = [line for line in util.file_to_lines(glob.glob(material))] #已經切成陣列 random.shuffle(li) #做亂數取樣 print(len(li)) #li = li[:size] # Prepare data: list of x(char), y(label) sequences data = [] for line in li: x, y = util.line_toseq(line, charstop) #print(x) #print(y[:5]) #這邊在做文本做gram if features == 1: d = crf.x_seq_to_features_discrete(x, charstop, 1), y
"python runhmm.py 'data/sjw/*' 80 1" args = sys.argv if len(args)>1: material = args[1] size = int(args[2]) charstop = int(args[3]) cut = int(size*trainportion) print "Material:", material print "Size:", size, "entries,", trainportion, "as training" print "Starting Time:",datetime.datetime.now() # Prepare li: list of random lines print "Reading from files..." li = [line for line in util.file_to_lines(glob.glob(material))] random.shuffle(li) li = li[:size] # Prepare data: list of x(char), y(label) sequences print "Prepare list of sequences..." closetestdata = li[:cut] testdata = li[cut:] traindata = [] for line in closetestdata: x, y = util.line_toseq(line, charstop) traindata.append(zip(x,y)) # traindata shape: [[(x,y),(x,y), ...],[],[],...]
import util " PARAMETERS " charstop = False # True means label attributes to previous char " END OF PARAMETERS " "python cpr.py 'qualitative/allover-sjw-gold.txt' 'qualitative/allover-sjw-me.txt' 1" args = sys.argv if len(args)>1: material1 = args[1] material2 = args[2] charstop = int(args[3]) # Prepare li: list of random lines print "Reading from files..." gold = [line for line in util.file_to_lines(glob.glob(material1))] out = [line for line in util.file_to_lines(glob.glob(material2))] golddata = [] for line in gold: golddata.append(util.line_toseq(line, charstop)) outdata = [] for line in out: outdata.append(util.line_toseq(line, charstop)) # testdata shape: [([x1, x2, ...],[y1,y2,...]),([],[])] results = [] assert len(golddata)==len(outdata)
args = sys.argv if len(args)>1: material = args[1] size = int(args[2]) charstop = int(args[3]) hu = args[4] cut = int(size*trainportion) print "Material:", material print "Size:", size, "entries,", trainportion, "as training" print "Starting Time:",datetime.datetime.now() # Prepare li: list of random lines print "Reading from files..." li = [line for line in util.file_to_lines(glob.glob(material))] random.shuffle(li) li = li[:size] # Prepare data: list of x(char), y(label) sequences print "Prepare list of sequences..." closetestdata = li[:cut] traindata = [] for line in closetestdata: x, y = util.line_toseq(line, charstop) traindata.append(zip(x,y)) # traindata shape: [[(x,y),(x,y), ...],[],[],...] # testdata shape: [([x1, x2, ...],[y1,y2,...]),([],[])]
args = sys.argv if len(args)>1: material = args[1] size = int(args[2]) charstop = int(args[3]) hu = args[4] cut = int(size*trainportion) print ("Material:", material) print ("Size:", size, "entries,", trainportion, "as training") print ("Starting Time:",datetime.datetime.now()) # Prepare li: list of random lines print ("Reading from files...") li = [line for line in util.file_to_lines(glob.glob(material))] random.shuffle(li) li = li[:size] # Prepare data: list of x(char), y(label) sequences print ("Prepare list of sequences...") closetestdata = li[:cut] traindata = [] for line in closetestdata: x, y = util.line_toseq(line, charstop) traindata.append(zip(x,y)) # traindata shape: [[(x,y),(x,y), ...],[],[],...]