def Base(eta, l2, morepara, emb, batchsize): params.outfile = 'NER_CRF_lstm_Viterti_' params.dataf = '../ner_data/eng.train.bioes.conll' params.dev = '../ner_data/eng.dev.bioes.conll' params.test = '../ner_data/eng.test.bioes.conll' params.batchsize = batchsize params.hidden = 100 params.embedsize = 100 params.emb = emb params.eta = eta params.L2 = l2 params.dropout = 0 params.num_labels = 17 params.morepara = morepara (words, We) = getGloveWordmap('../embedding/glove.6B.100d.txt') words.update({'UUUNKKK': 0}) a = [0] * len(We[0]) newWe = [] newWe.append(a) We = newWe + We We = np.asarray(We).astype('float32') print We.shape tagger = getTagger('../ner_data/ner_bioes') print tagger params.taggerlist = getTaggerlist('../ner_data/ner_bioes') params.outfile = params.outfile + ".Batchsize" + '_' + str( params.batchsize ) + '_dropout_' + str(params.dropout) + "_LearningRate" + '_' + str( params.eta) + '_' + str(l2) + str(morepara) + '_emb_' + str(emb) #examples are shuffled data trainx0, trainy0, _, _ = Get_Ner_bioes(params.dataf, words, tagger) traindata = trainx0, trainy0 #N = int(params.frac*len(trainx0)) #traindata = trainx0[:N], trainy0[:N] devx0, devy0, params.devrawx, params.devpos = Get_Ner_bioes( params.dev, words, tagger) devdata = devx0, devy0 print devy0[:10] print 'dev set', len(devx0) testx0, testy0, params.testrawx, params.testpos = Get_Ner_bioes( params.test, words, tagger) testdata = testx0, testy0 print 'test set', len(testx0) #print Y print "Using Training Data" + params.dataf print "Using Word Embeddings with Dimension " + str(params.embedsize) print "Saving models to: " + params.outfile tm = CRF_model(We, params) tm.train(trainx0, trainy0, devx0, devy0, testx0, testy0, params)
def Base(eta, l2, num_filters, emb, hidden): params.outfile = 'NER_BiLSTM_CNN_CRF_' params.dataf = '../ner_data/eng.train.bioes.conll' params.dev = '../ner_data/eng.dev.bioes.conll' params.test = '../ner_data/eng.test.bioes.conll' params.batchsize = 10 params.hidden = hidden params.embedsize = 100 params.emb = emb params.eta = eta params.L2 = l2 params.dropout = 1 params.num_labels = 17 params.char_embedd_dim = 30 params.num_filters = num_filters (words, We) = getGloveWordmap('../embedding/glove.6B.100d.txt') words.update({'UUUNKKK': 0}) a = [0] * len(We[0]) newWe = [] newWe.append(a) We = newWe + We We = np.asarray(We).astype('float32') print We.shape tagger = getTagger('../ner_data/ner_bioes') print tagger char_dic = getTagger('../ner_data/char_dic') params.char_dic = char_dic scale = np.sqrt(3.0 / params.char_embedd_dim) char_embedd_table = np.random.uniform( -scale, scale, [len(char_dic), params.char_embedd_dim]).astype(theano.config.floatX) params.taggerlist = getTaggerlist('../ner_data/ner_bioes') params.outfile = params.outfile + ".Batchsize" + '_' + str( params.batchsize) + '_dropout_' + str( params.dropout) + "_LearningRate" + '_' + str( params.eta) + '_' + str(l2) + '_' + str( num_filters) + '_hidden_' + str(hidden) trainx0, trainx0_char, trainy0, _, _ = Get_Ner_bioes_and_Char( params.dataf, words, tagger, char_dic) train = trainx0, trainy0, trainx0_char devx0, devx0_char, devy0, params.devrawx, params.devpos = Get_Ner_bioes_and_Char( params.dev, words, tagger, char_dic) dev = devx0, devy0, devx0_char print devy0[:10] print 'dev set', len(devx0) testx0, testx0_char, testy0, params.testrawx, params.testpos = Get_Ner_bioes_and_Char( params.test, words, tagger, char_dic) test = testx0, testy0, testx0_char print 'test set', len(testx0) #print Y print "Using Training Data" + params.dataf print "Using Word Embeddings with Dimension " + str(params.embedsize) print "Saving models to: " + params.outfile tm = CRF_model(We, char_embedd_table, params) tm.train(train, dev, test, params)
def Base(eta, l2, num_filters, inf, hidden_size): params.outfile = 'base_ner_inf_' params.dataf = '../ner_data/eng.train.bioes.conll' params.dev = '../ner_data/eng.dev.bioes.conll' params.test = '../ner_data/eng.test.bioes.conll' params.batchsize = 10 params.hidden = hidden_size params.embedsize = 100 params.eta = eta params.L2 = l2 params.dropout = 1 params.emb = 1 params.inf = inf params.char_embedd_dim = 30 params.num_filters = num_filters params.en_hidden_size = hidden_size params.de_hidden_size = hidden_size params.lstm_layers_num = 1 params.num_labels = 17 params.layers_num = 3 (words, We) = getGloveWordmap('../embedding/glove.6B.100d.txt') words.update({'UUUNKKK': 0}) a = [0] * len(We[0]) newWe = [] newWe.append(a) We = newWe + We We = np.asarray(We).astype('float32') print We.shape tagger = getTagger('../ner_data/ner_bioes') params.taggerlist = getTaggerlist('../ner_data/ner_bioes') print tagger char_dic = getTagger('../ner_data/char_dic') params.char_dic = char_dic scale = np.sqrt(3.0 / params.char_embedd_dim) char_embedd_table = np.random.uniform( -scale, scale, [len(char_dic), params.char_embedd_dim]).astype(theano.config.floatX) params.taggerlist = getTaggerlist('../ner_data/ner_bioes') params.outfile = params.outfile + '_dropout_' + str( params.dropout) + "_LearningRate" + '_' + str( params.eta) + '_inf_' + str(inf) + '_' + str(l2) + '_' + str( num_filters) + '_hidden_' + str(hidden_size) trainx0, trainx0_char, trainy0, _, _ = Get_Ner_bioes_and_Char( params.dataf, words, tagger, char_dic) train = trainx0, trainy0, trainx0_char devx0, devx0_char, devy0, params.devrawx, params.devpos = Get_Ner_bioes_and_Char( params.dev, words, tagger, char_dic) dev = devx0, devy0, devx0_char print devy0[:10] print 'dev set', len(devx0) testx0, testx0_char, testy0, params.testrawx, params.testpos = Get_Ner_bioes_and_Char( params.test, words, tagger, char_dic) test = testx0, testy0, testx0_char if (inf == 0) or (inf == 1): from base_ner_model_selection import base_model tm = base_model(We, char_embedd_table, params) tm.train(train, dev, test, params) elif (inf == 2): from seq2seq_att_ner import Seq2Seq tm = Seq2Seq(We, char_embedd_table, params) tm.train(train, dev, test, params) elif (inf == 3): from self_att_ner import Transformer tm = Transformer(We, char_embedd_table, params) tm.train(train, dev, test, params)
def Base(eta, l2, inf, hidden_size): params.outfile = 'h_base_ner_inf_' params.dataf = '../ner_data/eng.train.bioes.conll' params.dev = '../ner_data/eng.dev.bioes.conll' params.test = '../ner_data/eng.test.bioes.conll' params.batchsize = 10 params.hidden = hidden_size params.embedsize = 100 params.eta = eta params.L2 = l2 params.dropout = 0 params.emb =0 params.inf = inf params.en_hidden_size= hidden_size params.de_hidden_size= hidden_size params.lstm_layers_num =1 params.num_labels = 17 (words, We) = getGloveWordmap('../embedding/glove.6B.100d.txt') words.update({'UUUNKKK':0}) a=[0]*len(We[0]) newWe = [] newWe.append(a) We = newWe + We We = np.asarray(We).astype('float32') print We.shape tagger = getTagger('../ner_data/ner_bioes') params.taggerlist = getTaggerlist('../ner_data/ner_bioes') print tagger params.outfile = params.outfile+".Batchsize"+'_'+str(params.batchsize)+'_LearningRate_'+str(params.eta)+ '_inf_' +str(inf) + '_' + str(l2) + '_'+ str(hidden_size) #examples are shuffled data trainx0, trainy0, _ , _ = Get_Ner_bioes(params.dataf, words, tagger) traindata = trainx0, trainy0 #N = int(params.frac*len(trainx0)) #traindata = trainx0[:N], trainy0[:N] devx0, devy0, params.devrawx, params.devpos = Get_Ner_bioes(params.dev, words, tagger) devdata = devx0, devy0 print devy0[:10] print 'dev set', len(devx0) testx0, testy0, params.testrawx, params.testpos = Get_Ner_bioes(params.test, words, tagger) testdata = testx0, testy0 print 'test set', len(testx0) #print Y print "Using Training Data"+params.dataf print "Using Word Embeddings with Dimension "+str(params.embedsize) print "Saving models to: "+params.outfile if (inf ==0) or (inf==1): tm = base_model(We, params) tm.train(traindata, devdata, testdata, params) #elif(inf ==2): # from seq2seq import Seq2Seq # tm = Seq2Seq(We, params) # tm.train(traindata, devdata, testdata, params) elif(inf ==2): from seq2seq_att_ner_h import Seq2Seq #from seq2seq_att_ner_beamsearch import Seq2Seq #params.de_hidden_size=200 #params.outfile = 'de_hidden_200_' + params.outfile tm = Seq2Seq(We, params) tm.train(traindata, devdata, testdata, params) elif(inf ==3): #from seq2seq_att_ner import Seq2Seq from seq2seq_att_ner_h_beamsearch import Seq2Seq #params.de_hidden_size=200 #params.outfile = 'de_hidden_200_' + params.outfile tm = Seq2Seq(We, params) tm.train(traindata, devdata, testdata, params) elif(inf ==4): #from seq2seq_att_all import Seq2Seq from seq2seq_local_att_ner import Seq2Seq params.window =int(sys.argv[5]) params.outfile = 'local_att_window_' + str(params.window)+ '_attweight_' + sys.argv[6] + params.outfile tm = Seq2Seq(We, params) tm.train(traindata, devdata, testdata, params)
def Base(eta, l3, emb, num_filters, inf, hidden_inf): params.outfile = 'CRF_Inf_NER_' params.dataf = '../ner_data/eng.train.bioes.conll' params.dev = '../ner_data/eng.dev.bioes.conll' params.test = '../ner_data/eng.test.bioes.conll' params.batchsize = 10 params.hidden = 200 params.embedsize = 100 params.emb = emb params.eta = eta params.dropout = 1 params.hidden_inf = hidden_inf params.char_embedd_dim = 30 params.num_filters = num_filters params.inf = inf params.regutype = 0 params.annealing = 1 params.L3 = l3 (words, We) = getGloveWordmap('../embedding/glove.6B.100d.txt') words.update({'UUUNKKK': 0}) a = [0] * len(We[0]) newWe = [] newWe.append(a) We = newWe + We We = np.asarray(We).astype('float32') tagger = getTagger('../ner_data/ner_bioes') params.taggerlist = getTaggerlist('../ner_data/ner_bioes') char_dic = getTagger('../ner_data/char_dic') params.char_dic = char_dic scale = np.sqrt(3.0 / params.char_embedd_dim) char_embedd_table = np.random.uniform( -scale, scale, [len(char_dic), params.char_embedd_dim]).astype(theano.config.floatX) params.words = words params.tagger = tagger params.outfile = params.outfile + ".num_filters" + '_' + str( num_filters ) + '_dropout_' + str(params.dropout) + '_LearningRate_' + str( params.eta) + '_' + str(l3) + '_emb_' + str(emb) + '_inf_' + str( params.inf) + '_hidden_' + str( params.hidden_inf) + '_annealing_' + str(params.annealing) trainx0, trainx0_char, trainy0, _, _ = Get_Ner_bioes_and_Char( params.dataf, words, tagger, char_dic) train = trainx0, trainy0, trainx0_char devx0, devx0_char, devy0, params.devrawx, params.devpos = Get_Ner_bioes_and_Char( params.dev, words, tagger, char_dic) dev = devx0, devy0, devx0_char testx0, testx0_char, testy0, params.testrawx, params.testpos = Get_Ner_bioes_and_Char( params.test, words, tagger, char_dic) test = testx0, testy0, testx0_char if (inf == 0) or (inf == 1): from model_selection_NER_inference import CRF_model tm = CRF_model(We, char_embedd_table, params) tm.train(train, dev, test, params) elif (inf == 2): from model_selection_inference_NER_seq2seq import CRF_seq2seq_model params.de_hidden_size = hidden_inf #params.outfile = 'de_hidden_' + str(params.de_hidden_size) + '_' + params.outfile tm = CRF_seq2seq_model(We, char_embedd_table, params) tm.train(train, dev, test, params) else: from model_selection_inference_NER_seq2seq_beamsearch import CRF_seq2seq_model params.de_hidden_size = hidden_inf #params.outfile = 'de_hidden_' + str(params.de_hidden_size) + '_' + params.outfile tm = CRF_seq2seq_model(We, char_embedd_table, params) tm.train(train, dev, test, params)
def Base(eta, l3, epoches, warmstart): params.outfile = 'CRF_Inf_NER_' params.dataf = '../ner_data/eng.train.bioes.conll' params.dev = '../ner_data/eng.dev.bioes.conll' params.test = '../ner_data/eng.test.bioes.conll' emb = 1 params.batchsize = 10 params.hidden = 200 params.embedsize = 100 params.emb = emb params.eta = eta params.dropout = 1 params.char_embedd_dim = 30 num_filters = 50 params.num_filters = num_filters params.epoches = epoches params.regutype = 0 params.annealing = 0 params.L3 = l3 params.hidden_inf = 200 params.WarmStart = warmstart (words, We) = getGloveWordmap('../embedding/glove.6B.100d.txt') words.update({'UUUNKKK': 0}) a = [0] * len(We[0]) newWe = [] newWe.append(a) We = newWe + We We = np.asarray(We).astype('float32') tagger = getTagger('../ner_data/ner_bioes') params.taggerlist = getTaggerlist('../ner_data/ner_bioes') char_dic = getTagger('../ner_data/char_dic') params.char_dic = char_dic scale = np.sqrt(3.0 / params.char_embedd_dim) char_embedd_table = np.random.uniform( -scale, scale, [len(char_dic), params.char_embedd_dim]).astype(theano.config.floatX) params.words = words params.tagger = tagger params.outfile = params.outfile + ".num_filters" + '_' + str( num_filters) + '_LearningRate_' + str( params.eta) + '_' + str(l3) + '_emb_' + str(emb) print params.outfile trainx0, trainx0_char, trainy0, _, _ = Get_Ner_bioes_and_Char( params.dataf, words, tagger, char_dic) train = trainx0, trainy0, trainx0_char devx0, devx0_char, devy0, params.devrawx, params.devpos = Get_Ner_bioes_and_Char( params.dev, words, tagger, char_dic) dev = devx0, devy0, devx0_char testx0, testx0_char, testy0, params.testrawx, params.testpos = Get_Ner_bioes_and_Char( params.test, words, tagger, char_dic) test = testx0, testy0, testx0_char from model_selection_NER_sgd_inference import CRF_model tm = CRF_model(We, char_embedd_table, params) tm.train(train, dev, test, params)
def Base(eta, l3, emb, batchsize, inf, hidden_inf): params.outfile = 'h_CRF_Inf_NER_' params.dataf = '../ner_data/eng.train.bioes.conll' params.dev = '../ner_data/eng.dev.bioes.conll' params.test = '../ner_data/eng.test.bioes.conll' params.batchsize = batchsize params.hidden = 100 params.embedsize = 100 params.emb = emb params.eta = eta params.dropout = 0 params.hidden_inf = hidden_inf params.inf = inf params.regutype = 0 params.annealing = 0 params.L3 = l3 (words, We) = getGloveWordmap('../embedding/glove.6B.100d.txt') words.update({'UUUNKKK': 0}) a = [0] * len(We[0]) newWe = [] newWe.append(a) We = newWe + We We = np.asarray(We).astype('float32') tagger = getTagger('../ner_data/ner_bioes') params.taggerlist = getTaggerlist('../ner_data/ner_bioes') params.words = words params.tagger = tagger params.outfile = params.outfile + ".Batchsize" + '_' + str( params.batchsize) + '_dropout_' + str( params.dropout) + '_LearningRate_' + str(params.eta) + '_' + str( l3) + '_emb_' + str(emb) + '_inf_' + str( params.inf) + '_hidden_' + str(params.hidden_inf) trainx0, trainy0, _, _ = Get_Ner_bioes(params.dataf, words, tagger) traindata = trainx0, trainy0 devx0, devy0, params.devrawx, params.devpos = Get_Ner_bioes( params.dev, words, tagger) devdata = devx0, devy0 print devy0[:10] print 'dev set', len(devx0) testx0, testy0, params.testrawx, params.testpos = Get_Ner_bioes( params.test, words, tagger) testdata = testx0, testy0 print "Using Training Data" + params.dataf print "Using Word Embeddings with Dimension " + str(params.embedsize) print "Saving models to: " + params.outfile #lm = LM_model(params) #lm.train(trainy0, devy0, params) if (inf == 0) or (inf == 1): from model_selection_NER_inference import CRF_model tm = CRF_model(We, params) tm.train(trainx0, trainy0, devx0, devy0, testx0, testy0, params) elif (inf == 2): from model_selection_inference_NER_seq2seq_h import CRF_seq2seq_model params.de_hidden_size = hidden_inf params.outfile = 'h_de_hidden_' + str( params.de_hidden_size) + '_' + params.outfile tm = CRF_seq2seq_model(We, params) tm.train(trainx0, trainy0, devx0, devy0, testx0, testy0, params) else: from model_selection_inference_NER_seq2seq_h_beamsearch import CRF_seq2seq_model params.de_hidden_size = hidden_inf params.outfile = 'h_de_hidden_' + str( params.de_hidden_size) + '_' + params.outfile tm = CRF_seq2seq_model(We, params) tm.train(trainx0, trainy0, devx0, devy0, testx0, testy0, params)