np.random.seed(0) # train hyperparameters feature_length = conf.ner_feature_length_2 batch_size = conf.word_batch_size nb_epoch = conf.word_nb_epoch model_name = os.path.basename(__file__)[:-3] folder_path = 'model/%s' % model_name if not os.path.isdir(folder_path): os.makedirs(folder_path) # the data, shuffled and split between train and dev sets train_data = load_data.load_ner(dataset='eng.train') dev_data = load_data.load_ner(dataset='eng.testa') train_samples = len(train_data) dev_samples = len(dev_data) print('train shape:', train_samples) print('dev shape:', dev_samples) print() word_train_data = [] word_dev_data = [] # all train sample, combine train and dev [word_train_data.extend(list(each[0])) for each in train_data] [word_dev_data.extend(list(each[0])) for each in dev_data] word_train_samples = len(word_train_data)
# add path sys.path.append('../') sys.path.append('../tools') from tools import conf from tools import load_data from tools import prepare # input sentence dimensions step_length = conf.ner_step_length pos_length = conf.ner_pos_length chunk_length = conf.ner_chunk_length IOB = conf.ner_BIOES_decode test_data = load_data.load_ner(dataset='eng.testb') best_epoch = sys.argv[1] model_name = os.path.basename(__file__)[9:-3] folder_path = './model/%s' % model_name model_path = '%s/model_epoch_%s.h5' % (folder_path, best_epoch) result = open('%s/predict.txt' % folder_path, 'w') def convert(chunktags): # convert BIOES to BIO for p, q in enumerate(chunktags): if q.startswith("E-"): chunktags[p] = "I-" + q[2:]
from tools import prepare # input sentence dimensions step_length = conf.ner_step_length pos_length = conf.ner_pos_length chunk_length = conf.ner_chunk_length gazetteer_length = conf.gazetteer_length IOB = conf.ner_BIOES_decode data = sys.argv[1] best_epoch = sys.argv[2] if data=="dev": test_data = load_data.load_ner(dataset='eng.testa', form='BIOES') elif data == "test": test_data = load_data.load_ner(dataset='eng.testb', form='BIOES') tokens = [len(x[0]) for x in test_data] print(sum(tokens)) print('%s shape:'%data, len(test_data)) model_name = os.path.basename(__file__)[9:-3] folder_path = './model/%s'%model_name model_path = '%s/model_epoch_%s.h5'%(folder_path, best_epoch) result = open('%s/predict.txt'%folder_path, 'w') def convert(chunktags): # convert BIOES to BIO for p, q in enumerate(chunktags):
hash_vocab = conf.ner_hash_vocab hash_length = conf.ner_hash_length output_length = conf.ner_BIOES_length batch_size = conf.batch_size nb_epoch = 50 #conf.nb_epoch model_name = os.path.basename(__file__)[:-3] folder_path = 'model/%s' % model_name if not os.path.isdir(folder_path): os.makedirs(folder_path) # the data, shuffled and split between train and test sets train_data = load_data.load_ner(dataset='eng.train', form='BIOES') dev_data = load_data.load_ner(dataset='eng.testa', form='BIOES') train_samples = len(train_data) dev_samples = len(dev_data) print('train shape:', train_samples) print('dev shape:', dev_samples) print() word_embedding = pd.read_csv('../preprocessing/senna/embeddings.txt', delimiter=' ', header=None) word_embedding = word_embedding.values word_embedding = np.concatenate([ np.zeros((1, emb_length)), word_embedding, np.random.uniform(-1, 1, (1, emb_length))