def main(_): np.random.seed(1337) random.seed(1337) if FLAGS.only_test or FLAGS.train_steps == 0: FLAGS.train_steps = 0 test(FLAGS) return print "#" * 67 print "# Loading data from:" print "#" * 67 print "Train:", FLAGS.train_data print "Valid:", FLAGS.valid_data print "Test: ", FLAGS.test_data print "Feature threshold:", FLAGS.feat_thresh # Load feature templates template = Template(FLAGS.template) # pretreatment process: read, split and create vocabularies train_set, valid_set, test_set, dicts, max_len = pretreatment( FLAGS.train_data, FLAGS.valid_data, FLAGS.test_data, threshold=FLAGS.feat_thresh, template=template) # Reset the maximum sentence's length # max_len = max(MAX_LEN, max_len) FLAGS.max_len = max_len # unfold these corpus train_corpus, train_lens = train_set valid_corpus, valid_lens = valid_set test_corpus, test_lens = test_set train_sentcs, train_featvs, train_labels = unfold_corpus(train_corpus) valid_sentcs, valid_featvs, valid_labels = unfold_corpus(valid_corpus) test_sentcs, test_featvs, test_labels = unfold_corpus(test_corpus) # vocabularies feats2idx = dicts['feats2idx'] words2idx = dicts['words2idx'] label2idx = dicts['label2idx'] FLAGS.label2idx = label2idx FLAGS.words2idx = words2idx FLAGS.feats2idx = feats2idx FLAGS.feat_size = len(feats2idx) print "Lexical word size: %d" % len(words2idx) print "Label size: %d" % len(label2idx) print "Features size: %d" % len(feats2idx) print "-------------------------------------------------------------------" print "Training data size: %d" % len(train_corpus) print "Validation data size: %d" % len(valid_corpus) print "Test data size: %d" % len(test_corpus) print "Maximum sentence len: %d" % FLAGS.max_len del train_corpus del valid_corpus # del test_corpus # neural network's output_dim nb_classes = len(label2idx) FLAGS.nb_classes = nb_classes + 1 idx2label = dict((k, v) for v, k in FLAGS.label2idx.iteritems()) # idx2words = dict((k, v) for v, k in FLAGS.words2idx.iteritems()) # convert corpus from string seq to numeric id seq with post padding 0 print "Preparing training, validate and testing data." train_X, train_F, train_Y = conv_corpus( train_sentcs, train_featvs, train_labels, words2idx, feats2idx, label2idx, max_len=max_len) valid_X, valid_F, valid_Y = conv_corpus( valid_sentcs, valid_featvs, valid_labels, words2idx, feats2idx, label2idx, max_len=max_len) test_X, test_F, test_Y = conv_corpus( test_sentcs, test_featvs, test_labels, words2idx, feats2idx, label2idx, max_len=max_len) del train_sentcs, train_featvs, train_labels del valid_sentcs, valid_featvs, valid_labels # del test_sentcs, test_featvs, test_labels del train_X, valid_X, test_X print "#" * 67 print "Training arguments" print "#" * 67 print "L2 regular: %f" % FLAGS.l2_reg print "nb_classes: %d" % FLAGS.nb_classes print "Batch size: %d" % FLAGS.batch_size # print "Hidden layer: %d" % FLAGS.hidden_dim print "Train epochs: %d" % FLAGS.train_steps print "Learning rate: %f" % FLAGS.lr print "#" * 67 print "Training process start." print "#" * 67 model = linear_chain_CRF( FLAGS.feat_size, FLAGS.nb_classes, FLAGS.max_len, FLAGS.batch_size, len(template.template), FLAGS.l2_reg) pred_test, test_loss, test_acc = model.run( train_F, train_Y, train_lens, valid_F, valid_Y, valid_lens, test_F, test_Y, test_lens, FLAGS) print "Test loss: %f, accuracy: %f" % (test_loss, test_acc) pred_test = [pred_test[i][:test_lens[i]] for i in xrange(len(pred_test))] pred_test_label = convert_id_to_word(pred_test, idx2label) if FLAGS.eval_test: res_test, pred_test_label = evaluate(pred_test_label, test_labels) print "Test F1: %f, P: %f, R: %f" % (res_test['f1'], res_test['p'], res_test['r']) original_text = [[item['w'] for item in sent] for sent in test_corpus] write_prediction(FLAGS.output_dir + 'prediction.utf8', original_text, pred_test_label) print "Saving feature dicts..." save_dicts(FLAGS.output_dir, FLAGS.feats2idx, FLAGS.words2idx, FLAGS.label2idx)
def main(_): np.random.seed(1337) random.seed(1337) if FLAGS.only_test or FLAGS.train_steps == 0: FLAGS.train_steps = 0 test(FLAGS) return print "#" * 67 print "# Loading data from:" print "#" * 67 print "Train:", FLAGS.train_data print "Valid:", FLAGS.valid_data print "Test: ", FLAGS.test_data if FLAGS.window == 1: win = (0, 0) elif FLAGS.window == 3: win = (-1, 1) elif FLAGS.window == 5: win = (-2, 2) else: raise ValueError('Unsupported window size %d.' % FLAGS.window) # Choose fields templates & features templates template = HybridTemplate(FLAGS.template, win) # pretreatment process: read, split and create vocabularies train_set, valid_set, test_set, dicts, max_len = pretreatment( FLAGS.train_data, FLAGS.valid_data, FLAGS.test_data, threshold=0, template=template) # Reset the maximum sentence's length # max_len = max(MAX_LEN, max_len) FLAGS.max_len = max_len # unfold these corpus train_corpus, train_lens = train_set valid_corpus, valid_lens = valid_set test_corpus, test_lens = test_set train_sentcs, train_featvs, train_labels = unfold_corpus(train_corpus) valid_sentcs, valid_featvs, valid_labels = unfold_corpus(valid_corpus) test_sentcs, test_featvs, test_labels = unfold_corpus(test_corpus) # vocabularies feats2idx = dicts['feats2idx'] words2idx = dicts['words2idx'] label2idx = dicts['label2idx'] FLAGS.label2idx = label2idx FLAGS.words2idx = words2idx FLAGS.feats2idx = feats2idx FLAGS.feat_size = len(feats2idx) print "Lexical word size: %d" % len(words2idx) print "Label size: %d" % len(label2idx) print "Features size: %d" % len(feats2idx) print "-------------------------------------------------------------------" print "Training data size: %d" % len(train_corpus) print "Validation data size: %d" % len(valid_corpus) print "Test data size: %d" % len(test_corpus) print "Maximum sentence len: %d" % FLAGS.max_len del train_corpus del valid_corpus # del test_corpus # neural network's output_dim nb_classes = len(label2idx) FLAGS.nb_classes = nb_classes + 1 # Embedding layer's input_dim nb_words = len(words2idx) FLAGS.nb_words = nb_words FLAGS.in_dim = FLAGS.nb_words + 1 # load embeddings from file print "#" * 67 print "# Reading embeddings from file: %s" % (FLAGS.emb_file) emb_mat, idx_map = read_emb_from_file(FLAGS.emb_file, words2idx) FLAGS.emb_dim = max(emb_mat.shape[1], FLAGS.emb_dim) print "embeddings' size:", emb_mat.shape if FLAGS.fine_tuning: print "The embeddings will be fine-tuned!" idx2label = dict((k, v) for v, k in FLAGS.label2idx.iteritems()) # idx2words = dict((k, v) for v, k in FLAGS.words2idx.iteritems()) # convert corpus from string to it's own index seq with post padding 0 print "Preparing training, validate and testing data." train_X, train_F, train_Y = conv_corpus(train_sentcs, train_featvs, train_labels, words2idx, feats2idx, label2idx, max_len=max_len) valid_X, valid_F, valid_Y = conv_corpus(valid_sentcs, valid_featvs, valid_labels, words2idx, feats2idx, label2idx, max_len=max_len) test_X, test_F, test_Y = conv_corpus(test_sentcs, test_featvs, test_labels, words2idx, feats2idx, label2idx, max_len=max_len) del train_sentcs, train_featvs, train_labels del valid_sentcs, valid_featvs, valid_labels # del test_sentcs, test_featvs, test_labels print "#" * 67 print "Training arguments" print "#" * 67 print "L2 regular: %f" % FLAGS.l2_reg print "nb_classes: %d" % FLAGS.nb_classes print "Batch size: %d" % FLAGS.batch_size print "Hidden layer: %d" % FLAGS.hidden_dim print "Train epochs: %d" % FLAGS.train_steps print "Learning rate: %f" % FLAGS.lr print "#" * 67 print "Training process start." print "#" * 67 # if FLAGS.model == 'LSTM': # Model_type = tagger.LSTM_NER # elif FLAGS.model == 'BLSTM': # Model_type = tagger.Bi_LSTM_NER # elif FLAGS.model == 'CNNBLSTM': # Model_type = tagger.CNN_Bi_LSTM_NER # else: # raise TypeError("Unknow model type % " % FLAGS.model) model = Hybrid_LSTM_tagger(nb_words, FLAGS.emb_dim, emb_mat, FLAGS.feat_size, FLAGS.hidden_dim, FLAGS.nb_classes, FLAGS.max_len, FLAGS.fine_tuning, FLAGS.dropout, FLAGS.batch_size, len(template.template), FLAGS.window, FLAGS.l2_reg) pred_test, test_loss, test_acc = model.run(train_X, train_F, train_Y, train_lens, valid_X, valid_F, valid_Y, valid_lens, test_X, test_F, test_Y, test_lens, FLAGS) print "Test loss: %f, accuracy: %f" % (test_loss, test_acc) # pred_test = [pred_test[i][:test_lens[i]] for i in xrange(len(pred_test))] pred_test_label = convert_id_to_word(pred_test, idx2label) if FLAGS.eval_test: res_test, pred_test_label = evaluate(pred_test_label, test_labels) print "Test F1: %f, P: %f, R: %f" % (res_test['f1'], res_test['p'], res_test['r']) original_text = [[item['w'] for item in sent] for sent in test_corpus] write_prediction(FLAGS.output_dir + 'prediction.utf8', original_text, pred_test_label) print "Saving feature dicts..." save_dicts(FLAGS.output_dir, FLAGS.feats2idx, FLAGS.words2idx, FLAGS.label2idx)
def main(_): np.random.seed(1337) random.seed(1337) if FLAGS.only_test or FLAGS.train_steps == 0: FLAGS.train_steps = 0 test(FLAGS) return print "#" * 67 print "# Loading data from:" print "#" * 67 print "Train:", FLAGS.train_data print "Valid:", FLAGS.valid_data print "Test: ", FLAGS.test_data # Choose fields templates & features templates template = Template(FLAGS.template, prefix=False) # pretreatment process: read, split and create vocabularies train_set, valid_set, test_set, dicts, max_len = pretreatment( FLAGS.train_data, FLAGS.valid_data, FLAGS.test_data, threshold=0, template=template) # Reset the maximum sentence's length # max_len = max(MAX_LEN, max_len) FLAGS.max_len = max_len # unfold these corpus train_corpus, train_lens = train_set valid_corpus, valid_lens = valid_set test_corpus, test_lens = test_set train_sentcs, train_featvs, train_labels = unfold_corpus(train_corpus) valid_sentcs, valid_featvs, valid_labels = unfold_corpus(valid_corpus) test_sentcs, test_featvs, test_labels = unfold_corpus(test_corpus) # vocabularies feats2idx = dicts['feats2idx'] words2idx = dicts['words2idx'] label2idx = dicts['label2idx'] FLAGS.label2idx = label2idx FLAGS.words2idx = words2idx FLAGS.feats2idx = feats2idx print "Lexical word size: %d" % len(feats2idx) print "Label size: %d" % len(label2idx) print "-------------------------------------------------------------------" print "Training data size: %d" % len(train_corpus) print "Validation data size: %d" % len(valid_corpus) print "Test data size: %d" % len(test_corpus) print "Maximum sentence len: %d" % FLAGS.max_len del train_corpus del valid_corpus # del test_corpus # neural network's output_dim nb_classes = len(label2idx) FLAGS.nb_classes = nb_classes + 1 # Embedding layer's input_dim nb_words = len(words2idx) FLAGS.nb_words = nb_words FLAGS.in_dim = FLAGS.nb_words + 1 # load embeddings from file print "#" * 67 print "# Reading embeddings from file: %s" % (FLAGS.emb_file) emb_mat, idx_map = read_emb_from_file(FLAGS.emb_file, feats2idx) FLAGS.emb_dim = max(emb_mat.shape[1], FLAGS.emb_dim) print "embeddings' size:", emb_mat.shape if FLAGS.fine_tuning: print "The embeddings will be fine-tuned!" idx2label = dict((k, v) for v, k in FLAGS.label2idx.iteritems()) # idx2words = dict((k, v) for v, k in FLAGS.words2idx.iteritems()) # convert corpus from string to it's own index seq with post padding 0 print "Preparing training, validate and testing data." train_X, train_F, train_Y = conv_corpus( train_sentcs, train_featvs, train_labels, words2idx, feats2idx, label2idx, max_len=max_len) valid_X, valid_F, valid_Y = conv_corpus( valid_sentcs, valid_featvs, valid_labels, words2idx, feats2idx, label2idx, max_len=max_len) test_X, test_F, test_Y = conv_corpus( test_sentcs, test_featvs, test_labels, words2idx, feats2idx, label2idx, max_len=max_len) # Release memory del train_sentcs, train_featvs, train_labels del valid_sentcs, valid_featvs, valid_labels # del test_sentcs, test_featvs, test_labels del train_X, valid_X, test_X print "#" * 67 print "Training arguments" print "#" * 67 print "L2 regular: %f" % FLAGS.l2_reg print "nb_classes: %d" % FLAGS.nb_classes print "Batch size: %d" % FLAGS.batch_size print "Hidden layer: %d" % FLAGS.hidden_dim print "Train epochs: %d" % FLAGS.train_steps print "Learning rate: %f" % FLAGS.lr print "#" * 67 print "Training process start." print "#" * 67 if FLAGS.model == 'LSTM': Model_type = tagger.LSTM_NER elif FLAGS.model == 'BLSTM': Model_type = tagger.Bi_LSTM_NER elif FLAGS.model == 'CNNBLSTM': Model_type = tagger.CNN_Bi_LSTM_NER else: raise TypeError("Unknow model type % " % FLAGS.model) model = Model_type( nb_words, FLAGS.emb_dim, emb_mat, FLAGS.hidden_dim, FLAGS.nb_classes, FLAGS.dropout, FLAGS.batch_size, FLAGS.max_len, len(template.template), FLAGS.l2_reg, FLAGS.fine_tuning) pred_test, test_loss, test_acc = model.run( train_F, train_Y, train_lens, valid_F, valid_Y, valid_lens, test_F, test_Y, test_lens, FLAGS) print "Test loss: %f, accuracy: %f" % (test_loss, test_acc) pred_test = [pred_test[i][:test_lens[i]] for i in xrange(len(pred_test))] pred_test_label = convert_id_to_word(pred_test, idx2label) if FLAGS.eval_test: res_test, pred_test_label = evaluate(pred_test_label, test_labels) print "Test F1: %f, P: %f, R: %f" % (res_test['f1'], res_test['p'], res_test['r']) original_text = [[item['w'] for item in sent] for sent in test_corpus] write_prediction(FLAGS.output_dir + 'prediction.utf8', original_text, pred_test_label) print "Saving feature dicts..." save_dicts(FLAGS.output_dir, FLAGS.feats2idx, FLAGS.words2idx, FLAGS.label2idx)