def get_embed_vocab(embed_file): assert os.path.exists(embed_file) embed_vocab = Vocab(bos=None, eos=None) vec_dim = 0 with open(embed_file, 'r', encoding='utf-8') as fin: for line in fin: tokens = line.strip().split(' ') if len(tokens) < 10: continue embed_vocab.add(tokens[0]) if vec_dim == 0: vec_dim = len(tokens[1:]) embed_weights = np.random.uniform(-0.5 / vec_dim, 0.5 / vec_dim, (len(embed_vocab), vec_dim)) with open(embed_file, 'r', encoding='utf-8') as fin: for line in fin: tokens = line.strip().split(' ') if len(tokens) < 10: continue idx = embed_vocab.inst2idx(tokens[0]) embed_weights[idx] = np.asarray(tokens[1:], dtype=np.float32) embed_weights[embed_vocab.pad_idx] = 0. embed_weights /= np.std(embed_weights) embed_vocab.embeddings = embed_weights return embed_vocab
def prepare(args): logger = logging.getLogger("brc") logger.info('Checking the data files...') for data_path in args.train_files + args.dev_files + args.test_files: assert os.path.exists(data_path), '{} file does not exist.'.format( data_path) logger.info('Preparing the directories...') for dir_path in [ args.vocab_dir, args.model_dir, args.result_dir, args.summary_dir ]: if not os.path.exists(dir_path): os.makedirs(dir_path) logger.info('Building vocabulary...') brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, args.gpus, args.batch_size, args.train_files, args.dev_files, args.test_files) vocab = Vocab(init_random=False, trainable_oov_cnt_threshold=2) for word in brc_data.word_iter('train'): vocab.add(word) unfiltered_vocab_size = vocab.size() vocab.filter_tokens_by_cnt(min_cnt=2) filtered_num = unfiltered_vocab_size - vocab.size() logger.info('After filter {} tokens, the final vocab size is {}'.format( filtered_num, vocab.size())) logger.info('Assigning embeddings...') # vocab.build_embedding_matrix(args.pretrained_word_path) vocab.randomly_init_embeddings(args.embed_size) logger.info('Saving vocab...') with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout: pickle.dump(vocab, fout) logger.info('Done with preparing!')
def pdtb_prepare(args): print('Loading dataset...') train_sections = [os.path.join(PathConfig.json_data_dir, '{:02}'.format(section_num)) for section_num in PathConfig.train_sections] dev_sections = [os.path.join(PathConfig.json_data_dir, '{:02}'.format(section_num)) for section_num in PathConfig.dev_sections] test_sections = [os.path.join(PathConfig.json_data_dir, '{:02}'.format(section_num)) for section_num in PathConfig.test_sections] dataset = PDTBDataSet(train_sections, dev_sections, test_sections, level=2 if args.task.startswith('fine') else 1) print('Size of train: {}, dev: {}, test: {}'.format(len(dataset.train_set), len(dataset.dev_set), len(dataset.test_set))) print('Creating word vocab...') if not os.path.exists(PathConfig.experiment_data_dir): os.mkdir(PathConfig.experiment_data_dir) word_vocab = Vocab(mannual_add=[PAD_WORD, UNK_WORD, BOS_WORD, EOS_WORD, NUM_WORD]) for word in dataset.get_all_words(): word_vocab.add(word) word_vocab.load_pretrained_emb(PathConfig.embedding_path) print('Size of word vocab: {}'.format(word_vocab.size())) torch.save(word_vocab, os.path.join(PathConfig.experiment_data_dir, 'word_vocab.obj')) tag_vocab = Vocab() for tag in dataset.get_all_tags(): tag_vocab.add(tag) print('Size of tag vocab: {}'.format(tag_vocab.size())) tag_vocab.init_embed(ModelConfig.tag_embed_dim) torch.save(tag_vocab, os.path.join(PathConfig.experiment_data_dir, 'tag_vocab.obj')) print('Formatting the dataset to torch variables...') dataset.format_instances_to_torch_var(word_vocab, tag_vocab) torch.save(dataset, os.path.join(PathConfig.experiment_data_dir, 'dataset.obj'))
def prepare_data(): # load the dataset train_sections = [ os.path.join(paths.json_data_dir, '{:02}'.format(section_num)) for section_num in paths.train_sections ] dev_sections = [ os.path.join(paths.json_data_dir, '{:02}'.format(section_num)) for section_num in paths.dev_sections ] test_sections = [ os.path.join(paths.json_data_dir, '{:02}'.format(section_num)) for section_num in paths.test_sections ] train_dataset = PDTBDataSet(train_sections, tree_type=args.tree_type, level=args.level, multiple_labels=False) dev_dataset = PDTBDataSet(dev_sections, tree_type=args.tree_type, level=args.level, multiple_labels=True) test_dataset = PDTBDataSet(test_sections, tree_type=args.tree_type, level=args.level, multiple_labels=True) if not (train_dataset.consistent_with(dev_dataset) and dev_dataset.consistent_with(test_dataset)): print('Dataset labels are not consistent.') print('Train: {}'.format(sorted(train_dataset.label_map.keys()))) print('Dev: {}'.format(sorted(dev_dataset.label_map.keys()))) print('Test: {}'.format(sorted(test_dataset.label_map.keys()))) print('Size of train set: {}, dev set: {}, test set: {}'.format( len(train_dataset), len(dev_dataset), len(test_dataset))) # save the dataset torch.save(train_dataset, os.path.join(paths.experiment_data_dir, 'train.data')) torch.save(dev_dataset, os.path.join(paths.experiment_data_dir, 'dev.data')) torch.save(test_dataset, os.path.join(paths.experiment_data_dir, 'test.data')) # build the vocab vocab = Vocab( mannual_add=[PAD_WORD, UNK_WORD, BOS_WORD, EOS_WORD, NUM_WORD]) all_words = train_dataset.get_all_words() + dev_dataset.get_all_words( ) + test_dataset.get_all_words() # all_words = train_dataset.get_all_words() for word in all_words: vocab.add(word) # load and initialize the embeddings vocab.load_pretrained_emb(paths.embedding_path) print('Size of PDTB vocabulary: {}'.format(vocab.size())) # save the vocab torch.save(vocab, paths.vocab_path)
def create_vocab(data_path): wd_vocab = Vocab(min_count=3, bos=None, eos=None) lbl_vocab = Vocab(pad=None, unk=None, bos=None, eos=None) assert os.path.exists(data_path) with open(data_path, 'r', encoding='utf-8') as fin: loader = map(lambda x: x.strip().split('|||'), fin) for lbl, data_item in loader: wds = data_item.strip().split(' ') wd_vocab.add(wds) lbl_vocab.add(lbl.strip()) return MultiVocab({'word': wd_vocab, 'label': lbl_vocab})
def create_vocab(datasets, embed_file=None, bert_vocab_path=None, min_count=2): wd_vocab = Vocab(min_count, bos=None, eos=None) char_vocab = Vocab(bos=None, eos=None) tag_vocab = Vocab(bos=None, eos=None) ner_vocab = Vocab(bos=None, eos=None) for insts in datasets: for inst in insts: wd_vocab.add(inst.word) char_vocab.add(list(inst.word)) tag_vocab.add(inst.pos_tag) if inst.ner_tag != 'O': # including PER ORG LOC MISC and UNK ner_tag = inst.ner_tag.split('-')[1] ner_vocab.add(ner_tag) embed_count = wd_vocab.load_embeddings(embed_file) print("%d word pre-trained embeddings loaded..." % embed_count) bert_vocab = BERTVocab( bert_vocab_path) if bert_vocab_path is not None else None return MultiVocab( dict(word=wd_vocab, char=char_vocab, tag=tag_vocab, ner=ner_vocab, bert=bert_vocab))
def create_vocab(data_path, min_count=3): root_rel = None wd_vocab = Vocab(min_count, eos=None) char_vocab = Vocab(min_count, eos=None) tag_vocab = Vocab(eos=None) rel_vocab = Vocab(bos=None, eos=None) with open(data_path, 'r', encoding='utf-8') as fr: for deps in read_deps(fr): for dep in deps: wd_vocab.add(dep.form) char_vocab.add(list(dep.form)) tag_vocab.add(dep.pos_tag) if dep.head != 0: rel_vocab.add(dep.dep_rel) elif root_rel is None: root_rel = dep.dep_rel rel_vocab.add(dep.dep_rel) elif root_rel != dep.dep_rel: print('root = ' + root_rel + ', rel for root = ' + dep.dep_rel) return MultiVocab( dict(word=wd_vocab, char=char_vocab, tag=tag_vocab, rel=rel_vocab))
def main(): parser = argparse.ArgumentParser( description='Tuning with bi-directional LSTM-CNN') parser.add_argument('--num_epochs', type=int, default=100, help='Number of training epochs') parser.add_argument('--batch_size', type=int, default=1, help='Number of sentences in each batch') parser.add_argument('--hidden_size', type=int, default=200, help='Number of hidden units in RNN') parser.add_argument('--num_filters', type=int, default=35, help='Number of filters in CNN') parser.add_argument('--min_filter_width', type=int, default=3, help='Number of filters in CNN') parser.add_argument('--max_filter_width', type=int, default=7, help='Number of filters in CNN') parser.add_argument('--embedDimension', type=int, default=300, help='embedding dimension') parser.add_argument('--learning_rate', type=float, default=0.4, help='Learning rate') parser.add_argument('--decay_rate', type=float, default=0.1, help='Decay rate of learning rate') parser.add_argument('--gamma', type=float, default=0.0, help='weight for regularization') parser.add_argument('--schedule', type=int, default=1, help='schedule for learning rate decay') parser.add_argument('--embedding_vectors', help='path for embedding dict') parser.add_argument('--train') parser.add_argument('--trainAux') parser.add_argument('--dev') parser.add_argument('--test') parser.add_argument('--ner_tag_field_l1', type=int, default=1, help='ner tag field') parser.add_argument('--ner_tag_field_l2', type=int, default=1, help='ner tag field') parser.add_argument('--use_gpu', type=int, default=0, help='use gpu') parser.add_argument('--save_dir') parser.add_argument('--vocabChar') parser.add_argument('--vocabOutput') parser.add_argument('--vocabOutputAux') parser.add_argument('--vocabInput') args = parser.parse_args() use_gpu = args.use_gpu train_path = args.train train_path_aux = args.trainAux dev_path = args.dev test_path = args.test num_epochs = args.num_epochs batch_size = args.batch_size hidden_size = args.hidden_size num_filters = args.num_filters min_filter_width = args.min_filter_width max_filter_width = args.max_filter_width learning_rate = args.learning_rate momentum = 0.01 * learning_rate decay_rate = args.decay_rate gamma = args.gamma schedule = args.schedule save_dir = args.save_dir if not os.path.exists(save_dir): os.makedirs(save_dir) embedding_path = args.embedding_vectors inputVocabulary = Vocab() charVocabulary = CharVocab() targetVocabulary = Vocab() targetVocabularyAux = Vocab() if args.vocabChar: with open(args.vocabChar, "r") as f: charVocabulary.__dict__ = json.load(f) charVocabulary.set_freeze() charVocabulary.process() if args.vocabOutput: with open(args.vocabOutput, "r") as f: targetVocabulary.__dict__ = json.load(f) targetVocabulary.set_freeze() targetVocabulary.process() if args.vocabOutputAux: with open(args.vocabOutputAux, "r") as f: targetVocabularyAux.__dict__ = json.load(f) targetVocabularyAux.set_freeze() targetVocabularyAux.process() embedding_vocab = None if args.embedding_vectors: print(args.embedding_vectors) embedd_dict, embedding_vocab, reverse_word_vocab, vocabularySize, embeddingDimension = load_embeddings( embedding_path) print("Read Word Embedding of dimension " + str(embeddingDimension) + " for " + str(vocabularySize) + " number of words") for everyWord in embedding_vocab: inputVocabulary.add(everyWord) inputVocabulary.set_freeze() inputVocabulary.process() else: if args.vocabInput: with open(args.vocabInput, "r") as f: inputVocabulary.__dict__ = json.load(f) inputVocabulary.set_freeze() inputVocabulary.process() else: print( "Neither pre-trained word embeddings nor input vocabulary is specified" ) exit() if charVocabulary.__is_empty__(): charVocabulary.add("<S>") charVocabulary.add("</S>") trainCorpus, trainLabelsRaw, maxTrainLength = readCoNLL( train_path, charVocabulary, targetVocabulary, args.ner_tag_field_l1, embedding_vocab) print("Train Corpus contains " + str(len(trainCorpus)) + " sentences and maximum sentence length is " + str(maxTrainLength)) trainCorpusRawSorted = trainCorpus trainLabelsRawSorted = trainLabelsRaw trainCorpusAux, trainLabelsRawAux, maxTrainLengthAux = readCoNLL( train_path_aux, charVocabulary, targetVocabularyAux, args.ner_tag_field_l2, embedding_vocab) print("Auxiliary Train Corpus contains " + str(len(trainCorpusAux)) + " sentences and maximum sentence length is " + str(maxTrainLengthAux)) trainCorpusRawSortedAux = trainCorpusAux trainLabelsRawSortedAux = trainLabelsRawAux devCorpus, devLabelsRaw, maxDevLength = readCoNLL(dev_path, charVocabulary, targetVocabulary, args.ner_tag_field_l1, embedding_vocab) print("Dev Corpus contains " + str(len(devCorpus)) + " sentences and maximum sentence length is " + str(maxDevLength)) testCorpus, testLabelsRaw, maxTestLength = readCoNLL( test_path, charVocabulary, targetVocabulary, args.ner_tag_field_l1, embedding_vocab) print("Test Corpus contains " + str(len(testCorpus)) + " sentences and maximum sentence length is " + str(maxTestLength)) if not targetVocabulary.get_freeze(): print(targetVocabulary._tok_to_ind) tmp_filename = '%s/output.vocab' % (save_dir) with open(tmp_filename, "w") as f: json.dump(targetVocabulary.__dict__, f) targetVocabulary.set_freeze() if not targetVocabularyAux.get_freeze(): print(targetVocabularyAux._tok_to_ind) tmp_filename = '%s/output_aux.vocab' % (save_dir) with open(tmp_filename, "w") as f: json.dump(targetVocabularyAux.__dict__, f) targetVocabularyAux.set_freeze() if not charVocabulary.get_freeze(): tmp_filename = '%s/char.vocab' % (save_dir) with open(tmp_filename, "w") as f: json.dump(charVocabulary.__dict__, f) charVocabulary.set_freeze() embeddingDimension = args.embedDimension word_embedding = np.random.uniform( -0.1, 0.1, (inputVocabulary.__len__(), embeddingDimension)) if args.embedding_vectors: for everyWord in inputVocabulary._tok_to_ind: if everyWord in embedding_vocab: word_embedding[inputVocabulary.__get_word__( everyWord)] = embedd_dict[embedding_vocab[everyWord]] tmp_filename = '%s/input.vocab' % (save_dir) with open(tmp_filename, "w") as f: json.dump(inputVocabulary.__dict__, f) inputVocabulary.set_freeze() del embedd_dict del reverse_word_vocab del vocabularySize del embedding_vocab print("Read " + str(targetVocabulary.__len__()) + " number of target words") print("Read " + str(targetVocabularyAux.__len__()) + " number of target words") print("Read " + str(inputVocabulary.__len__()) + " number of input words") print("Read " + str(charVocabulary.__len__()) + " number of characters") print("Number of epochs = " + str(num_epochs)) print("Mini-Batch size = " + str(batch_size)) print("Bi-LSTM Hidden size = " + str(hidden_size)) print("Features per CNN filter = " + str(num_filters)) print("Minimum ngrams for CNN filter = " + str(min_filter_width)) print("Maximum ngrams for CNN filter = " + str(max_filter_width)) print("Initial Learning Rate = " + str(learning_rate)) network = BiCNNLSTMTranstion(inputVocabulary.__len__(), embeddingDimension, min_filter_width, max_filter_width, charVocabulary.__len__(), num_filters, hidden_size, targetVocabulary.__len__(), word_embedding, targetVocabularyAux.__len__()) lr = learning_rate lr_aux = learning_rate * 0.1 optim = SGD(network.parameters(), lr=lr, momentum=momentum, weight_decay=gamma, nesterov=True) num_batches = len(trainCorpus) / batch_size + 1 dev_f1 = 0.0 dev_acc = 0.0 dev_precision = 0.0 dev_recall = 0.0 test_f1 = 0.0 test_acc = 0.0 test_precision = 0.0 test_recall = 0.0 best_epoch = 0 print("Training....") if use_gpu == 1: network.cuda() prev_error = 100000.0 for epoch in range(1, num_epochs + 1): print( 'Epoch %d ( learning rate=%.4f, decay rate=%.4f (schedule=%d)): ' % (epoch, lr, decay_rate, schedule)) train_err = 0. train_corr = 0. train_total = 0. start_time = time.time() num_back = 0 network.train() count = 0 count_batch = 0 l1_indices = list(range(len(trainCorpusRawSorted))) l2_indices = list(range(len(trainCorpusRawSortedAux))) with tqdm(total=(len(trainCorpusRawSorted) + len(trainCorpusRawSortedAux))) as pbar: for l1, l2 in zip_longest(l1_indices, l2_indices): if l1 is not None: x_input, batch_length, current_batch_size, current_max_sequence_length, y_output, mask, y_prev = constructBatch( [trainCorpusRawSorted[l1]], [trainLabelsRawSorted[l1]], inputVocabulary, targetVocabulary, charVocabulary, max_filter_width, args.use_gpu) optim.zero_grad() loss, _ = network.loss(x_input, batch_length, current_batch_size, current_max_sequence_length, y_output, mask, y_prev, 0, use_gpu) loss.backward() optim.step() train_err += loss.item() train_total += batch_length.data.sum() count = count + current_batch_size count_batch = count_batch + 1 if l2 is not None: x_input, batch_length, current_batch_size, current_max_sequence_length, y_output, mask, y_prev = constructBatch( [trainCorpusRawSortedAux[l2]], [trainLabelsRawSortedAux[l2]], inputVocabulary, targetVocabularyAux, charVocabulary, max_filter_width, args.use_gpu) optim.zero_grad() loss, _ = network.loss(x_input, batch_length, current_batch_size, current_max_sequence_length, y_output, mask, y_prev, 1, use_gpu) loss.backward() optim.step() time_ave = (time.time() - start_time) / count time_left = (num_batches - count_batch) * time_ave pbar.update(2) print('train: %d loss: %.4f, time: %.2fs' % (num_batches, train_err / count, time.time() - start_time)) network.eval() tmp_filename = '%s/_dev%d' % (save_dir, epoch) current_epoch_loss = 0.0 with codecs.open(tmp_filename, "w", encoding="utf8", errors="ignore") as writer: for inputs, labels in batch(devCorpus, devLabelsRaw, 1): x_input, batch_length, current_batch_size, current_max_sequence_length, y_output, mask, y_prev = constructBatch( inputs, labels, inputVocabulary, targetVocabulary, charVocabulary, max_filter_width, args.use_gpu) loss, _ = network.loss(x_input, batch_length, current_batch_size, current_max_sequence_length, y_output, mask, y_prev, 0, use_gpu) current_epoch_loss = current_epoch_loss + loss.item() loss, preds = network.forward(x_input, batch_length, current_batch_size, current_max_sequence_length, y_output, mask, y_prev, 0, use_gpu) count = 0 for i in range(len(inputs)): for j in range(len(inputs[i])): writer.write(inputs[i][j] + " " + labels[i][j] + " " + targetVocabulary.__get_index__( preds[i][j].item()).upper()) writer.write("\n") writer.write("\n") writer.close() acc, precision, recall, f1 = evaluate(tmp_filename, save_dir) print( 'dev loss: %.2f, dev acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%' % (current_epoch_loss, acc, precision, recall, f1)) if current_epoch_loss > prev_error: lr = lr * 0.7 optim = SGD(network.parameters(), lr=lr, momentum=momentum, weight_decay=gamma, nesterov=True) network.load_state_dict(torch.load(save_dir + "/model")) network.eval() if lr < 0.002: network.eval() tmp_filename = '%s/_test%d' % (save_dir, epoch) with codecs.open(tmp_filename, "w", encoding="utf8", errors="ignore") as writer: for inputs, labels in batch(testCorpus, testLabelsRaw, 1): x_input, batch_length, current_batch_size, current_max_sequence_length, y_output, mask, y_prev = constructBatch( inputs, labels, inputVocabulary, targetVocabulary, charVocabulary, max_filter_width, args.use_gpu) loss, preds = network.forward( x_input, batch_length, current_batch_size, current_max_sequence_length, y_output, mask, y_prev, 0, use_gpu) count = 0 for i in range(len(inputs)): for j in range(len(inputs[i])): writer.write(inputs[i][j] + " " + labels[i][j] + " " + targetVocabulary.__get_index__( preds[i][j].item()).upper()) writer.write("\n") writer.write("\n") writer.close() acc, precision, recall, f1 = evaluate(tmp_filename, save_dir) print( 'test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%' % (acc, precision, recall, f1)) exit() else: prev_error = current_epoch_loss torch.save(network.state_dict(), save_dir + "/model") network.eval() tmp_filename = '%s/_test%d' % (save_dir, epoch) with codecs.open(tmp_filename, "w", encoding="utf8", errors="ignore") as writer: for inputs, labels in batch(testCorpus, testLabelsRaw, 1): x_input, batch_length, current_batch_size, current_max_sequence_length, y_output, mask, y_prev = constructBatch( inputs, labels, inputVocabulary, targetVocabulary, charVocabulary, max_filter_width, args.use_gpu) loss, preds = network.forward(x_input, batch_length, current_batch_size, current_max_sequence_length, y_output, mask, y_prev, 0, use_gpu) count = 0 for i in range(len(inputs)): for j in range(len(inputs[i])): writer.write(inputs[i][j] + " " + labels[i][j] + " " + targetVocabulary.__get_index__( preds[i][j].item()).upper()) writer.write("\n") writer.write("\n") writer.close() acc, precision, recall, f1 = evaluate(tmp_filename, save_dir) print( 'test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%' % (acc, precision, recall, f1)) torch.save(network.state_dict(), save_dir + "/model")
def main(): parser = argparse.ArgumentParser( description='Training a Sequence Labeler with bi-directional LSTM-CNN') parser.add_argument('--num_epochs', type=int, default=100, help='Number of training epochs') parser.add_argument('--batch_size', type=int, default=5, help='Number of sentences in each batch') parser.add_argument('--hidden_size', type=int, default=200, help='Number of hidden units in RNN') parser.add_argument('--num_filters', type=int, default=35, help='Number of filters in CNN') parser.add_argument('--min_filter_width', type=int, default=3, help='Number of filters in CNN') parser.add_argument('--max_filter_width', type=int, default=7, help='Number of filters in CNN') parser.add_argument('--embedDimension', type=int, default=300, help='embedding dimension') parser.add_argument('--learning_rate', type=float, default=0.4, help='Learning rate') parser.add_argument('--gamma', type=float, default=0.0, help='weight for regularization') parser.add_argument('--embedding_vectors', help='path for embedding dict') parser.add_argument('--embedding_dict_new', help='path for embedding dict') parser.add_argument('--train') parser.add_argument('--dev') parser.add_argument('--test') parser.add_argument('--vocabChar') parser.add_argument('--vocabOutput') parser.add_argument('--vocabInput') parser.add_argument('--ner_tag_field', type=int, default=1, help='ner tag field') parser.add_argument('--use_gpu', type=int, default=1, help='use gpu') parser.add_argument('--fineTune', type=bool, default=False, help='fineTune pretrained word embeddings') parser.add_argument('--save-dir') parser.add_argument('--perform_evaluation', type=bool, default=False, help='perform evaluation only') parser.add_argument('--deploy', type=bool, default=False, help='deploy') parser.add_argument('--train_from', type=str, default="") args = parser.parse_args() train_path = args.train dev_path = args.dev test_path = args.test num_epochs = args.num_epochs batch_size = args.batch_size hidden_size = args.hidden_size num_filters = args.num_filters min_filter_width = args.min_filter_width max_filter_width = args.max_filter_width learning_rate = args.learning_rate momentum = 0.01 * learning_rate gamma = args.gamma embedding_path = args.embedding_vectors save_dir = args.save_dir # create the output folder if does not exist if not os.path.exists(save_dir): os.makedirs(save_dir) evaluation = args.perform_evaluation inputVocabulary = Vocab() charVocabulary = CharVocab() targetVocabulary = Vocab() # Read Character vocabulary if vocabChar argument is given if args.vocabChar: with open(args.vocabChar, "r") as f: charVocabulary.__dict__ = json.load(f) charVocabulary.set_freeze() charVocabulary.process() # Read Output vocabulary if vocabChar argument is given if args.vocabOutput: with open(args.vocabOutput, "r") as f: targetVocabulary.__dict__ = json.load(f) targetVocabulary.set_freeze() targetVocabulary.process() embedding_vocab = None # If the path to pre-trained embeddings are given if args.embedding_vectors: print(args.embedding_vectors) # load the pre-trained embeddings embedd_dict, embedding_vocab, reverse_word_vocab, vocabularySize, embeddingDimension = load_embeddings( embedding_path) print("Read Word Embedding of dimension " + str(embeddingDimension) + " for " + str(vocabularySize) + " number of words") # add the words to the Input vocabulary which is a dictionary of words for everyWord in embedding_vocab: inputVocabulary.add(everyWord) inputVocabulary.set_freeze() inputVocabulary.process() else: # Read Input vocabulary if vocabChar argument is given if args.vocabInput: with open(args.vocabInput, "r") as f: inputVocabulary.__dict__ = json.load(f) inputVocabulary.set_freeze() inputVocabulary.process() else: print( "Neither pre-trained word embeddings nor input vocabulary is specified" ) exit() # if character vocabulary is initialize with beginning and end markers if charVocabulary.__is_empty__(): charVocabulary.add("<S>") charVocabulary.add("</S>") if evaluation: # if we are performing evaluation, we do not require train and dev splits if not args.deploy: # if we are not deploying the model, then we are interesting in testing the model testCorpus, testLabelsRaw, maxTestLength = readCoNLL( test_path, charVocabulary, targetVocabulary, args.ner_tag_field, inputVocabulary) print("Test Corpus contains " + str(len(testCorpus)) + " sentences and maximum sentence length is " + str(maxTestLength)) print("Read " + str(len(charVocabulary)) + " number of characters") else: # if we are deploying the model, we are trying to get predictions on a plain corpus testCorpus, maxTestLength = readUnlabeledData(test_path) print("Test Corpus contains " + str(len(testCorpus)) + " sentences and maximum sentence length is " + str(maxTestLength)) else: # read train split trainCorpus, trainLabelsRaw, maxTrainLength = readCoNLL( train_path, charVocabulary, targetVocabulary, args.ner_tag_field, embedding_vocab) print("Train Corpus contains " + str(len(trainCorpus)) + " sentences and maximum sentence length is " + str(maxTrainLength)) trainCorpusRawSorted = trainCorpus trainLabelsRawSorted = trainLabelsRaw # read dev split devCorpus, devLabelsRaw, maxDevLength = readCoNLL( dev_path, charVocabulary, targetVocabulary, args.ner_tag_field, embedding_vocab) print("Dev Corpus contains " + str(len(devCorpus)) + " sentences and maximum sentence length is " + str(maxDevLength)) # read test split testCorpus, testLabelsRaw, maxTestLength = readCoNLL( test_path, charVocabulary, targetVocabulary, args.ner_tag_field, embedding_vocab) print("Test Corpus contains " + str(len(testCorpus)) + " sentences and maximum sentence length is " + str(maxTestLength)) if not targetVocabulary.get_freeze(): # save the output vocabulary print(targetVocabulary._tok_to_ind) tmp_filename = '%s/output.vocab' % (save_dir) with open(tmp_filename, "w") as f: json.dump(targetVocabulary.__dict__, f) targetVocabulary.set_freeze() if not charVocabulary.get_freeze(): # save the character vocabulary tmp_filename = '%s/char.vocab' % (save_dir) with open(tmp_filename, "w") as f: json.dump(charVocabulary.__dict__, f) charVocabulary.set_freeze() # initialize word embeddings randomly embeddingDimension = args.embedDimension word_embedding = np.random.uniform( -0.1, 0.1, (inputVocabulary.__len__(), embeddingDimension)) if args.embedding_vectors: # pre-trained word embeddings are given, update the word_embedding variable with pre-trained embeddings for everyWord in inputVocabulary._tok_to_ind: if everyWord in embedding_vocab: word_embedding[inputVocabulary.__get_word__( everyWord)] = embedd_dict[embedding_vocab[everyWord]] # save the input vocabulary tmp_filename = '%s/input.vocab' % (save_dir) with open(tmp_filename, "w") as f: json.dump(inputVocabulary.__dict__, f) inputVocabulary.set_freeze() del embedd_dict del reverse_word_vocab del vocabularySize del embedding_vocab print("Read " + str(targetVocabulary.__len__()) + " number of target words") print("Read " + str(inputVocabulary.__len__()) + " number of input words") print("Read " + str(charVocabulary.__len__()) + " number of characters") print("Number of epochs = " + str(num_epochs)) print("Mini-Batch size = " + str(batch_size)) print("Bi-LSTM Hidden size = " + str(hidden_size)) print("Features per CNN filter = " + str(num_filters)) print("Minimum ngrams for CNN filter = " + str(min_filter_width)) print("Maximum ngrams for CNN filter = " + str(max_filter_width)) print("Initial Learning Rate = " + str(learning_rate)) use_gpu = args.use_gpu # create a Bi-LSTM network network = BiCNNLSTMTranstion(inputVocabulary.__len__(), embeddingDimension, min_filter_width, max_filter_width, charVocabulary.__len__(), num_filters, hidden_size, targetVocabulary.__len__(), word_embedding, args.fineTune) print(network) lr = learning_rate optim = SGD(network.parameters(), lr=lr, momentum=momentum, weight_decay=gamma, nesterov=True) if not evaluation: num_batches = len(trainCorpus) / batch_size + 1 dev_f1 = 0.0 dev_acc = 0.0 dev_precision = 0.0 dev_recall = 0.0 test_f1 = 0.0 test_acc = 0.0 test_precision = 0.0 test_recall = 0.0 best_epoch = 0 if evaluation: # if we are performing evaluation, load the trained model network.load_state_dict(torch.load(save_dir + "/model")) # save output vocabulary as a plain file tmp_filename = '%s/output.vocab.plain' % (save_dir) with open(tmp_filename, "w") as f: for index in range(len(targetVocabulary._ind_to_tok)): f.write(targetVocabulary._ind_to_tok[index]) f.write("\n") f.close() # save input vocabulary as a plain file tmp_filename = '%s/input.vocab.plain' % (save_dir) with open(tmp_filename, "w") as f: for index in range(len(inputVocabulary._ind_to_tok)): f.write(inputVocabulary._ind_to_tok[index]) f.write("\n") f.close() # save character vocabulary as a plain file tmp_filename = '%s/char.vocab.plain' % (save_dir) with open(tmp_filename, "w") as f: for index in range(len(charVocabulary._ind_to_tok)): f.write(charVocabulary._ind_to_tok[index]) f.write("\n") f.close() print("Performing Evaluation") if args.use_gpu == 0: network.cpu() network.eval() tmp_filename = '%s/_test_new' % (save_dir) if args.use_gpu == 1: print("Using GPU....") network.cuda() with codecs.open(tmp_filename, "w", encoding="utf8", errors="ignore") as writer: for inputs, labels in batch(testCorpus, testLabelsRaw, 1): # for every sentence in the test data, convert the input to tensor x_input, batch_length, current_batch_size, current_max_sequence_length, y_output, mask, y_prev = constructBatch( inputs, labels, inputVocabulary, targetVocabulary, charVocabulary, max_filter_width, args.use_gpu) # get predictions by calling the forward() function of the model loss, preds, probs = network.forward( x_input, batch_length, current_batch_size, current_max_sequence_length, y_output, mask, y_prev, args.use_gpu) count = 0 # get the labels and write the output to the file for i in range(len(inputs)): for j in range(len(inputs[i])): writer.write(inputs[i][j]) for k in range(probs[i][j].size()[0]): writer.write(" " + str(probs[i][j][k].item())) writer.write(" " + inputs[i][j] + " " + labels[i][j] + " " + targetVocabulary.__get_index__( preds[i][j].item()).upper()) writer.write("\n") writer.write("\n") writer.close() # Calulate the F-Score on the predicted output acc, precision, recall, f1 = evaluate(tmp_filename, save_dir) print( 'test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%' % (acc, precision, recall, f1)) else: if args.use_gpu == 1: print("Using GPU....") network.cuda() if args.train_from: print("Loading pre-trained model from " + args.train_from) network.load_state_dict(torch.load(args.train_from)) print("Training....") prev_error = 1000.0 network.eval() tmp_filename = '%s/_dev' % (save_dir) current_epoch_loss = 0.0 with codecs.open(tmp_filename, "w", encoding="utf8", errors="ignore") as writer: for inputs, labels in batch(devCorpus, devLabelsRaw, 1): # for every sentence in the test data, convert the input to tensor x_input, batch_length, current_batch_size, current_max_sequence_length, y_output, mask, y_prev = constructBatch( inputs, labels, inputVocabulary, targetVocabulary, charVocabulary, max_filter_width, args.use_gpu) # get the loss by calling the forward() function of the model loss, _ = network.loss(x_input, batch_length, current_batch_size, current_max_sequence_length, y_output, mask, y_prev, args.use_gpu) current_epoch_loss = current_epoch_loss + loss.item() # get the predictions by calling the forward() function of the model loss, preds, probs = network.forward( x_input, batch_length, current_batch_size, current_max_sequence_length, y_output, mask, y_prev, args.use_gpu) count = 0 # get the labels and write the output to the file for i in range(len(inputs)): for j in range(len(inputs[i])): writer.write(inputs[i][j] + " " + labels[i][j] + " " + targetVocabulary.__get_index__( preds[i][j].item()).upper()) writer.write("\n") writer.write("\n") writer.close() # Calulate the F-Score on the predicted output acc, precision, recall, f1 = evaluate(tmp_filename, save_dir) print( 'dev loss: %.2f, dev acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%' % (current_epoch_loss, acc, precision, recall, f1)) for epoch in range(1, num_epochs + 1): print('Epoch %d ( learning rate=%.4f ): ' % (epoch, lr)) train_err = 0. train_corr = 0. train_total = 0. start_time = time.time() num_back = 0 network.train() count = 0 count_batch = 0 with tqdm(total=(len(trainCorpusRawSorted))) as pbar: for inputs, labels in batch(trainCorpusRawSorted, trainLabelsRawSorted, batch_size): # for every sentence in the test data, convert the input to tensor x_input, batch_length, current_batch_size, current_max_sequence_length, y_output, mask, y_prev = constructBatch( inputs, labels, inputVocabulary, targetVocabulary, charVocabulary, max_filter_width, args.use_gpu) optim.zero_grad() # get the loss by calling the forward() function of the model loss, _ = network.loss(x_input, batch_length, current_batch_size, current_max_sequence_length, y_output, mask, y_prev, args.use_gpu) # calculate gradients by calling backward() function and call optim.step() to perform gradient updation loss.backward() optim.step() train_err += loss.item() train_total += batch_length.data.sum() count = count + current_batch_size count_batch = count_batch + 1 time_ave = (time.time() - start_time) / count time_left = (num_batches - count_batch) * time_ave pbar.update(1) print('train: %d loss: %.4f, time: %.2fs' % (num_batches, train_err / count, time.time() - start_time)) network.eval() tmp_filename = '%s/_dev%d' % (save_dir, epoch) current_epoch_loss = 0.0 with codecs.open(tmp_filename, "w", encoding="utf8", errors="ignore") as writer: for inputs, labels in batch(devCorpus, devLabelsRaw, 1): # for every sentence in the test data, convert the input to tensor x_input, batch_length, current_batch_size, current_max_sequence_length, y_output, mask, y_prev = constructBatch( inputs, labels, inputVocabulary, targetVocabulary, charVocabulary, max_filter_width, args.use_gpu) # get the loss by calling the forward() function of the model loss, _ = network.loss(x_input, batch_length, current_batch_size, current_max_sequence_length, y_output, mask, y_prev, args.use_gpu) current_epoch_loss = current_epoch_loss + loss.item() # get the predictions by calling the forward() function of the model loss, preds, probs = network.forward( x_input, batch_length, current_batch_size, current_max_sequence_length, y_output, mask, y_prev, args.use_gpu) count = 0 # get the labels and write the output to the file for i in range(len(inputs)): for j in range(len(inputs[i])): writer.write(inputs[i][j] + " " + labels[i][j] + " " + targetVocabulary.__get_index__( preds[i][j].item()).upper()) writer.write("\n") writer.write("\n") writer.close() # Calulate the F-Score on the predicted output acc, precision, recall, f1 = evaluate(tmp_filename, save_dir) print( 'dev loss: %.2f, dev acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%' % (current_epoch_loss, acc, precision, recall, f1)) if epoch > 1: if current_epoch_loss > prev_error: # if the validation loss has increased, load the previous epoch model and reduce the learning rate lr = lr * 0.7 optim = SGD(network.parameters(), lr=lr, momentum=momentum, weight_decay=gamma, nesterov=True) network.load_state_dict(torch.load(save_dir + "/model")) network.eval() if lr < 0.002: # if the learning rate is less than 0.002, stop the training network.eval() tmp_filename = '%s/_test%d' % (save_dir, epoch) with codecs.open(tmp_filename, "w", encoding="utf8", errors="ignore") as writer: for inputs, labels in batch( testCorpus, testLabelsRaw, 1): # for every sentence in the test data, convert the input to tensor x_input, batch_length, current_batch_size, current_max_sequence_length, y_output, mask, y_prev = constructBatch( inputs, labels, inputVocabulary, targetVocabulary, charVocabulary, max_filter_width, args.use_gpu) # get the predictions by calling the forward() function of the model loss, preds, probs = network.forward( x_input, batch_length, current_batch_size, current_max_sequence_length, y_output, mask, y_prev, args.use_gpu) count = 0 # get the labels and write the output to the file for i in range(len(inputs)): for j in range(len(inputs[i])): writer.write(inputs[i][j]) for k in range(probs[i][j].size()[0]): writer.write( " " + str(probs[i][j][k].item())) writer.write( " " + inputs[i][j] + " " + labels[i][j] + " " + targetVocabulary.__get_index__( preds[i][j].item()).upper()) writer.write("\n") writer.write("\n") writer.close() # Calulate the F-Score on the predicted output acc, precision, recall, f1 = evaluate( tmp_filename, save_dir) print( 'test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%' % (acc, precision, recall, f1)) exit() else: prev_error = current_epoch_loss torch.save(network.state_dict(), save_dir + "/model") network.eval() tmp_filename = '%s/_test%d' % (save_dir, epoch) with codecs.open(tmp_filename, "w", encoding="utf8", errors="ignore") as writer: for inputs, labels in batch(testCorpus, testLabelsRaw, 1): # for every sentence in the test data, convert the input to tensor x_input, batch_length, current_batch_size, current_max_sequence_length, y_output, mask, y_prev = constructBatch( inputs, labels, inputVocabulary, targetVocabulary, charVocabulary, max_filter_width, args.use_gpu) # get the predictions by calling the forward() function of the model loss, preds, probs = network.forward( x_input, batch_length, current_batch_size, current_max_sequence_length, y_output, mask, y_prev, args.use_gpu) count = 0 # get the labels and write the output to the file for i in range(len(inputs)): for j in range(len(inputs[i])): writer.write( inputs[i][j] + " " + labels[i][j] + " " + targetVocabulary.__get_index__( preds[i][j].item()).upper()) writer.write("\n") writer.write("\n") writer.close() # Calulate the F-Score on the predicted output acc, precision, recall, f1 = evaluate( tmp_filename, save_dir) print( 'test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%' % (acc, precision, recall, f1)) else: prev_error = current_epoch_loss torch.save(network.state_dict(), save_dir + "/model")
def create_vocab(all_data, min_count=3): wd_vocab = Vocab(min_count=min_count, bos=None, eos=None) for task_data in all_data: for inst in task_data: wd_vocab.add(inst.data) return wd_vocab