Example #1
0
def get_embed_vocab(embed_file):
    assert os.path.exists(embed_file)
    embed_vocab = Vocab(bos=None, eos=None)
    vec_dim = 0
    with open(embed_file, 'r', encoding='utf-8') as fin:
        for line in fin:
            tokens = line.strip().split(' ')
            if len(tokens) < 10:
                continue
            embed_vocab.add(tokens[0])
            if vec_dim == 0:
                vec_dim = len(tokens[1:])

    embed_weights = np.random.uniform(-0.5 / vec_dim, 0.5 / vec_dim,
                                      (len(embed_vocab), vec_dim))
    with open(embed_file, 'r', encoding='utf-8') as fin:
        for line in fin:
            tokens = line.strip().split(' ')
            if len(tokens) < 10:
                continue
            idx = embed_vocab.inst2idx(tokens[0])
            embed_weights[idx] = np.asarray(tokens[1:], dtype=np.float32)
    embed_weights[embed_vocab.pad_idx] = 0.
    embed_weights /= np.std(embed_weights)
    embed_vocab.embeddings = embed_weights
    return embed_vocab
Example #2
0
def prepare(args):
    logger = logging.getLogger("brc")
    logger.info('Checking the data files...')
    for data_path in args.train_files + args.dev_files + args.test_files:
        assert os.path.exists(data_path), '{} file does not exist.'.format(
            data_path)
    logger.info('Preparing the directories...')
    for dir_path in [
            args.vocab_dir, args.model_dir, args.result_dir, args.summary_dir
    ]:
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

    logger.info('Building vocabulary...')
    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
                          args.gpus, args.batch_size, args.train_files,
                          args.dev_files, args.test_files)
    vocab = Vocab(init_random=False, trainable_oov_cnt_threshold=2)
    for word in brc_data.word_iter('train'):
        vocab.add(word)
    unfiltered_vocab_size = vocab.size()
    vocab.filter_tokens_by_cnt(min_cnt=2)
    filtered_num = unfiltered_vocab_size - vocab.size()
    logger.info('After filter {} tokens, the final vocab size is {}'.format(
        filtered_num, vocab.size()))
    logger.info('Assigning embeddings...')
    #     vocab.build_embedding_matrix(args.pretrained_word_path)
    vocab.randomly_init_embeddings(args.embed_size)
    logger.info('Saving vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout:
        pickle.dump(vocab, fout)
    logger.info('Done with preparing!')
Example #3
0
def pdtb_prepare(args):
    print('Loading dataset...')
    train_sections = [os.path.join(PathConfig.json_data_dir, '{:02}'.format(section_num)) for section_num in
                      PathConfig.train_sections]
    dev_sections = [os.path.join(PathConfig.json_data_dir, '{:02}'.format(section_num)) for section_num in
                    PathConfig.dev_sections]
    test_sections = [os.path.join(PathConfig.json_data_dir, '{:02}'.format(section_num)) for section_num in
                     PathConfig.test_sections]
    dataset = PDTBDataSet(train_sections, dev_sections, test_sections, level=2 if args.task.startswith('fine') else 1)
    print('Size of train: {}, dev: {}, test: {}'.format(len(dataset.train_set), len(dataset.dev_set),
                                                        len(dataset.test_set)))
    print('Creating word vocab...')
    if not os.path.exists(PathConfig.experiment_data_dir):
        os.mkdir(PathConfig.experiment_data_dir)
    word_vocab = Vocab(mannual_add=[PAD_WORD, UNK_WORD, BOS_WORD, EOS_WORD, NUM_WORD])
    for word in dataset.get_all_words():
        word_vocab.add(word)
    word_vocab.load_pretrained_emb(PathConfig.embedding_path)
    print('Size of word vocab: {}'.format(word_vocab.size()))
    torch.save(word_vocab, os.path.join(PathConfig.experiment_data_dir, 'word_vocab.obj'))
    tag_vocab = Vocab()
    for tag in dataset.get_all_tags():
        tag_vocab.add(tag)
    print('Size of tag vocab: {}'.format(tag_vocab.size()))
    tag_vocab.init_embed(ModelConfig.tag_embed_dim)
    torch.save(tag_vocab, os.path.join(PathConfig.experiment_data_dir, 'tag_vocab.obj'))
    print('Formatting the dataset to torch variables...')
    dataset.format_instances_to_torch_var(word_vocab, tag_vocab)
    torch.save(dataset, os.path.join(PathConfig.experiment_data_dir, 'dataset.obj'))
Example #4
0
def prepare_data():
    # load the dataset
    train_sections = [
        os.path.join(paths.json_data_dir, '{:02}'.format(section_num))
        for section_num in paths.train_sections
    ]
    dev_sections = [
        os.path.join(paths.json_data_dir, '{:02}'.format(section_num))
        for section_num in paths.dev_sections
    ]
    test_sections = [
        os.path.join(paths.json_data_dir, '{:02}'.format(section_num))
        for section_num in paths.test_sections
    ]
    train_dataset = PDTBDataSet(train_sections,
                                tree_type=args.tree_type,
                                level=args.level,
                                multiple_labels=False)
    dev_dataset = PDTBDataSet(dev_sections,
                              tree_type=args.tree_type,
                              level=args.level,
                              multiple_labels=True)
    test_dataset = PDTBDataSet(test_sections,
                               tree_type=args.tree_type,
                               level=args.level,
                               multiple_labels=True)
    if not (train_dataset.consistent_with(dev_dataset)
            and dev_dataset.consistent_with(test_dataset)):
        print('Dataset labels are not consistent.')
        print('Train: {}'.format(sorted(train_dataset.label_map.keys())))
        print('Dev: {}'.format(sorted(dev_dataset.label_map.keys())))
        print('Test: {}'.format(sorted(test_dataset.label_map.keys())))
    print('Size of train set: {}, dev set: {}, test set: {}'.format(
        len(train_dataset), len(dev_dataset), len(test_dataset)))
    # save the dataset
    torch.save(train_dataset,
               os.path.join(paths.experiment_data_dir, 'train.data'))
    torch.save(dev_dataset, os.path.join(paths.experiment_data_dir,
                                         'dev.data'))
    torch.save(test_dataset,
               os.path.join(paths.experiment_data_dir, 'test.data'))
    # build the vocab
    vocab = Vocab(
        mannual_add=[PAD_WORD, UNK_WORD, BOS_WORD, EOS_WORD, NUM_WORD])
    all_words = train_dataset.get_all_words() + dev_dataset.get_all_words(
    ) + test_dataset.get_all_words()
    # all_words = train_dataset.get_all_words()
    for word in all_words:
        vocab.add(word)
    # load and initialize the embeddings
    vocab.load_pretrained_emb(paths.embedding_path)
    print('Size of PDTB vocabulary: {}'.format(vocab.size()))
    # save the vocab
    torch.save(vocab, paths.vocab_path)
Example #5
0
def create_vocab(data_path):
    wd_vocab = Vocab(min_count=3, bos=None, eos=None)
    lbl_vocab = Vocab(pad=None, unk=None, bos=None, eos=None)
    assert os.path.exists(data_path)
    with open(data_path, 'r', encoding='utf-8') as fin:
        loader = map(lambda x: x.strip().split('|||'), fin)
        for lbl, data_item in loader:
            wds = data_item.strip().split(' ')
            wd_vocab.add(wds)
            lbl_vocab.add(lbl.strip())
    return MultiVocab({'word': wd_vocab, 'label': lbl_vocab})
Example #6
0
def create_vocab(datasets, embed_file=None, bert_vocab_path=None, min_count=2):
    wd_vocab = Vocab(min_count, bos=None, eos=None)
    char_vocab = Vocab(bos=None, eos=None)
    tag_vocab = Vocab(bos=None, eos=None)
    ner_vocab = Vocab(bos=None, eos=None)
    for insts in datasets:
        for inst in insts:
            wd_vocab.add(inst.word)
            char_vocab.add(list(inst.word))
            tag_vocab.add(inst.pos_tag)

            if inst.ner_tag != 'O':
                # including PER ORG LOC MISC and UNK
                ner_tag = inst.ner_tag.split('-')[1]
                ner_vocab.add(ner_tag)

    embed_count = wd_vocab.load_embeddings(embed_file)
    print("%d word pre-trained embeddings loaded..." % embed_count)

    bert_vocab = BERTVocab(
        bert_vocab_path) if bert_vocab_path is not None else None

    return MultiVocab(
        dict(word=wd_vocab,
             char=char_vocab,
             tag=tag_vocab,
             ner=ner_vocab,
             bert=bert_vocab))
Example #7
0
def create_vocab(data_path, min_count=3):
    root_rel = None
    wd_vocab = Vocab(min_count, eos=None)
    char_vocab = Vocab(min_count, eos=None)
    tag_vocab = Vocab(eos=None)
    rel_vocab = Vocab(bos=None, eos=None)
    with open(data_path, 'r', encoding='utf-8') as fr:
        for deps in read_deps(fr):
            for dep in deps:
                wd_vocab.add(dep.form)
                char_vocab.add(list(dep.form))
                tag_vocab.add(dep.pos_tag)

                if dep.head != 0:
                    rel_vocab.add(dep.dep_rel)
                elif root_rel is None:
                    root_rel = dep.dep_rel
                    rel_vocab.add(dep.dep_rel)
                elif root_rel != dep.dep_rel:
                    print('root = ' + root_rel + ', rel for root = ' +
                          dep.dep_rel)

    return MultiVocab(
        dict(word=wd_vocab, char=char_vocab, tag=tag_vocab, rel=rel_vocab))
Example #8
0
def main():
    parser = argparse.ArgumentParser(
        description='Tuning with bi-directional LSTM-CNN')
    parser.add_argument('--num_epochs',
                        type=int,
                        default=100,
                        help='Number of training epochs')
    parser.add_argument('--batch_size',
                        type=int,
                        default=1,
                        help='Number of sentences in each batch')
    parser.add_argument('--hidden_size',
                        type=int,
                        default=200,
                        help='Number of hidden units in RNN')
    parser.add_argument('--num_filters',
                        type=int,
                        default=35,
                        help='Number of filters in CNN')
    parser.add_argument('--min_filter_width',
                        type=int,
                        default=3,
                        help='Number of filters in CNN')
    parser.add_argument('--max_filter_width',
                        type=int,
                        default=7,
                        help='Number of filters in CNN')
    parser.add_argument('--embedDimension',
                        type=int,
                        default=300,
                        help='embedding dimension')

    parser.add_argument('--learning_rate',
                        type=float,
                        default=0.4,
                        help='Learning rate')
    parser.add_argument('--decay_rate',
                        type=float,
                        default=0.1,
                        help='Decay rate of learning rate')
    parser.add_argument('--gamma',
                        type=float,
                        default=0.0,
                        help='weight for regularization')
    parser.add_argument('--schedule',
                        type=int,
                        default=1,
                        help='schedule for learning rate decay')
    parser.add_argument('--embedding_vectors', help='path for embedding dict')
    parser.add_argument('--train')
    parser.add_argument('--trainAux')
    parser.add_argument('--dev')
    parser.add_argument('--test')

    parser.add_argument('--ner_tag_field_l1',
                        type=int,
                        default=1,
                        help='ner tag field')
    parser.add_argument('--ner_tag_field_l2',
                        type=int,
                        default=1,
                        help='ner tag field')
    parser.add_argument('--use_gpu', type=int, default=0, help='use gpu')

    parser.add_argument('--save_dir')

    parser.add_argument('--vocabChar')
    parser.add_argument('--vocabOutput')
    parser.add_argument('--vocabOutputAux')
    parser.add_argument('--vocabInput')

    args = parser.parse_args()

    use_gpu = args.use_gpu

    train_path = args.train
    train_path_aux = args.trainAux
    dev_path = args.dev
    test_path = args.test

    num_epochs = args.num_epochs
    batch_size = args.batch_size
    hidden_size = args.hidden_size

    num_filters = args.num_filters
    min_filter_width = args.min_filter_width
    max_filter_width = args.max_filter_width

    learning_rate = args.learning_rate
    momentum = 0.01 * learning_rate
    decay_rate = args.decay_rate
    gamma = args.gamma
    schedule = args.schedule

    save_dir = args.save_dir
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    embedding_path = args.embedding_vectors

    inputVocabulary = Vocab()
    charVocabulary = CharVocab()
    targetVocabulary = Vocab()
    targetVocabularyAux = Vocab()

    if args.vocabChar:
        with open(args.vocabChar, "r") as f:
            charVocabulary.__dict__ = json.load(f)
        charVocabulary.set_freeze()
        charVocabulary.process()

    if args.vocabOutput:
        with open(args.vocabOutput, "r") as f:
            targetVocabulary.__dict__ = json.load(f)
        targetVocabulary.set_freeze()
        targetVocabulary.process()

    if args.vocabOutputAux:
        with open(args.vocabOutputAux, "r") as f:
            targetVocabularyAux.__dict__ = json.load(f)
        targetVocabularyAux.set_freeze()
        targetVocabularyAux.process()

    embedding_vocab = None

    if args.embedding_vectors:
        print(args.embedding_vectors)
        embedd_dict, embedding_vocab, reverse_word_vocab, vocabularySize, embeddingDimension = load_embeddings(
            embedding_path)
        print("Read Word Embedding of dimension " + str(embeddingDimension) +
              " for " + str(vocabularySize) + " number of words")

        for everyWord in embedding_vocab:
            inputVocabulary.add(everyWord)
        inputVocabulary.set_freeze()
        inputVocabulary.process()
    else:
        if args.vocabInput:
            with open(args.vocabInput, "r") as f:
                inputVocabulary.__dict__ = json.load(f)
            inputVocabulary.set_freeze()
            inputVocabulary.process()
        else:
            print(
                "Neither pre-trained word embeddings nor input vocabulary is specified"
            )
            exit()

    if charVocabulary.__is_empty__():
        charVocabulary.add("<S>")
        charVocabulary.add("</S>")

    trainCorpus, trainLabelsRaw, maxTrainLength = readCoNLL(
        train_path, charVocabulary, targetVocabulary, args.ner_tag_field_l1,
        embedding_vocab)
    print("Train Corpus contains " + str(len(trainCorpus)) +
          " sentences and maximum sentence length is " + str(maxTrainLength))

    trainCorpusRawSorted = trainCorpus
    trainLabelsRawSorted = trainLabelsRaw

    trainCorpusAux, trainLabelsRawAux, maxTrainLengthAux = readCoNLL(
        train_path_aux, charVocabulary, targetVocabularyAux,
        args.ner_tag_field_l2, embedding_vocab)
    print("Auxiliary Train Corpus contains " + str(len(trainCorpusAux)) +
          " sentences and maximum sentence length is " +
          str(maxTrainLengthAux))

    trainCorpusRawSortedAux = trainCorpusAux
    trainLabelsRawSortedAux = trainLabelsRawAux

    devCorpus, devLabelsRaw, maxDevLength = readCoNLL(dev_path, charVocabulary,
                                                      targetVocabulary,
                                                      args.ner_tag_field_l1,
                                                      embedding_vocab)
    print("Dev Corpus contains " + str(len(devCorpus)) +
          " sentences and maximum sentence length is " + str(maxDevLength))

    testCorpus, testLabelsRaw, maxTestLength = readCoNLL(
        test_path, charVocabulary, targetVocabulary, args.ner_tag_field_l1,
        embedding_vocab)
    print("Test Corpus contains " + str(len(testCorpus)) +
          " sentences and maximum sentence length is " + str(maxTestLength))

    if not targetVocabulary.get_freeze():
        print(targetVocabulary._tok_to_ind)

        tmp_filename = '%s/output.vocab' % (save_dir)
        with open(tmp_filename, "w") as f:
            json.dump(targetVocabulary.__dict__, f)
        targetVocabulary.set_freeze()

    if not targetVocabularyAux.get_freeze():
        print(targetVocabularyAux._tok_to_ind)

        tmp_filename = '%s/output_aux.vocab' % (save_dir)
        with open(tmp_filename, "w") as f:
            json.dump(targetVocabularyAux.__dict__, f)
        targetVocabularyAux.set_freeze()

    if not charVocabulary.get_freeze():
        tmp_filename = '%s/char.vocab' % (save_dir)
        with open(tmp_filename, "w") as f:
            json.dump(charVocabulary.__dict__, f)
        charVocabulary.set_freeze()

    embeddingDimension = args.embedDimension
    word_embedding = np.random.uniform(
        -0.1, 0.1, (inputVocabulary.__len__(), embeddingDimension))
    if args.embedding_vectors:
        for everyWord in inputVocabulary._tok_to_ind:
            if everyWord in embedding_vocab:
                word_embedding[inputVocabulary.__get_word__(
                    everyWord)] = embedd_dict[embedding_vocab[everyWord]]

        tmp_filename = '%s/input.vocab' % (save_dir)
        with open(tmp_filename, "w") as f:
            json.dump(inputVocabulary.__dict__, f)
        inputVocabulary.set_freeze()

        del embedd_dict
        del reverse_word_vocab
        del vocabularySize
        del embedding_vocab

    print("Read " + str(targetVocabulary.__len__()) +
          " number of target words")
    print("Read " + str(targetVocabularyAux.__len__()) +
          " number of target words")
    print("Read " + str(inputVocabulary.__len__()) + " number of input words")
    print("Read " + str(charVocabulary.__len__()) + " number of characters")

    print("Number of epochs = " + str(num_epochs))
    print("Mini-Batch size = " + str(batch_size))
    print("Bi-LSTM Hidden size = " + str(hidden_size))
    print("Features per CNN filter = " + str(num_filters))
    print("Minimum ngrams for CNN filter = " + str(min_filter_width))
    print("Maximum ngrams for CNN filter = " + str(max_filter_width))
    print("Initial Learning Rate = " + str(learning_rate))

    network = BiCNNLSTMTranstion(inputVocabulary.__len__(), embeddingDimension,
                                 min_filter_width, max_filter_width,
                                 charVocabulary.__len__(),
                                 num_filters, hidden_size,
                                 targetVocabulary.__len__(), word_embedding,
                                 targetVocabularyAux.__len__())

    lr = learning_rate
    lr_aux = learning_rate * 0.1

    optim = SGD(network.parameters(),
                lr=lr,
                momentum=momentum,
                weight_decay=gamma,
                nesterov=True)

    num_batches = len(trainCorpus) / batch_size + 1

    dev_f1 = 0.0
    dev_acc = 0.0
    dev_precision = 0.0
    dev_recall = 0.0
    test_f1 = 0.0
    test_acc = 0.0
    test_precision = 0.0
    test_recall = 0.0
    best_epoch = 0

    print("Training....")

    if use_gpu == 1:
        network.cuda()

    prev_error = 100000.0

    for epoch in range(1, num_epochs + 1):
        print(
            'Epoch %d ( learning rate=%.4f, decay rate=%.4f (schedule=%d)): ' %
            (epoch, lr, decay_rate, schedule))

        train_err = 0.
        train_corr = 0.
        train_total = 0.

        start_time = time.time()
        num_back = 0
        network.train()

        count = 0
        count_batch = 0

        l1_indices = list(range(len(trainCorpusRawSorted)))
        l2_indices = list(range(len(trainCorpusRawSortedAux)))

        with tqdm(total=(len(trainCorpusRawSorted) +
                         len(trainCorpusRawSortedAux))) as pbar:

            for l1, l2 in zip_longest(l1_indices, l2_indices):
                if l1 is not None:
                    x_input, batch_length, current_batch_size, current_max_sequence_length, y_output, mask, y_prev = constructBatch(
                        [trainCorpusRawSorted[l1]], [trainLabelsRawSorted[l1]],
                        inputVocabulary, targetVocabulary, charVocabulary,
                        max_filter_width, args.use_gpu)

                    optim.zero_grad()
                    loss, _ = network.loss(x_input, batch_length,
                                           current_batch_size,
                                           current_max_sequence_length,
                                           y_output, mask, y_prev, 0, use_gpu)

                    loss.backward()
                    optim.step()

                    train_err += loss.item()
                    train_total += batch_length.data.sum()

                    count = count + current_batch_size
                    count_batch = count_batch + 1

                if l2 is not None:
                    x_input, batch_length, current_batch_size, current_max_sequence_length, y_output, mask, y_prev = constructBatch(
                        [trainCorpusRawSortedAux[l2]],
                        [trainLabelsRawSortedAux[l2]], inputVocabulary,
                        targetVocabularyAux, charVocabulary, max_filter_width,
                        args.use_gpu)

                    optim.zero_grad()
                    loss, _ = network.loss(x_input, batch_length,
                                           current_batch_size,
                                           current_max_sequence_length,
                                           y_output, mask, y_prev, 1, use_gpu)

                    loss.backward()
                    optim.step()

                time_ave = (time.time() - start_time) / count
                time_left = (num_batches - count_batch) * time_ave
                pbar.update(2)

        print('train: %d loss: %.4f, time: %.2fs' %
              (num_batches, train_err / count, time.time() - start_time))

        network.eval()
        tmp_filename = '%s/_dev%d' % (save_dir, epoch)

        current_epoch_loss = 0.0

        with codecs.open(tmp_filename, "w", encoding="utf8",
                         errors="ignore") as writer:
            for inputs, labels in batch(devCorpus, devLabelsRaw, 1):
                x_input, batch_length, current_batch_size, current_max_sequence_length, y_output, mask, y_prev = constructBatch(
                    inputs, labels, inputVocabulary, targetVocabulary,
                    charVocabulary, max_filter_width, args.use_gpu)

                loss, _ = network.loss(x_input, batch_length,
                                       current_batch_size,
                                       current_max_sequence_length, y_output,
                                       mask, y_prev, 0, use_gpu)
                current_epoch_loss = current_epoch_loss + loss.item()

                loss, preds = network.forward(x_input, batch_length,
                                              current_batch_size,
                                              current_max_sequence_length,
                                              y_output, mask, y_prev, 0,
                                              use_gpu)

                count = 0

                for i in range(len(inputs)):
                    for j in range(len(inputs[i])):
                        writer.write(inputs[i][j] + " " + labels[i][j] + " " +
                                     targetVocabulary.__get_index__(
                                         preds[i][j].item()).upper())
                        writer.write("\n")
                    writer.write("\n")

            writer.close()

        acc, precision, recall, f1 = evaluate(tmp_filename, save_dir)
        print(
            'dev loss: %.2f, dev acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%'
            % (current_epoch_loss, acc, precision, recall, f1))

        if current_epoch_loss > prev_error:
            lr = lr * 0.7
            optim = SGD(network.parameters(),
                        lr=lr,
                        momentum=momentum,
                        weight_decay=gamma,
                        nesterov=True)

            network.load_state_dict(torch.load(save_dir + "/model"))
            network.eval()

            if lr < 0.002:
                network.eval()
                tmp_filename = '%s/_test%d' % (save_dir, epoch)

                with codecs.open(tmp_filename,
                                 "w",
                                 encoding="utf8",
                                 errors="ignore") as writer:
                    for inputs, labels in batch(testCorpus, testLabelsRaw, 1):
                        x_input, batch_length, current_batch_size, current_max_sequence_length, y_output, mask, y_prev = constructBatch(
                            inputs, labels, inputVocabulary, targetVocabulary,
                            charVocabulary, max_filter_width, args.use_gpu)

                        loss, preds = network.forward(
                            x_input, batch_length, current_batch_size,
                            current_max_sequence_length, y_output, mask,
                            y_prev, 0, use_gpu)

                        count = 0

                        for i in range(len(inputs)):
                            for j in range(len(inputs[i])):
                                writer.write(inputs[i][j] + " " +
                                             labels[i][j] + " " +
                                             targetVocabulary.__get_index__(
                                                 preds[i][j].item()).upper())
                                writer.write("\n")
                            writer.write("\n")

                    writer.close()

                acc, precision, recall, f1 = evaluate(tmp_filename, save_dir)
                print(
                    'test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%'
                    % (acc, precision, recall, f1))
                exit()
        else:
            prev_error = current_epoch_loss
            torch.save(network.state_dict(), save_dir + "/model")

            network.eval()
            tmp_filename = '%s/_test%d' % (save_dir, epoch)

            with codecs.open(tmp_filename,
                             "w",
                             encoding="utf8",
                             errors="ignore") as writer:
                for inputs, labels in batch(testCorpus, testLabelsRaw, 1):
                    x_input, batch_length, current_batch_size, current_max_sequence_length, y_output, mask, y_prev = constructBatch(
                        inputs, labels, inputVocabulary, targetVocabulary,
                        charVocabulary, max_filter_width, args.use_gpu)

                    loss, preds = network.forward(x_input, batch_length,
                                                  current_batch_size,
                                                  current_max_sequence_length,
                                                  y_output, mask, y_prev, 0,
                                                  use_gpu)

                    count = 0

                    for i in range(len(inputs)):
                        for j in range(len(inputs[i])):
                            writer.write(inputs[i][j] + " " + labels[i][j] +
                                         " " + targetVocabulary.__get_index__(
                                             preds[i][j].item()).upper())
                            writer.write("\n")
                        writer.write("\n")

                writer.close()

            acc, precision, recall, f1 = evaluate(tmp_filename, save_dir)
            print(
                'test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%'
                % (acc, precision, recall, f1))

            torch.save(network.state_dict(), save_dir + "/model")
Example #9
0
def main():

    parser = argparse.ArgumentParser(
        description='Training a Sequence Labeler with bi-directional LSTM-CNN')
    parser.add_argument('--num_epochs',
                        type=int,
                        default=100,
                        help='Number of training epochs')
    parser.add_argument('--batch_size',
                        type=int,
                        default=5,
                        help='Number of sentences in each batch')
    parser.add_argument('--hidden_size',
                        type=int,
                        default=200,
                        help='Number of hidden units in RNN')
    parser.add_argument('--num_filters',
                        type=int,
                        default=35,
                        help='Number of filters in CNN')
    parser.add_argument('--min_filter_width',
                        type=int,
                        default=3,
                        help='Number of filters in CNN')
    parser.add_argument('--max_filter_width',
                        type=int,
                        default=7,
                        help='Number of filters in CNN')
    parser.add_argument('--embedDimension',
                        type=int,
                        default=300,
                        help='embedding dimension')

    parser.add_argument('--learning_rate',
                        type=float,
                        default=0.4,
                        help='Learning rate')
    parser.add_argument('--gamma',
                        type=float,
                        default=0.0,
                        help='weight for regularization')
    parser.add_argument('--embedding_vectors', help='path for embedding dict')
    parser.add_argument('--embedding_dict_new', help='path for embedding dict')
    parser.add_argument('--train')
    parser.add_argument('--dev')
    parser.add_argument('--test')

    parser.add_argument('--vocabChar')
    parser.add_argument('--vocabOutput')
    parser.add_argument('--vocabInput')

    parser.add_argument('--ner_tag_field',
                        type=int,
                        default=1,
                        help='ner tag field')
    parser.add_argument('--use_gpu', type=int, default=1, help='use gpu')

    parser.add_argument('--fineTune',
                        type=bool,
                        default=False,
                        help='fineTune pretrained word embeddings')

    parser.add_argument('--save-dir')
    parser.add_argument('--perform_evaluation',
                        type=bool,
                        default=False,
                        help='perform evaluation only')
    parser.add_argument('--deploy', type=bool, default=False, help='deploy')

    parser.add_argument('--train_from', type=str, default="")

    args = parser.parse_args()

    train_path = args.train
    dev_path = args.dev
    test_path = args.test

    num_epochs = args.num_epochs
    batch_size = args.batch_size
    hidden_size = args.hidden_size

    num_filters = args.num_filters
    min_filter_width = args.min_filter_width
    max_filter_width = args.max_filter_width

    learning_rate = args.learning_rate
    momentum = 0.01 * learning_rate
    gamma = args.gamma

    embedding_path = args.embedding_vectors

    save_dir = args.save_dir

    # create the output folder if does not exist
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    evaluation = args.perform_evaluation

    inputVocabulary = Vocab()
    charVocabulary = CharVocab()
    targetVocabulary = Vocab()

    # Read Character vocabulary if vocabChar argument is given
    if args.vocabChar:
        with open(args.vocabChar, "r") as f:
            charVocabulary.__dict__ = json.load(f)
        charVocabulary.set_freeze()
        charVocabulary.process()

    # Read Output vocabulary if vocabChar argument is given
    if args.vocabOutput:
        with open(args.vocabOutput, "r") as f:
            targetVocabulary.__dict__ = json.load(f)
        targetVocabulary.set_freeze()
        targetVocabulary.process()

    embedding_vocab = None

    # If the path to pre-trained embeddings are given
    if args.embedding_vectors:
        print(args.embedding_vectors)

        # load the pre-trained embeddings
        embedd_dict, embedding_vocab, reverse_word_vocab, vocabularySize, embeddingDimension = load_embeddings(
            embedding_path)
        print("Read Word Embedding of dimension " + str(embeddingDimension) +
              " for " + str(vocabularySize) + " number of words")

        # add the words to the Input vocabulary which is a dictionary of words
        for everyWord in embedding_vocab:
            inputVocabulary.add(everyWord)
        inputVocabulary.set_freeze()
        inputVocabulary.process()
    else:
        # Read Input vocabulary if vocabChar argument is given
        if args.vocabInput:
            with open(args.vocabInput, "r") as f:
                inputVocabulary.__dict__ = json.load(f)
            inputVocabulary.set_freeze()
            inputVocabulary.process()
        else:
            print(
                "Neither pre-trained word embeddings nor input vocabulary is specified"
            )
            exit()

    # if character vocabulary is initialize with beginning and end markers
    if charVocabulary.__is_empty__():
        charVocabulary.add("<S>")
        charVocabulary.add("</S>")

    if evaluation:
        # if we are performing evaluation, we do not require train and dev splits
        if not args.deploy:
            # if we are not deploying the model, then we are interesting in testing the model
            testCorpus, testLabelsRaw, maxTestLength = readCoNLL(
                test_path, charVocabulary, targetVocabulary,
                args.ner_tag_field, inputVocabulary)
            print("Test Corpus contains " + str(len(testCorpus)) +
                  " sentences and maximum sentence length is " +
                  str(maxTestLength))
            print("Read " + str(len(charVocabulary)) + " number of characters")

        else:
            # if we are deploying the model, we are trying to get predictions on a plain corpus
            testCorpus, maxTestLength = readUnlabeledData(test_path)
            print("Test Corpus contains " + str(len(testCorpus)) +
                  " sentences and maximum sentence length is " +
                  str(maxTestLength))
    else:
        # read train split
        trainCorpus, trainLabelsRaw, maxTrainLength = readCoNLL(
            train_path, charVocabulary, targetVocabulary, args.ner_tag_field,
            embedding_vocab)
        print("Train Corpus contains " + str(len(trainCorpus)) +
              " sentences and maximum sentence length is " +
              str(maxTrainLength))

        trainCorpusRawSorted = trainCorpus
        trainLabelsRawSorted = trainLabelsRaw

        # read dev split
        devCorpus, devLabelsRaw, maxDevLength = readCoNLL(
            dev_path, charVocabulary, targetVocabulary, args.ner_tag_field,
            embedding_vocab)
        print("Dev Corpus contains " + str(len(devCorpus)) +
              " sentences and maximum sentence length is " + str(maxDevLength))

        # read test split
        testCorpus, testLabelsRaw, maxTestLength = readCoNLL(
            test_path, charVocabulary, targetVocabulary, args.ner_tag_field,
            embedding_vocab)
        print("Test Corpus contains " + str(len(testCorpus)) +
              " sentences and maximum sentence length is " +
              str(maxTestLength))

    if not targetVocabulary.get_freeze():
        # save the output vocabulary
        print(targetVocabulary._tok_to_ind)

        tmp_filename = '%s/output.vocab' % (save_dir)
        with open(tmp_filename, "w") as f:
            json.dump(targetVocabulary.__dict__, f)
        targetVocabulary.set_freeze()

    if not charVocabulary.get_freeze():
        # save the character vocabulary
        tmp_filename = '%s/char.vocab' % (save_dir)
        with open(tmp_filename, "w") as f:
            json.dump(charVocabulary.__dict__, f)
        charVocabulary.set_freeze()

    # initialize word embeddings randomly
    embeddingDimension = args.embedDimension
    word_embedding = np.random.uniform(
        -0.1, 0.1, (inputVocabulary.__len__(), embeddingDimension))

    if args.embedding_vectors:
        # pre-trained word embeddings are given, update the word_embedding variable with pre-trained embeddings
        for everyWord in inputVocabulary._tok_to_ind:
            if everyWord in embedding_vocab:
                word_embedding[inputVocabulary.__get_word__(
                    everyWord)] = embedd_dict[embedding_vocab[everyWord]]

        # save the input vocabulary
        tmp_filename = '%s/input.vocab' % (save_dir)
        with open(tmp_filename, "w") as f:
            json.dump(inputVocabulary.__dict__, f)
        inputVocabulary.set_freeze()

        del embedd_dict
        del reverse_word_vocab
        del vocabularySize
        del embedding_vocab

    print("Read " + str(targetVocabulary.__len__()) +
          " number of target words")
    print("Read " + str(inputVocabulary.__len__()) + " number of input words")
    print("Read " + str(charVocabulary.__len__()) + " number of characters")

    print("Number of epochs = " + str(num_epochs))
    print("Mini-Batch size = " + str(batch_size))
    print("Bi-LSTM Hidden size = " + str(hidden_size))
    print("Features per CNN filter = " + str(num_filters))
    print("Minimum ngrams for CNN filter = " + str(min_filter_width))
    print("Maximum ngrams for CNN filter = " + str(max_filter_width))
    print("Initial Learning Rate = " + str(learning_rate))

    use_gpu = args.use_gpu

    # create a Bi-LSTM network
    network = BiCNNLSTMTranstion(inputVocabulary.__len__(), embeddingDimension,
                                 min_filter_width, max_filter_width,
                                 charVocabulary.__len__(),
                                 num_filters, hidden_size,
                                 targetVocabulary.__len__(), word_embedding,
                                 args.fineTune)
    print(network)

    lr = learning_rate

    optim = SGD(network.parameters(),
                lr=lr,
                momentum=momentum,
                weight_decay=gamma,
                nesterov=True)

    if not evaluation:
        num_batches = len(trainCorpus) / batch_size + 1

    dev_f1 = 0.0
    dev_acc = 0.0
    dev_precision = 0.0
    dev_recall = 0.0
    test_f1 = 0.0
    test_acc = 0.0
    test_precision = 0.0
    test_recall = 0.0
    best_epoch = 0

    if evaluation:
        # if we are performing evaluation, load the trained model
        network.load_state_dict(torch.load(save_dir + "/model"))

        # save output vocabulary as a plain file
        tmp_filename = '%s/output.vocab.plain' % (save_dir)
        with open(tmp_filename, "w") as f:
            for index in range(len(targetVocabulary._ind_to_tok)):
                f.write(targetVocabulary._ind_to_tok[index])
                f.write("\n")
            f.close()

        # save input vocabulary as a plain file
        tmp_filename = '%s/input.vocab.plain' % (save_dir)
        with open(tmp_filename, "w") as f:
            for index in range(len(inputVocabulary._ind_to_tok)):
                f.write(inputVocabulary._ind_to_tok[index])
                f.write("\n")
            f.close()

        # save character vocabulary as a plain file
        tmp_filename = '%s/char.vocab.plain' % (save_dir)
        with open(tmp_filename, "w") as f:
            for index in range(len(charVocabulary._ind_to_tok)):
                f.write(charVocabulary._ind_to_tok[index])
                f.write("\n")
            f.close()

        print("Performing Evaluation")
        if args.use_gpu == 0:
            network.cpu()

        network.eval()
        tmp_filename = '%s/_test_new' % (save_dir)

        if args.use_gpu == 1:
            print("Using GPU....")
            network.cuda()

        with codecs.open(tmp_filename, "w", encoding="utf8",
                         errors="ignore") as writer:
            for inputs, labels in batch(testCorpus, testLabelsRaw, 1):
                # for every sentence in the test data, convert the input to tensor
                x_input, batch_length, current_batch_size, current_max_sequence_length, y_output, mask, y_prev = constructBatch(
                    inputs, labels, inputVocabulary, targetVocabulary,
                    charVocabulary, max_filter_width, args.use_gpu)

                # get predictions by calling the forward() function of the model
                loss, preds, probs = network.forward(
                    x_input, batch_length, current_batch_size,
                    current_max_sequence_length, y_output, mask, y_prev,
                    args.use_gpu)

                count = 0

                # get the labels and write the output to the file
                for i in range(len(inputs)):
                    for j in range(len(inputs[i])):
                        writer.write(inputs[i][j])

                        for k in range(probs[i][j].size()[0]):
                            writer.write(" " + str(probs[i][j][k].item()))
                        writer.write(" " + inputs[i][j] + " " + labels[i][j] +
                                     " " + targetVocabulary.__get_index__(
                                         preds[i][j].item()).upper())
                        writer.write("\n")
                    writer.write("\n")

            writer.close()

        # Calulate the F-Score on the predicted output
        acc, precision, recall, f1 = evaluate(tmp_filename, save_dir)
        print(
            'test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%' %
            (acc, precision, recall, f1))
    else:
        if args.use_gpu == 1:
            print("Using GPU....")
            network.cuda()

        if args.train_from:
            print("Loading pre-trained model from " + args.train_from)
            network.load_state_dict(torch.load(args.train_from))

        print("Training....")
        prev_error = 1000.0

        network.eval()
        tmp_filename = '%s/_dev' % (save_dir)
        current_epoch_loss = 0.0

        with codecs.open(tmp_filename, "w", encoding="utf8",
                         errors="ignore") as writer:
            for inputs, labels in batch(devCorpus, devLabelsRaw, 1):
                # for every sentence in the test data, convert the input to tensor
                x_input, batch_length, current_batch_size, current_max_sequence_length, y_output, mask, y_prev = constructBatch(
                    inputs, labels, inputVocabulary, targetVocabulary,
                    charVocabulary, max_filter_width, args.use_gpu)

                # get the loss by calling the forward() function of the model
                loss, _ = network.loss(x_input, batch_length,
                                       current_batch_size,
                                       current_max_sequence_length, y_output,
                                       mask, y_prev, args.use_gpu)
                current_epoch_loss = current_epoch_loss + loss.item()

                # get the predictions by calling the forward() function of the model
                loss, preds, probs = network.forward(
                    x_input, batch_length, current_batch_size,
                    current_max_sequence_length, y_output, mask, y_prev,
                    args.use_gpu)

                count = 0

                # get the labels and write the output to the file
                for i in range(len(inputs)):
                    for j in range(len(inputs[i])):
                        writer.write(inputs[i][j] + " " + labels[i][j] + " " +
                                     targetVocabulary.__get_index__(
                                         preds[i][j].item()).upper())
                        writer.write("\n")
                    writer.write("\n")

            writer.close()

        # Calulate the F-Score on the predicted output
        acc, precision, recall, f1 = evaluate(tmp_filename, save_dir)
        print(
            'dev loss: %.2f, dev acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%'
            % (current_epoch_loss, acc, precision, recall, f1))

        for epoch in range(1, num_epochs + 1):
            print('Epoch %d ( learning rate=%.4f ): ' % (epoch, lr))

            train_err = 0.
            train_corr = 0.
            train_total = 0.

            start_time = time.time()
            num_back = 0
            network.train()

            count = 0
            count_batch = 0

            with tqdm(total=(len(trainCorpusRawSorted))) as pbar:
                for inputs, labels in batch(trainCorpusRawSorted,
                                            trainLabelsRawSorted, batch_size):
                    # for every sentence in the test data, convert the input to tensor
                    x_input, batch_length, current_batch_size, current_max_sequence_length, y_output, mask, y_prev = constructBatch(
                        inputs, labels, inputVocabulary, targetVocabulary,
                        charVocabulary, max_filter_width, args.use_gpu)

                    optim.zero_grad()

                    # get the loss by calling the forward() function of the model
                    loss, _ = network.loss(x_input, batch_length,
                                           current_batch_size,
                                           current_max_sequence_length,
                                           y_output, mask, y_prev,
                                           args.use_gpu)

                    # calculate gradients by calling backward() function and call optim.step() to perform gradient updation
                    loss.backward()
                    optim.step()

                    train_err += loss.item()
                    train_total += batch_length.data.sum()

                    count = count + current_batch_size
                    count_batch = count_batch + 1

                    time_ave = (time.time() - start_time) / count
                    time_left = (num_batches - count_batch) * time_ave
                    pbar.update(1)

            print('train: %d loss: %.4f, time: %.2fs' %
                  (num_batches, train_err / count, time.time() - start_time))

            network.eval()
            tmp_filename = '%s/_dev%d' % (save_dir, epoch)
            current_epoch_loss = 0.0

            with codecs.open(tmp_filename,
                             "w",
                             encoding="utf8",
                             errors="ignore") as writer:
                for inputs, labels in batch(devCorpus, devLabelsRaw, 1):
                    # for every sentence in the test data, convert the input to tensor
                    x_input, batch_length, current_batch_size, current_max_sequence_length, y_output, mask, y_prev = constructBatch(
                        inputs, labels, inputVocabulary, targetVocabulary,
                        charVocabulary, max_filter_width, args.use_gpu)

                    # get the loss by calling the forward() function of the model
                    loss, _ = network.loss(x_input, batch_length,
                                           current_batch_size,
                                           current_max_sequence_length,
                                           y_output, mask, y_prev,
                                           args.use_gpu)
                    current_epoch_loss = current_epoch_loss + loss.item()

                    # get the predictions by calling the forward() function of the model
                    loss, preds, probs = network.forward(
                        x_input, batch_length, current_batch_size,
                        current_max_sequence_length, y_output, mask, y_prev,
                        args.use_gpu)

                    count = 0

                    # get the labels and write the output to the file
                    for i in range(len(inputs)):
                        for j in range(len(inputs[i])):
                            writer.write(inputs[i][j] + " " + labels[i][j] +
                                         " " + targetVocabulary.__get_index__(
                                             preds[i][j].item()).upper())
                            writer.write("\n")
                        writer.write("\n")

                writer.close()

            # Calulate the F-Score on the predicted output
            acc, precision, recall, f1 = evaluate(tmp_filename, save_dir)
            print(
                'dev loss: %.2f, dev acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%'
                % (current_epoch_loss, acc, precision, recall, f1))

            if epoch > 1:
                if current_epoch_loss > prev_error:
                    # if the validation loss has increased, load the previous epoch model and reduce the learning rate
                    lr = lr * 0.7
                    optim = SGD(network.parameters(),
                                lr=lr,
                                momentum=momentum,
                                weight_decay=gamma,
                                nesterov=True)

                    network.load_state_dict(torch.load(save_dir + "/model"))
                    network.eval()

                    if lr < 0.002:
                        # if the learning rate is less than 0.002, stop the training
                        network.eval()
                        tmp_filename = '%s/_test%d' % (save_dir, epoch)

                        with codecs.open(tmp_filename,
                                         "w",
                                         encoding="utf8",
                                         errors="ignore") as writer:
                            for inputs, labels in batch(
                                    testCorpus, testLabelsRaw, 1):
                                # for every sentence in the test data, convert the input to tensor
                                x_input, batch_length, current_batch_size, current_max_sequence_length, y_output, mask, y_prev = constructBatch(
                                    inputs, labels, inputVocabulary,
                                    targetVocabulary, charVocabulary,
                                    max_filter_width, args.use_gpu)

                                # get the predictions by calling the forward() function of the model
                                loss, preds, probs = network.forward(
                                    x_input, batch_length, current_batch_size,
                                    current_max_sequence_length, y_output,
                                    mask, y_prev, args.use_gpu)

                                count = 0

                                # get the labels and write the output to the file
                                for i in range(len(inputs)):
                                    for j in range(len(inputs[i])):
                                        writer.write(inputs[i][j])

                                        for k in range(probs[i][j].size()[0]):
                                            writer.write(
                                                " " +
                                                str(probs[i][j][k].item()))
                                        writer.write(
                                            " " + inputs[i][j] + " " +
                                            labels[i][j] + " " +
                                            targetVocabulary.__get_index__(
                                                preds[i][j].item()).upper())
                                        writer.write("\n")
                                    writer.write("\n")

                            writer.close()

                        # Calulate the F-Score on the predicted output
                        acc, precision, recall, f1 = evaluate(
                            tmp_filename, save_dir)
                        print(
                            'test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%'
                            % (acc, precision, recall, f1))

                        exit()
                else:
                    prev_error = current_epoch_loss
                    torch.save(network.state_dict(), save_dir + "/model")

                    network.eval()
                    tmp_filename = '%s/_test%d' % (save_dir, epoch)

                    with codecs.open(tmp_filename,
                                     "w",
                                     encoding="utf8",
                                     errors="ignore") as writer:
                        for inputs, labels in batch(testCorpus, testLabelsRaw,
                                                    1):
                            # for every sentence in the test data, convert the input to tensor
                            x_input, batch_length, current_batch_size, current_max_sequence_length, y_output, mask, y_prev = constructBatch(
                                inputs, labels, inputVocabulary,
                                targetVocabulary, charVocabulary,
                                max_filter_width, args.use_gpu)

                            # get the predictions by calling the forward() function of the model
                            loss, preds, probs = network.forward(
                                x_input, batch_length, current_batch_size,
                                current_max_sequence_length, y_output, mask,
                                y_prev, args.use_gpu)

                            count = 0

                            # get the labels and write the output to the file
                            for i in range(len(inputs)):
                                for j in range(len(inputs[i])):
                                    writer.write(
                                        inputs[i][j] + " " + labels[i][j] +
                                        " " + targetVocabulary.__get_index__(
                                            preds[i][j].item()).upper())
                                    writer.write("\n")
                                writer.write("\n")

                        writer.close()

                    # Calulate the F-Score on the predicted output
                    acc, precision, recall, f1 = evaluate(
                        tmp_filename, save_dir)
                    print(
                        'test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%'
                        % (acc, precision, recall, f1))
            else:
                prev_error = current_epoch_loss
                torch.save(network.state_dict(), save_dir + "/model")
Example #10
0
def create_vocab(all_data, min_count=3):
    wd_vocab = Vocab(min_count=min_count, bos=None, eos=None)
    for task_data in all_data:
        for inst in task_data:
            wd_vocab.add(inst.data)
    return wd_vocab