Python Vocab.add Examples

Programming Language: Python

Namespace/Package Name: utils.vocab

Class/Type: Vocab

Method/Function: add

Examples at hotexamples.com: 10

Python Vocab.add - 10 examples found. These are the top rated real world Python examples of utils.vocab.Vocab.add extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Vocab(30)

load(22)

add(10)

from_file(8)

save(7)

add_word(5)

add_tokenized_sentence(3)

tokens2ids(3)

size(3)

tokens2indices(3)

finish(3)

set_freeze(2)

extend(2)

token2id(2)

process(2)

load_pretrained_emb(2)

__dict__(2)

get_freeze(2)

__get_index__(2)

__get_word__(2)

__len__(2)

build_vocab(2)

encode_sequence_batch(1)

build(1)

unmap(1)

top_words(1)

add_documents(1)

add_pad_token(1)

sequence_2_id(1)

add_unk_token(1)

randomly_init_embeddings(1)

add_words(1)

load_vocab(1)

load_from_pickle(1)

embeddings(1)

load_from_file(1)

load_embeddings(1)

inst2idx(1)

init_embed(1)

get_token(1)

get_size(1)

get_index(1)

contains(1)

from_iterable(1)

convert_tokens_to_ids(1)

create(1)

filter_tokens_by_cnt(1)

word_2_id(1)

Example #1

Show file

def get_embed_vocab(embed_file):
    assert os.path.exists(embed_file)
    embed_vocab = Vocab(bos=None, eos=None)
    vec_dim = 0
    with open(embed_file, 'r', encoding='utf-8') as fin:
        for line in fin:
            tokens = line.strip().split(' ')
            if len(tokens) < 10:
                continue
            embed_vocab.add(tokens[0])
            if vec_dim == 0:
                vec_dim = len(tokens[1:])

    embed_weights = np.random.uniform(-0.5 / vec_dim, 0.5 / vec_dim,
                                      (len(embed_vocab), vec_dim))
    with open(embed_file, 'r', encoding='utf-8') as fin:
        for line in fin:
            tokens = line.strip().split(' ')
            if len(tokens) < 10:
                continue
            idx = embed_vocab.inst2idx(tokens[0])
            embed_weights[idx] = np.asarray(tokens[1:], dtype=np.float32)
    embed_weights[embed_vocab.pad_idx] = 0.
    embed_weights /= np.std(embed_weights)
    embed_vocab.embeddings = embed_weights
    return embed_vocab

Example #2

Show file

def prepare(args):
    logger = logging.getLogger("brc")
    logger.info('Checking the data files...')
    for data_path in args.train_files + args.dev_files + args.test_files:
        assert os.path.exists(data_path), '{} file does not exist.'.format(
            data_path)
    logger.info('Preparing the directories...')
    for dir_path in [
            args.vocab_dir, args.model_dir, args.result_dir, args.summary_dir
    ]:
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

    logger.info('Building vocabulary...')
    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
                          args.gpus, args.batch_size, args.train_files,
                          args.dev_files, args.test_files)
    vocab = Vocab(init_random=False, trainable_oov_cnt_threshold=2)
    for word in brc_data.word_iter('train'):
        vocab.add(word)
    unfiltered_vocab_size = vocab.size()
    vocab.filter_tokens_by_cnt(min_cnt=2)
    filtered_num = unfiltered_vocab_size - vocab.size()
    logger.info('After filter {} tokens, the final vocab size is {}'.format(
        filtered_num, vocab.size()))
    logger.info('Assigning embeddings...')
    #     vocab.build_embedding_matrix(args.pretrained_word_path)
    vocab.randomly_init_embeddings(args.embed_size)
    logger.info('Saving vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout:
        pickle.dump(vocab, fout)
    logger.info('Done with preparing!')

Example #3

Show file

File: run.py Project: yizhongw/TagNN-PDTB

def pdtb_prepare(args):
    print('Loading dataset...')
    train_sections = [os.path.join(PathConfig.json_data_dir, '{:02}'.format(section_num)) for section_num in
                      PathConfig.train_sections]
    dev_sections = [os.path.join(PathConfig.json_data_dir, '{:02}'.format(section_num)) for section_num in
                    PathConfig.dev_sections]
    test_sections = [os.path.join(PathConfig.json_data_dir, '{:02}'.format(section_num)) for section_num in
                     PathConfig.test_sections]
    dataset = PDTBDataSet(train_sections, dev_sections, test_sections, level=2 if args.task.startswith('fine') else 1)
    print('Size of train: {}, dev: {}, test: {}'.format(len(dataset.train_set), len(dataset.dev_set),
                                                        len(dataset.test_set)))
    print('Creating word vocab...')
    if not os.path.exists(PathConfig.experiment_data_dir):
        os.mkdir(PathConfig.experiment_data_dir)
    word_vocab = Vocab(mannual_add=[PAD_WORD, UNK_WORD, BOS_WORD, EOS_WORD, NUM_WORD])
    for word in dataset.get_all_words():
        word_vocab.add(word)
    word_vocab.load_pretrained_emb(PathConfig.embedding_path)
    print('Size of word vocab: {}'.format(word_vocab.size()))
    torch.save(word_vocab, os.path.join(PathConfig.experiment_data_dir, 'word_vocab.obj'))
    tag_vocab = Vocab()
    for tag in dataset.get_all_tags():
        tag_vocab.add(tag)
    print('Size of tag vocab: {}'.format(tag_vocab.size()))
    tag_vocab.init_embed(ModelConfig.tag_embed_dim)
    torch.save(tag_vocab, os.path.join(PathConfig.experiment_data_dir, 'tag_vocab.obj'))
    print('Formatting the dataset to torch variables...')
    dataset.format_instances_to_torch_var(word_vocab, tag_vocab)
    torch.save(dataset, os.path.join(PathConfig.experiment_data_dir, 'dataset.obj'))

Example #4

Show file

def prepare_data():
    # load the dataset
    train_sections = [
        os.path.join(paths.json_data_dir, '{:02}'.format(section_num))
        for section_num in paths.train_sections
    ]
    dev_sections = [
        os.path.join(paths.json_data_dir, '{:02}'.format(section_num))
        for section_num in paths.dev_sections
    ]
    test_sections = [
        os.path.join(paths.json_data_dir, '{:02}'.format(section_num))
        for section_num in paths.test_sections
    ]
    train_dataset = PDTBDataSet(train_sections,
                                tree_type=args.tree_type,
                                level=args.level,
                                multiple_labels=False)
    dev_dataset = PDTBDataSet(dev_sections,
                              tree_type=args.tree_type,
                              level=args.level,
                              multiple_labels=True)
    test_dataset = PDTBDataSet(test_sections,
                               tree_type=args.tree_type,
                               level=args.level,
                               multiple_labels=True)
    if not (train_dataset.consistent_with(dev_dataset)
            and dev_dataset.consistent_with(test_dataset)):
        print('Dataset labels are not consistent.')
        print('Train: {}'.format(sorted(train_dataset.label_map.keys())))
        print('Dev: {}'.format(sorted(dev_dataset.label_map.keys())))
        print('Test: {}'.format(sorted(test_dataset.label_map.keys())))
    print('Size of train set: {}, dev set: {}, test set: {}'.format(
        len(train_dataset), len(dev_dataset), len(test_dataset)))
    # save the dataset
    torch.save(train_dataset,
               os.path.join(paths.experiment_data_dir, 'train.data'))
    torch.save(dev_dataset, os.path.join(paths.experiment_data_dir,
                                         'dev.data'))
    torch.save(test_dataset,
               os.path.join(paths.experiment_data_dir, 'test.data'))
    # build the vocab
    vocab = Vocab(
        mannual_add=[PAD_WORD, UNK_WORD, BOS_WORD, EOS_WORD, NUM_WORD])
    all_words = train_dataset.get_all_words() + dev_dataset.get_all_words(
    ) + test_dataset.get_all_words()
    # all_words = train_dataset.get_all_words()
    for word in all_words:
        vocab.add(word)
    # load and initialize the embeddings
    vocab.load_pretrained_emb(paths.embedding_path)
    print('Size of PDTB vocabulary: {}'.format(vocab.size()))
    # save the vocab
    torch.save(vocab, paths.vocab_path)

Example #5

Show file

def create_vocab(data_path):
    wd_vocab = Vocab(min_count=3, bos=None, eos=None)
    lbl_vocab = Vocab(pad=None, unk=None, bos=None, eos=None)
    assert os.path.exists(data_path)
    with open(data_path, 'r', encoding='utf-8') as fin:
        loader = map(lambda x: x.strip().split('|||'), fin)
        for lbl, data_item in loader:
            wds = data_item.strip().split(' ')
            wd_vocab.add(wds)
            lbl_vocab.add(lbl.strip())
    return MultiVocab({'word': wd_vocab, 'label': lbl_vocab})

Example #6

Show file

def create_vocab(datasets, embed_file=None, bert_vocab_path=None, min_count=2):
    wd_vocab = Vocab(min_count, bos=None, eos=None)
    char_vocab = Vocab(bos=None, eos=None)
    tag_vocab = Vocab(bos=None, eos=None)
    ner_vocab = Vocab(bos=None, eos=None)
    for insts in datasets:
        for inst in insts:
            wd_vocab.add(inst.word)
            char_vocab.add(list(inst.word))
            tag_vocab.add(inst.pos_tag)

            if inst.ner_tag != 'O':
                # including PER ORG LOC MISC and UNK
                ner_tag = inst.ner_tag.split('-')[1]
                ner_vocab.add(ner_tag)

    embed_count = wd_vocab.load_embeddings(embed_file)
    print("%d word pre-trained embeddings loaded..." % embed_count)

    bert_vocab = BERTVocab(
        bert_vocab_path) if bert_vocab_path is not None else None

    return MultiVocab(
        dict(word=wd_vocab,
             char=char_vocab,
             tag=tag_vocab,
             ner=ner_vocab,
             bert=bert_vocab))

Example #7

Show file

def create_vocab(data_path, min_count=3):
    root_rel = None
    wd_vocab = Vocab(min_count, eos=None)
    char_vocab = Vocab(min_count, eos=None)
    tag_vocab = Vocab(eos=None)
    rel_vocab = Vocab(bos=None, eos=None)
    with open(data_path, 'r', encoding='utf-8') as fr:
        for deps in read_deps(fr):
            for dep in deps:
                wd_vocab.add(dep.form)
                char_vocab.add(list(dep.form))
                tag_vocab.add(dep.pos_tag)

                if dep.head != 0:
                    rel_vocab.add(dep.dep_rel)
                elif root_rel is None:
                    root_rel = dep.dep_rel
                    rel_vocab.add(dep.dep_rel)
                elif root_rel != dep.dep_rel:
                    print('root = ' + root_rel + ', rel for root = ' +
                          dep.dep_rel)

    return MultiVocab(
        dict(word=wd_vocab, char=char_vocab, tag=tag_vocab, rel=rel_vocab))

Example #8

Show file

def main():
    parser = argparse.ArgumentParser(
        description='Tuning with bi-directional LSTM-CNN')
    parser.add_argument('--num_epochs',
                        type=int,
                        default=100,
                        help='Number of training epochs')
    parser.add_argument('--batch_size',
                        type=int,
                        default=1,
                        help='Number of sentences in each batch')
    parser.add_argument('--hidden_size',
                        type=int,
                        default=200,
                        help='Number of hidden units in RNN')
    parser.add_argument('--num_filters',
                        type=int,
                        default=35,
                        help='Number of filters in CNN')
    parser.add_argument('--min_filter_width',
                        type=int,
                        default=3,
                        help='Number of filters in CNN')
    parser.add_argument('--max_filter_width',
                        type=int,
                        default=7,
                        help='Number of filters in CNN')
    parser.add_argument('--embedDimension',
                        type=int,
                        default=300,
                        help='embedding dimension')

    parser.add_argument('--learning_rate',
                        type=float,
                        default=0.4,
                        help='Learning rate')
    parser.add_argument('--decay_rate',
                        type=float,
                        default=0.1,
                        help='Decay rate of learning rate')
    parser.add_argument('--gamma',
                        type=float,
                        default=0.0,
                        help='weight for regularization')
    parser.add_argument('--schedule',
                        type=int,
                        default=1,
                        help='schedule for learning rate decay')
    parser.add_argument('--embedding_vectors', help='path for embedding dict')
    parser.add_argument('--train')
    parser.add_argument('--trainAux')
    parser.add_argument('--dev')
    parser.add_argument('--test')

    parser.add_argument('--ner_tag_field_l1',
                        type=int,
                        default=1,
                        help='ner tag field')
    parser.add_argument('--ner_tag_field_l2',
                        type=int,
                        default=1,
                        help='ner tag field')
    parser.add_argument('--use_gpu', type=int, default=0, help='use gpu')

    parser.add_argument('--save_dir')

    parser.add_argument('--vocabChar')
    parser.add_argument('--vocabOutput')
    parser.add_argument('--vocabOutputAux')
    parser.add_argument('--vocabInput')

    args = parser.parse_args()

    use_gpu = args.use_gpu

    train_path = args.train
    train_path_aux = args.trainAux
    dev_path = args.dev
    test_path = args.test

    num_epochs = args.num_epochs
    batch_size = args.batch_size
    hidden_size = args.hidden_size

    num_filters = args.num_filters
    min_filter_width = args.min_filter_width
    max_filter_width = args.max_filter_width

    learning_rate = args.learning_rate
    momentum = 0.01 * learning_rate
    decay_rate = args.decay_rate
    gamma = args.gamma
    schedule = args.schedule

    save_dir = args.save_dir
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    embedding_path = args.embedding_vectors

    inputVocabulary = Vocab()
    charVocabulary = CharVocab()
    targetVocabulary = Vocab()
    targetVocabularyAux = Vocab()

    if args.vocabChar:
        with open(args.vocabChar, "r") as f:
            charVocabulary.__dict__ = json.load(f)
        charVocabulary.set_freeze()
        charVocabulary.process()

    if args.vocabOutput:
        with open(args.vocabOutput, "r") as f:
            targetVocabulary.__dict__ = json.load(f)
        targetVocabulary.set_freeze()
        targetVocabulary.process()

    if args.vocabOutputAux:
        with open(args.vocabOutputAux, "r") as f:
            targetVocabularyAux.__dict__ = json.load(f)
        targetVocabularyAux.set_freeze()
        targetVocabularyAux.process()

    embedding_vocab = None

    if args.embedding_vectors:
        print(args.embedding_vectors)
        embedd_dict, embedding_vocab, reverse_word_vocab, vocabularySize, embeddingDimension = load_embeddings(
            embedding_path)
        print("Read Word Embedding of dimension " + str(embeddingDimension) +
              " for " + str(vocabularySize) + " number of words")

        for everyWord in embedding_vocab:
            inputVocabulary.add(everyWord)
        inputVocabulary.set_freeze()
        inputVocabulary.process()
    else:
        if args.vocabInput:
            with open(args.vocabInput, "r") as f:
                inputVocabulary.__dict__ = json.load(f)
            inputVocabulary.set_freeze()
            inputVocabulary.process()
        else:
            print(
                "Neither pre-trained word embeddings nor input vocabulary is specified"
            )
            exit()

    if charVocabulary.__is_empty__():
        charVocabulary.add("<S>")
        charVocabulary.add("</S>")

    trainCorpus, trainLabelsRaw, maxTrainLength = readCoNLL(
        train_path, charVocabulary, targetVocabulary, args.ner_tag_field_l1,
        embedding_vocab)
    print("Train Corpus contains " + str(len(trainCorpus)) +
          " sentences and maximum sentence length is " + str(maxTrainLength))

    trainCorpusRawSorted = trainCorpus
    trainLabelsRawSorted = trainLabelsRaw

    trainCorpusAux, trainLabelsRawAux, maxTrainLengthAux = readCoNLL(
        train_path_aux, charVocabulary, targetVocabularyAux,
        args.ner_tag_field_l2, embedding_vocab)
    print("Auxiliary Train Corpus contains " + str(len(trainCorpusAux)) +
          " sentences and maximum sentence length is " +
          str(maxTrainLengthAux))

    trainCorpusRawSortedAux = trainCorpusAux
    trainLabelsRawSortedAux = trainLabelsRawAux

    devCorpus, devLabelsRaw, maxDevLength = readCoNLL(dev_path, charVocabulary,
                                                      targetVocabulary,
                                                      args.ner_tag_field_l1,
                                                      embedding_vocab)
    print("Dev Corpus contains " + str(len(devCorpus)) +
          " sentences and maximum sentence length is " + str(maxDevLength))

    testCorpus, testLabelsRaw, maxTestLength = readCoNLL(
        test_path, charVocabulary, targetVocabulary, args.ner_tag_field_l1,
        embedding_vocab)
    print("Test Corpus contains " + str(len(testCorpus)) +
          " sentences and maximum sentence length is " + str(maxTestLength))

    if not targetVocabulary.get_freeze():
        print(targetVocabulary._tok_to_ind)

        tmp_filename = '%s/output.vocab' % (save_dir)
        with open(tmp_filename, "w") as f:
            json.dump(targetVocabulary.__dict__, f)
        targetVocabulary.set_freeze()

    if not targetVocabularyAux.get_freeze():
        print(targetVocabularyAux._tok_to_ind)

        tmp_filename = '%s/output_aux.vocab' % (save_dir)
        with open(tmp_filename, "w") as f:
            json.dump(targetVocabularyAux.__dict__, f)
        targetVocabularyAux.set_freeze()

    if not charVocabulary.get_freeze():
        tmp_filename = '%s/char.vocab' % (save_dir)
        with open(tmp_filename, "w") as f:
            json.dump(charVocabulary.__dict__, f)
        charVocabulary.set_freeze()

    embeddingDimension = args.embedDimension
    word_embedding = np.random.uniform(
        -0.1, 0.1, (inputVocabulary.__len__(), embeddingDimension))
    if args.embedding_vectors:
        for everyWord in inputVocabulary._tok_to_ind:
            if everyWord in embedding_vocab:
                word_embedding[inputVocabulary.__get_word__(
                    everyWord)] = embedd_dict[embedding_vocab[everyWord]]

        tmp_filename = '%s/input.vocab' % (save_dir)
        with open(tmp_filename, "w") as f:
            json.dump(inputVocabulary.__dict__, f)
        inputVocabulary.set_freeze()

        del embedd_dict
        del reverse_word_vocab
        del vocabularySize
        del embedding_vocab

    print("Read " + str(targetVocabulary.__len__()) +
          " number of target words")
    print("Read " + str(targetVocabularyAux.__len__()) +
          " number of target words")
    print("Read " + str(inputVocabulary.__len__()) + " number of input words")
    print("Read " + str(charVocabulary.__len__()) + " number of characters")

    print("Number of epochs = " + str(num_epochs))
    print("Mini-Batch size = " + str(batch_size))
    print("Bi-LSTM Hidden size = " + str(hidden_size))
    print("Features per CNN filter = " + str(num_filters))
    print("Minimum ngrams for CNN filter = " + str(min_filter_width))
    print("Maximum ngrams for CNN filter = " + str(max_filter_width))
    print("Initial Learning Rate = " + str(learning_rate))

    network = BiCNNLSTMTranstion(inputVocabulary.__len__(), embeddingDimension,
                                 min_filter_width, max_filter_width,
                                 charVocabulary.__len__(),
                                 num_filters, hidden_size,
                                 targetVocabulary.__len__(), word_embedding,
                                 targetVocabularyAux.__len__())

    lr = learning_rate
    lr_aux = learning_rate * 0.1

    optim = SGD(network.parameters(),
                lr=lr,
                momentum=momentum,
                weight_decay=gamma,
                nesterov=True)

    num_batches = len(trainCorpus) / batch_size + 1

    dev_f1 = 0.0
    dev_acc = 0.0
    dev_precision = 0.0
    dev_recall = 0.0
    test_f1 = 0.0
    test_acc = 0.0
    test_precision = 0.0
    test_recall = 0.0
    best_epoch = 0

    print("Training....")

    if use_gpu == 1:
        network.cuda()

    prev_error = 100000.0

    for epoch in range(1, num_epochs + 1):
        print(
            'Epoch %d ( learning rate=%.4f, decay rate=%.4f (schedule=%d)): ' %
            (epoch, lr, decay_rate, schedule))

        train_err = 0.
        train_corr = 0.
        train_total = 0.

        start_time = time.time()
        num_back = 0
        network.train()

        count = 0
        count_batch = 0

        l1_indices = list(range(len(trainCorpusRawSorted)))
        l2_indices = list(range(len(trainCorpusRawSortedAux)))

        with tqdm(total=(len(trainCorpusRawSorted) +
                         len(trainCorpusRawSortedAux))) as pbar:

            for l1, l2 in zip_longest(l1_indices, l2_indices):
                if l1 is not None:
                    x_input, batch_length, current_batch_size, current_max_sequence_length, y_output, mask, y_prev = constructBatch(
                        [trainCorpusRawSorted[l1]], [trainLabelsRawSorted[l1]],
                        inputVocabulary, targetVocabulary, charVocabulary,
                        max_filter_width, args.use_gpu)

                    optim.zero_grad()
                    loss, _ = network.loss(x_input, batch_length,
                                           current_batch_size,
                                           current_max_sequence_length,
                                           y_output, mask, y_prev, 0, use_gpu)

                    loss.backward()
                    optim.step()

                    train_err += loss.item()
                    train_total += batch_length.data.sum()

                    count = count + current_batch_size
                    count_batch = count_batch + 1

                if l2 is not None:
                    x_input, batch_length, current_batch_size, current_max_sequence_length, y_output, mask, y_prev = constructBatch(
                        [trainCorpusRawSortedAux[l2]],
                        [trainLabelsRawSortedAux[l2]], inputVocabulary,
                        targetVocabularyAux, charVocabulary, max_filter_width,
                        args.use_gpu)

                    optim.zero_grad()
                    loss, _ = network.loss(x_input, batch_length,
                                           current_batch_size,
                                           current_max_sequence_length,
                                           y_output, mask, y_prev, 1, use_gpu)

                    loss.backward()
                    optim.step()

                time_ave = (time.time() - start_time) / count
                time_left = (num_batches - count_batch) * time_ave
                pbar.update(2)

        print('train: %d loss: %.4f, time: %.2fs' %
              (num_batches, train_err / count, time.time() - start_time))

        network.eval()
        tmp_filename = '%s/_dev%d' % (save_dir, epoch)

        current_epoch_loss = 0.0

        with codecs.open(tmp_filename, "w", encoding="utf8",
                         errors="ignore") as writer:
            for inputs, labels in batch(devCorpus, devLabelsRaw, 1):
                x_input, batch_length, current_batch_size, current_max_sequence_length, y_output, mask, y_prev = constructBatch(
                    inputs, labels, inputVocabulary, targetVocabulary,
                    charVocabulary, max_filter_width, args.use_gpu)

                loss, _ = network.loss(x_input, batch_length,
                                       current_batch_size,
                                       current_max_sequence_length, y_output,
                                       mask, y_prev, 0, use_gpu)
                current_epoch_loss = current_epoch_loss + loss.item()

                loss, preds = network.forward(x_input, batch_length,
                                              current_batch_size,
                                              current_max_sequence_length,
                                              y_output, mask, y_prev, 0,
                                              use_gpu)

                count = 0

                for i in range(len(inputs)):
                    for j in range(len(inputs[i])):
                        writer.write(inputs[i][j] + " " + labels[i][j] + " " +
                                     targetVocabulary.__get_index__(
                                         preds[i][j].item()).upper())
                        writer.write("\n")
                    writer.write("\n")

            writer.close()

        acc, precision, recall, f1 = evaluate(tmp_filename, save_dir)
        print(
            'dev loss: %.2f, dev acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%'
            % (current_epoch_loss, acc, precision, recall, f1))

        if current_epoch_loss > prev_error:
            lr = lr * 0.7
            optim = SGD(network.parameters(),
                        lr=lr,
                        momentum=momentum,
                        weight_decay=gamma,
                        nesterov=True)

            network.load_state_dict(torch.load(save_dir + "/model"))
            network.eval()

            if lr < 0.002:
                network.eval()
                tmp_filename = '%s/_test%d' % (save_dir, epoch)

                with codecs.open(tmp_filename,
                                 "w",
                                 encoding="utf8",
                                 errors="ignore") as writer:
                    for inputs, labels in batch(testCorpus, testLabelsRaw, 1):
                        x_input, batch_length, current_batch_size, current_max_sequence_length, y_output, mask, y_prev = constructBatch(
                            inputs, labels, inputVocabulary, targetVocabulary,
                            charVocabulary, max_filter_width, args.use_gpu)

                        loss, preds = network.forward(
                            x_input, batch_length, current_batch_size,
                            current_max_sequence_length, y_output, mask,
                            y_prev, 0, use_gpu)

                        count = 0

                        for i in range(len(inputs)):
                            for j in range(len(inputs[i])):
                                writer.write(inputs[i][j] + " " +
                                             labels[i][j] + " " +
                                             targetVocabulary.__get_index__(
                                                 preds[i][j].item()).upper())
                                writer.write("\n")
                            writer.write("\n")

                    writer.close()

                acc, precision, recall, f1 = evaluate(tmp_filename, save_dir)
                print(
                    'test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%'
                    % (acc, precision, recall, f1))
                exit()
        else:
            prev_error = current_epoch_loss
            torch.save(network.state_dict(), save_dir + "/model")

            network.eval()
            tmp_filename = '%s/_test%d' % (save_dir, epoch)

            with codecs.open(tmp_filename,
                             "w",
                             encoding="utf8",
                             errors="ignore") as writer:
                for inputs, labels in batch(testCorpus, testLabelsRaw, 1):
                    x_input, batch_length, current_batch_size, current_max_sequence_length, y_output, mask, y_prev = constructBatch(
                        inputs, labels, inputVocabulary, targetVocabulary,
                        charVocabulary, max_filter_width, args.use_gpu)

                    loss, preds = network.forward(x_input, batch_length,
                                                  current_batch_size,
                                                  current_max_sequence_length,
                                                  y_output, mask, y_prev, 0,
                                                  use_gpu)

                    count = 0

                    for i in range(len(inputs)):
                        for j in range(len(inputs[i])):
                            writer.write(inputs[i][j] + " " + labels[i][j] +
                                         " " + targetVocabulary.__get_index__(
                                             preds[i][j].item()).upper())
                            writer.write("\n")
                        writer.write("\n")

                writer.close()

            acc, precision, recall, f1 = evaluate(tmp_filename, save_dir)
            print(
                'test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%'
                % (acc, precision, recall, f1))

            torch.save(network.state_dict(), save_dir + "/model")

Example #9

Show file

def main():

    parser = argparse.ArgumentParser(
        description='Training a Sequence Labeler with bi-directional LSTM-CNN')
    parser.add_argument('--num_epochs',
                        type=int,
                        default=100,
                        help='Number of training epochs')
    parser.add_argument('--batch_size',
                        type=int,
                        default=5,
                        help='Number of sentences in each batch')
    parser.add_argument('--hidden_size',
                        type=int,
                        default=200,
                        help='Number of hidden units in RNN')
    parser.add_argument('--num_filters',
                        type=int,
                        default=35,
                        help='Number of filters in CNN')
    parser.add_argument('--min_filter_width',
                        type=int,
                        default=3,
                        help='Number of filters in CNN')
    parser.add_argument('--max_filter_width',
                        type=int,
                        default=7,
                        help='Number of filters in CNN')
    parser.add_argument('--embedDimension',
                        type=int,
                        default=300,
                        help='embedding dimension')

    parser.add_argument('--learning_rate',
                        type=float,
                        default=0.4,
                        help='Learning rate')
    parser.add_argument('--gamma',
                        type=float,
                        default=0.0,
                        help='weight for regularization')
    parser.add_argument('--embedding_vectors', help='path for embedding dict')
    parser.add_argument('--embedding_dict_new', help='path for embedding dict')
    parser.add_argument('--train')
    parser.add_argument('--dev')
    parser.add_argument('--test')

    parser.add_argument('--vocabChar')
    parser.add_argument('--vocabOutput')
    parser.add_argument('--vocabInput')

    parser.add_argument('--ner_tag_field',
                        type=int,
                        default=1,
                        help='ner tag field')
    parser.add_argument('--use_gpu', type=int, default=1, help='use gpu')

    parser.add_argument('--fineTune',
                        type=bool,
                        default=False,
                        help='fineTune pretrained word embeddings')

    parser.add_argument('--save-dir')
    parser.add_argument('--perform_evaluation',
                        type=bool,
                        default=False,
                        help='perform evaluation only')
    parser.add_argument('--deploy', type=bool, default=False, help='deploy')

    parser.add_argument('--train_from', type=str, default="")

    args = parser.parse_args()

    train_path = args.train
    dev_path = args.dev
    test_path = args.test

    num_epochs = args.num_epochs
    batch_size = args.batch_size
    hidden_size = args.hidden_size

    num_filters = args.num_filters
    min_filter_width = args.min_filter_width
    max_filter_width = args.max_filter_width

    learning_rate = args.learning_rate
    momentum = 0.01 * learning_rate
    gamma = args.gamma

    embedding_path = args.embedding_vectors

    save_dir = args.save_dir

    # create the output folder if does not exist
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    evaluation = args.perform_evaluation

    inputVocabulary = Vocab()
    charVocabulary = CharVocab()
    targetVocabulary = Vocab()

    # Read Character vocabulary if vocabChar argument is given
    if args.vocabChar:
        with open(args.vocabChar, "r") as f:
            charVocabulary.__dict__ = json.load(f)
        charVocabulary.set_freeze()
        charVocabulary.process()

    # Read Output vocabulary if vocabChar argument is given
    if args.vocabOutput:
        with open(args.vocabOutput, "r") as f:
            targetVocabulary.__dict__ = json.load(f)
        targetVocabulary.set_freeze()
        targetVocabulary.process()

    embedding_vocab = None

    # If the path to pre-trained embeddings are given
    if args.embedding_vectors:
        print(args.embedding_vectors)

        # load the pre-trained embeddings
        embedd_dict, embedding_vocab, reverse_word_vocab, vocabularySize, embeddingDimension = load_embeddings(
            embedding_path)
        print("Read Word Embedding of dimension " + str(embeddingDimension) +
              " for " + str(vocabularySize) + " number of words")

        # add the words to the Input vocabulary which is a dictionary of words
        for everyWord in embedding_vocab:
            inputVocabulary.add(everyWord)
        inputVocabulary.set_freeze()
        inputVocabulary.process()
    else:
        # Read Input vocabulary if vocabChar argument is given
        if args.vocabInput:
            with open(args.vocabInput, "r") as f:
                inputVocabulary.__dict__ = json.load(f)
            inputVocabulary.set_freeze()
            inputVocabulary.process()
        else:
            print(
                "Neither pre-trained word embeddings nor input vocabulary is specified"
            )
            exit()

    # if character vocabulary is initialize with beginning and end markers
    if charVocabulary.__is_empty__():
        charVocabulary.add("<S>")
        charVocabulary.add("</S>")

    if evaluation:
        # if we are performing evaluation, we do not require train and dev splits
        if not args.deploy:
            # if we are not deploying the model, then we are interesting in testing the model
            testCorpus, testLabelsRaw, maxTestLength = readCoNLL(
                test_path, charVocabulary, targetVocabulary,
                args.ner_tag_field, inputVocabulary)
            print("Test Corpus contains " + str(len(testCorpus)) +
                  " sentences and maximum sentence length is " +
                  str(maxTestLength))
            print("Read " + str(len(charVocabulary)) + " number of characters")

        else:
            # if we are deploying the model, we are trying to get predictions on a plain corpus
            testCorpus, maxTestLength = readUnlabeledData(test_path)
            print("Test Corpus contains " + str(len(testCorpus)) +
                  " sentences and maximum sentence length is " +
                  str(maxTestLength))
    else:
        # read train split
        trainCorpus, trainLabelsRaw, maxTrainLength = readCoNLL(
            train_path, charVocabulary, targetVocabulary, args.ner_tag_field,
            embedding_vocab)
        print("Train Corpus contains " + str(len(trainCorpus)) +
              " sentences and maximum sentence length is " +
              str(maxTrainLength))

        trainCorpusRawSorted = trainCorpus
        trainLabelsRawSorted = trainLabelsRaw

        # read dev split
        devCorpus, devLabelsRaw, maxDevLength = readCoNLL(
            dev_path, charVocabulary, targetVocabulary, args.ner_tag_field,
            embedding_vocab)
        print("Dev Corpus contains " + str(len(devCorpus)) +
              " sentences and maximum sentence length is " + str(maxDevLength))

        # read test split
        testCorpus, testLabelsRaw, maxTestLength = readCoNLL(
            test_path, charVocabulary, targetVocabulary, args.ner_tag_field,
            embedding_vocab)
        print("Test Corpus contains " + str(len(testCorpus)) +
              " sentences and maximum sentence length is " +
              str(maxTestLength))

    if not targetVocabulary.get_freeze():
        # save the output vocabulary
        print(targetVocabulary._tok_to_ind)

        tmp_filename = '%s/output.vocab' % (save_dir)
        with open(tmp_filename, "w") as f:
            json.dump(targetVocabulary.__dict__, f)
        targetVocabulary.set_freeze()

    if not charVocabulary.get_freeze():
        # save the character vocabulary
        tmp_filename = '%s/char.vocab' % (save_dir)
        with open(tmp_filename, "w") as f:
            json.dump(charVocabulary.__dict__, f)
        charVocabulary.set_freeze()

    # initialize word embeddings randomly
    embeddingDimension = args.embedDimension
    word_embedding = np.random.uniform(
        -0.1, 0.1, (inputVocabulary.__len__(), embeddingDimension))

    if args.embedding_vectors:
        # pre-trained word embeddings are given, update the word_embedding variable with pre-trained embeddings
        for everyWord in inputVocabulary._tok_to_ind:
            if everyWord in embedding_vocab:
                word_embedding[inputVocabulary.__get_word__(
                    everyWord)] = embedd_dict[embedding_vocab[everyWord]]

        # save the input vocabulary
        tmp_filename = '%s/input.vocab' % (save_dir)
        with open(tmp_filename, "w") as f:
            json.dump(inputVocabulary.__dict__, f)
        inputVocabulary.set_freeze()

        del embedd_dict
        del reverse_word_vocab
        del vocabularySize
        del embedding_vocab

    print("Read " + str(targetVocabulary.__len__()) +
          " number of target words")
    print("Read " + str(inputVocabulary.__len__()) + " number of input words")
    print("Read " + str(charVocabulary.__len__()) + " number of characters")

    print("Number of epochs = " + str(num_epochs))
    print("Mini-Batch size = " + str(batch_size))
    print("Bi-LSTM Hidden size = " + str(hidden_size))
    print("Features per CNN filter = " + str(num_filters))
    print("Minimum ngrams for CNN filter = " + str(min_filter_width))
    print("Maximum ngrams for CNN filter = " + str(max_filter_width))
    print("Initial Learning Rate = " + str(learning_rate))

    use_gpu = args.use_gpu

    # create a Bi-LSTM network
    network = BiCNNLSTMTranstion(inputVocabulary.__len__(), embeddingDimension,
                                 min_filter_width, max_filter_width,
                                 charVocabulary.__len__(),
                                 num_filters, hidden_size,
                                 targetVocabulary.__len__(), word_embedding,
                                 args.fineTune)
    print(network)

    lr = learning_rate

    optim = SGD(network.parameters(),
                lr=lr,
                momentum=momentum,
                weight_decay=gamma,
                nesterov=True)

    if not evaluation:
        num_batches = len(trainCorpus) / batch_size + 1

    dev_f1 = 0.0
    dev_acc = 0.0
    dev_precision = 0.0
    dev_recall = 0.0
    test_f1 = 0.0
    test_acc = 0.0
    test_precision = 0.0
    test_recall = 0.0
    best_epoch = 0

    if evaluation:
        # if we are performing evaluation, load the trained model
        network.load_state_dict(torch.load(save_dir + "/model"))

        # save output vocabulary as a plain file
        tmp_filename = '%s/output.vocab.plain' % (save_dir)
        with open(tmp_filename, "w") as f:
            for index in range(len(targetVocabulary._ind_to_tok)):
                f.write(targetVocabulary._ind_to_tok[index])
                f.write("\n")
            f.close()

        # save input vocabulary as a plain file
        tmp_filename = '%s/input.vocab.plain' % (save_dir)
        with open(tmp_filename, "w") as f:
            for index in range(len(inputVocabulary._ind_to_tok)):
                f.write(inputVocabulary._ind_to_tok[index])
                f.write("\n")
            f.close()

        # save character vocabulary as a plain file
        tmp_filename = '%s/char.vocab.plain' % (save_dir)
        with open(tmp_filename, "w") as f:
            for index in range(len(charVocabulary._ind_to_tok)):
                f.write(charVocabulary._ind_to_tok[index])
                f.write("\n")
            f.close()

        print("Performing Evaluation")
        if args.use_gpu == 0:
            network.cpu()

        network.eval()
        tmp_filename = '%s/_test_new' % (save_dir)

        if args.use_gpu == 1:
            print("Using GPU....")
            network.cuda()

        with codecs.open(tmp_filename, "w", encoding="utf8",
                         errors="ignore") as writer:
            for inputs, labels in batch(testCorpus, testLabelsRaw, 1):
                # for every sentence in the test data, convert the input to tensor
                x_input, batch_length, current_batch_size, current_max_sequence_length, y_output, mask, y_prev = constructBatch(
                    inputs, labels, inputVocabulary, targetVocabulary,
                    charVocabulary, max_filter_width, args.use_gpu)

                # get predictions by calling the forward() function of the model
                loss, preds, probs = network.forward(
                    x_input, batch_length, current_batch_size,
                    current_max_sequence_length, y_output, mask, y_prev,
                    args.use_gpu)

                count = 0

                # get the labels and write the output to the file
                for i in range(len(inputs)):
                    for j in range(len(inputs[i])):
                        writer.write(inputs[i][j])

                        for k in range(probs[i][j].size()[0]):
                            writer.write(" " + str(probs[i][j][k].item()))
                        writer.write(" " + inputs[i][j] + " " + labels[i][j] +
                                     " " + targetVocabulary.__get_index__(
                                         preds[i][j].item()).upper())
                        writer.write("\n")
                    writer.write("\n")

            writer.close()

        # Calulate the F-Score on the predicted output
        acc, precision, recall, f1 = evaluate(tmp_filename, save_dir)
        print(
            'test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%' %
            (acc, precision, recall, f1))
    else:
        if args.use_gpu == 1:
            print("Using GPU....")
            network.cuda()

        if args.train_from:
            print("Loading pre-trained model from " + args.train_from)
            network.load_state_dict(torch.load(args.train_from))

        print("Training....")
        prev_error = 1000.0

        network.eval()
        tmp_filename = '%s/_dev' % (save_dir)
        current_epoch_loss = 0.0

        with codecs.open(tmp_filename, "w", encoding="utf8",
                         errors="ignore") as writer:
            for inputs, labels in batch(devCorpus, devLabelsRaw, 1):
                # for every sentence in the test data, convert the input to tensor
                x_input, batch_length, current_batch_size, current_max_sequence_length, y_output, mask, y_prev = constructBatch(
                    inputs, labels, inputVocabulary, targetVocabulary,
                    charVocabulary, max_filter_width, args.use_gpu)

                # get the loss by calling the forward() function of the model
                loss, _ = network.loss(x_input, batch_length,
                                       current_batch_size,
                                       current_max_sequence_length, y_output,
                                       mask, y_prev, args.use_gpu)
                current_epoch_loss = current_epoch_loss + loss.item()

                # get the predictions by calling the forward() function of the model
                loss, preds, probs = network.forward(
                    x_input, batch_length, current_batch_size,
                    current_max_sequence_length, y_output, mask, y_prev,
                    args.use_gpu)

                count = 0

                # get the labels and write the output to the file
                for i in range(len(inputs)):
                    for j in range(len(inputs[i])):
                        writer.write(inputs[i][j] + " " + labels[i][j] + " " +
                                     targetVocabulary.__get_index__(
                                         preds[i][j].item()).upper())
                        writer.write("\n")
                    writer.write("\n")

            writer.close()

        # Calulate the F-Score on the predicted output
        acc, precision, recall, f1 = evaluate(tmp_filename, save_dir)
        print(
            'dev loss: %.2f, dev acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%'
            % (current_epoch_loss, acc, precision, recall, f1))

        for epoch in range(1, num_epochs + 1):
            print('Epoch %d ( learning rate=%.4f ): ' % (epoch, lr))

            train_err = 0.
            train_corr = 0.
            train_total = 0.

            start_time = time.time()
            num_back = 0
            network.train()

            count = 0
            count_batch = 0

            with tqdm(total=(len(trainCorpusRawSorted))) as pbar:
                for inputs, labels in batch(trainCorpusRawSorted,
                                            trainLabelsRawSorted, batch_size):
                    # for every sentence in the test data, convert the input to tensor
                    x_input, batch_length, current_batch_size, current_max_sequence_length, y_output, mask, y_prev = constructBatch(
                        inputs, labels, inputVocabulary, targetVocabulary,
                        charVocabulary, max_filter_width, args.use_gpu)

                    optim.zero_grad()

                    # get the loss by calling the forward() function of the model
                    loss, _ = network.loss(x_input, batch_length,
                                           current_batch_size,
                                           current_max_sequence_length,
                                           y_output, mask, y_prev,
                                           args.use_gpu)

                    # calculate gradients by calling backward() function and call optim.step() to perform gradient updation
                    loss.backward()
                    optim.step()

                    train_err += loss.item()
                    train_total += batch_length.data.sum()

                    count = count + current_batch_size
                    count_batch = count_batch + 1

                    time_ave = (time.time() - start_time) / count
                    time_left = (num_batches - count_batch) * time_ave
                    pbar.update(1)

            print('train: %d loss: %.4f, time: %.2fs' %
                  (num_batches, train_err / count, time.time() - start_time))

            network.eval()
            tmp_filename = '%s/_dev%d' % (save_dir, epoch)
            current_epoch_loss = 0.0

            with codecs.open(tmp_filename,
                             "w",
                             encoding="utf8",
                             errors="ignore") as writer:
                for inputs, labels in batch(devCorpus, devLabelsRaw, 1):
                    # for every sentence in the test data, convert the input to tensor
                    x_input, batch_length, current_batch_size, current_max_sequence_length, y_output, mask, y_prev = constructBatch(
                        inputs, labels, inputVocabulary, targetVocabulary,
                        charVocabulary, max_filter_width, args.use_gpu)

                    # get the loss by calling the forward() function of the model
                    loss, _ = network.loss(x_input, batch_length,
                                           current_batch_size,
                                           current_max_sequence_length,
                                           y_output, mask, y_prev,
                                           args.use_gpu)
                    current_epoch_loss = current_epoch_loss + loss.item()

                    # get the predictions by calling the forward() function of the model
                    loss, preds, probs = network.forward(
                        x_input, batch_length, current_batch_size,
                        current_max_sequence_length, y_output, mask, y_prev,
                        args.use_gpu)

                    count = 0

                    # get the labels and write the output to the file
                    for i in range(len(inputs)):
                        for j in range(len(inputs[i])):
                            writer.write(inputs[i][j] + " " + labels[i][j] +
                                         " " + targetVocabulary.__get_index__(
                                             preds[i][j].item()).upper())
                            writer.write("\n")
                        writer.write("\n")

                writer.close()

            # Calulate the F-Score on the predicted output
            acc, precision, recall, f1 = evaluate(tmp_filename, save_dir)
            print(
                'dev loss: %.2f, dev acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%'
                % (current_epoch_loss, acc, precision, recall, f1))

            if epoch > 1:
                if current_epoch_loss > prev_error:
                    # if the validation loss has increased, load the previous epoch model and reduce the learning rate
                    lr = lr * 0.7
                    optim = SGD(network.parameters(),
                                lr=lr,
                                momentum=momentum,
                                weight_decay=gamma,
                                nesterov=True)

                    network.load_state_dict(torch.load(save_dir + "/model"))
                    network.eval()

                    if lr < 0.002:
                        # if the learning rate is less than 0.002, stop the training
                        network.eval()
                        tmp_filename = '%s/_test%d' % (save_dir, epoch)

                        with codecs.open(tmp_filename,
                                         "w",
                                         encoding="utf8",
                                         errors="ignore") as writer:
                            for inputs, labels in batch(
                                    testCorpus, testLabelsRaw, 1):
                                # for every sentence in the test data, convert the input to tensor
                                x_input, batch_length, current_batch_size, current_max_sequence_length, y_output, mask, y_prev = constructBatch(
                                    inputs, labels, inputVocabulary,
                                    targetVocabulary, charVocabulary,
                                    max_filter_width, args.use_gpu)

                                # get the predictions by calling the forward() function of the model
                                loss, preds, probs = network.forward(
                                    x_input, batch_length, current_batch_size,
                                    current_max_sequence_length, y_output,
                                    mask, y_prev, args.use_gpu)

                                count = 0

                                # get the labels and write the output to the file
                                for i in range(len(inputs)):
                                    for j in range(len(inputs[i])):
                                        writer.write(inputs[i][j])

                                        for k in range(probs[i][j].size()[0]):
                                            writer.write(
                                                " " +
                                                str(probs[i][j][k].item()))
                                        writer.write(
                                            " " + inputs[i][j] + " " +
                                            labels[i][j] + " " +
                                            targetVocabulary.__get_index__(
                                                preds[i][j].item()).upper())
                                        writer.write("\n")
                                    writer.write("\n")

                            writer.close()

                        # Calulate the F-Score on the predicted output
                        acc, precision, recall, f1 = evaluate(
                            tmp_filename, save_dir)
                        print(
                            'test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%'
                            % (acc, precision, recall, f1))

                        exit()
                else:
                    prev_error = current_epoch_loss
                    torch.save(network.state_dict(), save_dir + "/model")

                    network.eval()
                    tmp_filename = '%s/_test%d' % (save_dir, epoch)

                    with codecs.open(tmp_filename,
                                     "w",
                                     encoding="utf8",
                                     errors="ignore") as writer:
                        for inputs, labels in batch(testCorpus, testLabelsRaw,
                                                    1):
                            # for every sentence in the test data, convert the input to tensor
                            x_input, batch_length, current_batch_size, current_max_sequence_length, y_output, mask, y_prev = constructBatch(
                                inputs, labels, inputVocabulary,
                                targetVocabulary, charVocabulary,
                                max_filter_width, args.use_gpu)

                            # get the predictions by calling the forward() function of the model
                            loss, preds, probs = network.forward(
                                x_input, batch_length, current_batch_size,
                                current_max_sequence_length, y_output, mask,
                                y_prev, args.use_gpu)

                            count = 0

                            # get the labels and write the output to the file
                            for i in range(len(inputs)):
                                for j in range(len(inputs[i])):
                                    writer.write(
                                        inputs[i][j] + " " + labels[i][j] +
                                        " " + targetVocabulary.__get_index__(
                                            preds[i][j].item()).upper())
                                    writer.write("\n")
                                writer.write("\n")

                        writer.close()

                    # Calulate the F-Score on the predicted output
                    acc, precision, recall, f1 = evaluate(
                        tmp_filename, save_dir)
                    print(
                        'test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%'
                        % (acc, precision, recall, f1))
            else:
                prev_error = current_epoch_loss
                torch.save(network.state_dict(), save_dir + "/model")

Example #10

Show file

def create_vocab(all_data, min_count=3):
    wd_vocab = Vocab(min_count=min_count, bos=None, eos=None)
    for task_data in all_data:
        for inst in task_data:
            wd_vocab.add(inst.data)
    return wd_vocab