Exemple #1
0
def train(data):
    print("Training model...")
    model = SeqModel(data)
    if data.use_cuda:
        model.cuda()
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    if data.optimizer == "SGD":
        optimizer = optim.SGD(parameters,
                              lr=data.lr,
                              momentum=data.momentum,
                              weight_decay=data.weight_decay)
    elif data.optimizer == "Adagrad":
        optimizer = optim.Adagrad(parameters,
                                  lr=data.lr,
                                  weight_decay=data.weight_decay)
    elif data.optimizer == "Adam":
        optimizer = optim.Adam(parameters,
                               lr=data.lr,
                               weight_decay=data.weight_decay)
    else:
        print("Optimizer Error: {} optimizer is not support.".format(
            data.optimizer))
    for idx in range(data.iter):
        epoch_start = temp_start = time.time()
        train_num = len(data.train_text)
        if train_num % data.batch_size == 0:
            batch_block = train_num // data.batch_size
        else:
            batch_block = train_num // data.batch_size + 1
        correct = total = total_loss = 0
        random.shuffle(data.train_idx)
        for block in range(batch_block):
            left = block * data.batch_size
            right = left + data.batch_size
            if right > train_num:
                right = train_num
            instance = data.train_idx[left:right]
            batch_word, batch_word_len, word_recover, batch_char, batch_char_len, char_recover, batch_label, mask = generate_batch(
                instance, data.use_cuda)
            loss, seq = model.forward(batch_word, batch_word_len, batch_char,
                                      batch_char_len, char_recover,
                                      batch_label, mask)
            right_token, total_token = predict_check(seq, batch_label, mask)
            correct += right_token
            total += total_token
            total_loss += loss.data[0]
            loss.backward()
            optimizer.step()
            model.zero_grad()
        epoch_end = time.time()
        print("Epoch:{}. Time:{}. Loss:{}. acc:{}".format(
            idx, epoch_end - epoch_start, total_loss, correct / total))
        evaluate(data, model, "dev", idx)
        evaluate(data, model, "test", idx)
        print("Finish.")
Exemple #2
0
def train(data, opt, fold_idx):

    model = SeqModel(data, opt)

    optimizer = optim.Adam(model.parameters(), lr=opt.lr, weight_decay=opt.l2)

    if opt.tune_wordemb == False:
        my_utils.freeze_net(model.word_hidden.wordrep.word_embedding)

    best_dev_f = -10
    best_dev_p = -10
    best_dev_r = -10

    bad_counter = 0

    for idx in range(opt.iter):
        epoch_start = time.time()

        if opt.elmo:
            my_utils.shuffle(data.train_texts, data.train_Ids)
        else:
            random.shuffle(data.train_Ids)

        model.train()
        model.zero_grad()
        batch_size = opt.batch_size
        train_num = len(data.train_Ids)
        total_batch = train_num // batch_size + 1

        for batch_id in range(total_batch):

            start = batch_id * batch_size
            end = (batch_id + 1) * batch_size
            if end > train_num:
                end = train_num
            instance = data.train_Ids[start:end]
            if opt.elmo:
                instance_text = data.train_texts[start:end]
            else:
                instance_text = None
            if not instance:
                continue

            batch_word, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, mask, batch_features, batch_text = batchify_with_label(
                data, instance, instance_text, opt.gpu)

            loss, tag_seq = model.neg_log_likelihood_loss(
                batch_word, batch_wordlen, batch_char, batch_charlen,
                batch_charrecover, batch_label, mask, batch_features,
                batch_text)

            loss.backward()

            if opt.gradient_clip > 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               opt.gradient_clip)
            optimizer.step()
            model.zero_grad()

        epoch_finish = time.time()
        logging.info("epoch: %s training finished. Time: %.2fs" %
                     (idx, epoch_finish - epoch_start))

        if opt.dev_file:
            _, _, p, r, f, _, _ = evaluate(data, opt, model, "dev", True)
            logging.info("Dev: p: %.4f, r: %.4f, f: %.4f" % (p, r, f))
        else:
            f = best_dev_f

        if f > best_dev_f:
            logging.info("Exceed previous best f score on dev: %.4f" %
                         (best_dev_f))

            if fold_idx is None:
                torch.save(model.state_dict(),
                           os.path.join(opt.output, "model.pkl"))
            else:
                torch.save(
                    model.state_dict(),
                    os.path.join(opt.output,
                                 "model_{}.pkl".format(fold_idx + 1)))

            best_dev_f = f
            best_dev_p = p
            best_dev_r = r

            # if opt.test_file:
            #     _, _, p, r, f, _, _ = evaluate(data, opt, model, "test", True, opt.nbest)
            #     logging.info("Test: p: %.4f, r: %.4f, f: %.4f" % (p, r, f))

            bad_counter = 0
        else:
            bad_counter += 1

        if len(opt.dev_file) != 0 and bad_counter >= opt.patience:
            logging.info('Early Stop!')
            break

    logging.info("train finished")

    if len(opt.dev_file) == 0:
        torch.save(model.state_dict(), os.path.join(opt.output, "model.pkl"))

    return best_dev_p, best_dev_r, best_dev_f
Exemple #3
0
def test(data, opt):

    corpus_dir = opt.test_file

    if opt.nlp_tool == "nltk":
        nlp_tool = nltk.data.load('tokenizers/punkt/english.pickle')
    else:
        raise RuntimeError("invalid nlp tool")

    corpus_files = [f for f in os.listdir(corpus_dir) if f.find('.xml') != -1]

    model = SeqModel(data, opt)
    if opt.test_in_cpu:
        model.load_state_dict(
            torch.load(os.path.join(opt.output, 'model.pkl'), map_location='cpu'))
    else:
        model.load_state_dict(torch.load(os.path.join(opt.output, 'model.pkl')))

    meddra_dict = load_meddra_dict(data)

    # initialize norm models
    if opt.norm_rule and opt.norm_vsm and opt.norm_neural: # ensemble
        logging.info("use ensemble normer")
        multi_sieve.init(opt, None, data, meddra_dict, None, True)
        if opt.ensemble == 'learn':
            if opt.test_in_cpu:
                ensemble_model = torch.load(os.path.join(opt.output, 'ensemble.pkl'), map_location='cpu')
            else:
                ensemble_model = torch.load(os.path.join(opt.output, 'ensemble.pkl'))
            ensemble_model.eval()
        else:
            if opt.test_in_cpu:
                vsm_model = torch.load(os.path.join(opt.output, 'vsm.pkl'), map_location='cpu')
                neural_model = torch.load(os.path.join(opt.output, 'norm_neural.pkl'), map_location='cpu')
            else:
                vsm_model = torch.load(os.path.join(opt.output, 'vsm.pkl'))
                neural_model = torch.load(os.path.join(opt.output, 'norm_neural.pkl'))

            vsm_model.eval()
            neural_model.eval()

    elif opt.norm_rule:
        logging.info("use rule-based normer")
        multi_sieve.init(opt, None, data, meddra_dict)

    elif opt.norm_vsm:
        logging.info("use vsm-based normer")
        if opt.test_in_cpu:
            vsm_model = torch.load(os.path.join(opt.output, 'vsm.pkl'), map_location='cpu')
        else:
            vsm_model = torch.load(os.path.join(opt.output, 'vsm.pkl'))
        vsm_model.eval()

    elif opt.norm_neural:
        logging.info("use neural-based normer")
        if opt.test_in_cpu:
            neural_model = torch.load(os.path.join(opt.output, 'norm_neural.pkl'), map_location='cpu')
        else:
            neural_model = torch.load(os.path.join(opt.output, 'norm_neural.pkl'))
        neural_model.eval()
    else:
        logging.info("no normalization is performed.")


    makedir_and_clear(opt.predict)

    ct_success = 0
    ct_error = 0

    for fileName in corpus_files:
        try:
            start = time.time()
            document, annotation_file = processOneFile_fda(fileName, corpus_dir, nlp_tool, False, opt.types, opt.type_filter, True, False)
            pred_entities = []

            for section in document:

                data.test_texts = []
                data.test_Ids = []
                read_instance_from_one_document(section, data.word_alphabet, data.char_alphabet, data.label_alphabet,
                                                data.test_texts, data.test_Ids, data)

                _, _, _, _, _, pred_results, _ = evaluate(data, opt, model, 'test', False, opt.nbest)

                entities = translateResultsintoEntities(section.sentences, pred_results)

                # remove the entity in the ignore_region and fill section_id
                section_id = section.name[section.name.rfind('_')+1: ]
                entities = remove_entity_in_the_ignore_region(annotation_file.ignore_regions, entities, section_id)


                if opt.norm_rule and opt.norm_vsm and opt.norm_neural:
                    if opt.ensemble == 'learn':
                        ensemble_model.process_one_doc(section, entities, meddra_dict, None, True)
                    else:
                        pred_entities1 = copy.deepcopy(entities)
                        pred_entities2 = copy.deepcopy(entities)
                        pred_entities3 = copy.deepcopy(entities)
                        multi_sieve.runMultiPassSieve(section, pred_entities1, meddra_dict, True)
                        vsm_model.process_one_doc(section, pred_entities2, meddra_dict, None, True)
                        neural_model.process_one_doc(section, pred_entities3, meddra_dict, None, True)

                        # merge pred_entities1, pred_entities2, pred_entities3 into entities
                        ensemble.merge_result(pred_entities1, pred_entities2, pred_entities3, entities, meddra_dict, True, vsm_model.dict_alphabet, data)

                elif opt.norm_rule:
                    multi_sieve.runMultiPassSieve(section, entities, meddra_dict, True)
                elif opt.norm_vsm:
                    vsm_model.process_one_doc(section, entities, meddra_dict, None, True)
                elif opt.norm_neural:
                    neural_model.process_one_doc(section, entities, meddra_dict, None, True)


                for entity in entities:
                    if len(entity.norm_ids)!=0: # if a mention can't be normed, not output it
                        pred_entities.append(entity)


            dump_results(fileName, pred_entities, opt, annotation_file)

            end = time.time()
            logging.info("process %s complete with %.2fs" % (fileName, end - start))

            ct_success += 1
        except Exception as e:
            logging.error("process file {} error: {}".format(fileName, e))
            ct_error += 1

    if opt.norm_rule:
        multi_sieve.finalize(True)

    logging.info("test finished, total {}, error {}".format(ct_success + ct_error, ct_error))
Exemple #4
0
def predict(opt, data):

    seq_model = SeqModel(data)
    if opt.test_in_cpu:
        seq_model.load_state_dict(torch.load(os.path.join(opt.output, 'ner_model.pkl'), map_location='cpu'))
    else:
        cuda_src = 'cuda:{}'.format(opt.old_gpu)
        cuda_dst = 'cuda:{}'.format(opt.gpu)
        seq_model.load_state_dict(torch.load(os.path.join(opt.output, 'ner_model.pkl'), map_location={cuda_src:cuda_dst}))


    seq_wordseq = WordSequence(data, False, True, True, True)
    if opt.test_in_cpu:
        seq_wordseq.load_state_dict(torch.load(os.path.join(opt.output, 'ner_wordseq.pkl'), map_location='cpu'))
    else:
        cuda_src = 'cuda:{}'.format(opt.old_gpu)
        cuda_dst = 'cuda:{}'.format(opt.gpu)
        seq_wordseq.load_state_dict(torch.load(os.path.join(opt.output, 'ner_wordseq.pkl'), map_location={cuda_src:cuda_dst}))

    classify_model = ClassifyModel(data)
    if opt.test_in_cpu:
        classify_model.load_state_dict(torch.load(os.path.join(opt.output, 're_model.pkl'), map_location='cpu'))
    else:
        cuda_src = 'cuda:{}'.format(opt.old_gpu)
        cuda_dst = 'cuda:{}'.format(opt.gpu)
        classify_model.load_state_dict(torch.load(os.path.join(opt.output, 're_model.pkl'), map_location={cuda_src:cuda_dst}))

    classify_wordseq = WordSequence(data, True, False, True, False)
    if opt.test_in_cpu:
        classify_wordseq.load_state_dict(torch.load(os.path.join(opt.output, 're_wordseq.pkl'), map_location='cpu'))
    else:
        cuda_src = 'cuda:{}'.format(opt.old_gpu)
        cuda_dst = 'cuda:{}'.format(opt.gpu)
        classify_wordseq.load_state_dict(torch.load(os.path.join(opt.output, 're_wordseq.pkl'), map_location={cuda_src:cuda_dst}))

    input_files = [f for f in listdir(opt.input) if isfile(join(opt.input,f)) and f[0]!='.']


    # for idx in tqdm(range(len(input_files))):
    for idx in range(len(input_files)):

        start = time.time()
        fileName = join(opt.input,input_files[idx])
        doc_name = input_files[idx]

        doc_token = processOneFile(fileName)

        doc = generateDataForOneFile(doc_token)

        raw_texts, raw_Ids = read_instance(doc, data.word_alphabet, data.char_alphabet,
                                                                   data.feature_alphabets, data.label_alphabet,
                                                                   data.number_normalized,
                                                                   data.MAX_SENTENCE_LENGTH)

        decode_results = evaluateWhenTest(data, seq_wordseq, seq_model, raw_Ids)


        entities = ner.translateNCRFPPintoEntities(doc_token, decode_results, doc_name)

        collection = bioc.BioCCollection()
        document = bioc.BioCDocument()
        collection.add_document(document)
        document.id = doc_name
        passage = bioc.BioCPassage()
        document.add_passage(passage)
        passage.offset = 0

        for entity in entities:
            anno_entity = bioc.BioCAnnotation()
            passage.add_annotation(anno_entity)
            anno_entity.id = entity.id
            anno_entity.infons['type'] = entity.type
            anno_entity_location = bioc.BioCLocation(entity.start, entity.getlength())
            anno_entity.add_location(anno_entity_location)
            anno_entity.text = entity.text


        test_X, test_other = relation_extraction.getRelationInstanceForOneDoc(doc_token, entities, doc_name, data)

        relations = relation_extraction.evaluateWhenTest(classify_wordseq, classify_model, test_X, data, test_other, data.re_feature_alphabets[data.re_feature_name2id['[RELATION]']])

        for relation in relations:
            bioc_relation = bioc.BioCRelation()
            passage.add_relation(bioc_relation)
            bioc_relation.id = relation.id
            bioc_relation.infons['type'] = relation.type

            node1 = bioc.BioCNode(relation.node1.id, 'argument 1')
            bioc_relation.add_node(node1)
            node2 = bioc.BioCNode(relation.node2.id, 'argument 2')
            bioc_relation.add_node(node2)


        with open(os.path.join(opt.predict, doc_name + ".bioc.xml"), 'w') as fp:
            bioc.dump(collection, fp)

        end = time.time()
        logging.info("process %s complete with %.2fs" % (input_files[idx], end-start))



    logging.info("test finished")
def joint_train(data, old_data, opt):

    if not os.path.exists(opt.output):
        os.makedirs(opt.output)

    if opt.pretrained_model_dir != 'None':
        seq_model = SeqModel(data)
        if opt.test_in_cpu:
            seq_model.load_state_dict(
                torch.load(os.path.join(opt.pretrained_model_dir,
                                        'ner_model.pkl'),
                           map_location='cpu'))
        else:
            seq_model.load_state_dict(
                torch.load(
                    os.path.join(opt.pretrained_model_dir, 'ner_model.pkl')))

        if (data.label_alphabet_size != seq_model.crf.tagset_size)\
                or (data.HP_hidden_dim != seq_model.hidden2tag.weight.size(1)):
            raise RuntimeError("ner_model not compatible")

        seq_wordseq = WordSequence(data, False, True, True, True)

        if ((data.word_emb_dim != seq_wordseq.wordrep.word_embedding.embedding_dim)\
            or (data.char_emb_dim != seq_wordseq.wordrep.char_feature.char_embeddings.embedding_dim)\
            or (data.feature_emb_dims[0] != seq_wordseq.wordrep.feature_embedding_dims[0])\
            or (data.feature_emb_dims[1] != seq_wordseq.wordrep.feature_embedding_dims[1])):
            raise RuntimeError("ner_wordseq not compatible")

        old_seq_wordseq = WordSequence(old_data, False, True, True, True)
        if opt.test_in_cpu:
            old_seq_wordseq.load_state_dict(
                torch.load(os.path.join(opt.pretrained_model_dir,
                                        'ner_wordseq.pkl'),
                           map_location='cpu'))
        else:
            old_seq_wordseq.load_state_dict(
                torch.load(
                    os.path.join(opt.pretrained_model_dir, 'ner_wordseq.pkl')))

        # sd = old_seq_wordseq.lstm.state_dict()

        for old, new in zip(old_seq_wordseq.lstm.parameters(),
                            seq_wordseq.lstm.parameters()):
            new.data.copy_(old)

        vocab_size = old_seq_wordseq.wordrep.word_embedding.num_embeddings
        seq_wordseq.wordrep.word_embedding.weight.data[
            0:
            vocab_size, :] = old_seq_wordseq.wordrep.word_embedding.weight.data[
                0:vocab_size, :]

        vocab_size = old_seq_wordseq.wordrep.char_feature.char_embeddings.num_embeddings
        seq_wordseq.wordrep.char_feature.char_embeddings.weight.data[
            0:
            vocab_size, :] = old_seq_wordseq.wordrep.char_feature.char_embeddings.weight.data[
                0:vocab_size, :]

        for i, feature_embedding in enumerate(
                old_seq_wordseq.wordrep.feature_embeddings):
            vocab_size = feature_embedding.num_embeddings
            seq_wordseq.wordrep.feature_embeddings[i].weight.data[
                0:vocab_size, :] = feature_embedding.weight.data[
                    0:vocab_size, :]

        # for word in data.word_alphabet.iteritems():
        #
        #     old_seq_wordseq.wordrep.word_embedding.weight.data data.word_alphabet.get_index(word)
        #

        classify_wordseq = WordSequence(data, True, False, True, False)

        if ((data.word_emb_dim != classify_wordseq.wordrep.word_embedding.embedding_dim)\
            or (data.re_feature_emb_dims[data.re_feature_name2id['[POSITION]']] != classify_wordseq.wordrep.position1_emb.embedding_dim)\
            or (data.feature_emb_dims[1] != classify_wordseq.wordrep.feature_embedding_dims[0])):
            raise RuntimeError("re_wordseq not compatible")

        old_classify_wordseq = WordSequence(old_data, True, False, True, False)
        if opt.test_in_cpu:
            old_classify_wordseq.load_state_dict(
                torch.load(os.path.join(opt.pretrained_model_dir,
                                        're_wordseq.pkl'),
                           map_location='cpu'))
        else:
            old_classify_wordseq.load_state_dict(
                torch.load(
                    os.path.join(opt.pretrained_model_dir, 're_wordseq.pkl')))

        for old, new in zip(old_classify_wordseq.lstm.parameters(),
                            classify_wordseq.lstm.parameters()):
            new.data.copy_(old)

        vocab_size = old_classify_wordseq.wordrep.word_embedding.num_embeddings
        classify_wordseq.wordrep.word_embedding.weight.data[
            0:
            vocab_size, :] = old_classify_wordseq.wordrep.word_embedding.weight.data[
                0:vocab_size, :]

        vocab_size = old_classify_wordseq.wordrep.position1_emb.num_embeddings
        classify_wordseq.wordrep.position1_emb.weight.data[
            0:
            vocab_size, :] = old_classify_wordseq.wordrep.position1_emb.weight.data[
                0:vocab_size, :]

        vocab_size = old_classify_wordseq.wordrep.position2_emb.num_embeddings
        classify_wordseq.wordrep.position2_emb.weight.data[
            0:
            vocab_size, :] = old_classify_wordseq.wordrep.position2_emb.weight.data[
                0:vocab_size, :]

        vocab_size = old_classify_wordseq.wordrep.feature_embeddings[
            0].num_embeddings
        classify_wordseq.wordrep.feature_embeddings[0].weight.data[
            0:vocab_size, :] = old_classify_wordseq.wordrep.feature_embeddings[
                0].weight.data[0:vocab_size, :]

        classify_model = ClassifyModel(data)

        old_classify_model = ClassifyModel(old_data)
        if opt.test_in_cpu:
            old_classify_model.load_state_dict(
                torch.load(os.path.join(opt.pretrained_model_dir,
                                        're_model.pkl'),
                           map_location='cpu'))
        else:
            old_classify_model.load_state_dict(
                torch.load(
                    os.path.join(opt.pretrained_model_dir, 're_model.pkl')))

        if (data.re_feature_alphabet_sizes[data.re_feature_name2id['[RELATION]']] != old_classify_model.linear.weight.size(0)
            or (data.re_feature_emb_dims[data.re_feature_name2id['[ENTITY_TYPE]']] != old_classify_model.entity_type_emb.embedding_dim) \
                or (data.re_feature_emb_dims[
                        data.re_feature_name2id['[ENTITY]']] != old_classify_model.entity_emb.embedding_dim)
                or (data.re_feature_emb_dims[
                        data.re_feature_name2id['[TOKEN_NUM]']] != old_classify_model.tok_num_betw_emb.embedding_dim) \
                or (data.re_feature_emb_dims[
                        data.re_feature_name2id['[ENTITY_NUM]']] != old_classify_model.et_num_emb.embedding_dim) \
                ):
            raise RuntimeError("re_model not compatible")

        vocab_size = old_classify_model.entity_type_emb.num_embeddings
        classify_model.entity_type_emb.weight.data[
            0:vocab_size, :] = old_classify_model.entity_type_emb.weight.data[
                0:vocab_size, :]

        vocab_size = old_classify_model.entity_emb.num_embeddings
        classify_model.entity_emb.weight.data[
            0:vocab_size, :] = old_classify_model.entity_emb.weight.data[
                0:vocab_size, :]

        vocab_size = old_classify_model.tok_num_betw_emb.num_embeddings
        classify_model.tok_num_betw_emb.weight.data[
            0:vocab_size, :] = old_classify_model.tok_num_betw_emb.weight.data[
                0:vocab_size, :]

        vocab_size = old_classify_model.et_num_emb.num_embeddings
        classify_model.et_num_emb.weight.data[
            0:vocab_size, :] = old_classify_model.et_num_emb.weight.data[
                0:vocab_size, :]

    else:
        seq_model = SeqModel(data)
        seq_wordseq = WordSequence(data, False, True, True, True)

        classify_wordseq = WordSequence(data, True, False, True, False)
        classify_model = ClassifyModel(data)

    iter_parameter = itertools.chain(
        *map(list, [seq_wordseq.parameters(),
                    seq_model.parameters()]))
    seq_optimizer = optim.Adam(iter_parameter,
                               lr=data.HP_lr,
                               weight_decay=data.HP_l2)
    iter_parameter = itertools.chain(*map(
        list, [classify_wordseq.parameters(),
               classify_model.parameters()]))
    classify_optimizer = optim.Adam(iter_parameter,
                                    lr=data.HP_lr,
                                    weight_decay=data.HP_l2)

    if data.tune_wordemb == False:
        my_utils.freeze_net(seq_wordseq.wordrep.word_embedding)
        my_utils.freeze_net(classify_wordseq.wordrep.word_embedding)

    re_X_positive = []
    re_Y_positive = []
    re_X_negative = []
    re_Y_negative = []
    relation_vocab = data.re_feature_alphabets[
        data.re_feature_name2id['[RELATION]']]
    my_collate = my_utils.sorted_collate1
    for i in range(len(data.re_train_X)):
        x = data.re_train_X[i]
        y = data.re_train_Y[i]

        if y != relation_vocab.get_index("</unk>"):
            re_X_positive.append(x)
            re_Y_positive.append(y)
        else:
            re_X_negative.append(x)
            re_Y_negative.append(y)

    re_dev_loader = DataLoader(my_utils.RelationDataset(
        data.re_dev_X, data.re_dev_Y),
                               data.HP_batch_size,
                               shuffle=False,
                               collate_fn=my_collate)
    # re_test_loader = DataLoader(my_utils.RelationDataset(data.re_test_X, data.re_test_Y), data.HP_batch_size, shuffle=False, collate_fn=my_collate)

    best_ner_score = -1
    best_re_score = -1
    count_performance_not_grow = 0

    for idx in range(data.HP_iteration):
        epoch_start = time.time()

        seq_wordseq.train()
        seq_wordseq.zero_grad()
        seq_model.train()
        seq_model.zero_grad()

        classify_wordseq.train()
        classify_wordseq.zero_grad()
        classify_model.train()
        classify_model.zero_grad()

        batch_size = data.HP_batch_size

        random.shuffle(data.train_Ids)
        ner_train_num = len(data.train_Ids)
        ner_total_batch = ner_train_num // batch_size + 1

        re_train_loader, re_train_iter = makeRelationDataset(
            re_X_positive, re_Y_positive, re_X_negative, re_Y_negative,
            data.unk_ratio, True, my_collate, data.HP_batch_size)
        re_total_batch = len(re_train_loader)

        total_batch = max(ner_total_batch, re_total_batch)
        min_batch = min(ner_total_batch, re_total_batch)

        for batch_id in range(total_batch):

            if batch_id < ner_total_batch:
                start = batch_id * batch_size
                end = (batch_id + 1) * batch_size
                if end > ner_train_num:
                    end = ner_train_num
                instance = data.train_Ids[start:end]
                batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, mask, \
                    batch_permute_label = batchify_with_label(instance, data.HP_gpu)

                hidden = seq_wordseq.forward(batch_word, batch_features,
                                             batch_wordlen, batch_char,
                                             batch_charlen, batch_charrecover,
                                             None, None)
                hidden_adv = None
                loss, tag_seq = seq_model.neg_log_likelihood_loss(
                    hidden, hidden_adv, batch_label, mask)
                loss.backward()
                seq_optimizer.step()
                seq_wordseq.zero_grad()
                seq_model.zero_grad()

            if batch_id < re_total_batch:
                [batch_word, batch_features, batch_wordlen, batch_wordrecover, \
                 batch_char, batch_charlen, batch_charrecover, \
                 position1_seq_tensor, position2_seq_tensor, e1_token, e1_length, e2_token, e2_length, e1_type, e2_type, \
                 tok_num_betw, et_num], [targets, targets_permute] = my_utils.endless_get_next_batch_without_rebatch1(
                    re_train_loader, re_train_iter)

                hidden = classify_wordseq.forward(batch_word, batch_features,
                                                  batch_wordlen, batch_char,
                                                  batch_charlen,
                                                  batch_charrecover,
                                                  position1_seq_tensor,
                                                  position2_seq_tensor)
                hidden_adv = None
                loss, pred = classify_model.neg_log_likelihood_loss(
                    hidden, hidden_adv, batch_wordlen, e1_token, e1_length,
                    e2_token, e2_length, e1_type, e2_type, tok_num_betw,
                    et_num, targets)
                loss.backward()
                classify_optimizer.step()
                classify_wordseq.zero_grad()
                classify_model.zero_grad()

        epoch_finish = time.time()
        logging.info("epoch: %s training finished. Time: %.2fs" %
                     (idx, epoch_finish - epoch_start))

        _, _, _, _, ner_score, _, _ = ner.evaluate(data, seq_wordseq,
                                                   seq_model, "dev")
        logging.info("ner evaluate: f: %.4f" % (ner_score))
        if ner_score > best_ner_score:
            logging.info("new best score: ner: %.4f" % (ner_score))
            best_ner_score = ner_score

            torch.save(seq_wordseq.state_dict(),
                       os.path.join(opt.output, 'ner_wordseq.pkl'))
            torch.save(seq_model.state_dict(),
                       os.path.join(opt.output, 'ner_model.pkl'))

            count_performance_not_grow = 0

            # _, _, _, _, test_ner_score, _, _ = ner.evaluate(data, seq_wordseq, seq_model, "test")
            # logging.info("ner evaluate on test: f: %.4f" % (test_ner_score))

        else:
            count_performance_not_grow += 1

        re_score = relation_extraction.evaluate(classify_wordseq,
                                                classify_model, re_dev_loader)
        logging.info("re evaluate: f: %.4f" % (re_score))
        if re_score > best_re_score:
            logging.info("new best score: re: %.4f" % (re_score))
            best_re_score = re_score

            torch.save(classify_wordseq.state_dict(),
                       os.path.join(opt.output, 're_wordseq.pkl'))
            torch.save(classify_model.state_dict(),
                       os.path.join(opt.output, 're_model.pkl'))

            count_performance_not_grow = 0

            # test_re_score = relation_extraction.evaluate(classify_wordseq, classify_model, re_test_loader)
            # logging.info("re evaluate on test: f: %.4f" % (test_re_score))
        else:
            count_performance_not_grow += 1

        if count_performance_not_grow > 2 * data.patience:
            logging.info("early stop")
            break

    logging.info("train finished")
Exemple #6
0
def test(data, opt):
    # corpus_dir = join(opt.test_file, 'corpus')
    # corpus_dir = join(opt.test_file, 'txt')
    corpus_dir = opt.test_file

    if opt.nlp_tool == "spacy":
        nlp_tool = spacy.load('en')
    elif opt.nlp_tool == "nltk":
        nlp_tool = nltk.data.load('tokenizers/punkt/english.pickle')
    elif opt.nlp_tool == "stanford":
        nlp_tool = StanfordCoreNLP('http://localhost:{0}'.format(9000))
    else:
        raise RuntimeError("invalid nlp tool")

    corpus_files = [
        f for f in listdir(corpus_dir) if isfile(join(corpus_dir, f))
    ]

    model = SeqModel(data, opt)
    if opt.test_in_cpu:
        model.load_state_dict(
            torch.load(os.path.join(opt.output, 'model.pkl'),
                       map_location='cpu'))
    else:
        model.load_state_dict(torch.load(os.path.join(opt.output,
                                                      'model.pkl')))

    dictionary, dictionary_reverse = umls.load_umls_MRCONSO(
        data.config['norm_dict'])
    isMeddra_dict = False

    # initialize norm models
    if opt.norm_rule and opt.norm_vsm and opt.norm_neural:  # ensemble
        logging.info("use ensemble normer")
        multi_sieve.init(opt, None, data, dictionary, dictionary_reverse,
                         False)
        if opt.ensemble == 'learn':
            if opt.test_in_cpu:
                ensemble_model = torch.load(os.path.join(
                    opt.output, 'ensemble.pkl'),
                                            map_location='cpu')
            else:
                ensemble_model = torch.load(
                    os.path.join(opt.output, 'ensemble.pkl'))
            ensemble_model.eval()
        else:
            if opt.test_in_cpu:
                vsm_model = torch.load(os.path.join(opt.output, 'vsm.pkl'),
                                       map_location='cpu')
                neural_model = torch.load(os.path.join(opt.output,
                                                       'norm_neural.pkl'),
                                          map_location='cpu')
            else:
                vsm_model = torch.load(os.path.join(opt.output, 'vsm.pkl'))
                neural_model = torch.load(
                    os.path.join(opt.output, 'norm_neural.pkl'))

            vsm_model.eval()
            neural_model.eval()

    elif opt.norm_rule:
        logging.info("use rule-based normer")
        multi_sieve.init(opt, None, data, dictionary, dictionary_reverse,
                         False)

    elif opt.norm_vsm:
        logging.info("use vsm-based normer")
        if opt.test_in_cpu:
            vsm_model = torch.load(os.path.join(opt.output, 'vsm.pkl'),
                                   map_location='cpu')
        else:
            vsm_model = torch.load(os.path.join(opt.output, 'vsm.pkl'))
        vsm_model.eval()

    elif opt.norm_neural:
        logging.info("use neural-based normer")
        if opt.test_in_cpu:
            neural_model = torch.load(os.path.join(opt.output,
                                                   'norm_neural.pkl'),
                                      map_location='cpu')
        else:
            neural_model = torch.load(
                os.path.join(opt.output, 'norm_neural.pkl'))
        neural_model.eval()
    else:
        logging.info("no normalization is performed.")

    makedir_and_clear(opt.predict)

    ct_success = 0
    ct_error = 0

    for fileName in corpus_files:
        try:
            start = time.time()
            document, _, _, _ = processOneFile(fileName, None, corpus_dir,
                                               nlp_tool, False, opt.types,
                                               opt.type_filter)

            data.test_texts = []
            data.test_Ids = []
            read_instance_from_one_document(document, data.word_alphabet,
                                            data.char_alphabet,
                                            data.label_alphabet,
                                            data.test_texts, data.test_Ids,
                                            data)

            _, _, _, _, _, pred_results, _ = evaluate(data, opt, model, 'test',
                                                      False, opt.nbest)

            entities = translateResultsintoEntities(document.sentences,
                                                    pred_results)

            if opt.norm_rule and opt.norm_vsm and opt.norm_neural:
                if opt.ensemble == 'learn':
                    ensemble_model.process_one_doc(document, entities,
                                                   dictionary,
                                                   dictionary_reverse,
                                                   isMeddra_dict)
                else:
                    pred_entities1 = copy.deepcopy(entities)
                    pred_entities2 = copy.deepcopy(entities)
                    pred_entities3 = copy.deepcopy(entities)
                    multi_sieve.runMultiPassSieve(document, pred_entities1,
                                                  dictionary, isMeddra_dict)
                    vsm_model.process_one_doc(document, pred_entities2,
                                              dictionary, dictionary_reverse,
                                              isMeddra_dict)
                    neural_model.process_one_doc(document, pred_entities3,
                                                 dictionary,
                                                 dictionary_reverse,
                                                 isMeddra_dict)

                    # merge pred_entities1, pred_entities2, pred_entities3 into entities
                    ensemble.merge_result(pred_entities1, pred_entities2,
                                          pred_entities3, entities, dictionary,
                                          isMeddra_dict,
                                          vsm_model.dict_alphabet, data)

            elif opt.norm_rule:
                multi_sieve.runMultiPassSieve(document, entities, dictionary,
                                              isMeddra_dict)
            elif opt.norm_vsm:
                vsm_model.process_one_doc(document, entities, dictionary,
                                          dictionary_reverse, isMeddra_dict)
            elif opt.norm_neural:
                neural_model.process_one_doc(document, entities, dictionary,
                                             dictionary_reverse, isMeddra_dict)

            dump_results(fileName, entities, opt)

            end = time.time()
            logging.info("process %s complete with %.2fs" %
                         (fileName, end - start))

            ct_success += 1
        except Exception as e:
            logging.error("process file {} error: {}".format(fileName, e))
            ct_error += 1

    logging.info("test finished, total {}, error {}".format(
        ct_success + ct_error, ct_error))
Exemple #7
0
def train(data):
    print("Training model...")
    data.show_data_summary()
    model = SeqModel(data)
    if data.use_cuda:
        model.cuda()
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    if data.optimizer == "SGD":
        optimizer = optim.SGD(parameters,
                              lr=data.lr,
                              momentum=data.momentum,
                              weight_decay=data.weight_decay)
    elif data.optimizer == "Adagrad":
        optimizer = optim.Adagrad(parameters,
                                  lr=data.lr,
                                  weight_decay=data.weight_decay)
    elif data.optimizer == "Adam":
        optimizer = optim.Adam(parameters,
                               lr=data.lr,
                               weight_decay=data.weight_decay)
    else:
        print("Optimizer Error: {} optimizer is not support.".format(
            data.optimizer))
    # if data.out_dict:
    #     external_dict(data.out_dict, data.train_file)
    #     external_dict(data.out_dict, data.dev_file)
    #     external = external_dict(data.out_dict, data.test_file)
    #     external_dict(data.out_dict,data.oov_file)
    #     #print(len(external))
    #     #with open('../data/ali_7k/external_dict','w',encoding='utf-8') as fout:
    #     #    for e in external:
    #     #        fout.write(e+'\n')
    for idx in range(data.iter):
        # data.mask_entity(data.iter, idx)
        epoch_start = temp_start = time.time()
        # if data.optimizer == "SGD":
        #    optimizer = lr_decay(optimizer,idx,data.lr_decay,data.lr)
        train_num = len(data.train_text)
        if train_num % data.batch_size == 0:
            batch_block = train_num // data.batch_size
        else:
            batch_block = train_num // data.batch_size + 1
        correct = total = total_loss = 0
        random.shuffle(data.train_idx)
        model.train()
        model.zero_grad()
        for block in range(batch_block):
            left = block * data.batch_size
            right = left + data.batch_size
            if right > train_num:
                right = train_num
            instance = data.train_idx[left:right]
            batch_word, batch_feat, batch_word_len, word_recover, batch_char, batch_char_len, char_recover, batch_label, mask, batch_bert \
                = generate_batch(instance, data.use_cuda, data.use_bert)
            batch_dict = None
            if data.out_dict:
                batch_dict = generate_dict_feature(instance, data,
                                                   data.use_cuda)
            loss, seq = model.neg_log_likehood(batch_word, batch_feat,
                                               batch_word_len, batch_char,
                                               batch_char_len, char_recover,
                                               batch_label, mask, batch_dict,
                                               batch_bert)
            right_token, total_token = predict_check(seq, batch_label, mask)
            correct += right_token
            total += total_token
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
            model.zero_grad()
        epoch_end = time.time()
        print("Epoch:{}. Time:{}. Loss:{}. acc:{}".format(
            idx, epoch_end - epoch_start, total_loss, correct / total))
        # torch.save(model,data.model_save_dir+'/model'+str(idx)+'.pkl')
        evaluate(data, model, "dev", idx)
        evaluate(data, model, "test", idx)
        if data.oov_file is not None:
            evaluate(data, model, "oov", idx)
        print("Finish.")
Exemple #8
0
    def train_step(self):
        print("loading the dataset...")
        config = Config()
        eval_config = Config()
        eval_config.keep_prob = 1.0

        train_data, valid_data, test_data = self.utils.load_data(
            FLAGS.max_len, batch_size=config.batch_size)

        print 'h:', len(train_data[0]), len(valid_data[0]), len(test_data[0])

        print("begin training")
        with tf.Graph().as_default(), tf.Session() as session:
            initializer = tf.random_uniform_initializer(
                -1 * FLAGS.init_scale, 1 * FLAGS.init_scale)
            with tf.variable_scope('model',
                                   reuse=None,
                                   initializer=initializer):
                train_model = SeqModel(config=config, is_training=True)

            with tf.variable_scope('model',
                                   reuse=True,
                                   initializer=initializer):
                valid_model = SeqModel(config=eval_config, is_training=False)
                test_model = SeqModel(config=eval_config, is_training=False)

            # add summary
            train_summary_dir = os.path.join(config.out_dir, "summaries",
                                             "train")
            train_summary_writer = tf.summary.FileWriter(
                train_summary_dir, session.graph)

            dev_summary_dir = os.path.join(eval_config.out_dir, "summaries",
                                           "dev")
            dev_summary_writer = tf.summary.FileWriter(dev_summary_dir,
                                                       session.graph)

            #add checkpoint
            checkpoint_dir = os.path.abspath(
                os.path.join(config.out_dir, "checkpoints"))
            checkpoint_prefix = os.path.join(checkpoint_dir, "model")
            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)
            # saver=tf.train.Saver(tf.all_variables)
            saver = tf.train.Saver(tf.global_variables())

            tf.initialize_all_variables().run()
            global_steps = 1
            begin_time = int(time.time())

            for i in range(config.num_epoch):
                print("the %d epoch training..." % (i + 1))
                lr_decay = config.lr_decay**max(i - config.max_decay_epoch,
                                                0.0)
                train_model.assign_new_lr(session, config.lr * lr_decay)
                global_steps = self.run_epoch(train_model, session, train_data,
                                              global_steps, valid_model,
                                              valid_data, train_summary_writer,
                                              dev_summary_writer)
                if i % config.checkpoint_every == 0:
                    path = saver.save(session, checkpoint_prefix, global_steps)
                    print("Saved model chechpoint to{}\n".format(path))
            print("the train is finished")
            end_time = int(time.time())
            print("training takes %d seconds already\n" %
                  (end_time - begin_time))
            test_accuracy = self.evaluate(test_model, session, test_data)
            print("the test data accuracy is %f" % test_accuracy)
            print("program end!")