Ejemplo n.º 1
0
def train(train_data, dev_data, test_data, d, dictionary, dictionary_reverse,
          opt, fold_idx, isMeddra_dict):
    logging.info("train the vsm-based normalization model ...")

    external_train_data = []
    if d.config.get('norm_ext_corpus') is not None:
        for k, v in d.config['norm_ext_corpus'].items():
            if k == 'tac':
                external_train_data.extend(
                    load_data_fda(v['path'], True, v.get('types'),
                                  v.get('types'), False, True))
            else:
                raise RuntimeError("not support external corpus")
    if len(external_train_data) != 0:
        train_data.extend(external_train_data)

    logging.info("build alphabet ...")
    word_alphabet = Alphabet('word')
    norm_utils.build_alphabet_from_dict(word_alphabet, dictionary,
                                        isMeddra_dict)
    norm_utils.build_alphabet(word_alphabet, train_data)
    if opt.dev_file:
        norm_utils.build_alphabet(word_alphabet, dev_data)
    if opt.test_file:
        norm_utils.build_alphabet(word_alphabet, test_data)
    norm_utils.fix_alphabet(word_alphabet)
    logging.info("alphabet size {}".format(word_alphabet.size()))

    if d.config.get('norm_emb') is not None:
        logging.info("load pretrained word embedding ...")
        pretrain_word_embedding, word_emb_dim = build_pretrain_embedding(
            d.config.get('norm_emb'), word_alphabet, opt.word_emb_dim, False)
        word_embedding = nn.Embedding(word_alphabet.size(),
                                      word_emb_dim,
                                      padding_idx=0)
        word_embedding.weight.data.copy_(
            torch.from_numpy(pretrain_word_embedding))
        embedding_dim = word_emb_dim
    else:
        logging.info("randomly initialize word embedding ...")
        word_embedding = nn.Embedding(word_alphabet.size(),
                                      d.word_emb_dim,
                                      padding_idx=0)
        word_embedding.weight.data.copy_(
            torch.from_numpy(
                random_embedding(word_alphabet.size(), d.word_emb_dim)))
        embedding_dim = d.word_emb_dim

    dict_alphabet = Alphabet('dict')
    norm_utils.init_dict_alphabet(dict_alphabet, dictionary)
    norm_utils.fix_alphabet(dict_alphabet)

    logging.info("init_vector_for_dict")
    poses, poses_lengths = init_vector_for_dict(word_alphabet, dict_alphabet,
                                                dictionary, isMeddra_dict)

    vsm_model = VsmNormer(word_alphabet, word_embedding, embedding_dim,
                          dict_alphabet, poses, poses_lengths)

    logging.info("generate instances for training ...")
    train_X = []
    train_Y = []

    for doc in train_data:
        if isMeddra_dict:
            temp_X, temp_Y = generate_instances(doc.entities, word_alphabet,
                                                dict_alphabet)
        else:
            temp_X, temp_Y = generate_instances_ehr(doc.entities,
                                                    word_alphabet,
                                                    dict_alphabet,
                                                    dictionary_reverse)
        train_X.extend(temp_X)
        train_Y.extend(temp_Y)

    train_loader = DataLoader(MyDataset(train_X, train_Y),
                              opt.batch_size,
                              shuffle=True,
                              collate_fn=my_collate)

    optimizer = optim.Adam(vsm_model.parameters(),
                           lr=opt.lr,
                           weight_decay=opt.l2)

    if opt.tune_wordemb == False:
        freeze_net(vsm_model.word_embedding)

    if d.config['norm_vsm_pretrain'] == '1':
        dict_pretrain(dictionary, dictionary_reverse, d, isMeddra_dict,
                      optimizer, vsm_model)

    best_dev_f = -10
    best_dev_p = -10
    best_dev_r = -10

    bad_counter = 0

    logging.info("start training ...")

    for idx in range(opt.iter):
        epoch_start = time.time()

        vsm_model.train()

        train_iter = iter(train_loader)
        num_iter = len(train_loader)

        sum_loss = 0

        correct, total = 0, 0

        for i in range(num_iter):

            x, lengths, y = next(train_iter)

            l, y_pred = vsm_model.forward_train(x, lengths, y)

            sum_loss += l.item()

            l.backward()

            if opt.gradient_clip > 0:
                torch.nn.utils.clip_grad_norm_(vsm_model.parameters(),
                                               opt.gradient_clip)
            optimizer.step()
            vsm_model.zero_grad()

            total += y.size(0)
            _, pred = torch.max(y_pred, 1)
            correct += (pred == y).sum().item()

        epoch_finish = time.time()
        accuracy = 100.0 * correct / total
        logging.info(
            "epoch: %s training finished. Time: %.2fs. loss: %.4f Accuracy %.2f"
            % (idx, epoch_finish - epoch_start, sum_loss / num_iter, accuracy))

        if opt.dev_file:
            p, r, f = norm_utils.evaluate(dev_data, dictionary,
                                          dictionary_reverse, vsm_model, None,
                                          None, d, isMeddra_dict)
            logging.info("Dev: p: %.4f, r: %.4f, f: %.4f" % (p, r, f))
        else:
            f = best_dev_f

        if f > best_dev_f:
            logging.info("Exceed previous best f score on dev: %.4f" %
                         (best_dev_f))

            if fold_idx is None:
                torch.save(vsm_model, os.path.join(opt.output, "vsm.pkl"))
            else:
                torch.save(
                    vsm_model,
                    os.path.join(opt.output,
                                 "vsm_{}.pkl".format(fold_idx + 1)))

            best_dev_f = f
            best_dev_p = p
            best_dev_r = r

            bad_counter = 0
        else:
            bad_counter += 1

        if len(opt.dev_file) != 0 and bad_counter >= opt.patience:
            logging.info('Early Stop!')
            break

    logging.info("train finished")

    if len(opt.dev_file) == 0:
        torch.save(vsm_model, os.path.join(opt.output, "vsm.pkl"))

    return best_dev_p, best_dev_r, best_dev_f
Ejemplo n.º 2
0
            logging.info("build alphabet ...")
            d.build_alphabet(d.train_data)
            d.build_alphabet(d.dev_data)
            d.fix_alphabet()

            logging.info("generate instance ...")
            d.train_texts, d.train_Ids = data.read_instance(
                d.train_data, d.word_alphabet, d.char_alphabet,
                d.label_alphabet, d)
            d.dev_texts, d.dev_Ids = data.read_instance(
                d.dev_data, d.word_alphabet, d.char_alphabet, d.label_alphabet,
                d)

            logging.info("load pretrained word embedding ...")
            d.pretrain_word_embedding, d.word_emb_dim = data.build_pretrain_embedding(
                opt.word_emb_file, d.word_alphabet, opt.word_emb_dim, False)

            p, r, f = train.train(d, opt, fold_idx)

            d.clear()
            d.save(os.path.join(opt.output,
                                "data_{}.pkl".format(fold_idx + 1)))

            macro_p += p
            macro_r += r
            macro_f += f

        logging.info("the macro averaged p r f are %.4f, %.4f, %.4f" %
                     (macro_p * 1.0 / fold_num, macro_r * 1.0 / fold_num,
                      macro_f * 1.0 / fold_num))
Ejemplo n.º 3
0
def train(train_data, dev_data, d, meddra_dict, opt, fold_idx):
    logging.info("train the vsm-based normalization model ...")

    external_train_data = []
    if d.config.get('norm_ext_corpus') is not None:
        for k, v in d.config['norm_ext_corpus'].items():
            if k == 'tac':
                external_train_data.extend(
                    load_data_fda(v['path'], True, v.get('types'),
                                  v.get('types'), False, True))
            else:
                raise RuntimeError("not support external corpus")
    if len(external_train_data) != 0:
        train_data.extend(external_train_data)

    vsm_model = VsmNormer()

    logging.info("build alphabet ...")
    norm_utils.build_alphabet(vsm_model.word_alphabet, train_data)
    if opt.dev_file:
        norm_utils.build_alphabet(vsm_model.word_alphabet, dev_data)

    norm_utils.build_alphabet_from_dict(vsm_model.word_alphabet, meddra_dict)
    norm_utils.fix_alphabet(vsm_model.word_alphabet)

    if d.config.get('norm_emb') is not None:
        logging.info("load pretrained word embedding ...")
        pretrain_word_embedding, word_emb_dim = build_pretrain_embedding(
            d.config.get('norm_emb'), vsm_model.word_alphabet,
            opt.word_emb_dim, False)
        vsm_model.word_embedding = nn.Embedding(vsm_model.word_alphabet.size(),
                                                word_emb_dim)
        vsm_model.word_embedding.weight.data.copy_(
            torch.from_numpy(pretrain_word_embedding))
        vsm_model.embedding_dim = word_emb_dim
    else:
        logging.info("randomly initialize word embedding ...")
        vsm_model.word_embedding = nn.Embedding(vsm_model.word_alphabet.size(),
                                                d.word_emb_dim)
        vsm_model.word_embedding.weight.data.copy_(
            torch.from_numpy(
                random_embedding(vsm_model.word_alphabet.size(),
                                 d.word_emb_dim)))
        vsm_model.embedding_dim = d.word_emb_dim

    if torch.cuda.is_available():
        vsm_model.word_embedding = vsm_model.word_embedding.cuda(vsm_model.gpu)

    logging.info("init_vector_for_dict")
    vsm_model.init_vector_for_dict(meddra_dict)
    norm_utils.fix_alphabet(vsm_model.dict_alphabet)

    vsm_model.train()

    best_dev_f = -10
    best_dev_p = -10
    best_dev_r = -10

    if opt.dev_file:
        p, r, f = norm_utils.evaluate(dev_data, meddra_dict, vsm_model)
        logging.info("Dev: p: %.4f, r: %.4f, f: %.4f" % (p, r, f))
    else:
        f = best_dev_f

    if f > best_dev_f:
        logging.info("Exceed previous best f score on dev: %.4f" %
                     (best_dev_f))

        if fold_idx is None:
            logging.info("save model to {}".format(
                os.path.join(opt.output, "vsm.pkl")))
            torch.save(vsm_model, os.path.join(opt.output, "vsm.pkl"))
        else:
            logging.info("save model to {}".format(
                os.path.join(opt.output, "vsm_{}.pkl".format(fold_idx + 1))))
            torch.save(
                vsm_model,
                os.path.join(opt.output, "vsm_{}.pkl".format(fold_idx + 1)))

        best_dev_f = f
        best_dev_p = p
        best_dev_r = r

    logging.info("train finished")

    if len(opt.dev_file) == 0:
        logging.info("save model to {}".format(
            os.path.join(opt.output, "vsm.pkl")))
        torch.save(vsm_model, os.path.join(opt.output, "vsm.pkl"))

    return best_dev_p, best_dev_r, best_dev_f
Ejemplo n.º 4
0
def train(train_data, dev_data, test_data, d, dictionary, dictionary_reverse,
          opt, fold_idx, isMeddra_dict):
    logging.info("train the ensemble normalization model ...")

    external_train_data = []
    if d.config.get('norm_ext_corpus') is not None:
        for k, v in d.config['norm_ext_corpus'].items():
            if k == 'tac':
                external_train_data.extend(
                    load_data_fda(v['path'], True, v.get('types'),
                                  v.get('types'), False, True))
            else:
                raise RuntimeError("not support external corpus")
    if len(external_train_data) != 0:
        train_data.extend(external_train_data)

    logging.info("build alphabet ...")
    word_alphabet = Alphabet('word')
    norm_utils.build_alphabet_from_dict(word_alphabet, dictionary,
                                        isMeddra_dict)
    norm_utils.build_alphabet(word_alphabet, train_data)
    if opt.dev_file:
        norm_utils.build_alphabet(word_alphabet, dev_data)
    if opt.test_file:
        norm_utils.build_alphabet(word_alphabet, test_data)
    norm_utils.fix_alphabet(word_alphabet)

    if d.config.get('norm_emb') is not None:
        logging.info("load pretrained word embedding ...")
        pretrain_word_embedding, word_emb_dim = build_pretrain_embedding(
            d.config.get('norm_emb'), word_alphabet, opt.word_emb_dim, False)
        word_embedding = nn.Embedding(word_alphabet.size(),
                                      word_emb_dim,
                                      padding_idx=0)
        word_embedding.weight.data.copy_(
            torch.from_numpy(pretrain_word_embedding))
        embedding_dim = word_emb_dim
    else:
        logging.info("randomly initialize word embedding ...")
        word_embedding = nn.Embedding(word_alphabet.size(),
                                      d.word_emb_dim,
                                      padding_idx=0)
        word_embedding.weight.data.copy_(
            torch.from_numpy(
                random_embedding(word_alphabet.size(), d.word_emb_dim)))
        embedding_dim = d.word_emb_dim

    dict_alphabet = Alphabet('dict')
    norm_utils.init_dict_alphabet(dict_alphabet, dictionary)
    norm_utils.fix_alphabet(dict_alphabet)

    # rule
    logging.info("init rule-based normer")
    multi_sieve.init(opt, train_data, d, dictionary, dictionary_reverse,
                     isMeddra_dict)

    if opt.ensemble == 'learn':
        logging.info("init ensemble normer")
        poses = vsm.init_vector_for_dict(word_alphabet, dict_alphabet,
                                         dictionary, isMeddra_dict)
        ensemble_model = Ensemble(word_alphabet, word_embedding, embedding_dim,
                                  dict_alphabet, poses)
        if pretrain_neural_model is not None:
            ensemble_model.neural_linear.weight.data.copy_(
                pretrain_neural_model.linear.weight.data)
        if pretrain_vsm_model is not None:
            ensemble_model.vsm_linear.weight.data.copy_(
                pretrain_vsm_model.linear.weight.data)
        ensemble_train_X = []
        ensemble_train_Y = []
        for doc in train_data:
            temp_X, temp_Y = generate_instances(doc, word_alphabet,
                                                dict_alphabet, dictionary,
                                                dictionary_reverse,
                                                isMeddra_dict)

            ensemble_train_X.extend(temp_X)
            ensemble_train_Y.extend(temp_Y)
        ensemble_train_loader = DataLoader(MyDataset(ensemble_train_X,
                                                     ensemble_train_Y),
                                           opt.batch_size,
                                           shuffle=True,
                                           collate_fn=my_collate)
        ensemble_optimizer = optim.Adam(ensemble_model.parameters(),
                                        lr=opt.lr,
                                        weight_decay=opt.l2)
        if opt.tune_wordemb == False:
            freeze_net(ensemble_model.word_embedding)
    else:

        # vsm
        logging.info("init vsm-based normer")
        poses = vsm.init_vector_for_dict(word_alphabet, dict_alphabet,
                                         dictionary, isMeddra_dict)
        # alphabet can share between vsm and neural since they don't change
        # but word_embedding cannot
        vsm_model = vsm.VsmNormer(word_alphabet, copy.deepcopy(word_embedding),
                                  embedding_dim, dict_alphabet, poses)
        vsm_train_X = []
        vsm_train_Y = []
        for doc in train_data:
            if isMeddra_dict:
                temp_X, temp_Y = vsm.generate_instances(
                    doc.entities, word_alphabet, dict_alphabet)
            else:
                temp_X, temp_Y = vsm.generate_instances_ehr(
                    doc.entities, word_alphabet, dict_alphabet,
                    dictionary_reverse)

            vsm_train_X.extend(temp_X)
            vsm_train_Y.extend(temp_Y)
        vsm_train_loader = DataLoader(vsm.MyDataset(vsm_train_X, vsm_train_Y),
                                      opt.batch_size,
                                      shuffle=True,
                                      collate_fn=vsm.my_collate)
        vsm_optimizer = optim.Adam(vsm_model.parameters(),
                                   lr=opt.lr,
                                   weight_decay=opt.l2)
        if opt.tune_wordemb == False:
            freeze_net(vsm_model.word_embedding)

        if d.config['norm_vsm_pretrain'] == '1':
            vsm.dict_pretrain(dictionary, dictionary_reverse, d, True,
                              vsm_optimizer, vsm_model)

        # neural
        logging.info("init neural-based normer")
        neural_model = norm_neural.NeuralNormer(word_alphabet,
                                                copy.deepcopy(word_embedding),
                                                embedding_dim, dict_alphabet)

        neural_train_X = []
        neural_train_Y = []
        for doc in train_data:
            if isMeddra_dict:
                temp_X, temp_Y = norm_neural.generate_instances(
                    doc.entities, word_alphabet, dict_alphabet)
            else:
                temp_X, temp_Y = norm_neural.generate_instances_ehr(
                    doc.entities, word_alphabet, dict_alphabet,
                    dictionary_reverse)

            neural_train_X.extend(temp_X)
            neural_train_Y.extend(temp_Y)
        neural_train_loader = DataLoader(norm_neural.MyDataset(
            neural_train_X, neural_train_Y),
                                         opt.batch_size,
                                         shuffle=True,
                                         collate_fn=norm_neural.my_collate)
        neural_optimizer = optim.Adam(neural_model.parameters(),
                                      lr=opt.lr,
                                      weight_decay=opt.l2)
        if opt.tune_wordemb == False:
            freeze_net(neural_model.word_embedding)

        if d.config['norm_neural_pretrain'] == '1':
            neural_model.dict_pretrain(dictionary, dictionary_reverse, d, True,
                                       neural_optimizer, neural_model)

    best_dev_f = -10
    best_dev_p = -10
    best_dev_r = -10

    bad_counter = 0

    logging.info("start training ...")
    for idx in range(opt.iter):
        epoch_start = time.time()

        if opt.ensemble == 'learn':

            ensemble_model.train()
            ensemble_train_iter = iter(ensemble_train_loader)
            ensemble_num_iter = len(ensemble_train_loader)

            for i in range(ensemble_num_iter):
                x, rules, lengths, y = next(ensemble_train_iter)

                y_pred = ensemble_model.forward(x, rules, lengths)

                l = ensemble_model.loss(y_pred, y)

                l.backward()

                if opt.gradient_clip > 0:
                    torch.nn.utils.clip_grad_norm_(ensemble_model.parameters(),
                                                   opt.gradient_clip)
                ensemble_optimizer.step()
                ensemble_model.zero_grad()

        else:

            vsm_model.train()
            vsm_train_iter = iter(vsm_train_loader)
            vsm_num_iter = len(vsm_train_loader)

            for i in range(vsm_num_iter):
                x, lengths, y = next(vsm_train_iter)

                l, _ = vsm_model.forward_train(x, lengths, y)

                l.backward()

                if opt.gradient_clip > 0:
                    torch.nn.utils.clip_grad_norm_(vsm_model.parameters(),
                                                   opt.gradient_clip)
                vsm_optimizer.step()
                vsm_model.zero_grad()

            neural_model.train()
            neural_train_iter = iter(neural_train_loader)
            neural_num_iter = len(neural_train_loader)

            for i in range(neural_num_iter):

                x, lengths, y = next(neural_train_iter)

                y_pred = neural_model.forward(x, lengths)

                l = neural_model.loss(y_pred, y)

                l.backward()

                if opt.gradient_clip > 0:
                    torch.nn.utils.clip_grad_norm_(neural_model.parameters(),
                                                   opt.gradient_clip)
                neural_optimizer.step()
                neural_model.zero_grad()

        epoch_finish = time.time()
        logging.info("epoch: %s training finished. Time: %.2fs" %
                     (idx, epoch_finish - epoch_start))

        if opt.dev_file:
            if opt.ensemble == 'learn':
                # logging.info("weight w1: %.4f, w2: %.4f, w3: %.4f" % (ensemble_model.w1.data.item(), ensemble_model.w2.data.item(), ensemble_model.w3.data.item()))
                p, r, f = norm_utils.evaluate(dev_data, dictionary,
                                              dictionary_reverse, None, None,
                                              ensemble_model, d, isMeddra_dict)
            else:
                p, r, f = norm_utils.evaluate(dev_data, dictionary,
                                              dictionary_reverse, vsm_model,
                                              neural_model, None, d,
                                              isMeddra_dict)
            logging.info("Dev: p: %.4f, r: %.4f, f: %.4f" % (p, r, f))
        else:
            f = best_dev_f

        if f > best_dev_f:
            logging.info("Exceed previous best f score on dev: %.4f" %
                         (best_dev_f))

            if opt.ensemble == 'learn':
                if fold_idx is None:
                    torch.save(ensemble_model,
                               os.path.join(opt.output, "ensemble.pkl"))
                else:
                    torch.save(
                        ensemble_model,
                        os.path.join(opt.output,
                                     "ensemble_{}.pkl".format(fold_idx + 1)))
            else:
                if fold_idx is None:
                    torch.save(vsm_model, os.path.join(opt.output, "vsm.pkl"))
                    torch.save(neural_model,
                               os.path.join(opt.output, "norm_neural.pkl"))
                else:
                    torch.save(
                        vsm_model,
                        os.path.join(opt.output,
                                     "vsm_{}.pkl".format(fold_idx + 1)))
                    torch.save(
                        neural_model,
                        os.path.join(opt.output,
                                     "norm_neural_{}.pkl".format(fold_idx +
                                                                 1)))

            best_dev_f = f
            best_dev_p = p
            best_dev_r = r

            bad_counter = 0
        else:
            bad_counter += 1

        if len(opt.dev_file) != 0 and bad_counter >= opt.patience:
            logging.info('Early Stop!')
            break

    logging.info("train finished")

    if fold_idx is None:
        multi_sieve.finalize(True)
    else:
        if fold_idx == opt.cross_validation - 1:
            multi_sieve.finalize(True)
        else:
            multi_sieve.finalize(False)

    if len(opt.dev_file) == 0:
        if opt.ensemble == 'learn':
            torch.save(ensemble_model, os.path.join(opt.output,
                                                    "ensemble.pkl"))
        else:
            torch.save(vsm_model, os.path.join(opt.output, "vsm.pkl"))
            torch.save(neural_model, os.path.join(opt.output,
                                                  "norm_neural.pkl"))

    return best_dev_p, best_dev_r, best_dev_f
Ejemplo n.º 5
0
        datapoints_train = load_dataponts(opt.train_file)
        datapoints_test = load_dataponts(opt.test_file)
        # datapoints_train = load_dataponts('training_instances_debug.txt')
        # datapoints_test = load_dataponts('test_instances_debug.txt')

        word_alphabet = Alphabet('word')

        build_alphabet(word_alphabet, datapoints_train)
        build_alphabet(word_alphabet, datapoints_test)
        word_alphabet.close()

        if d.config.get('norm_emb') is not None:
            logging.info("load pretrained word embedding ...")
            pretrain_word_embedding, word_emb_dim = build_pretrain_embedding(
                d.config.get('norm_emb'), word_alphabet, opt.word_emb_dim,
                False)
            word_embedding = nn.Embedding(word_alphabet.size(),
                                          word_emb_dim,
                                          padding_idx=0)
            word_embedding.weight.data.copy_(
                torch.from_numpy(pretrain_word_embedding))
            embedding_dim = word_emb_dim
        else:
            logging.info("randomly initialize word embedding ...")
            word_embedding = nn.Embedding(word_alphabet.size(),
                                          d.word_emb_dim,
                                          padding_idx=0)
            word_embedding.weight.data.copy_(
                torch.from_numpy(
                    random_embedding(word_alphabet.size(), d.word_emb_dim)))