def __init__(self, data, input_size): super(ClassifyModel, self).__init__() self.gpu = data.HP_gpu self.average_batch = data.average_batch_loss relation_alphabet_id = data.re_feature_name2id['[RELATION]'] label_size = data.re_feature_alphabet_sizes[relation_alphabet_id] self.attn = DotAttentionLayer(input_size, self.gpu) # instance-level feature entity_type_alphabet_id = data.re_feature_name2id['[ENTITY_TYPE]'] self.entity_type_emb = nn.Embedding(data.re_feature_alphabets[entity_type_alphabet_id].size(), data.re_feature_emb_dims[entity_type_alphabet_id], data.pad_idx) self.entity_type_emb.weight.data.copy_( torch.from_numpy(my_utils.random_embedding(data.re_feature_alphabets[entity_type_alphabet_id].size(), data.re_feature_emb_dims[entity_type_alphabet_id]))) entity_alphabet_id = data.re_feature_name2id['[ENTITY]'] self.entity_emb = nn.Embedding(data.re_feature_alphabets[entity_alphabet_id].size(), data.re_feature_emb_dims[entity_alphabet_id], data.pad_idx) self.entity_emb.weight.data.copy_( torch.from_numpy(my_utils.random_embedding(data.re_feature_alphabets[entity_alphabet_id].size(), data.re_feature_emb_dims[entity_alphabet_id]))) self.dot_att = DotAttentionLayer(data.re_feature_emb_dims[entity_alphabet_id], data.HP_gpu) tok_num_alphabet_id = data.re_feature_name2id['[TOKEN_NUM]'] self.tok_num_betw_emb = nn.Embedding(data.re_feature_alphabets[tok_num_alphabet_id].size(), data.re_feature_emb_dims[tok_num_alphabet_id], data.pad_idx) self.tok_num_betw_emb.weight.data.copy_( torch.from_numpy(my_utils.random_embedding(data.re_feature_alphabets[tok_num_alphabet_id].size(), data.re_feature_emb_dims[tok_num_alphabet_id]))) et_num_alphabet_id = data.re_feature_name2id['[ENTITY_NUM]'] self.et_num_emb = nn.Embedding(data.re_feature_alphabets[et_num_alphabet_id].size(), data.re_feature_emb_dims[et_num_alphabet_id], data.pad_idx) self.et_num_emb.weight.data.copy_( torch.from_numpy(my_utils.random_embedding(data.re_feature_alphabets[et_num_alphabet_id].size(), data.re_feature_emb_dims[et_num_alphabet_id]))) self.input_size = input_size + 2 * data.re_feature_emb_dims[entity_type_alphabet_id] + 2 * data.re_feature_emb_dims[entity_alphabet_id] + \ data.re_feature_emb_dims[tok_num_alphabet_id] + data.re_feature_emb_dims[et_num_alphabet_id] self.linear = nn.Linear(self.input_size, label_size, bias=False) self.loss_function = nn.NLLLoss(size_average=self.average_batch) self.frozen = False if torch.cuda.is_available(): self.attn = self.attn.cuda(data.HP_gpu) self.entity_type_emb = self.entity_type_emb.cuda(data.HP_gpu) self.entity_emb = self.entity_emb.cuda(data.HP_gpu) self.dot_att = self.dot_att.cuda(data.HP_gpu) self.tok_num_betw_emb = self.tok_num_betw_emb.cuda(data.HP_gpu) self.et_num_emb = self.et_num_emb.cuda(data.HP_gpu) self.linear = self.linear.cuda(data.HP_gpu)
def __init__(self, data, num_layers, hidden_size, dropout, gpu): super(LSTMFeatureExtractor, self).__init__() self.num_layers = num_layers self.hidden_size = hidden_size // 2 self.n_cells = self.num_layers * 2 self.word_emb = nn.Embedding(data.word_alphabet.size(), data.word_emb_dim, data.pad_idx) if data.pretrain_word_embedding is not None: self.word_emb.weight.data.copy_(torch.from_numpy(data.pretrain_word_embedding)) else: self.word_emb.weight.data.copy_(torch.from_numpy(my_utils.random_embedding(data.word_alphabet.size(), data.word_emb_dim))) postag_alphabet_id = data.feature_name2id['[POS]'] self.postag_emb = nn.Embedding(data.feature_alphabets[postag_alphabet_id].size(), data.feature_emb_dims[postag_alphabet_id], data.pad_idx) self.postag_emb.weight.data.copy_( torch.from_numpy(my_utils.random_embedding(data.feature_alphabets[postag_alphabet_id].size(), data.feature_emb_dims[postag_alphabet_id]))) position_alphabet_id = data.re_feature_name2id['[POSITION]'] self.position1_emb = nn.Embedding(data.re_feature_alphabets[position_alphabet_id].size(), data.re_feature_emb_dims[position_alphabet_id], data.pad_idx) self.position1_emb.weight.data.copy_( torch.from_numpy(my_utils.random_embedding(data.re_feature_alphabets[position_alphabet_id].size(), data.re_feature_emb_dims[position_alphabet_id]))) self.position2_emb = nn.Embedding(data.re_feature_alphabets[position_alphabet_id].size(), data.re_feature_emb_dims[position_alphabet_id], data.pad_idx) self.position2_emb.weight.data.copy_( torch.from_numpy(my_utils.random_embedding(data.re_feature_alphabets[position_alphabet_id].size(), data.re_feature_emb_dims[position_alphabet_id]))) self.input_size = data.word_emb_dim + data.feature_emb_dims[postag_alphabet_id] + 2*data.re_feature_emb_dims[position_alphabet_id] self.rnn = nn.LSTM(input_size=self.input_size, hidden_size=self.hidden_size, num_layers=num_layers, dropout=dropout, bidirectional=True) self.attn = DotAttentionLayer(hidden_size, gpu)
def __init__(self, context_feature_size, data): super(MLP, self).__init__() entity_type_alphabet_id = data.re_feature_name2id['[ENTITY_TYPE]'] self.entity_type_emb = nn.Embedding(data.re_feature_alphabets[entity_type_alphabet_id].size(), data.re_feature_emb_dims[entity_type_alphabet_id], data.pad_idx) self.entity_type_emb.weight.data.copy_( torch.from_numpy(my_utils.random_embedding(data.re_feature_alphabets[entity_type_alphabet_id].size(), data.re_feature_emb_dims[entity_type_alphabet_id]))) entity_alphabet_id = data.re_feature_name2id['[ENTITY]'] self.entity_emb = nn.Embedding(data.re_feature_alphabets[entity_alphabet_id].size(), data.re_feature_emb_dims[entity_alphabet_id], data.pad_idx) self.entity_emb.weight.data.copy_( torch.from_numpy(my_utils.random_embedding(data.re_feature_alphabets[entity_alphabet_id].size(), data.re_feature_emb_dims[entity_alphabet_id]))) self.dot_att = DotAttentionLayer(data.re_feature_emb_dims[entity_alphabet_id], data.HP_gpu) tok_num_alphabet_id = data.re_feature_name2id['[TOKEN_NUM]'] self.tok_num_betw_emb = nn.Embedding(data.re_feature_alphabets[tok_num_alphabet_id].size(), data.re_feature_emb_dims[tok_num_alphabet_id], data.pad_idx) self.tok_num_betw_emb.weight.data.copy_( torch.from_numpy(my_utils.random_embedding(data.re_feature_alphabets[tok_num_alphabet_id].size(), data.re_feature_emb_dims[tok_num_alphabet_id]))) et_num_alphabet_id = data.re_feature_name2id['[ENTITY_NUM]'] self.et_num_emb = nn.Embedding(data.re_feature_alphabets[et_num_alphabet_id].size(), data.re_feature_emb_dims[et_num_alphabet_id], data.pad_idx) self.et_num_emb.weight.data.copy_( torch.from_numpy(my_utils.random_embedding(data.re_feature_alphabets[et_num_alphabet_id].size(), data.re_feature_emb_dims[et_num_alphabet_id]))) self.input_size = context_feature_size + 2 * data.re_feature_emb_dims[entity_type_alphabet_id] + 2 * data.re_feature_emb_dims[entity_alphabet_id] + \ data.re_feature_emb_dims[tok_num_alphabet_id] + data.re_feature_emb_dims[et_num_alphabet_id] relation_alphabet_id = data.re_feature_name2id['[RELATION]'] self.linear = nn.Linear(self.input_size, data.re_feature_alphabet_sizes[relation_alphabet_id], bias=False) self.criterion = nn.CrossEntropyLoss()
def train(train_data, dev_data, test_data, d, dictionary, dictionary_reverse, opt, fold_idx, isMeddra_dict): logging.info("train the vsm-based normalization model ...") external_train_data = [] if d.config.get('norm_ext_corpus') is not None: for k, v in d.config['norm_ext_corpus'].items(): if k == 'tac': external_train_data.extend( load_data_fda(v['path'], True, v.get('types'), v.get('types'), False, True)) else: raise RuntimeError("not support external corpus") if len(external_train_data) != 0: train_data.extend(external_train_data) logging.info("build alphabet ...") word_alphabet = Alphabet('word') norm_utils.build_alphabet_from_dict(word_alphabet, dictionary, isMeddra_dict) norm_utils.build_alphabet(word_alphabet, train_data) if opt.dev_file: norm_utils.build_alphabet(word_alphabet, dev_data) if opt.test_file: norm_utils.build_alphabet(word_alphabet, test_data) norm_utils.fix_alphabet(word_alphabet) logging.info("alphabet size {}".format(word_alphabet.size())) if d.config.get('norm_emb') is not None: logging.info("load pretrained word embedding ...") pretrain_word_embedding, word_emb_dim = build_pretrain_embedding( d.config.get('norm_emb'), word_alphabet, opt.word_emb_dim, False) word_embedding = nn.Embedding(word_alphabet.size(), word_emb_dim, padding_idx=0) word_embedding.weight.data.copy_( torch.from_numpy(pretrain_word_embedding)) embedding_dim = word_emb_dim else: logging.info("randomly initialize word embedding ...") word_embedding = nn.Embedding(word_alphabet.size(), d.word_emb_dim, padding_idx=0) word_embedding.weight.data.copy_( torch.from_numpy( random_embedding(word_alphabet.size(), d.word_emb_dim))) embedding_dim = d.word_emb_dim dict_alphabet = Alphabet('dict') norm_utils.init_dict_alphabet(dict_alphabet, dictionary) norm_utils.fix_alphabet(dict_alphabet) logging.info("init_vector_for_dict") poses, poses_lengths = init_vector_for_dict(word_alphabet, dict_alphabet, dictionary, isMeddra_dict) vsm_model = VsmNormer(word_alphabet, word_embedding, embedding_dim, dict_alphabet, poses, poses_lengths) logging.info("generate instances for training ...") train_X = [] train_Y = [] for doc in train_data: if isMeddra_dict: temp_X, temp_Y = generate_instances(doc.entities, word_alphabet, dict_alphabet) else: temp_X, temp_Y = generate_instances_ehr(doc.entities, word_alphabet, dict_alphabet, dictionary_reverse) train_X.extend(temp_X) train_Y.extend(temp_Y) train_loader = DataLoader(MyDataset(train_X, train_Y), opt.batch_size, shuffle=True, collate_fn=my_collate) optimizer = optim.Adam(vsm_model.parameters(), lr=opt.lr, weight_decay=opt.l2) if opt.tune_wordemb == False: freeze_net(vsm_model.word_embedding) if d.config['norm_vsm_pretrain'] == '1': dict_pretrain(dictionary, dictionary_reverse, d, isMeddra_dict, optimizer, vsm_model) best_dev_f = -10 best_dev_p = -10 best_dev_r = -10 bad_counter = 0 logging.info("start training ...") for idx in range(opt.iter): epoch_start = time.time() vsm_model.train() train_iter = iter(train_loader) num_iter = len(train_loader) sum_loss = 0 correct, total = 0, 0 for i in range(num_iter): x, lengths, y = next(train_iter) l, y_pred = vsm_model.forward_train(x, lengths, y) sum_loss += l.item() l.backward() if opt.gradient_clip > 0: torch.nn.utils.clip_grad_norm_(vsm_model.parameters(), opt.gradient_clip) optimizer.step() vsm_model.zero_grad() total += y.size(0) _, pred = torch.max(y_pred, 1) correct += (pred == y).sum().item() epoch_finish = time.time() accuracy = 100.0 * correct / total logging.info( "epoch: %s training finished. Time: %.2fs. loss: %.4f Accuracy %.2f" % (idx, epoch_finish - epoch_start, sum_loss / num_iter, accuracy)) if opt.dev_file: p, r, f = norm_utils.evaluate(dev_data, dictionary, dictionary_reverse, vsm_model, None, None, d, isMeddra_dict) logging.info("Dev: p: %.4f, r: %.4f, f: %.4f" % (p, r, f)) else: f = best_dev_f if f > best_dev_f: logging.info("Exceed previous best f score on dev: %.4f" % (best_dev_f)) if fold_idx is None: torch.save(vsm_model, os.path.join(opt.output, "vsm.pkl")) else: torch.save( vsm_model, os.path.join(opt.output, "vsm_{}.pkl".format(fold_idx + 1))) best_dev_f = f best_dev_p = p best_dev_r = r bad_counter = 0 else: bad_counter += 1 if len(opt.dev_file) != 0 and bad_counter >= opt.patience: logging.info('Early Stop!') break logging.info("train finished") if len(opt.dev_file) == 0: torch.save(vsm_model, os.path.join(opt.output, "vsm.pkl")) return best_dev_p, best_dev_r, best_dev_f
def __init__(self, data): super(ClassifyModel, self).__init__() print "build classify network..." self.gpu = data.HP_gpu self.average_batch = data.average_batch_loss relation_alphabet_id = data.re_feature_name2id['[RELATION]'] label_size = data.re_feature_alphabet_sizes[relation_alphabet_id] # self.word_hidden = WordSequence(data, True, False, False) self.attn = DotAttentionLayer(data.HP_hidden_dim, self.gpu) # instance-level feature entity_type_alphabet_id = data.re_feature_name2id['[ENTITY_TYPE]'] self.entity_type_emb = nn.Embedding( data.re_feature_alphabets[entity_type_alphabet_id].size(), data.re_feature_emb_dims[entity_type_alphabet_id], data.pad_idx) self.entity_type_emb.weight.data.copy_( torch.from_numpy( my_utils.random_embedding( data.re_feature_alphabets[entity_type_alphabet_id].size(), data.re_feature_emb_dims[entity_type_alphabet_id]))) entity_alphabet_id = data.re_feature_name2id['[ENTITY]'] self.entity_emb = nn.Embedding( data.re_feature_alphabets[entity_alphabet_id].size(), data.re_feature_emb_dims[entity_alphabet_id], data.pad_idx) self.entity_emb.weight.data.copy_( torch.from_numpy( my_utils.random_embedding( data.re_feature_alphabets[entity_alphabet_id].size(), data.re_feature_emb_dims[entity_alphabet_id]))) self.dot_att = DotAttentionLayer( data.re_feature_emb_dims[entity_alphabet_id], data.HP_gpu) tok_num_alphabet_id = data.re_feature_name2id['[TOKEN_NUM]'] self.tok_num_betw_emb = nn.Embedding( data.re_feature_alphabets[tok_num_alphabet_id].size(), data.re_feature_emb_dims[tok_num_alphabet_id], data.pad_idx) self.tok_num_betw_emb.weight.data.copy_( torch.from_numpy( my_utils.random_embedding( data.re_feature_alphabets[tok_num_alphabet_id].size(), data.re_feature_emb_dims[tok_num_alphabet_id]))) et_num_alphabet_id = data.re_feature_name2id['[ENTITY_NUM]'] self.et_num_emb = nn.Embedding( data.re_feature_alphabets[et_num_alphabet_id].size(), data.re_feature_emb_dims[et_num_alphabet_id], data.pad_idx) self.et_num_emb.weight.data.copy_( torch.from_numpy( my_utils.random_embedding( data.re_feature_alphabets[et_num_alphabet_id].size(), data.re_feature_emb_dims[et_num_alphabet_id]))) self.input_size = data.HP_hidden_dim + 2 * data.re_feature_emb_dims[entity_type_alphabet_id] + 2 * data.re_feature_emb_dims[entity_alphabet_id] + \ data.re_feature_emb_dims[tok_num_alphabet_id] + data.re_feature_emb_dims[et_num_alphabet_id] self.linear = nn.Linear(self.input_size, label_size, bias=False) self.loss_function = nn.NLLLoss(size_average=self.average_batch) self.frozen = False
def train(train_data, dev_data, test_data, d, dictionary, dictionary_reverse, opt, fold_idx, isMeddra_dict): logging.info("train the ensemble normalization model ...") external_train_data = [] if d.config.get('norm_ext_corpus') is not None: for k, v in d.config['norm_ext_corpus'].items(): if k == 'tac': external_train_data.extend( load_data_fda(v['path'], True, v.get('types'), v.get('types'), False, True)) else: raise RuntimeError("not support external corpus") if len(external_train_data) != 0: train_data.extend(external_train_data) logging.info("build alphabet ...") word_alphabet = Alphabet('word') norm_utils.build_alphabet_from_dict(word_alphabet, dictionary, isMeddra_dict) norm_utils.build_alphabet(word_alphabet, train_data) if opt.dev_file: norm_utils.build_alphabet(word_alphabet, dev_data) if opt.test_file: norm_utils.build_alphabet(word_alphabet, test_data) norm_utils.fix_alphabet(word_alphabet) if d.config.get('norm_emb') is not None: logging.info("load pretrained word embedding ...") pretrain_word_embedding, word_emb_dim = build_pretrain_embedding( d.config.get('norm_emb'), word_alphabet, opt.word_emb_dim, False) word_embedding = nn.Embedding(word_alphabet.size(), word_emb_dim, padding_idx=0) word_embedding.weight.data.copy_( torch.from_numpy(pretrain_word_embedding)) embedding_dim = word_emb_dim else: logging.info("randomly initialize word embedding ...") word_embedding = nn.Embedding(word_alphabet.size(), d.word_emb_dim, padding_idx=0) word_embedding.weight.data.copy_( torch.from_numpy( random_embedding(word_alphabet.size(), d.word_emb_dim))) embedding_dim = d.word_emb_dim dict_alphabet = Alphabet('dict') norm_utils.init_dict_alphabet(dict_alphabet, dictionary) norm_utils.fix_alphabet(dict_alphabet) # rule logging.info("init rule-based normer") multi_sieve.init(opt, train_data, d, dictionary, dictionary_reverse, isMeddra_dict) if opt.ensemble == 'learn': logging.info("init ensemble normer") poses = vsm.init_vector_for_dict(word_alphabet, dict_alphabet, dictionary, isMeddra_dict) ensemble_model = Ensemble(word_alphabet, word_embedding, embedding_dim, dict_alphabet, poses) if pretrain_neural_model is not None: ensemble_model.neural_linear.weight.data.copy_( pretrain_neural_model.linear.weight.data) if pretrain_vsm_model is not None: ensemble_model.vsm_linear.weight.data.copy_( pretrain_vsm_model.linear.weight.data) ensemble_train_X = [] ensemble_train_Y = [] for doc in train_data: temp_X, temp_Y = generate_instances(doc, word_alphabet, dict_alphabet, dictionary, dictionary_reverse, isMeddra_dict) ensemble_train_X.extend(temp_X) ensemble_train_Y.extend(temp_Y) ensemble_train_loader = DataLoader(MyDataset(ensemble_train_X, ensemble_train_Y), opt.batch_size, shuffle=True, collate_fn=my_collate) ensemble_optimizer = optim.Adam(ensemble_model.parameters(), lr=opt.lr, weight_decay=opt.l2) if opt.tune_wordemb == False: freeze_net(ensemble_model.word_embedding) else: # vsm logging.info("init vsm-based normer") poses = vsm.init_vector_for_dict(word_alphabet, dict_alphabet, dictionary, isMeddra_dict) # alphabet can share between vsm and neural since they don't change # but word_embedding cannot vsm_model = vsm.VsmNormer(word_alphabet, copy.deepcopy(word_embedding), embedding_dim, dict_alphabet, poses) vsm_train_X = [] vsm_train_Y = [] for doc in train_data: if isMeddra_dict: temp_X, temp_Y = vsm.generate_instances( doc.entities, word_alphabet, dict_alphabet) else: temp_X, temp_Y = vsm.generate_instances_ehr( doc.entities, word_alphabet, dict_alphabet, dictionary_reverse) vsm_train_X.extend(temp_X) vsm_train_Y.extend(temp_Y) vsm_train_loader = DataLoader(vsm.MyDataset(vsm_train_X, vsm_train_Y), opt.batch_size, shuffle=True, collate_fn=vsm.my_collate) vsm_optimizer = optim.Adam(vsm_model.parameters(), lr=opt.lr, weight_decay=opt.l2) if opt.tune_wordemb == False: freeze_net(vsm_model.word_embedding) if d.config['norm_vsm_pretrain'] == '1': vsm.dict_pretrain(dictionary, dictionary_reverse, d, True, vsm_optimizer, vsm_model) # neural logging.info("init neural-based normer") neural_model = norm_neural.NeuralNormer(word_alphabet, copy.deepcopy(word_embedding), embedding_dim, dict_alphabet) neural_train_X = [] neural_train_Y = [] for doc in train_data: if isMeddra_dict: temp_X, temp_Y = norm_neural.generate_instances( doc.entities, word_alphabet, dict_alphabet) else: temp_X, temp_Y = norm_neural.generate_instances_ehr( doc.entities, word_alphabet, dict_alphabet, dictionary_reverse) neural_train_X.extend(temp_X) neural_train_Y.extend(temp_Y) neural_train_loader = DataLoader(norm_neural.MyDataset( neural_train_X, neural_train_Y), opt.batch_size, shuffle=True, collate_fn=norm_neural.my_collate) neural_optimizer = optim.Adam(neural_model.parameters(), lr=opt.lr, weight_decay=opt.l2) if opt.tune_wordemb == False: freeze_net(neural_model.word_embedding) if d.config['norm_neural_pretrain'] == '1': neural_model.dict_pretrain(dictionary, dictionary_reverse, d, True, neural_optimizer, neural_model) best_dev_f = -10 best_dev_p = -10 best_dev_r = -10 bad_counter = 0 logging.info("start training ...") for idx in range(opt.iter): epoch_start = time.time() if opt.ensemble == 'learn': ensemble_model.train() ensemble_train_iter = iter(ensemble_train_loader) ensemble_num_iter = len(ensemble_train_loader) for i in range(ensemble_num_iter): x, rules, lengths, y = next(ensemble_train_iter) y_pred = ensemble_model.forward(x, rules, lengths) l = ensemble_model.loss(y_pred, y) l.backward() if opt.gradient_clip > 0: torch.nn.utils.clip_grad_norm_(ensemble_model.parameters(), opt.gradient_clip) ensemble_optimizer.step() ensemble_model.zero_grad() else: vsm_model.train() vsm_train_iter = iter(vsm_train_loader) vsm_num_iter = len(vsm_train_loader) for i in range(vsm_num_iter): x, lengths, y = next(vsm_train_iter) l, _ = vsm_model.forward_train(x, lengths, y) l.backward() if opt.gradient_clip > 0: torch.nn.utils.clip_grad_norm_(vsm_model.parameters(), opt.gradient_clip) vsm_optimizer.step() vsm_model.zero_grad() neural_model.train() neural_train_iter = iter(neural_train_loader) neural_num_iter = len(neural_train_loader) for i in range(neural_num_iter): x, lengths, y = next(neural_train_iter) y_pred = neural_model.forward(x, lengths) l = neural_model.loss(y_pred, y) l.backward() if opt.gradient_clip > 0: torch.nn.utils.clip_grad_norm_(neural_model.parameters(), opt.gradient_clip) neural_optimizer.step() neural_model.zero_grad() epoch_finish = time.time() logging.info("epoch: %s training finished. Time: %.2fs" % (idx, epoch_finish - epoch_start)) if opt.dev_file: if opt.ensemble == 'learn': # logging.info("weight w1: %.4f, w2: %.4f, w3: %.4f" % (ensemble_model.w1.data.item(), ensemble_model.w2.data.item(), ensemble_model.w3.data.item())) p, r, f = norm_utils.evaluate(dev_data, dictionary, dictionary_reverse, None, None, ensemble_model, d, isMeddra_dict) else: p, r, f = norm_utils.evaluate(dev_data, dictionary, dictionary_reverse, vsm_model, neural_model, None, d, isMeddra_dict) logging.info("Dev: p: %.4f, r: %.4f, f: %.4f" % (p, r, f)) else: f = best_dev_f if f > best_dev_f: logging.info("Exceed previous best f score on dev: %.4f" % (best_dev_f)) if opt.ensemble == 'learn': if fold_idx is None: torch.save(ensemble_model, os.path.join(opt.output, "ensemble.pkl")) else: torch.save( ensemble_model, os.path.join(opt.output, "ensemble_{}.pkl".format(fold_idx + 1))) else: if fold_idx is None: torch.save(vsm_model, os.path.join(opt.output, "vsm.pkl")) torch.save(neural_model, os.path.join(opt.output, "norm_neural.pkl")) else: torch.save( vsm_model, os.path.join(opt.output, "vsm_{}.pkl".format(fold_idx + 1))) torch.save( neural_model, os.path.join(opt.output, "norm_neural_{}.pkl".format(fold_idx + 1))) best_dev_f = f best_dev_p = p best_dev_r = r bad_counter = 0 else: bad_counter += 1 if len(opt.dev_file) != 0 and bad_counter >= opt.patience: logging.info('Early Stop!') break logging.info("train finished") if fold_idx is None: multi_sieve.finalize(True) else: if fold_idx == opt.cross_validation - 1: multi_sieve.finalize(True) else: multi_sieve.finalize(False) if len(opt.dev_file) == 0: if opt.ensemble == 'learn': torch.save(ensemble_model, os.path.join(opt.output, "ensemble.pkl")) else: torch.save(vsm_model, os.path.join(opt.output, "vsm.pkl")) torch.save(neural_model, os.path.join(opt.output, "norm_neural.pkl")) return best_dev_p, best_dev_r, best_dev_f
def train(train_data, dev_data, d, meddra_dict, opt, fold_idx): logging.info("train the vsm-based normalization model ...") external_train_data = [] if d.config.get('norm_ext_corpus') is not None: for k, v in d.config['norm_ext_corpus'].items(): if k == 'tac': external_train_data.extend( load_data_fda(v['path'], True, v.get('types'), v.get('types'), False, True)) else: raise RuntimeError("not support external corpus") if len(external_train_data) != 0: train_data.extend(external_train_data) vsm_model = VsmNormer() logging.info("build alphabet ...") norm_utils.build_alphabet(vsm_model.word_alphabet, train_data) if opt.dev_file: norm_utils.build_alphabet(vsm_model.word_alphabet, dev_data) norm_utils.build_alphabet_from_dict(vsm_model.word_alphabet, meddra_dict) norm_utils.fix_alphabet(vsm_model.word_alphabet) if d.config.get('norm_emb') is not None: logging.info("load pretrained word embedding ...") pretrain_word_embedding, word_emb_dim = build_pretrain_embedding( d.config.get('norm_emb'), vsm_model.word_alphabet, opt.word_emb_dim, False) vsm_model.word_embedding = nn.Embedding(vsm_model.word_alphabet.size(), word_emb_dim) vsm_model.word_embedding.weight.data.copy_( torch.from_numpy(pretrain_word_embedding)) vsm_model.embedding_dim = word_emb_dim else: logging.info("randomly initialize word embedding ...") vsm_model.word_embedding = nn.Embedding(vsm_model.word_alphabet.size(), d.word_emb_dim) vsm_model.word_embedding.weight.data.copy_( torch.from_numpy( random_embedding(vsm_model.word_alphabet.size(), d.word_emb_dim))) vsm_model.embedding_dim = d.word_emb_dim if torch.cuda.is_available(): vsm_model.word_embedding = vsm_model.word_embedding.cuda(vsm_model.gpu) logging.info("init_vector_for_dict") vsm_model.init_vector_for_dict(meddra_dict) norm_utils.fix_alphabet(vsm_model.dict_alphabet) vsm_model.train() best_dev_f = -10 best_dev_p = -10 best_dev_r = -10 if opt.dev_file: p, r, f = norm_utils.evaluate(dev_data, meddra_dict, vsm_model) logging.info("Dev: p: %.4f, r: %.4f, f: %.4f" % (p, r, f)) else: f = best_dev_f if f > best_dev_f: logging.info("Exceed previous best f score on dev: %.4f" % (best_dev_f)) if fold_idx is None: logging.info("save model to {}".format( os.path.join(opt.output, "vsm.pkl"))) torch.save(vsm_model, os.path.join(opt.output, "vsm.pkl")) else: logging.info("save model to {}".format( os.path.join(opt.output, "vsm_{}.pkl".format(fold_idx + 1)))) torch.save( vsm_model, os.path.join(opt.output, "vsm_{}.pkl".format(fold_idx + 1))) best_dev_f = f best_dev_p = p best_dev_r = r logging.info("train finished") if len(opt.dev_file) == 0: logging.info("save model to {}".format( os.path.join(opt.output, "vsm.pkl"))) torch.save(vsm_model, os.path.join(opt.output, "vsm.pkl")) return best_dev_p, best_dev_r, best_dev_f
d.config.get('norm_emb'), word_alphabet, opt.word_emb_dim, False) word_embedding = nn.Embedding(word_alphabet.size(), word_emb_dim, padding_idx=0) word_embedding.weight.data.copy_( torch.from_numpy(pretrain_word_embedding)) embedding_dim = word_emb_dim else: logging.info("randomly initialize word embedding ...") word_embedding = nn.Embedding(word_alphabet.size(), d.word_emb_dim, padding_idx=0) word_embedding.weight.data.copy_( torch.from_numpy( random_embedding(word_alphabet.size(), d.word_emb_dim))) embedding_dim = d.word_emb_dim vsm_model = VsmNormer(word_alphabet, word_embedding, embedding_dim) # generate data points instances_train = generate_instances(word_alphabet, datapoints_train) instances_test = generate_instances(word_alphabet, datapoints_test) # batch size always 1 train_loader = DataLoader(MyDataset(instances_train), 1, shuffle=True, collate_fn=my_collate) test_loader = DataLoader(MyDataset(instances_test), 1,