def process_one_doc(self, doc, entities, dictionary, dictionary_reverse): Xs, Ys = generate_instances_ehr(entities, self.dict_alphabet, dictionary_reverse) data_loader = DataLoader(MyDataset(Xs, Ys), opt.batch_size, shuffle=False, collate_fn=my_collate) data_iter = iter(data_loader) num_iter = len(data_loader) entity_start = 0 for i in range(num_iter): x, mask, sentences, _, _, tokens_ent, mask_ent = next(data_iter) _, y_pred = self.forward(x, sentences, mask, tokens_ent, mask_ent) values, indices = torch.max(y_pred, 1) actual_batch_size = x.size(0) for batch_idx in range(actual_batch_size): entity = entities[entity_start + batch_idx] norm_id = get_dict_name(self.dict_alphabet, indices[batch_idx].item()) concept = dictionary[norm_id] entity.norm_ids.append(norm_id) entity.norm_names.append(concept.names) entity_start += actual_batch_size
def process_one_doc(self, doc, entities, dictionary, dictionary_reverse, isMeddra_dict): Xs, Ys = generate_instances(doc, self.word_alphabet, self.dict_alphabet, dictionary, dictionary_reverse, isMeddra_dict) data_loader = DataLoader(MyDataset(Xs, Ys), opt.batch_size, shuffle=False, collate_fn=my_collate) data_iter = iter(data_loader) num_iter = len(data_loader) entity_start = 0 for i in range(num_iter): words, rules, lengths, _ = next(data_iter) y_pred = self.forward(words, rules, lengths) values, indices = torch.max(y_pred, 1) actual_batch_size = lengths.size(0) for batch_idx in range(actual_batch_size): entity = entities[entity_start+batch_idx] norm_id = norm_utils.get_dict_name(self.dict_alphabet, indices[batch_idx].item()) if isMeddra_dict: name = dictionary[norm_id] entity.norm_ids.append(norm_id) entity.norm_names.append(name) else: concept = dictionary[norm_id] entity.norm_ids.append(norm_id) entity.norm_names.append(concept.names) entity_start += actual_batch_size
def process_one_doc(self, doc, entities, dictionary, dictionary_reverse, isMeddra_dict): if isMeddra_dict: Xs, Ys = generate_instances(entities, self.word_alphabet, self.dict_alphabet) else: Xs, Ys = generate_instances_ehr(entities, self.word_alphabet, self.dict_alphabet, dictionary_reverse) data_loader = DataLoader(MyDataset(Xs, Ys), opt.batch_size, shuffle=False, collate_fn=my_collate) data_iter = iter(data_loader) num_iter = len(data_loader) entity_start = 0 for i in range(num_iter): x, lengths, _ = next(data_iter) y_pred = self.forward(x, lengths) y_pred = self.normalize(y_pred) values, indices = torch.max(y_pred, 1) actual_batch_size = lengths.size(0) for batch_idx in range(actual_batch_size): entity = entities[entity_start + batch_idx] norm_id = norm_utils.get_dict_name(self.dict_alphabet, indices[batch_idx].item()) if isMeddra_dict: name = dictionary[norm_id] entity.norm_ids.append(norm_id) entity.norm_names.append(name) else: concept = dictionary[norm_id] entity.norm_ids.append(norm_id) entity.norm_names.append(concept.names) if opt.ensemble == 'sum': entity.norm_confidences.append( y_pred[batch_idx].detach().cpu().numpy()) else: entity.norm_confidences.append(values[batch_idx].item()) entity.neural_id = norm_id entity_start += actual_batch_size
def init_vector_for_dict(word_alphabet, dict_alphabet, dictionary, isMeddra_dict): # pos poses = [] poses_lengths = [] dict_size = norm_utils.get_dict_size(dict_alphabet) max_len = 0 for i in range(dict_size): # pos if isMeddra_dict: concept_name = dictionary[norm_utils.get_dict_name( dict_alphabet, i)] tokens = my_tokenize(concept_name) else: concept = dictionary[norm_utils.get_dict_name(dict_alphabet, i)] tokens = my_tokenize(concept.names[0]) pos = [] for token in tokens: token = norm_utils.word_preprocess(token) word_id = word_alphabet.get_index(token) pos.append(word_id) if len(pos) > max_len: max_len = len(pos) poses.append(pos) poses_lengths.append(len(pos)) poses = pad_sequence(poses, max_len) poses_lengths = torch.LongTensor(poses_lengths) if opt.gpu >= 0 and torch.cuda.is_available(): poses = poses.cuda(opt.gpu) poses_lengths = poses_lengths.cuda(opt.gpu) return poses, poses_lengths
def process_one_doc(self, doc, entities, dict): for entity in entities: with torch.no_grad(): tokens_id = self.batch_name_to_ids(entity.name) values, indices = self.forward(tokens_id) norm_id = norm_utils.get_dict_name(self.dict_alphabet, indices.item()) name = dict[norm_id] entity.norm_ids.append(norm_id) entity.norm_names.append(name) entity.norm_confidences.append(values.item())
def merge_result(entities1, entities2, entities3, merge_entities, dictionary, isMeddra_dict, dict_alphabet, d): if opt.ensemble == 'vote': for idx, merge_entity in enumerate(merge_entities): entity1 = entities1[idx] entity2 = entities2[idx] entity3 = entities3[idx] if entity1.rule_id is None: if entity2.vsm_id == entity3.neural_id: merge_entity.norm_ids.append(entity2.norm_ids[0]) merge_entity.norm_names.append(entity2.norm_names[0]) else: # if entity2.norm_confidences[0] >= entity3.norm_confidences[0]: # merge_entity.norm_ids.append(entity2.norm_ids[0]) # merge_entity.norm_names.append(entity2.norm_names[0]) # else: # merge_entity.norm_ids.append(entity3.norm_ids[0]) # merge_entity.norm_names.append(entity3.norm_names[0]) # vsm is prior to others merge_entity.norm_ids.append(entity2.norm_ids[0]) merge_entity.norm_names.append(entity2.norm_names[0]) else: id_and_ticket = Counter() id_and_ticket[entity1.norm_ids[0]] = id_and_ticket[ entity1.norm_ids[0]] + 1 id_and_ticket[entity2.norm_ids[0]] = id_and_ticket[ entity2.norm_ids[0]] + 1 id_and_ticket[entity3.norm_ids[0]] = id_and_ticket[ entity3.norm_ids[0]] + 1 temp_id_name = {} temp_id_name[entity1.norm_ids[0]] = entity1.norm_names[0] temp_id_name[entity2.norm_ids[0]] = entity2.norm_names[0] temp_id_name[entity3.norm_ids[0]] = entity3.norm_names[0] top_id, top_ct = id_and_ticket.most_common(1)[0] if top_ct == 1: # the confidence of rule is always 1 # merge_entity.norm_ids.append(entity1.norm_ids[0]) # merge_entity.norm_names.append(entity1.norm_names[0]) # vsm is prior to others merge_entity.norm_ids.append(entity2.norm_ids[0]) merge_entity.norm_names.append(entity2.norm_names[0]) else: merge_entity.norm_ids.append(top_id) merge_entity.norm_names.append(temp_id_name[top_id]) elif opt.ensemble == 'sum': for idx, merge_entity in enumerate(merge_entities): entity1 = entities1[idx] entity2 = entities2[idx] entity3 = entities3[idx] if entity1.rule_id is None: total = float(d.config['norm_ensumble_sum_weight']['1']['w2'])*entity2.norm_confidences[0] + \ float(d.config['norm_ensumble_sum_weight']['1']['w3'])*entity3.norm_confidences[0] else: total = float(d.config['norm_ensumble_sum_weight']['2']['w1'])*entity1.norm_confidences[0] + \ float(d.config['norm_ensumble_sum_weight']['2']['w2'])*entity2.norm_confidences[0] + \ float(d.config['norm_ensumble_sum_weight']['2']['w3'])*entity3.norm_confidences[0] index = total.argmax() norm_id = norm_utils.get_dict_name(dict_alphabet, index) if isMeddra_dict: name = dictionary[norm_id] merge_entity.norm_ids.append(norm_id) merge_entity.norm_names.append(name) else: concept = dictionary[norm_id] merge_entity.norm_ids.append(norm_id) merge_entity.norm_names.append(concept.names) else: raise RuntimeError("run configuration") return merge_entities