Ejemplo n.º 1
0
    def process_one_doc(self, doc, entities, dictionary, dictionary_reverse):

        Xs, Ys = generate_instances_ehr(entities, self.dict_alphabet,
                                        dictionary_reverse)

        data_loader = DataLoader(MyDataset(Xs, Ys),
                                 opt.batch_size,
                                 shuffle=False,
                                 collate_fn=my_collate)
        data_iter = iter(data_loader)
        num_iter = len(data_loader)

        entity_start = 0

        for i in range(num_iter):

            x, mask, sentences, _, _, tokens_ent, mask_ent = next(data_iter)

            _, y_pred = self.forward(x, sentences, mask, tokens_ent, mask_ent)

            values, indices = torch.max(y_pred, 1)

            actual_batch_size = x.size(0)

            for batch_idx in range(actual_batch_size):
                entity = entities[entity_start + batch_idx]
                norm_id = get_dict_name(self.dict_alphabet,
                                        indices[batch_idx].item())

                concept = dictionary[norm_id]
                entity.norm_ids.append(norm_id)
                entity.norm_names.append(concept.names)

            entity_start += actual_batch_size
Ejemplo n.º 2
0
    def process_one_doc(self, doc, entities, dictionary, dictionary_reverse, isMeddra_dict):

        Xs, Ys = generate_instances(doc, self.word_alphabet, self.dict_alphabet, dictionary, dictionary_reverse, isMeddra_dict)

        data_loader = DataLoader(MyDataset(Xs, Ys), opt.batch_size, shuffle=False, collate_fn=my_collate)
        data_iter = iter(data_loader)
        num_iter = len(data_loader)

        entity_start = 0

        for i in range(num_iter):

            words, rules, lengths, _ = next(data_iter)

            y_pred = self.forward(words, rules, lengths)

            values, indices = torch.max(y_pred, 1)

            actual_batch_size = lengths.size(0)

            for batch_idx in range(actual_batch_size):
                entity = entities[entity_start+batch_idx]
                norm_id = norm_utils.get_dict_name(self.dict_alphabet, indices[batch_idx].item())
                if isMeddra_dict:
                    name = dictionary[norm_id]
                    entity.norm_ids.append(norm_id)
                    entity.norm_names.append(name)
                else:
                    concept = dictionary[norm_id]
                    entity.norm_ids.append(norm_id)
                    entity.norm_names.append(concept.names)

            entity_start += actual_batch_size
Ejemplo n.º 3
0
    def process_one_doc(self, doc, entities, dictionary, dictionary_reverse,
                        isMeddra_dict):

        if isMeddra_dict:
            Xs, Ys = generate_instances(entities, self.word_alphabet,
                                        self.dict_alphabet)
        else:
            Xs, Ys = generate_instances_ehr(entities, self.word_alphabet,
                                            self.dict_alphabet,
                                            dictionary_reverse)

        data_loader = DataLoader(MyDataset(Xs, Ys),
                                 opt.batch_size,
                                 shuffle=False,
                                 collate_fn=my_collate)
        data_iter = iter(data_loader)
        num_iter = len(data_loader)

        entity_start = 0

        for i in range(num_iter):

            x, lengths, _ = next(data_iter)

            y_pred = self.forward(x, lengths)

            y_pred = self.normalize(y_pred)

            values, indices = torch.max(y_pred, 1)

            actual_batch_size = lengths.size(0)

            for batch_idx in range(actual_batch_size):
                entity = entities[entity_start + batch_idx]
                norm_id = norm_utils.get_dict_name(self.dict_alphabet,
                                                   indices[batch_idx].item())
                if isMeddra_dict:
                    name = dictionary[norm_id]
                    entity.norm_ids.append(norm_id)
                    entity.norm_names.append(name)
                else:
                    concept = dictionary[norm_id]
                    entity.norm_ids.append(norm_id)
                    entity.norm_names.append(concept.names)

                if opt.ensemble == 'sum':
                    entity.norm_confidences.append(
                        y_pred[batch_idx].detach().cpu().numpy())
                else:
                    entity.norm_confidences.append(values[batch_idx].item())

                entity.neural_id = norm_id

            entity_start += actual_batch_size
Ejemplo n.º 4
0
def init_vector_for_dict(word_alphabet, dict_alphabet, dictionary,
                         isMeddra_dict):

    # pos
    poses = []
    poses_lengths = []
    dict_size = norm_utils.get_dict_size(dict_alphabet)
    max_len = 0
    for i in range(dict_size):

        # pos
        if isMeddra_dict:
            concept_name = dictionary[norm_utils.get_dict_name(
                dict_alphabet, i)]
            tokens = my_tokenize(concept_name)
        else:
            concept = dictionary[norm_utils.get_dict_name(dict_alphabet, i)]
            tokens = my_tokenize(concept.names[0])
        pos = []
        for token in tokens:
            token = norm_utils.word_preprocess(token)
            word_id = word_alphabet.get_index(token)
            pos.append(word_id)

        if len(pos) > max_len:
            max_len = len(pos)

        poses.append(pos)
        poses_lengths.append(len(pos))

    poses = pad_sequence(poses, max_len)
    poses_lengths = torch.LongTensor(poses_lengths)

    if opt.gpu >= 0 and torch.cuda.is_available():
        poses = poses.cuda(opt.gpu)
        poses_lengths = poses_lengths.cuda(opt.gpu)

    return poses, poses_lengths
Ejemplo n.º 5
0
    def process_one_doc(self, doc, entities, dict):

        for entity in entities:
            with torch.no_grad():
                tokens_id = self.batch_name_to_ids(entity.name)

                values, indices = self.forward(tokens_id)

                norm_id = norm_utils.get_dict_name(self.dict_alphabet,
                                                   indices.item())
                name = dict[norm_id]
                entity.norm_ids.append(norm_id)
                entity.norm_names.append(name)
                entity.norm_confidences.append(values.item())
Ejemplo n.º 6
0
def merge_result(entities1, entities2, entities3, merge_entities, dictionary,
                 isMeddra_dict, dict_alphabet, d):
    if opt.ensemble == 'vote':

        for idx, merge_entity in enumerate(merge_entities):
            entity1 = entities1[idx]
            entity2 = entities2[idx]
            entity3 = entities3[idx]

            if entity1.rule_id is None:
                if entity2.vsm_id == entity3.neural_id:
                    merge_entity.norm_ids.append(entity2.norm_ids[0])
                    merge_entity.norm_names.append(entity2.norm_names[0])
                else:
                    # if entity2.norm_confidences[0] >= entity3.norm_confidences[0]:
                    #     merge_entity.norm_ids.append(entity2.norm_ids[0])
                    #     merge_entity.norm_names.append(entity2.norm_names[0])
                    # else:
                    #     merge_entity.norm_ids.append(entity3.norm_ids[0])
                    #     merge_entity.norm_names.append(entity3.norm_names[0])

                    # vsm is prior to others
                    merge_entity.norm_ids.append(entity2.norm_ids[0])
                    merge_entity.norm_names.append(entity2.norm_names[0])

            else:

                id_and_ticket = Counter()
                id_and_ticket[entity1.norm_ids[0]] = id_and_ticket[
                    entity1.norm_ids[0]] + 1
                id_and_ticket[entity2.norm_ids[0]] = id_and_ticket[
                    entity2.norm_ids[0]] + 1
                id_and_ticket[entity3.norm_ids[0]] = id_and_ticket[
                    entity3.norm_ids[0]] + 1

                temp_id_name = {}
                temp_id_name[entity1.norm_ids[0]] = entity1.norm_names[0]
                temp_id_name[entity2.norm_ids[0]] = entity2.norm_names[0]
                temp_id_name[entity3.norm_ids[0]] = entity3.norm_names[0]

                top_id, top_ct = id_and_ticket.most_common(1)[0]
                if top_ct == 1:
                    # the confidence of rule is always 1
                    # merge_entity.norm_ids.append(entity1.norm_ids[0])
                    # merge_entity.norm_names.append(entity1.norm_names[0])

                    # vsm is prior to others
                    merge_entity.norm_ids.append(entity2.norm_ids[0])
                    merge_entity.norm_names.append(entity2.norm_names[0])
                else:
                    merge_entity.norm_ids.append(top_id)
                    merge_entity.norm_names.append(temp_id_name[top_id])

    elif opt.ensemble == 'sum':

        for idx, merge_entity in enumerate(merge_entities):
            entity1 = entities1[idx]
            entity2 = entities2[idx]
            entity3 = entities3[idx]

            if entity1.rule_id is None:

                total = float(d.config['norm_ensumble_sum_weight']['1']['w2'])*entity2.norm_confidences[0] + \
                        float(d.config['norm_ensumble_sum_weight']['1']['w3'])*entity3.norm_confidences[0]
            else:
                total = float(d.config['norm_ensumble_sum_weight']['2']['w1'])*entity1.norm_confidences[0] + \
                        float(d.config['norm_ensumble_sum_weight']['2']['w2'])*entity2.norm_confidences[0] + \
                        float(d.config['norm_ensumble_sum_weight']['2']['w3'])*entity3.norm_confidences[0]

            index = total.argmax()
            norm_id = norm_utils.get_dict_name(dict_alphabet, index)
            if isMeddra_dict:
                name = dictionary[norm_id]
                merge_entity.norm_ids.append(norm_id)
                merge_entity.norm_names.append(name)
            else:
                concept = dictionary[norm_id]
                merge_entity.norm_ids.append(norm_id)
                merge_entity.norm_names.append(concept.names)

    else:
        raise RuntimeError("run configuration")

    return merge_entities