Example #1
0
def generate_dict_instances(dictionary, dict_alphabet, word_alphabet, isMeddra_dict):
    Xs = []
    Ys = []

    if isMeddra_dict:
        for concept_id, concept_name in dictionary.items():

            Y = norm_utils.get_dict_index(dict_alphabet, concept_id)
            if Y >= 0 and Y < norm_utils.get_dict_size(dict_alphabet):
                Ys.append(Y)
            else:
                continue


            tokens = my_tokenize(concept_name)
            word_ids = []
            for token in tokens:
                if token in stop_word:
                    continue
                token = norm_utils.word_preprocess(token)
                word_id = word_alphabet.get_index(token)
                word_ids.append(word_id)

            Xs.append(word_ids)
    else :
        for concept_id, concept in dictionary.items():
            Y = norm_utils.get_dict_index(dict_alphabet, concept_id)
            if Y >= 0 and Y < norm_utils.get_dict_size(dict_alphabet):
                pass
            else:
                continue

            # for concept_name in concept.names:
            #
            #     tokens = my_tokenize(concept_name)
            #     word_ids = []
            #     for token in tokens:
            #         token = norm_utils.word_preprocess(token)
            #         word_id = word_alphabet.get_index(token)
            #         word_ids.append(word_id)
            #
            #     Ys.append(Y)
            #     Xs.append(word_ids)


            tokens = my_tokenize(concept.names[0])
            word_ids = []
            for token in tokens:
                if token in stop_word:
                    continue
                token = norm_utils.word_preprocess(token)
                word_id = word_alphabet.get_index(token)
                word_ids.append(word_id)

            Ys.append(Y)
            Xs.append(word_ids)


    return Xs, Ys
Example #2
0
def generate_instances(entities, word_alphabet, dict_alphabet):
    Xs = []
    Ys = []
    dict_size = norm_utils.get_dict_size(dict_alphabet)

    for entity in entities:
        if len(entity.norm_ids) > 0:
            Y = norm_utils.get_dict_index(dict_alphabet, entity.norm_ids[0])
            if Y >= 0 and Y < dict_size:
                pass
            else:
                continue
        else:
            Y = 0

        # mention
        tokens = my_tokenize(entity.name)
        mention = []
        for token in tokens:
            token = norm_utils.word_preprocess(token)
            word_id = word_alphabet.get_index(token)
            mention.append(word_id)

        Xs.append(mention)
        Ys.append(Y)

    return Xs, Ys
Example #3
0
File: vsm.py Project: foxlf823/norm
def generate_instances_ehr(entities, word_alphabet, dict_alphabet, dictionary_reverse):
    Xs = []
    Ys = []
    dict_size = norm_utils.get_dict_size(dict_alphabet)

    for entity in entities:

        if len(entity.norm_ids) > 0:
            if entity.norm_ids[0] in dictionary_reverse:
                cui_list = dictionary_reverse[entity.norm_ids[0]]
                Y = norm_utils.get_dict_index(dict_alphabet, cui_list[0])  # use the first id to generate instance
                if Y >= 0 and Y < dict_size:
                    pass
                else:
                    raise RuntimeError("entity {}, {}, cui not in dict_alphabet".format(entity.id, entity.name))
            else:
                logging.debug("entity {}, {}, can't map to umls, ignored".format(entity.id, entity.name))
                continue
        else:
            Y = 0

        # mention
        tokens = my_tokenize(entity.name)
        mention = []
        for token in tokens:
            token = norm_utils.word_preprocess(token)
            word_id = word_alphabet.get_index(token)
            mention.append(word_id)

        Xs.append(mention)
        Ys.append(Y)


    return Xs, Ys
Example #4
0
def generate_instances(entities, word_alphabet, dict_alphabet):
    Xs = []
    Ys = []

    for entity in entities:
        if len(entity.norm_ids) > 0:
            Y = norm_utils.get_dict_index(
                dict_alphabet,
                entity.norm_ids[0])  # use the first id to generate instance
            if Y >= 0 and Y < norm_utils.get_dict_size(
                    dict_alphabet):  # for tac, can be none or oov ID
                Ys.append(Y)
            else:
                continue
        else:
            Ys.append(0)

        tokens = my_tokenize(entity.name)
        word_ids = []
        for token in tokens:
            token = norm_utils.word_preprocess(token)
            word_id = word_alphabet.get_index(token)
            word_ids.append(word_id)

        Xs.append(word_ids)

    return Xs, Ys
Example #5
0
def convert_example_to_features(example, tokenizer, max_seq_length,
                                dict_alphabet):
    tokens = example["tokens"]
    segment_ids = example["segment_ids"]
    is_random_next = example["is_random_next"]
    masked_lm_positions = example["masked_lm_positions"]
    masked_lm_labels = example["masked_lm_labels"]

    tokens_ent = example['tokens_ent']
    tokens_ent_mask = example['tokens_ent_mask']
    ent_start = example['ent_start']
    norm_label = example['norm_label']

    assert len(tokens) == len(
        segment_ids
    ) <= max_seq_length  # The preprocessed data should be already truncated
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    masked_label_ids = tokenizer.convert_tokens_to_ids(masked_lm_labels)

    input_array = np.zeros(max_seq_length, dtype=np.int)
    input_array[:len(input_ids)] = input_ids

    mask_array = np.zeros(max_seq_length, dtype=np.bool)
    mask_array[:len(input_ids)] = 1

    segment_array = np.zeros(max_seq_length, dtype=np.bool)
    segment_array[:len(segment_ids)] = segment_ids

    lm_label_array = np.full(max_seq_length, dtype=np.int, fill_value=-1)
    lm_label_array[masked_lm_positions] = masked_label_ids

    input_ids_ent = tokenizer.convert_tokens_to_ids(tokens_ent)
    norm_label_ids = [get_dict_index(dict_alphabet, l) for l in norm_label]

    input_array_ent = np.zeros(max_seq_length, dtype=np.int)
    input_array_ent[:len(input_ids_ent)] = input_ids_ent

    mask_array_ent = np.zeros(max_seq_length, dtype=np.bool)
    mask_array_ent[:len(tokens_ent_mask)] = tokens_ent_mask

    norm_label_array = np.full(max_seq_length, dtype=np.int, fill_value=-1)
    norm_label_array[ent_start] = norm_label_ids

    features = InputFeatures(input_ids=input_array,
                             input_mask=mask_array,
                             segment_ids=segment_array,
                             lm_label_ids=lm_label_array,
                             is_next=is_random_next,
                             input_ids_ent=input_array_ent,
                             input_mask_ent=mask_array_ent,
                             norm_label_ids=norm_label_array)
    return features
Example #6
0
def generate_instances_ehr(entities, dict_alphabet, dictionary_reverse):
    Xs = []
    Ys = []

    for entity in entities:
        if len(entity.norm_ids) > 0:
            if entity.norm_ids[0] in dictionary_reverse:
                cui_list = dictionary_reverse[entity.norm_ids[0]]
                Y = get_dict_index(
                    dict_alphabet,
                    cui_list[0])  # use the first id to generate instance
                if Y >= 0 and Y < get_dict_size(dict_alphabet):
                    Ys.append(Y)
                else:
                    raise RuntimeError(
                        "entity {}, {}, cui not in dict_alphabet".format(
                            entity.id, entity.name))
            else:
                logging.debug(
                    "entity {}, {}, can't map to umls, ignored".format(
                        entity.id, entity.name))
                continue
        else:
            Ys.append(0)

        X = {}

        tokens = []
        for token in tokenizer.tokenize(entity.name):
            if token in stop_word:
                continue
            token = word_preprocess(token)
            tokens.append(token)

        tokens.insert(0, '[CLS]')
        tokens.append('[SEP]')
        word_ids = tokenizer.convert_tokens_to_ids(tokens)

        if len(word_ids) == 0:
            continue

        X['token'] = word_ids
        X['segment'] = [0] * len(word_ids)
        X['mask'] = [1] * len(word_ids)

        Xs.append(X)

    return Xs, Ys
Example #7
0
    def init_vector_for_dict(self, meddra_dict):
        self.dict_embedding = nn.Embedding(len(meddra_dict),
                                           self.embedding_dim)
        if torch.cuda.is_available():
            self.dict_embedding = self.dict_embedding.cuda(self.gpu)

        for concept_id, concept_name in meddra_dict.items():
            self.dict_alphabet.add(concept_id)
            with torch.no_grad():
                tokens_id = self.batch_name_to_ids(concept_name)
                length = tokens_id.size(1)
                emb = self.word_embedding(tokens_id)
                emb = emb.unsqueeze_(1)
                pool = functional.avg_pool2d(emb, (length, 1))
                index = norm_utils.get_dict_index(self.dict_alphabet,
                                                  concept_id)
                self.dict_embedding.weight.data[index] = pool[0][0]
Example #8
0
def generate_instances(document, word_alphabet, dict_alphabet, dictionary,
                       dictionary_reverse, isMeddra_dict):
    Xs = []
    Ys = []

    # copy entities from gold entities
    pred_entities = []
    for gold in document.entities:
        pred = Entity()
        pred.id = gold.id
        pred.type = gold.type
        pred.spans = gold.spans
        pred.section = gold.section
        pred.name = gold.name
        pred_entities.append(pred)

    multi_sieve.runMultiPassSieve(document, pred_entities, dictionary,
                                  isMeddra_dict)

    for idx, entity in enumerate(document.entities):

        if isMeddra_dict:
            if len(entity.norm_ids) > 0:
                Y = norm_utils.get_dict_index(dict_alphabet,
                                              entity.norm_ids[0])
                if Y >= 0 and Y < norm_utils.get_dict_size(dict_alphabet):
                    Ys.append(Y)
                else:
                    continue
            else:
                Ys.append(0)
        else:
            if len(entity.norm_ids) > 0:
                if entity.norm_ids[0] in dictionary_reverse:
                    cui_list = dictionary_reverse[entity.norm_ids[0]]
                    Y = norm_utils.get_dict_index(
                        dict_alphabet,
                        cui_list[0])  # use the first id to generate instance
                    if Y >= 0 and Y < norm_utils.get_dict_size(dict_alphabet):
                        Ys.append(Y)
                    else:
                        raise RuntimeError(
                            "entity {}, {}, cui not in dict_alphabet".format(
                                entity.id, entity.name))
                else:
                    logging.info(
                        "entity {}, {}, can't map to umls, ignored".format(
                            entity.id, entity.name))
                    continue
            else:
                Ys.append(0)

        X = dict()

        tokens = my_tokenize(entity.name)
        word_ids = []
        for token in tokens:
            token = norm_utils.word_preprocess(token)
            word_id = word_alphabet.get_index(token)
            word_ids.append(word_id)
        X['word'] = word_ids

        if pred_entities[idx].rule_id is None:
            X['rule'] = [0] * norm_utils.get_dict_size(dict_alphabet)
        else:
            X['rule'] = [0] * norm_utils.get_dict_size(dict_alphabet)
            X['rule'][norm_utils.get_dict_index(
                dict_alphabet, pred_entities[idx].rule_id)] = 1

        Xs.append(X)

    return Xs, Ys