Exemple #1
0
def generate_dict_instances(dictionary, dict_alphabet, word_alphabet, isMeddra_dict):
    Xs = []
    Ys = []

    if isMeddra_dict:
        for concept_id, concept_name in dictionary.items():

            Y = norm_utils.get_dict_index(dict_alphabet, concept_id)
            if Y >= 0 and Y < norm_utils.get_dict_size(dict_alphabet):
                Ys.append(Y)
            else:
                continue


            tokens = my_tokenize(concept_name)
            word_ids = []
            for token in tokens:
                if token in stop_word:
                    continue
                token = norm_utils.word_preprocess(token)
                word_id = word_alphabet.get_index(token)
                word_ids.append(word_id)

            Xs.append(word_ids)
    else :
        for concept_id, concept in dictionary.items():
            Y = norm_utils.get_dict_index(dict_alphabet, concept_id)
            if Y >= 0 and Y < norm_utils.get_dict_size(dict_alphabet):
                pass
            else:
                continue

            # for concept_name in concept.names:
            #
            #     tokens = my_tokenize(concept_name)
            #     word_ids = []
            #     for token in tokens:
            #         token = norm_utils.word_preprocess(token)
            #         word_id = word_alphabet.get_index(token)
            #         word_ids.append(word_id)
            #
            #     Ys.append(Y)
            #     Xs.append(word_ids)


            tokens = my_tokenize(concept.names[0])
            word_ids = []
            for token in tokens:
                if token in stop_word:
                    continue
                token = norm_utils.word_preprocess(token)
                word_id = word_alphabet.get_index(token)
                word_ids.append(word_id)

            Ys.append(Y)
            Xs.append(word_ids)


    return Xs, Ys
Exemple #2
0
def build_alphabet_from_dict(alphabet, dictionary, isMeddra_dict):
    if isMeddra_dict:
        for concept_id, concept_name in dictionary.items():
            tokens = my_tokenize(concept_name)
            for word in tokens:
                alphabet.add(word_preprocess(word))
    else:
        for concept_id, concept in dictionary.items():
            for concept_name in concept.names:
                tokens = my_tokenize(concept_name)
                for word in tokens:
                    alphabet.add(word_preprocess(word))
Exemple #3
0
def generate_instances(entities, word_alphabet, dict_alphabet):
    Xs = []
    Ys = []
    dict_size = norm_utils.get_dict_size(dict_alphabet)

    for entity in entities:
        if len(entity.norm_ids) > 0:
            Y = norm_utils.get_dict_index(dict_alphabet, entity.norm_ids[0])
            if Y >= 0 and Y < dict_size:
                pass
            else:
                continue
        else:
            Y = 0

        # mention
        tokens = my_tokenize(entity.name)
        mention = []
        for token in tokens:
            token = norm_utils.word_preprocess(token)
            word_id = word_alphabet.get_index(token)
            mention.append(word_id)

        Xs.append(mention)
        Ys.append(Y)

    return Xs, Ys
Exemple #4
0
def generate_instances_ehr(entities, word_alphabet, dict_alphabet, dictionary_reverse):
    Xs = []
    Ys = []
    dict_size = norm_utils.get_dict_size(dict_alphabet)

    for entity in entities:

        if len(entity.norm_ids) > 0:
            if entity.norm_ids[0] in dictionary_reverse:
                cui_list = dictionary_reverse[entity.norm_ids[0]]
                Y = norm_utils.get_dict_index(dict_alphabet, cui_list[0])  # use the first id to generate instance
                if Y >= 0 and Y < dict_size:
                    pass
                else:
                    raise RuntimeError("entity {}, {}, cui not in dict_alphabet".format(entity.id, entity.name))
            else:
                logging.debug("entity {}, {}, can't map to umls, ignored".format(entity.id, entity.name))
                continue
        else:
            Y = 0

        # mention
        tokens = my_tokenize(entity.name)
        mention = []
        for token in tokens:
            token = norm_utils.word_preprocess(token)
            word_id = word_alphabet.get_index(token)
            mention.append(word_id)

        Xs.append(mention)
        Ys.append(Y)


    return Xs, Ys
Exemple #5
0
def generate_instances(entities, word_alphabet, dict_alphabet):
    Xs = []
    Ys = []

    for entity in entities:
        if len(entity.norm_ids) > 0:
            Y = norm_utils.get_dict_index(
                dict_alphabet,
                entity.norm_ids[0])  # use the first id to generate instance
            if Y >= 0 and Y < norm_utils.get_dict_size(
                    dict_alphabet):  # for tac, can be none or oov ID
                Ys.append(Y)
            else:
                continue
        else:
            Ys.append(0)

        tokens = my_tokenize(entity.name)
        word_ids = []
        for token in tokens:
            token = norm_utils.word_preprocess(token)
            word_id = word_alphabet.get_index(token)
            word_ids.append(word_id)

        Xs.append(word_ids)

    return Xs, Ys
Exemple #6
0
def init_vector_for_dict(word_alphabet, dict_alphabet, dictionary,
                         isMeddra_dict):

    # pos
    poses = []
    poses_lengths = []
    dict_size = norm_utils.get_dict_size(dict_alphabet)
    max_len = 0
    for i in range(dict_size):

        # pos
        if isMeddra_dict:
            concept_name = dictionary[norm_utils.get_dict_name(
                dict_alphabet, i)]
            tokens = my_tokenize(concept_name)
        else:
            concept = dictionary[norm_utils.get_dict_name(dict_alphabet, i)]
            tokens = my_tokenize(concept.names[0])
        pos = []
        for token in tokens:
            token = norm_utils.word_preprocess(token)
            word_id = word_alphabet.get_index(token)
            pos.append(word_id)

        if len(pos) > max_len:
            max_len = len(pos)

        poses.append(pos)
        poses_lengths.append(len(pos))

    poses = pad_sequence(poses, max_len)
    poses_lengths = torch.LongTensor(poses_lengths)

    if opt.gpu >= 0 and torch.cuda.is_available():
        poses = poses.cuda(opt.gpu)
        poses_lengths = poses_lengths.cuda(opt.gpu)

    return poses, poses_lengths
Exemple #7
0
    def batch_name_to_ids(self, name):
        tokens = my_tokenize(name)
        length = len(tokens)
        tokens_id = np.zeros((1, length), dtype=np.int)
        for i, word in enumerate(tokens):
            word = norm_utils.word_preprocess(word)
            tokens_id[0][i] = self.word_alphabet.get_index(word)

        tokens_id = torch.from_numpy(tokens_id)

        if torch.cuda.is_available():
            return tokens_id.cuda(self.gpu)
        else:
            return tokens_id
Exemple #8
0
def generate_instances(document, word_alphabet, dict_alphabet, dictionary,
                       dictionary_reverse, isMeddra_dict):
    Xs = []
    Ys = []

    # copy entities from gold entities
    pred_entities = []
    for gold in document.entities:
        pred = Entity()
        pred.id = gold.id
        pred.type = gold.type
        pred.spans = gold.spans
        pred.section = gold.section
        pred.name = gold.name
        pred_entities.append(pred)

    multi_sieve.runMultiPassSieve(document, pred_entities, dictionary,
                                  isMeddra_dict)

    for idx, entity in enumerate(document.entities):

        if isMeddra_dict:
            if len(entity.norm_ids) > 0:
                Y = norm_utils.get_dict_index(dict_alphabet,
                                              entity.norm_ids[0])
                if Y >= 0 and Y < norm_utils.get_dict_size(dict_alphabet):
                    Ys.append(Y)
                else:
                    continue
            else:
                Ys.append(0)
        else:
            if len(entity.norm_ids) > 0:
                if entity.norm_ids[0] in dictionary_reverse:
                    cui_list = dictionary_reverse[entity.norm_ids[0]]
                    Y = norm_utils.get_dict_index(
                        dict_alphabet,
                        cui_list[0])  # use the first id to generate instance
                    if Y >= 0 and Y < norm_utils.get_dict_size(dict_alphabet):
                        Ys.append(Y)
                    else:
                        raise RuntimeError(
                            "entity {}, {}, cui not in dict_alphabet".format(
                                entity.id, entity.name))
                else:
                    logging.info(
                        "entity {}, {}, can't map to umls, ignored".format(
                            entity.id, entity.name))
                    continue
            else:
                Ys.append(0)

        X = dict()

        tokens = my_tokenize(entity.name)
        word_ids = []
        for token in tokens:
            token = norm_utils.word_preprocess(token)
            word_id = word_alphabet.get_index(token)
            word_ids.append(word_id)
        X['word'] = word_ids

        if pred_entities[idx].rule_id is None:
            X['rule'] = [0] * norm_utils.get_dict_size(dict_alphabet)
        else:
            X['rule'] = [0] * norm_utils.get_dict_size(dict_alphabet)
            X['rule'][norm_utils.get_dict_index(
                dict_alphabet, pred_entities[idx].rule_id)] = 1

        Xs.append(X)

    return Xs, Ys