def generate_dict_instances(dictionary, dict_alphabet, word_alphabet, isMeddra_dict): Xs = [] Ys = [] if isMeddra_dict: for concept_id, concept_name in dictionary.items(): Y = norm_utils.get_dict_index(dict_alphabet, concept_id) if Y >= 0 and Y < norm_utils.get_dict_size(dict_alphabet): Ys.append(Y) else: continue tokens = my_tokenize(concept_name) word_ids = [] for token in tokens: if token in stop_word: continue token = norm_utils.word_preprocess(token) word_id = word_alphabet.get_index(token) word_ids.append(word_id) Xs.append(word_ids) else : for concept_id, concept in dictionary.items(): Y = norm_utils.get_dict_index(dict_alphabet, concept_id) if Y >= 0 and Y < norm_utils.get_dict_size(dict_alphabet): pass else: continue # for concept_name in concept.names: # # tokens = my_tokenize(concept_name) # word_ids = [] # for token in tokens: # token = norm_utils.word_preprocess(token) # word_id = word_alphabet.get_index(token) # word_ids.append(word_id) # # Ys.append(Y) # Xs.append(word_ids) tokens = my_tokenize(concept.names[0]) word_ids = [] for token in tokens: if token in stop_word: continue token = norm_utils.word_preprocess(token) word_id = word_alphabet.get_index(token) word_ids.append(word_id) Ys.append(Y) Xs.append(word_ids) return Xs, Ys
def generate_instances(entities, word_alphabet, dict_alphabet): Xs = [] Ys = [] dict_size = norm_utils.get_dict_size(dict_alphabet) for entity in entities: if len(entity.norm_ids) > 0: Y = norm_utils.get_dict_index(dict_alphabet, entity.norm_ids[0]) if Y >= 0 and Y < dict_size: pass else: continue else: Y = 0 # mention tokens = my_tokenize(entity.name) mention = [] for token in tokens: token = norm_utils.word_preprocess(token) word_id = word_alphabet.get_index(token) mention.append(word_id) Xs.append(mention) Ys.append(Y) return Xs, Ys
def generate_instances_ehr(entities, word_alphabet, dict_alphabet, dictionary_reverse): Xs = [] Ys = [] dict_size = norm_utils.get_dict_size(dict_alphabet) for entity in entities: if len(entity.norm_ids) > 0: if entity.norm_ids[0] in dictionary_reverse: cui_list = dictionary_reverse[entity.norm_ids[0]] Y = norm_utils.get_dict_index(dict_alphabet, cui_list[0]) # use the first id to generate instance if Y >= 0 and Y < dict_size: pass else: raise RuntimeError("entity {}, {}, cui not in dict_alphabet".format(entity.id, entity.name)) else: logging.debug("entity {}, {}, can't map to umls, ignored".format(entity.id, entity.name)) continue else: Y = 0 # mention tokens = my_tokenize(entity.name) mention = [] for token in tokens: token = norm_utils.word_preprocess(token) word_id = word_alphabet.get_index(token) mention.append(word_id) Xs.append(mention) Ys.append(Y) return Xs, Ys
def generate_instances(entities, word_alphabet, dict_alphabet): Xs = [] Ys = [] for entity in entities: if len(entity.norm_ids) > 0: Y = norm_utils.get_dict_index( dict_alphabet, entity.norm_ids[0]) # use the first id to generate instance if Y >= 0 and Y < norm_utils.get_dict_size( dict_alphabet): # for tac, can be none or oov ID Ys.append(Y) else: continue else: Ys.append(0) tokens = my_tokenize(entity.name) word_ids = [] for token in tokens: token = norm_utils.word_preprocess(token) word_id = word_alphabet.get_index(token) word_ids.append(word_id) Xs.append(word_ids) return Xs, Ys
def batch_name_to_ids(self, name): tokens = my_tokenize(name) length = len(tokens) tokens_id = np.zeros((1, length), dtype=np.int) for i, word in enumerate(tokens): word = norm_utils.word_preprocess(word) tokens_id[0][i] = self.word_alphabet.get_index(word) tokens_id = torch.from_numpy(tokens_id) if torch.cuda.is_available(): return tokens_id.cuda(self.gpu) else: return tokens_id
def init_vector_for_dict(word_alphabet, dict_alphabet, dictionary, isMeddra_dict): # pos poses = [] poses_lengths = [] dict_size = norm_utils.get_dict_size(dict_alphabet) max_len = 0 for i in range(dict_size): # pos if isMeddra_dict: concept_name = dictionary[norm_utils.get_dict_name( dict_alphabet, i)] tokens = my_tokenize(concept_name) else: concept = dictionary[norm_utils.get_dict_name(dict_alphabet, i)] tokens = my_tokenize(concept.names[0]) pos = [] for token in tokens: token = norm_utils.word_preprocess(token) word_id = word_alphabet.get_index(token) pos.append(word_id) if len(pos) > max_len: max_len = len(pos) poses.append(pos) poses_lengths.append(len(pos)) poses = pad_sequence(poses, max_len) poses_lengths = torch.LongTensor(poses_lengths) if opt.gpu >= 0 and torch.cuda.is_available(): poses = poses.cuda(opt.gpu) poses_lengths = poses_lengths.cuda(opt.gpu) return poses, poses_lengths
def generate_instances(document, word_alphabet, dict_alphabet, dictionary, dictionary_reverse, isMeddra_dict): Xs = [] Ys = [] # copy entities from gold entities pred_entities = [] for gold in document.entities: pred = Entity() pred.id = gold.id pred.type = gold.type pred.spans = gold.spans pred.section = gold.section pred.name = gold.name pred_entities.append(pred) multi_sieve.runMultiPassSieve(document, pred_entities, dictionary, isMeddra_dict) for idx, entity in enumerate(document.entities): if isMeddra_dict: if len(entity.norm_ids) > 0: Y = norm_utils.get_dict_index(dict_alphabet, entity.norm_ids[0]) if Y >= 0 and Y < norm_utils.get_dict_size(dict_alphabet): Ys.append(Y) else: continue else: Ys.append(0) else: if len(entity.norm_ids) > 0: if entity.norm_ids[0] in dictionary_reverse: cui_list = dictionary_reverse[entity.norm_ids[0]] Y = norm_utils.get_dict_index( dict_alphabet, cui_list[0]) # use the first id to generate instance if Y >= 0 and Y < norm_utils.get_dict_size(dict_alphabet): Ys.append(Y) else: raise RuntimeError( "entity {}, {}, cui not in dict_alphabet".format( entity.id, entity.name)) else: logging.info( "entity {}, {}, can't map to umls, ignored".format( entity.id, entity.name)) continue else: Ys.append(0) X = dict() tokens = my_tokenize(entity.name) word_ids = [] for token in tokens: token = norm_utils.word_preprocess(token) word_id = word_alphabet.get_index(token) word_ids.append(word_id) X['word'] = word_ids if pred_entities[idx].rule_id is None: X['rule'] = [0] * norm_utils.get_dict_size(dict_alphabet) else: X['rule'] = [0] * norm_utils.get_dict_size(dict_alphabet) X['rule'][norm_utils.get_dict_index( dict_alphabet, pred_entities[idx].rule_id)] = 1 Xs.append(X) return Xs, Ys