def generate_dict_instances(dictionary, dict_alphabet, word_alphabet, isMeddra_dict): Xs = [] Ys = [] if isMeddra_dict: for concept_id, concept_name in dictionary.items(): Y = norm_utils.get_dict_index(dict_alphabet, concept_id) if Y >= 0 and Y < norm_utils.get_dict_size(dict_alphabet): Ys.append(Y) else: continue tokens = my_tokenize(concept_name) word_ids = [] for token in tokens: if token in stop_word: continue token = norm_utils.word_preprocess(token) word_id = word_alphabet.get_index(token) word_ids.append(word_id) Xs.append(word_ids) else : for concept_id, concept in dictionary.items(): Y = norm_utils.get_dict_index(dict_alphabet, concept_id) if Y >= 0 and Y < norm_utils.get_dict_size(dict_alphabet): pass else: continue # for concept_name in concept.names: # # tokens = my_tokenize(concept_name) # word_ids = [] # for token in tokens: # token = norm_utils.word_preprocess(token) # word_id = word_alphabet.get_index(token) # word_ids.append(word_id) # # Ys.append(Y) # Xs.append(word_ids) tokens = my_tokenize(concept.names[0]) word_ids = [] for token in tokens: if token in stop_word: continue token = norm_utils.word_preprocess(token) word_id = word_alphabet.get_index(token) word_ids.append(word_id) Ys.append(Y) Xs.append(word_ids) return Xs, Ys
def generate_instances(entities, word_alphabet, dict_alphabet): Xs = [] Ys = [] dict_size = norm_utils.get_dict_size(dict_alphabet) for entity in entities: if len(entity.norm_ids) > 0: Y = norm_utils.get_dict_index(dict_alphabet, entity.norm_ids[0]) if Y >= 0 and Y < dict_size: pass else: continue else: Y = 0 # mention tokens = my_tokenize(entity.name) mention = [] for token in tokens: token = norm_utils.word_preprocess(token) word_id = word_alphabet.get_index(token) mention.append(word_id) Xs.append(mention) Ys.append(Y) return Xs, Ys
def generate_instances_ehr(entities, word_alphabet, dict_alphabet, dictionary_reverse): Xs = [] Ys = [] dict_size = norm_utils.get_dict_size(dict_alphabet) for entity in entities: if len(entity.norm_ids) > 0: if entity.norm_ids[0] in dictionary_reverse: cui_list = dictionary_reverse[entity.norm_ids[0]] Y = norm_utils.get_dict_index(dict_alphabet, cui_list[0]) # use the first id to generate instance if Y >= 0 and Y < dict_size: pass else: raise RuntimeError("entity {}, {}, cui not in dict_alphabet".format(entity.id, entity.name)) else: logging.debug("entity {}, {}, can't map to umls, ignored".format(entity.id, entity.name)) continue else: Y = 0 # mention tokens = my_tokenize(entity.name) mention = [] for token in tokens: token = norm_utils.word_preprocess(token) word_id = word_alphabet.get_index(token) mention.append(word_id) Xs.append(mention) Ys.append(Y) return Xs, Ys
def generate_instances(entities, word_alphabet, dict_alphabet): Xs = [] Ys = [] for entity in entities: if len(entity.norm_ids) > 0: Y = norm_utils.get_dict_index( dict_alphabet, entity.norm_ids[0]) # use the first id to generate instance if Y >= 0 and Y < norm_utils.get_dict_size( dict_alphabet): # for tac, can be none or oov ID Ys.append(Y) else: continue else: Ys.append(0) tokens = my_tokenize(entity.name) word_ids = [] for token in tokens: token = norm_utils.word_preprocess(token) word_id = word_alphabet.get_index(token) word_ids.append(word_id) Xs.append(word_ids) return Xs, Ys
def convert_example_to_features(example, tokenizer, max_seq_length, dict_alphabet): tokens = example["tokens"] segment_ids = example["segment_ids"] is_random_next = example["is_random_next"] masked_lm_positions = example["masked_lm_positions"] masked_lm_labels = example["masked_lm_labels"] tokens_ent = example['tokens_ent'] tokens_ent_mask = example['tokens_ent_mask'] ent_start = example['ent_start'] norm_label = example['norm_label'] assert len(tokens) == len( segment_ids ) <= max_seq_length # The preprocessed data should be already truncated input_ids = tokenizer.convert_tokens_to_ids(tokens) masked_label_ids = tokenizer.convert_tokens_to_ids(masked_lm_labels) input_array = np.zeros(max_seq_length, dtype=np.int) input_array[:len(input_ids)] = input_ids mask_array = np.zeros(max_seq_length, dtype=np.bool) mask_array[:len(input_ids)] = 1 segment_array = np.zeros(max_seq_length, dtype=np.bool) segment_array[:len(segment_ids)] = segment_ids lm_label_array = np.full(max_seq_length, dtype=np.int, fill_value=-1) lm_label_array[masked_lm_positions] = masked_label_ids input_ids_ent = tokenizer.convert_tokens_to_ids(tokens_ent) norm_label_ids = [get_dict_index(dict_alphabet, l) for l in norm_label] input_array_ent = np.zeros(max_seq_length, dtype=np.int) input_array_ent[:len(input_ids_ent)] = input_ids_ent mask_array_ent = np.zeros(max_seq_length, dtype=np.bool) mask_array_ent[:len(tokens_ent_mask)] = tokens_ent_mask norm_label_array = np.full(max_seq_length, dtype=np.int, fill_value=-1) norm_label_array[ent_start] = norm_label_ids features = InputFeatures(input_ids=input_array, input_mask=mask_array, segment_ids=segment_array, lm_label_ids=lm_label_array, is_next=is_random_next, input_ids_ent=input_array_ent, input_mask_ent=mask_array_ent, norm_label_ids=norm_label_array) return features
def generate_instances_ehr(entities, dict_alphabet, dictionary_reverse): Xs = [] Ys = [] for entity in entities: if len(entity.norm_ids) > 0: if entity.norm_ids[0] in dictionary_reverse: cui_list = dictionary_reverse[entity.norm_ids[0]] Y = get_dict_index( dict_alphabet, cui_list[0]) # use the first id to generate instance if Y >= 0 and Y < get_dict_size(dict_alphabet): Ys.append(Y) else: raise RuntimeError( "entity {}, {}, cui not in dict_alphabet".format( entity.id, entity.name)) else: logging.debug( "entity {}, {}, can't map to umls, ignored".format( entity.id, entity.name)) continue else: Ys.append(0) X = {} tokens = [] for token in tokenizer.tokenize(entity.name): if token in stop_word: continue token = word_preprocess(token) tokens.append(token) tokens.insert(0, '[CLS]') tokens.append('[SEP]') word_ids = tokenizer.convert_tokens_to_ids(tokens) if len(word_ids) == 0: continue X['token'] = word_ids X['segment'] = [0] * len(word_ids) X['mask'] = [1] * len(word_ids) Xs.append(X) return Xs, Ys
def init_vector_for_dict(self, meddra_dict): self.dict_embedding = nn.Embedding(len(meddra_dict), self.embedding_dim) if torch.cuda.is_available(): self.dict_embedding = self.dict_embedding.cuda(self.gpu) for concept_id, concept_name in meddra_dict.items(): self.dict_alphabet.add(concept_id) with torch.no_grad(): tokens_id = self.batch_name_to_ids(concept_name) length = tokens_id.size(1) emb = self.word_embedding(tokens_id) emb = emb.unsqueeze_(1) pool = functional.avg_pool2d(emb, (length, 1)) index = norm_utils.get_dict_index(self.dict_alphabet, concept_id) self.dict_embedding.weight.data[index] = pool[0][0]
def generate_instances(document, word_alphabet, dict_alphabet, dictionary, dictionary_reverse, isMeddra_dict): Xs = [] Ys = [] # copy entities from gold entities pred_entities = [] for gold in document.entities: pred = Entity() pred.id = gold.id pred.type = gold.type pred.spans = gold.spans pred.section = gold.section pred.name = gold.name pred_entities.append(pred) multi_sieve.runMultiPassSieve(document, pred_entities, dictionary, isMeddra_dict) for idx, entity in enumerate(document.entities): if isMeddra_dict: if len(entity.norm_ids) > 0: Y = norm_utils.get_dict_index(dict_alphabet, entity.norm_ids[0]) if Y >= 0 and Y < norm_utils.get_dict_size(dict_alphabet): Ys.append(Y) else: continue else: Ys.append(0) else: if len(entity.norm_ids) > 0: if entity.norm_ids[0] in dictionary_reverse: cui_list = dictionary_reverse[entity.norm_ids[0]] Y = norm_utils.get_dict_index( dict_alphabet, cui_list[0]) # use the first id to generate instance if Y >= 0 and Y < norm_utils.get_dict_size(dict_alphabet): Ys.append(Y) else: raise RuntimeError( "entity {}, {}, cui not in dict_alphabet".format( entity.id, entity.name)) else: logging.info( "entity {}, {}, can't map to umls, ignored".format( entity.id, entity.name)) continue else: Ys.append(0) X = dict() tokens = my_tokenize(entity.name) word_ids = [] for token in tokens: token = norm_utils.word_preprocess(token) word_id = word_alphabet.get_index(token) word_ids.append(word_id) X['word'] = word_ids if pred_entities[idx].rule_id is None: X['rule'] = [0] * norm_utils.get_dict_size(dict_alphabet) else: X['rule'] = [0] * norm_utils.get_dict_size(dict_alphabet) X['rule'][norm_utils.get_dict_index( dict_alphabet, pred_entities[idx].rule_id)] = 1 Xs.append(X) return Xs, Ys