def load_pretrained_token_embeddings(self, sess, dataset, parameters, token_to_vector=None): if parameters['token_pretrained_embedding_filepath'] == '': return # Load embeddings start_time = time.time() print('Load token embeddings... ', end='', flush=True) if token_to_vector == None: token_to_vector = hd.load_pretrained_token_embeddings(parameters) initial_weights = sess.run(self.token_embedding_weights.read_value()) number_of_loaded_word_vectors = 0 number_of_token_original_case_found = 0 number_of_token_lowercase_found = 0 number_of_token_digits_replaced_with_zeros_found = 0 number_of_token_lowercase_and_digits_replaced_with_zeros_found = 0 for token in dataset.token_to_index.keys(): if token in token_to_vector.keys(): initial_weights[dataset.token_to_index[token]] = token_to_vector[token] number_of_token_original_case_found += 1 elif parameters['check_for_lowercase'] and token.lower() in token_to_vector.keys(): initial_weights[dataset.token_to_index[token]] = token_to_vector[token.lower()] number_of_token_lowercase_found += 1 elif parameters['check_for_digits_replaced_with_zeros'] and re.sub('\d', '0', token) in token_to_vector.keys(): initial_weights[dataset.token_to_index[token]] = token_to_vector[re.sub('\d', '0', token)] number_of_token_digits_replaced_with_zeros_found += 1 elif parameters['check_for_lowercase'] and parameters['check_for_digits_replaced_with_zeros'] and re.sub('\d', '0', token.lower()) in token_to_vector.keys(): initial_weights[dataset.token_to_index[token]] = token_to_vector[re.sub('\d', '0', token.lower())] number_of_token_lowercase_and_digits_replaced_with_zeros_found += 1 else: continue number_of_loaded_word_vectors += 1 elapsed_time = time.time() - start_time print('done ({0:.2f} seconds)'.format(elapsed_time)) print("number_of_token_original_case_found: {0}".format(number_of_token_original_case_found)) print("number_of_token_lowercase_found: {0}".format(number_of_token_lowercase_found)) print("number_of_token_digits_replaced_with_zeros_found: {0}".format(number_of_token_digits_replaced_with_zeros_found)) print("number_of_token_lowercase_and_digits_replaced_with_zeros_found: {0}".format(number_of_token_lowercase_and_digits_replaced_with_zeros_found)) print('number_of_loaded_word_vectors: {0}'.format(number_of_loaded_word_vectors)) print("dataset.vocabulary_size: {0}".format(dataset.vocabulary_size)) sess.run(self.token_embedding_weights.assign(initial_weights))
def predict(files, model_path, output_dir, format, use_lstm=True): # Must specify output format if format not in ['i2b2']: sys.stderr.write('\n\tError: Must specify output format\n') sys.stderr.write('\tAvailable formats: i2b2\n') sys.stderr.write('\n') exit(1) # Load model #if use_lstm==False: with open(model_path, 'rb') as f: model = pickle.load(f, encoding='latin1') if model._use_lstm: import helper_dataset as hd import DatasetCliner_experimental as Exp import entity_lstm as entity_model parameters = hd.load_parameters_from_file("LSTM_parameters.txt") parameters['use_pretrained_model'] = True temp_pretrained_dataset_adress = parameters[ 'model_folder'] + os.sep + "dataset.pickle" model._pretrained_dataset = pickle.load( open(temp_pretrained_dataset_adress, 'rb')) model._pretrained_wordvector = hd.load_pretrained_token_embeddings( parameters) model._current_model = None ''' updating_notes=[] for i,txt in enumerate(sorted(files)): note=Document(txt) tokenized_sents = note.getTokenizedSentences() updating_notes+=tokenized_sents print (updating_notes) fictional_labels= copy.deepcopy(tokenized_sents) for idx,x in enumerate(fictional_labels): for val_id,value in enumerate(x): fictional_labels[idx][val_id]='O' Datasets_tokens={} Datasets_labels={} Datasets_tokens['deploy']=tokenized_sents Datasets_labels['deploy']=fictional_labels dataset = Exp.Dataset() token_to_vector=dataset.load_dataset(Datasets_tokens, Datasets_labels, "", parameters,token_to_vector=model._pretrained_wordvector, pretrained_dataset=model._pretrained_dataset) parameters['Feature_vector_length']=dataset.feature_vector_size parameters['use_features_before_final_lstm']=False dataset.update_dataset("", ['deploy'],Datasets_tokens,Datasets_labels) model._pretrained_dataset=dataset model_LSTM=entity_model.EntityLSTM(dataset,parameters) model._current_model=model_LSTM ._current_model ''' print("END TEST") #exit() #model.parameters=None # Tell user if not predicting if not files: sys.stderr.write("\n\tNote: You did not supply any input files\n\n") exit() n = len(files) for i, txt in enumerate(sorted(files)): note = Document(txt) # Output file fname = os.path.splitext(os.path.basename(txt))[0] + '.' + 'con' out_path = os.path.join(output_dir, fname) #''' if os.path.exists(out_path): print('\tWARNING: prediction file already exists (%s)' % out_path) #continue #''' sys.stdout.write('%s\n' % ('-' * 30)) sys.stdout.write('\n\t%d of %d\n' % (i + 1, n)) sys.stdout.write('\t%s\n\n' % txt) # Predict concept labels labels = model.predict_classes_from_document(note) # Get predictions in proper format output = note.write(labels) # Output the concept predictions sys.stdout.write('\n\nwriting to: %s\n' % out_path) with open(out_path, 'w') as f: write(f, '%s\n' % output) sys.stdout.write('\n')
def load_dataset(self,avaliable_datasets_sent,avaliable_datasets_labels, dataset_filepaths, parameters, token_to_vector=None,pretrained_dataset=None): ''' dataset_filepaths : dictionary with keys 'train', 'valid', 'test', 'deploy' ''' start_time = time.time() print('Load dataset... \n') if parameters['token_pretrained_embedding_filepath'] != '': if token_to_vector==None: token_to_vector = hd.load_pretrained_token_embeddings(parameters) else: token_to_vector = {} all_tokens_in_pretraining_dataset = [] all_characters_in_pretraining_dataset = [] if parameters['use_pretrained_model']: #temp_pretrained_dataset_adress="./models/NN_models/1235-4/dataset.pickle" #"./models/NN_models/1234-5/dataset.pickle" if pretrained_dataset==None: temp_pretrained_dataset_adress=parameters['model_folder']+os.sep+"dataset.pickle" pretraining_dataset = pickle.load(open(temp_pretrained_dataset_adress, "rb")) print ("Pre-loading Pre-trained dataset objects") else: pretraining_dataset=pretrained_dataset print ("Pretrained dataset was pre-loaded") all_tokens_in_pretraining_dataset = pretraining_dataset.index_to_token.values() all_characters_in_pretraining_dataset = pretraining_dataset.index_to_character.values() remap_to_unk_count_threshold = 1 self.UNK_TOKEN_INDEX = 0 self.PADDING_CHARACTER_INDEX = 0 self.tokens_mapped_to_unk = [] self.UNK = 'UNK' self.unique_labels = [] labels = {} tokens = {} label_count = {} token_count = {} character_count = {} features={} features_file_names={} feature_vector_size={} #deploy for dataset_type in ['train', 'valid', 'test','deploy']: Not_here=False if dataset_type not in avaliable_datasets_sent: Not_here=True #_parse_dataset(self, dataset_filepath,dataset_type,sentences_list="",tags_list="") if Not_here==False: labels[dataset_type], tokens[dataset_type], token_count[dataset_type], label_count[dataset_type], character_count[dataset_type], features[dataset_type], \ features_file_names[dataset_type],feature_vector_size[dataset_type] \ = self._parse_dataset("", dataset_type, sentences_list=avaliable_datasets_sent[dataset_type], tags_list=avaliable_datasets_labels[dataset_type]) if Not_here==True: labels[dataset_type], tokens[dataset_type], token_count[dataset_type], label_count[dataset_type], character_count[dataset_type], features[dataset_type], \ features_file_names[dataset_type],feature_vector_size[dataset_type] \ = self._parse_dataset("", dataset_type, sentences_list=[], tags_list=[]) # token_count['all'] = {} for token in list(token_count['train'].keys()) + list(token_count['valid'].keys()) + list(token_count['test'].keys()) + list(token_count['deploy'].keys()): token_count['all'][token] = token_count['train'][token] + token_count['valid'][token] + token_count['test'][token] + token_count['deploy'][token] if parameters['load_all_pretrained_token_embeddings']: for token in token_to_vector: if token not in token_count['all']: token_count['all'][token] = -1 token_count['train'][token] = -1 for token in all_tokens_in_pretraining_dataset: if token not in token_count['all']: token_count['all'][token] = -1 token_count['train'][token] = -1 character_count['all'] = {} for character in list(character_count['train'].keys()) + list(character_count['valid'].keys()) + list(character_count['test'].keys()) + list(character_count['deploy'].keys()): character_count['all'][character] = character_count['train'][character] + character_count['valid'][character] + character_count['test'][character] + character_count['deploy'][character] for character in all_characters_in_pretraining_dataset: if character not in character_count['all']: character_count['all'][character] = -1 character_count['train'][character] = -1 label_count['all'] = {} for character in list(label_count['train'].keys()) + list(label_count['valid'].keys()) + list(label_count['test'].keys()) + list(label_count['deploy'].keys()): label_count['all'][character] = label_count['train'][character] + label_count['valid'][character] + label_count['test'][character] + label_count['deploy'][character] token_count['all'] = hd.order_dictionary(token_count['all'], 'value_key', reverse = True) label_count['all'] = hd.order_dictionary(label_count['all'], 'key', reverse = False) character_count['all'] = hd.order_dictionary(character_count['all'], 'value', reverse = True) if self.verbose: print('character_count[\'all\']: {0}'.format(character_count['all'])) token_to_index = {} token_to_index[self.UNK] = self.UNK_TOKEN_INDEX iteration_number = 0 number_of_unknown_tokens = 0 if self.verbose: print("parameters['remap_unknown_tokens_to_unk']: {0}".format(parameters['remap_unknown_tokens_to_unk'])) if self.verbose: print("len(token_count['train'].keys()): {0}".format(len(token_count['train'].keys()))) for token, count in token_count['all'].items(): if iteration_number == self.UNK_TOKEN_INDEX: iteration_number += 1 if parameters['remap_unknown_tokens_to_unk'] == 1 and \ (token_count['train'][token] == 0 or \ parameters['load_only_pretrained_token_embeddings']) and \ not hd.is_token_in_pretrained_embeddings(token, token_to_vector, parameters) and \ token not in all_tokens_in_pretraining_dataset: token_to_index[token] = self.UNK_TOKEN_INDEX number_of_unknown_tokens += 1 self.tokens_mapped_to_unk.append(token) else: token_to_index[token] = iteration_number iteration_number += 1 infrequent_token_indices = [] for token, count in token_count['train'].items(): if 0 < count <= remap_to_unk_count_threshold: infrequent_token_indices.append(token_to_index[token]) #if self.verbose: print("len(token_count['train']): {0}".format(len(token_count['train']))) # if self.verbose: print("len(infrequent_token_indices): {0}".format(len(infrequent_token_indices))) # Ensure that both B- and I- versions exist for each label labels_without_bio = set() for label in label_count['all'].keys(): new_label = hd.remove_bio_from_label_name(label) labels_without_bio.add(new_label) for label in labels_without_bio: if label == 'O': continue if parameters['tagging_format'] == 'bioes': prefixes = ['B-', 'I-', 'E-', 'S-'] else: prefixes = ['B-', 'I-'] for prefix in prefixes: l = prefix + label if l not in label_count['all']: label_count['all'][l] = 0 label_count['all'] = hd.order_dictionary(label_count['all'], 'key', reverse = False) if parameters['use_pretrained_model']: print ("USE_PRETRAINED_MODEL ACTIVE") self.unique_labels = sorted(list(pretraining_dataset.label_to_index.keys())) # Make sure labels are compatible with the pretraining dataset. for label in label_count['all']: if label not in pretraining_dataset.label_to_index: raise AssertionError("The label {0} does not exist in the pretraining dataset. ".format(label) + "Please ensure that only the following labels exist in the dataset: {0}".format(', '.join(self.unique_labels))) label_to_index = pretraining_dataset.label_to_index.copy() else: label_to_index = {} iteration_number = 0 for label, count in label_count['all'].items(): label_to_index[label] = iteration_number iteration_number += 1 self.unique_labels.append(label) character_to_index = {} iteration_number = 0 for character, count in character_count['all'].items(): if iteration_number == self.PADDING_CHARACTER_INDEX: iteration_number += 1 character_to_index[character] = iteration_number iteration_number += 1 token_to_index = hd.order_dictionary(token_to_index, 'value', reverse = False) if self.verbose: print('token_to_index: {0}'.format(token_to_index)) index_to_token = hd.reverse_dictionary(token_to_index) if parameters['remap_unknown_tokens_to_unk'] == 1: index_to_token[self.UNK_TOKEN_INDEX] = self.UNK if self.verbose: print('index_to_token: {0}'.format(index_to_token)) label_to_index = hd.order_dictionary(label_to_index, 'value', reverse = False) index_to_label = hd.reverse_dictionary(label_to_index) character_to_index = hd.order_dictionary(character_to_index, 'value', reverse = False) index_to_character = hd.reverse_dictionary(character_to_index) self.token_to_index = token_to_index self.index_to_token = index_to_token self.index_to_character = index_to_character self.character_to_index = character_to_index self.index_to_label = index_to_label self.label_to_index = label_to_index self.tokens = tokens self.labels = labels dataset_types=['train','test','valid','deploy'] token_indices, label_indices, character_indices_padded, character_indices, token_lengths, characters, label_vector_indices = self._convert_to_indices(dataset_types) self.token_indices = token_indices self.label_indices = label_indices self.character_indices_padded = character_indices_padded self.character_indices = character_indices self.token_lengths = token_lengths self.characters = characters self.label_vector_indices = label_vector_indices self.number_of_classes = max(self.index_to_label.keys()) + 1 self.vocabulary_size = max(self.index_to_token.keys()) + 1 self.alphabet_size = max(self.index_to_character.keys()) + 1 # unique_labels_of_interest is used to compute F1-scores. self.unique_labels_of_interest = list(self.unique_labels) self.unique_labels_of_interest.remove('O') self.unique_label_indices_of_interest = [] for lab in self.unique_labels_of_interest: self.unique_label_indices_of_interest.append(label_to_index[lab]) self.infrequent_token_indices = infrequent_token_indices elapsed_time = time.time() - start_time print('done ({0:.2f} seconds)'.format(elapsed_time)) self.feature_vector_size=0 self._log() return token_to_vector
def predict(files, model_path, output_dir, format, use_lstm=True): # Must specify output format if format not in ['i2b2']: sys.stderr.write('\n\tError: Must specify output format\n') sys.stderr.write('\tAvailable formats: i2b2\n') sys.stderr.write('\n') exit(1) # Load model #if use_lstm==False: with open(model_path, 'rb') as f: model = pickle.load(f, encoding='latin1') if model._use_lstm: import helper_dataset as hd import DatasetCliner_experimental as Exp import entity_lstm as entity_model parameters = hd.load_parameters_from_file("LSTM_parameters.txt") parameters['use_pretrained_model'] = True temp_pretrained_dataset_adress = parameters[ 'model_folder'] + os.sep + "dataset.pickle" model._pretrained_dataset = pickle.load( open(temp_pretrained_dataset_adress, 'rb')) model._pretrained_wordvector = hd.load_pretrained_token_embeddings( parameters) model._current_model = None print("END TEST") #exit() #model.parameters=None # Tell user if not predicting if not files: sys.stderr.write("\n\tNote: You did not supply any input files\n\n") exit() n = len(files) for i, txt in enumerate(sorted(files)): note = Document(txt) # Output file fname = os.path.splitext(os.path.basename(txt))[0] + '.' + 'con' out_path = os.path.join(output_dir, fname) if os.path.exists(out_path): print() #print('\tWARNING: prediction file already exists (%s)' % out_path) #continue ''' sys.stdout.write('%s\n' % ('-' * 30)) sys.stdout.write('\n\t%d of %d\n' % (i+1,n)) sys.stdout.write('\t%s\n\n' % txt) ''' # Predict concept labels labels = model.predict_classes_from_document(note) # Get predictions in proper format output = note.write(labels) print("-----------OUTPUT----------\n") print(output) # Output the concept predictions sys.stdout.write('\n\nwriting to: %s\n' % out_path) with open(out_path, 'w') as f: write(f, '%s\n' % output) sys.stdout.write('\n')