def first_pass_data_and_labels(notes): ''' first_pass_data_and_labels() Purpose: Interface with notes object to get text data and labels @param notes. List of Note objects @return <tuple> whose elements are: 0) list of tokenized sentences 1) list of labels for tokenized sentences >>> import os >>> from notes.note import Note >>> base_dir = os.path.join(os.getenv('CLINER_DIR'), 'tests', 'data') >>> txt = os.path.join(base_dir, 'single.txt') >>> con = os.path.join(base_dir, 'single.con') >>> note_tmp = Note('i2b2') >>> note_tmp.read(txt, con) >>> notes = [note_tmp] >>> first_pass_data_and_labels(notes) ([['The', 'score', 'stood', 'four', 'to', 'two', ',', 'with', 'but', 'one', 'inning', 'more', 'to', 'play', ',']], [['B', 'I', 'I', 'I', 'I', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]) ''' # Get the data and annotations from the Note objects l_tokenized_sentences = [note.getTokenizedSentences() for note in notes] l_iob_labels = [note.getIOBLabels() for note in notes] tokenized_sentences = flatten(l_tokenized_sentences) iob_labels = flatten(l_iob_labels) return tokenized_sentences, iob_labels
def __first_train(self, tokenized_sentences, Y, do_grid=False): """ Model::__first_train() Purpose: Train the first pass classifiers (for IOB chunking) @param tokenized_sentences. <list> of tokenized sentences @param Y. <list-of-lists> of IOB labels for words @param do_grid. <boolean> whether to perform a grid search @return None """ if globals_cliner.verbosity > 0: print('first pass') if globals_cliner.verbosity > 0: print('\textracting features (pass one)') # Seperate into prose v nonprose nested_prose_data, nested_prose_Y = list( zip(*[ line_iob_tup for line_iob_tup in zip(tokenized_sentences, Y) if is_prose_sentence(line_iob_tup[0]) ])) nested_nonprose_data, nested_nonprose_Y = list( zip(*[ line_iob_tup for line_iob_tup in zip(tokenized_sentences, Y) if not is_prose_sentence(line_iob_tup[0]) ])) # extract features nested_prose_feats = feat_obj.IOB_prose_features(nested_prose_data) nested_nonprose_feats = feat_obj.IOB_nonprose_features( nested_nonprose_data) # Flatten lists (because classifier will expect flat) prose_Y = flatten(nested_prose_Y) nonprose_Y = flatten(nested_nonprose_Y) # rename because code uses it pchunks = prose_Y nchunks = nonprose_Y prose = nested_prose_feats nonprose = nested_nonprose_feats # Train classifiers for prose and nonprose pvec, pclf = self.__generic_first_train('prose', prose, pchunks, do_grid) nvec, nclf = self.__generic_first_train('nonprose', nonprose, nchunks, do_grid) # Save vectorizers self._first_prose_vec = pvec self._first_nonprose_vec = nvec # Save classifiers self._first_prose_clf = pclf self._first_nonprose_clf = nclf
def __generic_first_predict(self, p_or_n, text_features, dvect, clf, do_grid=False): ''' Model::__generic_first_predict() Purpose: Train that works for both prose and nonprose @param p_or_n. <string> either "prose" or "nonprose" @param text_features. <list-of-lists> of feature dictionaries @param dvect. <DictVectorizer> @param clf. scikit-learn classifier @param do_grid. <boolean> indicating whether to perform grid search ''' # If nothing to predict, skip actual prediction if len(text_features) == 0: print('\tnothing to predict (pass one) ' + p_or_n) return [] # Save list structure to reconstruct after vectorization offsets = save_list_structure(text_features) if globals_cliner.verbosity > 0: print('\tvectorizing features (pass one) ' + p_or_n) # Vectorize features X_feats = dvect.transform(flatten(text_features)) if globals_cliner.verbosity > 0: print('\tpredicting labels (pass one) ' + p_or_n) # CRF requires reconstruct lists if self._crf_enabled: X_feats = reconstruct_list(list(X_feats), offsets) lib = crf else: lib = sci # for X in X_feats: # for x in X: # print x # print # print '\n' # Predict IOB labels out = lib.predict(clf, X_feats) # Format labels from output predictions = reconstruct_list(out, offsets) return predictions
def __second_train(self, chunked_data, inds_list, con_labels, do_grid=False): """ Model::__second_train() Purpose: Train the first pass classifiers (for IOB chunking) @param data <list> of tokenized sentences after collapsing chunks @param inds_list <list-of-lists> of indices - assertion: len(data) == len(inds_list) - one line of 'inds_list' contains a list of indices into the corresponding line for 'data' @param con_labels <list> of concept label strings - assertion: there are sum(len(inds_list)) labels AKA each index from inds_list maps to a label @param do_grid <boolean> indicating whether to perform a grid search @return None """ if globals_cliner.verbosity > 0: print('second pass') # Extract features if globals_cliner.verbosity > 0: print('\textracting features (pass two)') text_features = [ feat_obj.concept_features(s, inds) for s, inds in zip(chunked_data, inds_list) ] flattened_text_features = flatten(text_features) if globals_cliner.verbosity > 0: print('\tvectorizing features (pass two)') # Vectorize labels numeric_labels = [concept_labels[y] for y in con_labels] # Vectorize features self._second_vec = DictVectorizer() vectorized_features = self._second_vec.fit_transform( flattened_text_features) if globals_cliner.verbosity > 0: print('\ttraining classifier (pass two)') # Train the model self._second_clf = sci.train(vectorized_features, numeric_labels, do_grid)
def second_pass_data_and_labels(notes): ''' second_pass_data_and_labels() Purpose: Interface with notes object to get text data and labels @param notes. List of Note objects @return <tuple> whose elements are: 0) list of chunked sentences 0) list of list-of-indices designating chunks 1) list of labels for chunks >>> import os >>> from notes.note import Note >>> base_dir = os.path.join(os.getenv('CLINER_DIR'), 'tests', 'data') >>> txt = os.path.join(base_dir, 'single.txt') >>> con = os.path.join(base_dir, 'single.con') >>> note_tmp = Note('i2b2') >>> note_tmp.read(txt, con) >>> notes = [note_tmp] >>> second_pass_data_and_labels(notes) ([['The score stood four to two', ',', 'with', 'but', 'one', 'inning', 'more', 'to', 'play', ',']], [[0]], ['problem']) ''' # Get the data and annotations from the Note objects l_chunked_sentences = [note.getChunkedText() for note in notes] l_inds_list = [note.getConceptIndices() for note in notes] l_con_labels = [note.getConceptLabels() for note in notes] chunked_sentences = flatten(l_chunked_sentences) inds_list = flatten(l_inds_list) con_labels = flatten(l_con_labels) # print 'labels: ', len(con_labels) # print 'inds: ', sum(map(len,inds_list)) # exit() return chunked_sentences, inds_list, con_labels
def train(self, train_notes, val=[], test=[]): """ ClinerModel::train() Purpose: Train a Machine Learning model on annotated data @param notes. A list of Note objects (containing text and annotations) @return None """ # Extract formatted data train_sents = flatten([n.getTokenizedSentences() for n in train_notes]) train_labels = flatten([n.getTokenLabels() for n in train_notes]) if test: test_sents = flatten([n.getTokenizedSentences() for n in test]) test_labels = flatten([n.getTokenLabels() for n in test]) else: test_sents = [] test_labels = [] if val: print("VAL") val_sents = flatten([n.getTokenizedSentences() for n in val]) val_labels = flatten([n.getTokenLabels() for n in val]) self.train_fit(train_sents, train_labels, val_sents=val_sents, val_labels=val_labels, test_sents=test_sents, test_labels=test_labels) else: print("NO DEV") self.train_fit(train_sents, train_labels, dev_split=0.1, test_sents=test_sents, test_labels=test_labels) self._train_files = [n.getName() for n in train_notes + val]
def generic_predict(p_or_n, tokenized_sents, vocab, clf, use_lstm, hyperparams): ''' generic_predict() Train a model that works for both prose and nonprose @param p_or_n. A string that indicates "prose", "nonprose", or "all" @param tokenized_sents. A list of sentences, where each sentence is tokenized into words @param vocab. A dictionary mapping word tokens to numeric indices. @param clf. An encoding of the trained keras model. @param use_lstm. Bool indicating whether clf is a CRF or LSTM. ''' # use_lstm=self._use_lstm if use_lstm: #parameters=hd.load_parameters_from_file("LSTM_parameters.txt") parameters['use_pretrained_model'] = True #model_folder="./models/NN_models" predictions = [] sys.stdout.write('\n use_lstm \n') dataset = Exp.Dataset() fictional_labels = copy.deepcopy(tokenized_sents) for idx, x in enumerate(fictional_labels): for val_id, value in enumerate(x): fictional_labels[idx][val_id] = 'O' Datasets_tokens = {} Datasets_labels = {} Datasets_tokens['deploy'] = tokenized_sents Datasets_labels['deploy'] = fictional_labels token_to_vector = dataset.load_dataset( Datasets_tokens, Datasets_labels, "", parameters, token_to_vector=tokens_to_vec, pretrained_dataset=pretrained_dataset) print(dataset.token_indices.keys()) parameters['Feature_vector_length'] = dataset.feature_vector_size parameters['use_features_before_final_lstm'] = False dataset.update_dataset("", ['deploy'], Datasets_tokens, Datasets_labels) del Datasets_tokens del Datasets_labels #model=current_model model = entity_model.EntityLSTM(dataset, parameters) os.mkdir(parameters['conll_like_result_folder']) test_temp = os.path.join(parameters['conll_like_result_folder'], 'test/') train_temp = os.path.join(parameters['conll_like_result_folder'], 'train/') valid_temp = os.path.join(parameters['conll_like_result_folder'], 'valid/') os.mkdir(test_temp) os.mkdir(train_temp) os.mkdir(valid_temp) sess = tf.Session() with sess.as_default(): #model=entity_model.EntityLSTM(dataset,parameters) transition_params_trained = model.restore_from_pretrained_model( parameters, dataset, sess, token_to_vector=token_to_vector, pretrained_dataset=pretrained_dataset) del token_to_vector predictions = training_predict_LSTM.prediction_step( sess, dataset, "deploy", model, 0, parameters['conll_like_result_folder'], transition_params_trained) sess.close() tf.reset_default_graph() shutil.rmtree(parameters['conll_like_result_folder']) return predictions, model # If nothing to predict, skip actual prediction if len(tokenized_sents) == 0: sys.stdout.write('\tnothing to predict %s\n' % p_or_n) return [] sys.stdout.write('\tvectorizing words %s\n' % p_or_n) if use_lstm: print('todo: incorporate lstm') # vectorize tokenized sentences #X = [] #for sent in tokenized_sents: # id_seq = [] # for w in sent: # if w in vocab: # id_seq.append(vocab[w]) # else: # id_seq.append(vocab['oov']) # X.append(id_seq) else: from cliner.feature_extraction.features import extract_features # vectorize validation X text_features = extract_features(tokenized_sents) flat_X_feats = vocab.transform(flatten(text_features)) X = reconstruct_list(flat_X_feats, save_list_structure(text_features)) sys.stdout.write('\tpredicting labels %s\n' % p_or_n) # Predict labels if use_lstm: print("TEST_PREDICT") exit() else: from cliner.machine_learning import crf predictions = crf.predict(clf, X) # Format labels from output return predictions
def generic_train(p_or_n, train_sents, train_labels, use_lstm, val_sents=None, val_labels=None, test_sents=None, test_labels=None, dev_split=None): ''' generic_train() Train a model that works for both prose and nonprose @param p_or_n. A string that indicates "prose", "nonprose", or "all" @param train_sents. A list of sentences; each sentence is tokenized into words @param train_labels. Parallel to `train_sents`, 7-way labels for concept spans @param use_lstm Bool indicating whether to train CRF or LSTM. @param val_sents. Validation data. Same format as train_sents @param val_labels. Validation data. Same format as train_labels @param dev_split. A real number from 0 to 1 ''' # Must have data to train on: if len(train_sents) == 0: raise Exception('Training must have %s training examples' % p_or_n) # if you should split the data into train/dev yourself if (not val_sents) and (dev_split > 0.0) and (len(train_sents) > 10): p = int(dev_split * 100) sys.stdout.write('\tCreating %d/%d train/dev split\n' % (100 - p, p)) perm = list(range(len(train_sents))) random.shuffle(perm) train_sents = [train_sents[i] for i in perm] train_labels = [train_labels[i] for i in perm] ind = int(dev_split * len(train_sents)) val_sents = train_sents[:ind] train_sents = train_sents[ind:] val_labels = train_labels[:ind] train_labels = train_labels[ind:] else: sys.stdout.write('\tUsing existing validation data\n') sys.stdout.write('\tvectorizing words %s\n' % p_or_n) if use_lstm: print("TESTING NEW DATSET OBJECT") dataset = Exp.Dataset() parameters = hd.load_parameters_from_file("LSTM_parameters.txt") parameters['use_pretrained_model'] = False Datasets_tokens = {} Datasets_labels = {} Datasets_tokens['train'] = train_sents Datasets_labels['train'] = train_labels if val_sents != None: Datasets_tokens['valid'] = val_sents Datasets_labels['valid'] = val_labels if test_sents != None: Datasets_tokens['test'] = test_sents Datasets_labels['test'] = test_labels dataset.load_dataset(Datasets_tokens, Datasets_labels, "", parameters) pickle.dump( dataset, open(os.path.join(parameters['model_folder'], 'dataset.pickle'), 'wb')) print(Datasets_tokens['valid'][0]) print(Datasets_tokens['test'][0]) parameters['Feature_vector_length'] = dataset.feature_vector_size parameters['use_features_before_final_lstm'] = False parameters['learning_rate'] = 0.005 sess = tf.Session() number_of_sent = list(range(len(dataset.token_indices['train']))) with sess.as_default(): model = entity_model.EntityLSTM(dataset, parameters) sess.run(tf.global_variables_initializer()) model.load_pretrained_token_embeddings(sess, dataset, parameters) epoch_number = -1 transition_params_trained = np.random.rand(5 + 2, 5 + 2) values = {} values["best"] = 0 f1_dictionary = {} f1_dictionary['best'] = 0 model_saver = tf.train.Saver(max_to_keep=100) print("START TRAINING") eval_dir = os.path.join( tmo_dir, 'cliner_eval_%d' % random.randint(0, 256) + os.sep) parameters['conll_like_result_folder'] = eval_dir test_temp = os.path.join(parameters['conll_like_result_folder'], 'test/') train_temp = os.path.join(parameters['conll_like_result_folder'], 'train/') valid_temp = os.path.join(parameters['conll_like_result_folder'], 'valid/') os.mkdir(parameters['conll_like_result_folder']) os.mkdir(test_temp) os.mkdir(train_temp) os.mkdir(valid_temp) while epoch_number < 90: average_loss_per_phrase = 0 accuracy_per_phase = 0 step = 0 epoch_number += 1 if epoch_number != 0: sequence_numbers = list( range(len(dataset.token_indices['train']))) random.shuffle(sequence_numbers) for sequence_number in sequence_numbers: loss, accuracy, transition_params_trained = training_predict_LSTM.train_step( sess, dataset, sequence_number, model) average_loss_per_phrase += loss accuracy_per_phase += accuracy step += 1 if step % 10 == 0: print('Training {0:.2f}% done\n'.format( step / len(sequence_numbers) * 100)) model_saver.save( sess, os.path.join(parameters['model_folder'], 'model_{0:05d}.ckpt'.format(epoch_number))) total_loss = average_loss_per_phrase total_accuracy = accuracy_per_phase average_loss_per_phrase = average_loss_per_phrase / len( number_of_sent) accuracy_per_phase = accuracy_per_phase / len(number_of_sent) if epoch_number > 0: "" f1, predictions = training_predict_LSTM.prediction_step( sess, dataset, "test", model, epoch_number, parameters['conll_like_result_folder'], transition_params_trained) f1_train, _ = training_predict_LSTM.prediction_step( sess, dataset, "train", model, epoch_number, parameters['conll_like_result_folder'], transition_params_trained) f1_valid, _ = training_predict_LSTM.prediction_step( sess, dataset, "valid", model, epoch_number, parameters['conll_like_result_folder'], transition_params_trained) correctly_predicted_tokens = training_predict_LSTM.compute_train_accuracy( parameters['conll_like_result_folder'] + "valid" + os.sep + "epoche_" + str(epoch_number) + ".txt") if f1_dictionary['best'] < float(f1_valid): f1_dictionary['epoche'] = epoch_number f1_dictionary['best'] = float(f1_valid) if values["best"] < correctly_predicted_tokens: values["epoche"] = epoch_number values["best"] = correctly_predicted_tokens #print ("Number of correctly predicted tokens -test "+str(correctly_predicted_tokens)) print("NEW EPOCHE" + " " + str(epoch_number)) print("Current F1 on train" + " " + str(f1_train)) print("Current F1 on valid" + " " + str(f1_valid)) print("Current F1 on test" + " " + str(f1)) print("Current F1 best (validation): ") print(f1_dictionary) shutil.rmtree(parameters['conll_like_result_folder']) return parameters, dataset, f1_dictionary['best'] else: ######## # CRF ######## from cliner.feature_extraction.features import extract_features # vectorize tokenized sentences text_features = extract_features(train_sents) # type(text_features): <type 'list'> # Collect list of feature types enabled_features = set() for sf in text_features: for wf in sf: for (feature_type, instance), value in wf.items(): if feature_type.startswith('prev'): feature_type = 'PREV*' if feature_type.startswith('next'): feature_type = 'NEXT*' enabled_features.add(feature_type) enabled_features = sorted(enabled_features) # Vectorize features vocab = DictVectorizer() flat_X_feats = vocab.fit_transform(flatten(text_features)) X_feats = reconstruct_list(flat_X_feats, save_list_structure(text_features)) # vectorize IOB labels Y_labels = [[tag2id[y] for y in y_seq] for y_seq in train_labels] assert len(X_feats) == len(Y_labels) for i in range(len(X_feats)): assert X_feats[i].shape[0] == len(Y_labels[i]) # if there is specified validation data, then vectorize it if val_sents: # vectorize validation X val_text_features = extract_features(val_sents) flat_val_X_feats = vocab.transform(flatten(val_text_features)) val_X = reconstruct_list(flat_val_X_feats, save_list_structure(val_text_features)) # vectorize validation Y val_Y = [[tag2id[y] for y in y_seq] for y_seq in val_labels] # if there is specified test data, then vectorize it if test_sents: # vectorize test X test_text_features = extract_features(test_sents) flat_test_X_feats = vocab.transform(flatten(test_text_features)) test_X = reconstruct_list(flat_test_X_feats, save_list_structure(test_text_features)) # vectorize test Y test_Y = [[tag2id[y] for y in y_seq] for y_seq in test_labels] else: test_X = None test_Y = None sys.stdout.write('\ttraining classifiers %s\n' % p_or_n) if use_lstm: # train using lstm clf, dev_score = keras_ml.train(X_seq_ids, Y_labels, tag2id, len(vocab), val_X_ids=val_X, val_Y_ids=val_Y, test_X_ids=test_X, test_Y_ids=test_Y) else: # train using crf from machine_learning import crf clf, dev_score = crf.train(X_feats, Y_labels, val_X=val_X, val_Y=val_Y, test_X=test_X, test_Y=test_Y) return vocab, clf, dev_score, enabled_features
def __generic_first_train(self, p_or_n, text_features, iob_labels, do_grid=False): ''' Model::__generic_first_train() Purpose: Train that works for both prose and nonprose @param p_or_n. <string> either "prose" or "nonprose" @param text_features. <list-of-lists> of feature dictionaries @param iob_labels. <list> of "I", "O", and "B" labels @param do_grid. <boolean> indicating whether to perform grid search ''' # Must have data to train on if len(text_features) == 0: raise Exception('Training must have %s training examples' % p_or_n) # Vectorize IOB labels Y_labels = [IOB_labels[y] for y in iob_labels] # Save list structure to reconstruct after vectorization offsets = save_list_structure(text_features) if globals_cliner.verbosity > 0: print('\tvectorizing features (pass one) ' + p_or_n) #X = reconstruct_list(flatten(text_features), offsets) #Y = reconstruct_list( Y_labels , offsets) # for a,b in zip(X,Y): # for x,y in zip(a,b): # print y # #print filter(lambda t:t[0]=='word', x.keys()) # print x.keys() # print # print '\n\n\n' # Vectorize features dvect = DictVectorizer() X_feats = dvect.fit_transform(flatten(text_features)) # CRF needs reconstructed lists if self._crf_enabled: X_feats = reconstruct_list(list(X_feats), offsets) Y_labels = reconstruct_list(Y_labels, offsets) lib = crf else: lib = sci if globals_cliner.verbosity > 0: print('\ttraining classifiers (pass one) ' + p_or_n) # for i,X in enumerate(X_feats): # for j,x in enumerate(X): # print x, '\t', Y_labels[i][j] # print # exit() # Train classifier clf = lib.train(X_feats, Y_labels, do_grid) return dvect, clf
def __second_predict(self, chunked_sentences, inds_list): # If first pass predicted no concepts, then skip # NOTE: Special case because SVM cannot have empty input if sum([len(inds) for inds in inds_list]) == 0: print("first pass predicted no concepts, skipping second pass") return [] # Create object that is a wrapper for the features if globals_cliner.verbosity > 0: print('\textracting features (pass two)') print('\textracting features (pass two)') # Extract features text_features = [ feat_obj.concept_features(s, inds) for s, inds in zip(chunked_sentences, inds_list) ] flattened_text_features = flatten(text_features) print('\tvectorizing features (pass two)') if globals_cliner.verbosity > 0: print('\tvectorizing features (pass two)') # Vectorize features vectorized_features = self._second_vec.transform( flattened_text_features) if globals_cliner.verbosity > 0: print('\tpredicting labels (pass two)') # Predict concept labels out = sci.predict(self._second_clf, vectorized_features) # Line-by-line processing o = list(out) classifications = [] for lineno, inds in enumerate(inds_list): # Skip empty line if not inds: continue # For each concept for ind in inds: # Get next concept concept = reverse_concept_labels[o.pop(0)] # Get start position (ex. 7th word of line) start = 0 for i in range(ind): start += len(chunked_sentences[lineno][i].split()) # Length of chunk length = len(chunked_sentences[lineno][ind].split()) # Classification token classifications.append( (concept, lineno + 1, start, start + length - 1)) # Return classifications return classifications