def predict(self, model, test_data, predict_prob=False): XL, XR = test_data # get expected length of model input model_input_len = model.input_shape[0][2] if model_input_len > XL.shape[2]: # pad input matrix to fit expected length filler = np.ones((1, 1, model_input_len)) XL, _ = Network._pad_to_match_dimensions(XL, filler, 2, pad_left=True) else: XL = Network._strip_to_length(XL, model_input_len, 2) print "predicting..." labels = model.predict_classes([XL, XR]) if predict_prob: probs = model.predict_proba([XL, XR]) else: probs = None return labels, probs
def process_note(note, labels, del_list, label_index, probs): # get entity pairs, offsets, tokens, and event/timex entities entities = note.get_tlink_id_pairs() offsets = note.get_token_char_offsets() # flatten list of tokens tokenized_text = note.get_tokenized_text() tokens = [] for line in tokenized_text: tokens += tokenized_text[line] event_timex_labels = [] # flatten list of labels for label_list in note.get_labels(): event_timex_labels += label_list # sort del_list to be in ascending order, and remove duplicates del_list = list(set(del_list)) del_list.sort() del_list.reverse() # loop through indicies starting at the back to preserve earlier indexes for index in del_list: del entities[index] note_labels = labels[label_index:label_index + len(entities)] note_label_nums = Network()._convert_str_labels_to_int(note_labels) processed_entities = {} used_indexes = [] # for the same entity pairs (regardless of order), only use the best scores for i, note_label_num in enumerate(note_label_nums): if (entities[i][1], entities[i][0]) in processed_entities: if probs[i][note_label_num] > processed_entities[(entities[i][1], entities[i][0])]: used_indexes.append(i) # reverse order else: used_indexes.append(i - 1) else: processed_entities[(entities[i][0], entities[i][1])] = probs[i][note_label_num] note_labels = [note_labels[x] for x in used_indexes] entities = [entities[x] for x in used_indexes] return event_timex_labels, note_labels, entities, offsets, tokens
def get_test_input(self, note): """Given a note, return data for every token""" max_id = len(note.id_to_tok) # word ids starts with 1 word_vectors = None attribute_vectors = None for sent_num in note.pre_processed_text: for tok in note.pre_processed_text[sent_num]: wordID = tok['id'] word_index = int(wordID[1:]) # wordID example: 'w31' left_edge = max(1, word_index - 4) right_edge = min(max_id, word_index + 4) context_tokens = [ note.id_to_tok['w' + str(x)] for x in range(left_edge, right_edge + 1) ] context_words = [x['token'] for x in context_tokens] vecs = self._extract_word_representations(context_words) if word_vectors is None: word_vectors = vecs else: word_vectors = Network._pad_and_concatenate(word_vectors, vecs, axis=0) attributes = np.array([ tok.get('is_main_verb', False), tok.get('is_predicate', False), tok['pos'] == 'V', tok['pos'] == 'N' ]) attributes = attributes[np.newaxis, :] if attribute_vectors is None: attribute_vectors = attributes else: attribute_vectors = np.concatenate( (attribute_vectors, attributes), axis=0) return word_vectors, attribute_vectors
def trainNetwork(gold_files, val_files, newsreader_dir, pair_type, ordered=False, no_val=False, nolink_ratio=1.0, callbacks=[], train_dir='./'): ''' Train a neural network for classification of temporal realtions. ''' print "Called trainNetwork" global N_CLASSES if not os.path.isfile(train_dir+'training_data.pkl'): notes = get_notes(gold_files, newsreader_dir) if not no_val: val_notes = get_notes(val_files, newsreader_dir) network = Network() print "loading word vectors..." network.word_vectors = load_word2vec_binary(os.environ["TEA_PATH"] + '/GoogleNews-vectors-negative300.bin', verbose=0) if os.path.isfile(train_dir+'training_data.pkl'): print "loading pkl file... this may take over 10 minutes" training_data = cPickle.load(open(train_dir+'training_data.pkl')) print "training data size:", training_data[0].shape, training_data[1].shape, len(training_data[2]) else: # nolink_ration = # no tlink cases / # tlink cases training_data = network._get_training_input(notes, pair_type=pair_type, nolink_ratio=nolink_ratio, shuffle=True, ordered=ordered) print "training data size:", training_data[0].shape, training_data[1].shape, len(training_data[2]) if not no_val and val_notes is not None: val_data = network._get_test_input(val_notes, pair_type=pair_type, ordered=ordered) print "validation data size:", val_data[0].shape, val_data[1].shape, len(val_data[2]) else: val_data = None del network.word_vectors NNet, history = network.train_model(None, epochs=200, training_input=training_data, val_input=val_data, no_val=no_val, weight_classes=False, batch_size=100, encoder_dropout=0, decoder_dropout=0.5, input_dropout=0.6, reg_W=0, reg_B=0, reg_act=0, LSTM_size=256, dense_size=100, maxpooling=True, data_dim=300, max_len='auto', nb_classes=N_CLASSES, callbacks=callbacks, ordered=ordered) return NNet, history
def main(): global timenote_imported parser = argparse.ArgumentParser() parser.add_argument("predict_dir", nargs=1, help="Directory containing test input") parser.add_argument( "intra_model_path", help="Where trained model for intra-sentence pairs is located") parser.add_argument( "cross_model_path", help="Where trained model for cross-sentence pairs is located") parser.add_argument( "dct_model_path", help= "Where trained model for events and document creation time is located") parser.add_argument("annotation_destination", help="Where annotated files are written") parser.add_argument( "newsreader_annotations", help="Where newsreader pipeline parsed file objects go") parser.add_argument( "--evaluate", action='store_true', default=False, help="Use gold data from the given files to produce evaluation metrics" ) args = parser.parse_args() annotation_destination = args.annotation_destination newsreader_dir = args.newsreader_annotations if os.path.isdir(annotation_destination) is False: sys.exit("\n\noutput destination does not exist") if os.path.isdir(newsreader_dir) is False: sys.exit("invalid path for time note dir") predict_dir = args.predict_dir[0] if os.path.isdir(predict_dir) is False: sys.exit("\n\nno output directory exists at set path") # pickled_timeml_notes = [os.path.basename(l) for l in glob.glob(newsreader_dir + "/*")] if '/*' != args.predict_dir[0][-2:]: predict_dir = predict_dir + '/*' # get files in directory files = glob.glob(predict_dir) gold_files = [] tml_files = [] for f in files: if f.endswith(".TE3input"): #input file without tlinks tml_files.append(f) elif f.endswith(".tml"): gold_files.append(f) gold_files.sort() tml_files.sort() print "gold_files", gold_files # one-to-one pairing of annotated file and un-annotated # assert len(gold_files) == len(tml_files) network = Network() intra_model = model_from_json( open(os.path.join(args.intra_model_path, 'intra', '.arch.json')).read()) intra_model.load_weights( os.path.join(args.intra_model_path, 'intra', '.weights.h5')) cross_model = model_from_json( open(os.path.join(args.cross_model_path, 'cross', '.arch.json')).read()) cross_model.load_weights( os.path.join(args.cross_model_path, 'cross', '.weights.h5')) dct_model = model_from_json( open(os.path.join(args.dct_model_path, 'dct', '.arch.json')).read()) dct_model.load_weights( os.path.join(args.dct_model_path, 'dct', '.weights.h5')) for i, tml in enumerate(gold_files): print '\n\nprocessing file {}/{} {}'.format(i + 1, len(gold_files), tml) if os.path.isfile( os.path.join(newsreader_dir, basename(tml) + ".parsed.pickle")): tmp_note = cPickle.load( open( os.path.join(newsreader_dir, basename(tml) + ".parsed.pickle"), "rb")) else: tmp_note = TimeNote(tml, tml) cPickle.dump( tmp_note, open(newsreader_dir + "/" + basename(tml) + ".parsed.pickle", "wb")) # notes.append(tmp_note) notes = [tmp_note] # required to be a list intra_labels, intra_probs, intra_pair_index = network.single_predict( notes, intra_model, 'intra', predict_prob=True) intra_labels, intra_pair_index, intra_scores = network.smart_predict( intra_labels, intra_probs, intra_pair_index, type='str') cross_labels, cross_probs, cross_pair_index = network.single_predict( notes, cross_model, 'cross', predict_prob=True) cross_labels, cross_pair_index, cross_scores = network.smart_predict( cross_labels, cross_probs, cross_pair_index, type='str') timex_labels, timex_pair_index = predict_timex_rel(notes) dct_labels, dct_probs, dct_pair_index = network.single_predict( notes, dct_model, 'dct', predict_prob=True) dct_labels = network._convert_int_labels_to_str(dct_labels) dct_scores = [max(probs) for probs in dct_probs] assert len(dct_labels) == len(dct_scores) for i, note in enumerate(notes): note_id_pairs = [] note_labels = [] note_scores = [] for key in intra_pair_index.keys( ): # {(note_id, (ei, ej)) : index} # the dictionary is dynamically changing, so we need to check if key not in intra_pair_index: continue if key[0] == i: note_id_pairs.append(key[1]) note_labels.append(intra_labels[intra_pair_index[key]]) note_scores.append(intra_scores[intra_pair_index[key]]) intra_pair_index.pop(key) opposite_key = (key[0], (key[1][1], key[1][0])) intra_pair_index.pop(opposite_key) for key in cross_pair_index.keys( ): # {(note_id, (ei, ej)) : index} # the dictionary is dynamically changing, so we need to check if key not in cross_pair_index: continue if key[0] == i: note_id_pairs.append(key[1]) note_labels.append(cross_labels[cross_pair_index[key]]) note_scores.append(cross_scores[cross_pair_index[key]]) cross_pair_index.pop(key) opposite_key = (key[0], (key[1][1], key[1][0])) cross_pair_index.pop(opposite_key) for key in timex_pair_index.keys(): # {(note_id, (t, t)) : index} if key[0] == i: note_id_pairs.append(key[1]) note_labels.append(timex_labels[timex_pair_index[key]]) note_scores.append(1.0) # trust timex tlinks timex_pair_index.pop(key) for key in dct_pair_index.keys(): # {(note_id, (ei, t0)) : index} if key[0] == i: note_id_pairs.append(key[1]) note_labels.append(dct_labels[dct_pair_index[key]]) note_scores.append(max(dct_probs[dct_pair_index[key]])) #note_scores.append(0.0) dct_pair_index.pop(key) # note_labels, note_scores = resolve_coref(note, note_id_pairs, note_labels, note_scores) note_labels = modify_tlinks(note_id_pairs, note_labels, note_scores) save_predictions(note, note_id_pairs, note_labels, annotation_destination)
def train_model(self, training_data, validation_data=None, model_destination='./', epochs=500, weight_classes=False, batch_size=256, encoder_dropout=0, decoder_dropout=0.5, input_dropout=0.5, reg_W=0, reg_B=0, reg_act=0, LSTM_size=128, dense_size=30, maxpooling=True, data_dim=300, max_len='auto'): XL, XR, Y = training_data print "training data shape: ", XL.shape # reformat labels so that they can be used by the NN #Y = to_categorical(Y, 2) # use weighting to assist with the imbalanced data set problem if weight_classes: N = len(Y) n_pos = sum(Y) neg_weight = 1.0 * n_pos / N # inversely proportional to frequency class_weight = {1: 1 - neg_weight, 0: neg_weight} # infer maximum sequence length if max_len == 'auto': max_len = XL.shape[2] # pad input to reach max_len else: filler = np.ones((1, 1, max_len)) XL, _ = Network._pad_to_match_dimensions(XL, filler, 2, pad_left=True) XR, _ = Network._pad_to_match_dimensions(XR, filler, 2, pad_left=True) model = self.get_untrained_model(encoder_dropout=encoder_dropout, decoder_dropout=decoder_dropout, input_dropout=input_dropout, reg_W=reg_W, reg_B=reg_B, reg_act=reg_act, LSTM_size=LSTM_size, dense_size=dense_size, maxpooling=maxpooling, data_dim=data_dim, max_len=max_len) # split off validation data with 20 80 split (this way we get the same validation data every time we use this data sample, and can test on it after to get a confusion matrix) if validation_data is None: V_XL = XL[:(XL.shape[0] / 5), :, :] V_XR = XR[:(XR.shape[0] / 5), :, :] V_Y = Y[:(Y.shape[0] / 5), :] #V_labels = labels[:(Y.shape[0] / 5)] XL = XL[(XL.shape[0] / 5):, :, :] XR = XR[(XR.shape[0] / 5):, :, :] Y = Y[(Y.shape[0] / 5):, :] else: V_XL, V_XR, V_Y = validation_data # train the network print 'Training network...' earlystopping = EarlyStopping(monitor='val_loss', patience=20, verbose=0, mode='auto') checkpoint = ModelCheckpoint(model_destination + 'model.h5', monitor='val_acc', save_best_only=True) training_history = model.fit([XL, XR], Y, nb_epoch=epochs, validation_split=0, class_weight=class_weight, batch_size=batch_size, validation_data=([V_XL, V_XR], V_Y), callbacks=[checkpoint, earlystopping]) test = model.predict_classes([V_XL, V_XR]) Network.class_confusion(test, V_Y, 2) return model, training_history.history
def get_input(self, notes, shuffle=True, neg_ratio=3): word_vectors = None attribute_vectors = None labels = [] for note in notes: print "processing file ", note.annotated_note_path if hasattr(note, 'event_ids'): event_ids = note.event_ids else: id_chunk_map, event_ids, timex_ids, sentence_chunks = note.get_id_chunk_map( ) # every event tag corresponds to a list of words, pick the first word event_wordIDs = [note.id_to_wordIDs[x][0] for x in event_ids] max_id = len(note.id_to_tok) # word ids starts with 1 all_wordIDs = set(['w' + str(x) for x in range(1, max_id + 1)]) nonevent_wordIDs = all_wordIDs - set(event_wordIDs) n_neg_samples = min(len(nonevent_wordIDs), neg_ratio * len(event_wordIDs)) nonevent_wordIDs = list(nonevent_wordIDs)[0:n_neg_samples] training_wordIDs = event_wordIDs + nonevent_wordIDs for wordID in training_wordIDs: word_index = int(wordID[1:]) # wordID example: 'w31' left_edge = max(1, word_index - 4) right_edge = min(max_id, word_index + 4) context_tokens = [ note.id_to_tok['w' + str(x)] for x in range(left_edge, right_edge + 1) ] context_words = [x['token'] for x in context_tokens] vecs = self._extract_word_representations(context_words) if word_vectors is None: word_vectors = vecs else: word_vectors = Network._pad_and_concatenate(word_vectors, vecs, axis=0) tok = note.id_to_tok[wordID] attributes = np.array([ tok.get('is_main_verb', False), tok.get('is_predicate', False), tok['pos'] == 'V', tok['pos'] == 'N' ]) attributes = attributes[np.newaxis, :] if attribute_vectors is None: attribute_vectors = attributes else: attribute_vectors = np.concatenate( (attribute_vectors, attributes), axis=0) if wordID in event_wordIDs: labels.append(1) else: labels.append(0) if shuffle: rng_state = np.random.get_state() np.random.shuffle(word_vectors) np.random.set_state(rng_state) np.random.shuffle(attribute_vectors) np.random.set_state(rng_state) np.random.shuffle(labels) return word_vectors, attribute_vectors, labels