def train(notes, train_timex=True, train_event=True, train_rel=True): # TODO: need to do some filtering of tokens # TODO: experiment with the feature of the 4 left and right taggings. do we # only utilize taggings for each pass or do we incorporate taggings in different passes? timexLabels = [] # BIO labelings for tokens in text. timexFeatures = [] eventLabels = [] # EVENT or O labelings for tokens in text. eventFeatures = [] eventClassLabels = [] # event class labelings for tokens in text. eventClassFeatures = [] tlinkLabels = [] # temporal relation labelings for enitity pairs. tlinkFeatures = [] timexClassifier = None timexVectorizer = None eventClassifier = None eventVectorizer = None eventClassClassifier = None eventClassVectorizer = None tlinkClassifier = None tlinkVectorizer = None for i, note in enumerate(notes): print "note: {}".format(i) print "note path: ", note.note_path if train_timex is True: # extract features to perform BIO labeling for timexs tmpLabels = note.get_timex_labels() for label in tmpLabels: timexLabels += label timexFeatures += features.extract_timex_feature_set(note, tmpLabels) if train_event is True: # extract features to perform EVENT or O labeling. tmpLabels = note.get_event_labels() for label in tmpLabels: eventLabels += label eventFeatures += features.extract_event_feature_set(note, tmpLabels, timexLabels=note.get_timex_labels()) # extract features to perform event class labeling. tmpLabels = note.get_event_class_labels() for label in tmpLabels: eventClassLabels += label eventClassFeatures += features.extract_event_class_feature_set(note, tmpLabels, note.get_event_labels(), timexLabels=note.get_timex_labels()) if train_rel is True: # extract features to classify relations between temporal entities. tlinkLabels += note.get_tlink_labels() tlinkFeatures += features.extract_tlink_features(note) # TODO: when predicting, if gold standard is provided evaluate F-measure for each of the steps #print "VOCAB:" #print features._voc #print if train_timex is True: print "\tTRAINING TIMEX" # train model to perform BIO labeling for timexs timexClassifier, timexVectorizer = _trainTimex(timexFeatures, timexLabels, grid=True) #timexClassifier, timexVectorizer = _trainTimex(timexFeatures, timexLabels, grid=False) if train_event is True: # train model to label as EVENT or O # TODO: filter non-timex only? eventClassifier, eventVectorizer = _trainEvent(eventFeatures, eventLabels, grid=True) # eventClassifier, eventVectorizer = _trainEvent(eventFeatures, eventLabels, grid=False) # train model to label as a class of EVENT # TODO: filter event only? # TODO: should we be training over all tokens or those that are just EVENTs? eventClassClassifier, eventClassVectorizer = _trainEventClass(eventClassFeatures, eventClassLabels, grid=True) #eventClassClassifier, eventClassVectorizer = _trainEventClass(eventClassFeatures, eventClassLabels, grid=False) if train_rel is True: # train model to classify relations between temporal entities. # TODO: add features back in. #tlinkClassifier, tlinkVectorizer = _trainTlink(tlinkFeatures, tlinkLabels, grid=False) tlinkClassifier, tlinkVectorizer = _trainTlink(tlinkFeatures, tlinkLabels, grid=True) # will be accessed later for dumping models = {"TIMEX":timexClassifier, "EVENT":eventClassifier, "EVENT_CLASS":eventClassClassifier, "TLINK":tlinkClassifier} vectorizers = {"TIMEX":timexVectorizer, "EVENT":eventVectorizer, "EVENT_CLASS":eventClassVectorizer, "TLINK":tlinkVectorizer} return models, vectorizers
def predict(note, predict_timex=True, predict_event=True, predict_rel=True): # TODO: try and correct the flattening on the lists. might just end up being redundent? # TODO: refactor this code. a lot of it is redundant. # TODO: need to do some filtering of tokens # TODO: experiment with the feature of the 4 left and right taggings. do we # only utilize taggings for each pass or do we incorporate taggings in different passes? global _models global _vects global _models_loaded if _models_loaded is False: sys.exit("Models not loaded. Cannot predict") # get tokenized text tokenized_text = note.get_tokenized_text() # will be new iob_labels iob_labels = [] timexLabels = [] eventLabels = [] eventClassLabels = [] tlink_labels = [] # init the number of lines for timexlabels # we currently do not know what they are. # get the tokens into a flast list, these are ordered by # appearance within the document tokens = [] for line in tokenized_text: timexLabels.append([]) eventLabels.append([]) eventClassLabels.append([]) iob_labels.append([]) tokens += tokenized_text[line] timex_count = 2 if predict_timex is True: timexClassifier = _models["TIMEX"] timexVectorizer = _vects["TIMEX"] # get the timex feature set for the tokens within the note. timexFeatures = features.extract_timex_feature_set(note, timexLabels, predict=True) # sanity check assert len(tokens) == len(timexFeatures) # predict over the tokens and the features extracted. for t, f in zip(tokens, timexFeatures): features.update_features(t, f, timexLabels) X = timexVectorizer.transform([f]) Y = list(timexClassifier.predict(X)) timexLabels[t["sentence_num"] - 1].append({'entity_label':Y[0], 'entity_type':None if Y[0] == 'O' else 'TIMEX3', 'entity_id':"t"+str(timex_count)}) timex_count += 1 iob_labels[t["sentence_num"] - 1].append(timexLabels[t["sentence_num"] - 1][-1]) event_count = 2 event_class_count = 2 if predict_event is True: eventClassifier = _models["EVENT"] eventVectorizer = _vects["EVENT"] eventClassClassifier = _models["EVENT_CLASS"] eventClassVectorizer = _vects["EVENT_CLASS"] # get the timex feature set for the tokens within the note. # don't get iob labels yet, they are inaccurate. need to predict first. eventFeatures = features.extract_event_feature_set(note, eventLabels, predict=True, timexLabels=timexLabels) # sanity check assert len(tokens) == len(eventFeatures) # TODO: need to do some filter. if something is already labeled then just skip over it. # predict over the tokens and the features extracted. for t, f in zip(tokens, eventFeatures): features.update_features(t, f, eventLabels) X = eventVectorizer.transform([f]) Y = list(eventClassifier.predict(X)) eventLabels[t["sentence_num"] - 1].append({'entity_label':Y[0], 'entity_type':None if Y[0] == 'O' else 'EVENT', 'entity_id':"e" + str(event_count)}) event_count += 1 # get the timex feature set for the tokens within the note. eventClassFeatures = features.extract_event_class_feature_set(note, eventClassLabels, eventLabels, predict=True, timexLabels=timexLabels) # sanity check assert len(tokens) == len(eventClassFeatures) # predict over the tokens and the features extracted. for t, f in zip(tokens, eventClassFeatures): # updates labels features.update_features(t, f, eventClassLabels) X = eventClassVectorizer.transform([f]) Y = list(eventClassClassifier.predict(X)) eventClassLabels[t["sentence_num"] - 1].append({'entity_label':Y[0], 'entity_type':None if Y[0] == 'O' else 'EVENT', 'entity_id':'e' + str(event_class_count)}) event_class_count += 1 if iob_labels[t["sentence_num"] - 1][t["token_offset"]]["entity_type"] == None: iob_labels[t["sentence_num"] - 1][t["token_offset"]] = eventClassLabels[t["sentence_num"] - 1][-1] _totLabels = [] for l in eventClassLabels: _totLabels += l print "predicted ZERO events? : ", len(_totLabels) == len([l for l in _totLabels if l["entity_label"] != 'O']) _totLabels = [] for l in timexLabels: _totLabels += l print "predicted ZERO timex? :", len(_totLabels) == len([l for l in _totLabels if l["entity_label"] != 'O']) if predict_timex is True and predict_event is True and predict_rel is True: tlinkVectorizer = _vects["TLINK"] tlinkClassifier = _models["TLINK"] note.set_tlinked_entities(timexLabels,eventClassLabels) note.set_iob_labels(iob_labels) print "PREDICT: getting tlink features" f = features.extract_tlink_features(note) X = tlinkVectorizer.transform(f) tlink_labels = list(tlinkClassifier.predict(X)) entity_labels = [label for line in iob_labels for label in line] original_offsets = note.get_token_char_offsets() return entity_labels, original_offsets, tlink_labels, tokens