def fit(self, xs, ys): attribute_names = list(set(flatten(xs))) # Compute pairs of attributes : (attribute_name, is_child_before) self.attributes = [(a, True) for a in attribute_names ] + [(a, False) for a in attribute_names] self.attributes.sort() DecisionTreeBase.fit(self, xs, ys)
def test(epochs = 1): results = model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=epochs, validation_split=0.0, show_accuracy=True, verbose=1) probs = flatten( model.predict_proba(X_test, batch_size=batch_size) ) y_pred = [1 if p >= 0.5 else 0 for p in probs] r, p, f1 = rpf1(y_test, y_pred) print("recall", r, "precision", p, "f1", f1) return f1
def get_wordlevel_ys_by_code(lst_tag_sets, expected_tags): """ Convert a list of tagsets to a dictionary of ys values per tag label Parameters ---------- lst_tag_sets : a list of sets of tags List of labels for each word Returns ---------- A dictionary of codes mapping to binary labels for that code """ unique_tags = set(flatten(lst_tag_sets)) tmp_ys_bycode = defaultdict(list) for tag_set in lst_tag_sets: for y in unique_tags: tmp_ys_bycode[y].append(1 if y in tag_set else 0) num_rows = len(list(tmp_ys_bycode.values())[0]) # Convert to a dict of numpy arrays ys_bycode = dict() for tag in expected_tags: if tag in tmp_ys_bycode and len(tmp_ys_bycode[tag]) > 0: lst = tmp_ys_bycode[tag] ys_bycode[tag] = np.asarray(lst, dtype=np.int).reshape((len(lst), )) else: ys_bycode[tag] = np.zeros(shape=(num_rows,), dtype=np.int) return ys_bycode
def test(epochs=1): model.fit(X_train, y_train, n_epochs=epochs, batch_size=64) #64 seems good for now predictions = flatten(model.predict(X_test)) r, p, f1, cutoff = find_cutoff(y_test, predictions) print("recall", rnd(r), "precision", rnd(p), "f1", rnd(f1), "cutoff", rnd(cutoff)) return f1
def tally_code_frequencies(tagged_essays): freq = defaultdict(int) all_codes = set() for essay in tagged_essays: for i, sentence in enumerate(essay.sentences): words, tags = zip(*sentence) utags = set(flatten(tags)) all_codes.update(utags) for t in utags: freq[t] += 1 return freq
def test(epochs=1): results = model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=epochs, validation_split=0.0, show_accuracy=True, verbose=1) classes = flatten(model.predict_classes(X_test, batch_size=batch_size)) r, p, f1 = rpf1(y_test, classes) print("recall", r, "precision", p, "f1", f1) return f1
def __train__(self, sentences): unique_words = (flatten(sentences)) syn_map = {} mapped = set() for sentence in sentences: tags = nltk.pos_tag(sentence) for wd, tag in tags: pair = (wd, tag) if pair in mapped: continue synonyms = [(s, tag) for s in self.get_synonyms_for_word(wd, tag) if s in unique_words] if len(synonyms) >= 1: matches = [] for spair in synonyms: if spair in syn_map: matches.append(syn_map[spair]) if len(matches) == 0: synset = set(synonyms) synset.add(pair) for p in synset: syn_map[p] = synset elif len(matches) == 1: matches[0].add(pair) syn_map[pair] = matches[0] else: #merge existing synonym lists new_synset = set() for m in matches: new_synset.update(m) #update mapping to map to new larger set for s in new_synset: syn_map[s] = new_synset else: #length == 2 syn_map[pair] = set([pair]) mapped.add(pair) self.synonym_map = {} processed = set() for values in syn_map.values(): vid = id(values) if vid in processed: continue processed.add(vid) key = list(values)[0] for v in values: if v in self.synonym_map: raise Exception("Duplicate key %s" % str(v)) self.synonym_map[v] = key
def evaluate_feature_set(config, existing_extractors, new_extractor, features_filename_prefix): feat_extractors = existing_extractors + [new_extractor] feat_config = dict( list(config.items()) + [("extractors", feat_extractors)]) """ LOAD FEATURES """ # most params below exist ONLY for the purposes of the hashing to and from disk #mem_extract_features = memoize_to_disk(filename_prefix=features_filename_prefix, verbose=False)(extract_features) #essay_feats = mem_extract_features(tagged_essays, **feat_config) essay_feats = extract_features(tagged_essays, **feat_config) """ DEFINE TAGS """ _, lst_all_tags = flatten_to_wordlevel_feat_tags(essay_feats) regular_tags = list( set((t for t in flatten(lst_all_tags) if t[0].isdigit()))) """ works best with all the pair-wise causal relation codes """ wd_train_tags = regular_tags wd_test_tags = regular_tags """ CLASSIFIERS """ fn_create_wd_cls = lambda: LogisticRegression( ) # C=1, dual = False seems optimal wd_algo = str(fn_create_wd_cls()) # Gather metrics per fold folds = cross_validation(essay_feats, CV_FOLDS) def train_tagger(essays_TD, essays_VD, wd_test_tags, wd_train_tags): # TD and VD are lists of Essay objects. The sentences are lists # of featureextractortransformer.Word objects """ Data Partitioning and Training """ td_feats, td_tags = flatten_to_wordlevel_feat_tags(essays_TD) vd_feats, vd_tags = flatten_to_wordlevel_feat_tags(essays_VD) feature_transformer = FeatureVectorizer(min_feature_freq=MIN_FEAT_FREQ, sparse=SPARSE_WD_FEATS) td_X, vd_X = feature_transformer.fit_transform( td_feats), feature_transformer.transform(vd_feats) return td_X.shape, vd_X.shape #results = Parallel(n_jobs=CV_FOLDS)( # delayed(train_tagger)(essays_TD, essays_VD, wd_test_tags, wd_train_tags) # for (essays_TD, essays_VD) in folds) td_col_sizes, vd_col_sizes = [], [] for (essays_TD, essays_VD) in folds: td_x_shape, vd_x_shape = train_tagger(essays_TD, essays_VD, wd_test_tags, wd_train_tags) td_col_sizes.append(td_x_shape[1]) vd_col_sizes.append(vd_x_shape[1]) return np.mean(td_col_sizes), np.mean(vd_col_sizes)
def evaluate_feature_set(config, existing_extractors, new_extractor, features_filename_prefix): feat_extractors = existing_extractors + [new_extractor] feat_config = dict(list(config.items()) + [("extractors", feat_extractors)]) """ LOAD FEATURES """ # most params below exist ONLY for the purposes of the hashing to and from disk #mem_extract_features = memoize_to_disk(filename_prefix=features_filename_prefix, verbose=False)(extract_features) #essay_feats = mem_extract_features(tagged_essays, **feat_config) essay_feats = extract_features(tagged_essays, **feat_config) """ DEFINE TAGS """ _, lst_all_tags = flatten_to_wordlevel_feat_tags(essay_feats) regular_tags = list(set((t for t in flatten(lst_all_tags) if t[0].isdigit()))) """ works best with all the pair-wise causal relation codes """ wd_train_tags = regular_tags wd_test_tags = regular_tags """ CLASSIFIERS """ fn_create_wd_cls = lambda: LogisticRegression() # C=1, dual = False seems optimal wd_algo = str(fn_create_wd_cls()) # Gather metrics per fold folds = cross_validation(essay_feats, CV_FOLDS) def train_tagger(essays_TD, essays_VD, wd_test_tags, wd_train_tags): # TD and VD are lists of Essay objects. The sentences are lists # of featureextractortransformer.Word objects """ Data Partitioning and Training """ td_feats, td_tags = flatten_to_wordlevel_feat_tags(essays_TD) vd_feats, vd_tags = flatten_to_wordlevel_feat_tags(essays_VD) feature_transformer = FeatureVectorizer(min_feature_freq=MIN_FEAT_FREQ, sparse=SPARSE_WD_FEATS) td_X, vd_X = feature_transformer.fit_transform(td_feats), feature_transformer.transform(vd_feats) return td_X.shape, vd_X.shape #results = Parallel(n_jobs=CV_FOLDS)( # delayed(train_tagger)(essays_TD, essays_VD, wd_test_tags, wd_train_tags) # for (essays_TD, essays_VD) in folds) td_col_sizes, vd_col_sizes = [], [] for (essays_TD, essays_VD) in folds: td_x_shape, vd_x_shape = train_tagger(essays_TD, essays_VD, wd_test_tags, wd_train_tags) td_col_sizes.append(td_x_shape[1]) vd_col_sizes.append(vd_x_shape[1]) return np.mean(td_col_sizes), np.mean(vd_col_sizes)
def train_tagger(fold, essays_TD, essays_VD, wd_test_tags, wd_train_tags): wd_train_tags = set(wd_train_tags) # TD and VD are lists of Essay objects. The sentences are lists # of featureextractortransformer.Word objects print "\nFold %s" % fold print "Training Tagging Model" _, lst_every_tag = flatten_to_wordlevel_feat_tags(essay_feats) tag_freq = Counter(flatten(lst_every_tag)) """ Data Partitioning and Training """ td_feats, td_tags = flatten_to_wordlevel_feat_tags(essays_TD) vd_feats, vd_tags = flatten_to_wordlevel_feat_tags(essays_VD) feature_transformer = FeatureVectorizer(min_feature_freq=MIN_FEAT_FREQ, sparse=SPARSE_WD_FEATS) td_X, vd_X = feature_transformer.fit_transform(td_feats), feature_transformer.transform(vd_feats) #TODO: compute most common tags per word for training only (but not for evaluation) wd_td_ys = get_wordlevel_mostfrequent_ys(td_tags, wd_train_tags, tag_freq) # Get Actual Ys by code (dict of label to predictions wd_td_ys_by_code = get_wordlevel_ys_by_code(td_tags, wd_train_tags) wd_vd_ys_by_code = get_wordlevel_ys_by_code(vd_tags, wd_train_tags) #TODO: get most common tags for each word, predict from that using multi class method """ TRAIN Tagger """ model = fn_create_wd_cls() model.fit(td_X, wd_td_ys) wd_td_pred = model.predict(td_X) wd_vd_pred = model.predict(vd_X) """ TEST Tagger """ td_wd_predictions_by_code = get_by_code_from_powerset_predictions(wd_td_pred, wd_test_tags) vd_wd_predictions_by_code = get_by_code_from_powerset_predictions(wd_vd_pred, wd_test_tags) return td_wd_predictions_by_code, vd_wd_predictions_by_code, wd_td_ys_by_code, wd_vd_ys_by_code
remove_punctuation=REMOVE_PUNCTUATION, lower_case=LOWER_CASE, include_vague=INCLUDE_VAGUE, include_normal=INCLUDE_NORMAL) # FEATURE SETTINGS WINDOW_SIZE = 7 CV_FOLDS = 5 # END FEATURE SETTINGS offset = (WINDOW_SIZE - 1) / 2 # don't memoize as it's massive and also fast word_projector_transformer = WordProjectorTransformer(offset) essay_feats = word_projector_transformer.transform(tagged_essays) _, lst_all_tags = flatten_to_wordlevel_vectors_tags(essay_feats) all_tags = set(flatten(lst_all_tags)) # use more tags for training for sentence level classifier regular_tags = [t for t in all_tags if t[0].isdigit()] cause_tags = ["Causer", "Result", "explicit"] causal_rel_tags = [CAUSAL_REL, CAUSE_RESULT, RESULT_REL] # + ["explicit"] wd_train_tags = regular_tags + cause_tags wd_test_tags = regular_tags folds = cross_validation(essay_feats, CV_FOLDS) lst_td_wt_mean_prfa, lst_vd_wt_mean_prfa, lst_td_mean_prfa, lst_vd_mean_prfa = [], [], [], [] td_all_metricsByTag = defaultdict(list) vd_all_metricsByTag = defaultdict(list)
def max_probs_to_words(vector): ixs = np.argmax(vector, axis=1) return ids_to_words(flatten(ixs))
def fit(self, xs, ys): attribute_names = list(set(flatten(xs))) # Compute pairs of attributes : (attribute_name, is_child_before) self.attributes = [(a, True) for a in attribute_names] + [(a, False) for a in attribute_names] self.attributes.sort() DecisionTreeBase.fit(self, xs, ys)
feat_config = dict(config.items() + [("extractors", extractors)]) """ LOAD DATA """ mem_process_essays = memoize_to_disk(filename_prefix=processed_essay_filename_prefix)(load_process_essays) tagged_essays = mem_process_essays( **config ) logger.info("Essays loaded") # most params below exist ONLY for the purposes of the hashing to and from disk mem_extract_features = memoize_to_disk(filename_prefix=features_filename_prefix)(extract_features) essay_feats = mem_extract_features(tagged_essays, **feat_config) logger.info("Features loaded") """ DEFINE TAGS """ _, lst_all_tags = flatten_to_wordlevel_feat_tags(essay_feats) regular_tags = list(set((t for t in flatten(lst_all_tags) if t[0].isdigit()))) """ works best with all the pair-wise causal relation codes """ wd_train_tags = regular_tags wd_test_tags = regular_tags """ CLASSIFIERS """ """ Log Reg + Log Reg is best!!! """ fn_create_wd_cls = lambda: LogisticRegression() # C=1, dual = False seems optimal #fn_create_wd_cls = lambda : LinearSVC(C=1.0) #fn_create_wd_cls = lambda : RandomForestClassifier(n_jobs=8, max_depth=100) #fn_create_wd_cls = lambda : GradientBoostingClassifier() wd_algo = str(fn_create_wd_cls()) print "Classifier:", wd_algo
""" LOAD DATA """ mem_process_essays = memoize_to_disk( filename_prefix=processed_essay_filename_prefix)(load_process_essays) tagged_essays = mem_process_essays(**config) logger.info("Essays loaded") # most params below exist ONLY for the purposes of the hashing to and from disk mem_extract_features = memoize_to_disk( filename_prefix=features_filename_prefix)(extract_features) essay_feats = mem_extract_features(tagged_essays, **feat_config) logger.info("Features loaded") """ DEFINE TAGS """ _, lst_all_tags = flatten_to_wordlevel_feat_tags(essay_feats) #all_regular_tags = list((t for t in flatten(lst_all_tags) if t[0].isdigit())) all_regular_tags = list( set((t for t in flatten(lst_all_tags) if t.lower().strip() == "anaphor"))) tag_freq = Counter(all_regular_tags) regular_tags = list(tag_freq.keys()) """ works best with all the pair-wise causal relation codes """ wd_train_tags = regular_tags wd_test_tags = regular_tags """ CLASSIFIERS """ """ Log Reg + Log Reg is best!!! """ fn_create_wd_cls = lambda: LogisticRegression( ) # C=1, dual = False seems optimal wd_algo = str(fn_create_wd_cls()) print("Classifier:", wd_algo) folds = cross_validation(essay_feats, CV_FOLDS)
if __name__ == "__main__": "1,2,3 => 1" "else => 0" simple_dataset = [ ([1, 2, 3, 7, 8], 1), ([1, 2, 3, 4, 5], 1), ([7, 8, 1, 2, 3, 4, 5], 1), ([1, 2], 0), ([2, 3], 0), ([1, 3], 0), ([5, 6, 7], 0), ([4, 1, 2], 0), ([3, 7, 9], 0), ([1, 2, 4, 5, 6, 7, 8], 0), ] xs, ys = zip(*simple_dataset) attributes = list(set(flatten(xs))) dt = DecisionTree(attributes) dt.fit(xs, ys) predictions = dt.predict(xs) acc = accuracy(ys, predictions, class_value=1) print "\nAccuracy: " + str(acc) print "" print str(dt.tree) pass
def __init__(self, full_path, include_vague=True, include_normal=True, load_annotations=True, essay_text=None): self.include_normal = include_normal self.include_vague = include_vague if essay_text is None: self.full_path = full_path self.file_name = full_path.split("/")[-1] txt_file = full_path[:-4] + ".txt" with open(txt_file, "r+") as f: self.txt = f.read() else: if load_annotations: raise Exception( "Can't load annotations when pasing in essay as text string" ) self.full_path = "None" self.file_name = "None" self.txt = essay_text self.tagged_words = [] #list of list of tuples (words and tags) self.tagged_sentences = [] # list of sets of tags self.sentence_tags = [] self.id2annotation = {} self.split_sents = [] self.aborted_splits = [] if load_annotations: assert full_path.endswith(".ann") assert os.path.exists( txt_file ), "Missing associated text file for %s" % self.full_path with open(full_path, "r+") as f: lines = f.readlines() else: lines = [] codes_start = defaultdict(set) codes_end = defaultdict(set) def get_code(annotation): if ":" not in annotation.code: return annotation.code typ, id = annotation.code.split(":") """ strip off the trailing digit (e.g. Causer1:50) """ if typ[-1].isdigit(): typ = typ[:-1] return typ + ":" + id def process_causal_relations(causer, result): start = min(causer.start, result.start) end = max(causer.end, result.end) if start == end: return False cr_code = get_code(causer) + "->" + get_code(result) codes_start[start].add(cr_code) codes_end[end].add(cr_code) return True def process_text_annotation(annotation): if annotation.start == annotation.end: return False codes_end[annotation.end].add(get_code(annotation)) codes_start[annotation.start].add(get_code(annotation)) if hasattr(annotation, "dep_type"): codes_start[annotation.start].add(annotation.dep_type) codes_end[annotation.end].add(annotation.dep_type) return True annotations_with_dependencies = [] text_annotations = [] vague_ids = set() normal_ids = set() for line in lines: if len(line.strip()) < 2: continue if not line[1].isdigit(): print("Skipping annotation line: %s \n\tin essay %s" % (line.strip(), self.file_name)) continue first_char = line[0] if first_char == "T": if is_compound(line): annotation = CompoundTextAnnotation(line, self.txt) text_annotations.append(annotation.first_part) text_annotations.append(annotation.second_part) if annotation.third_part: text_annotations.append(annotation.third_part) """ DEBUGGING print "" print line.strip() print annotation.txt print "First: ", self.txt[annotation.first_part.start:annotation.first_part.end] print "Second: ", self.txt[annotation.second_part.start:annotation.second_part.end] print annotation.first_part.start, annotation.first_part.end, " ", print annotation.second_part.start, annotation.second_part.end """ else: try: annotation = TextAnnotation(line, self.txt) #Bad annotation, ignore if annotation.start == annotation.end: continue else: text_annotations.append(annotation) except Exception as e: import traceback print(traceback.format_exc()) raise e elif first_char == "A": annotation = AttributeAnnotation(line) if annotation.attribute == "Vague": vague_ids.add(annotation.target_id) if annotation.attribute == "Normal": normal_ids.add(annotation.target_id) for id in annotation.child_annotation_ids: annotation.child_annotations.append(self.id2annotation[id]) elif first_char == "R": annotation = RelationshipAnnotation(line) elif first_char == "E": annotation = EventAnnotation(line, self.id2annotation) annotations_with_dependencies.append(annotation) elif first_char == "#": annotation = NoteAnnotation(line) for id in annotation.child_annotation_ids: annotation.child_annotations.append(self.id2annotation[id]) else: raise Exception("Unknown annotation type") self.id2annotation[annotation.id] = annotation #end process lines for annotation in text_annotations: if not include_vague and annotation.id in vague_ids: continue if not include_normal and annotation.id in normal_ids: continue process_text_annotation(annotation) for annotation in annotations_with_dependencies: deps = annotation.dependencies() # group items grp_causer = dict() grp_result = dict() for dependency in deps: process_text_annotation(dependency) code = dependency.code splt = code.split(":") typ = splt[0] grp_key = 0 if typ[-1].isdigit(): grp_key = typ[-1] if code.startswith("Cause"): grp_causer[grp_key] = dependency elif code.startswith("Result"): grp_result[grp_key] = dependency else: pass if len(grp_causer) > 0 and len(grp_result) > 0: if len(grp_causer) == 1 and len(grp_result) == 1: causer = list(grp_causer.values())[0] result = list(grp_result.values())[0] process_causal_relations(causer, result) elif len(grp_causer) == len(grp_result): for key in grp_causer.keys(): causer = grp_causer[key] result = grp_result[key] process_causal_relations(causer, result) elif len(grp_causer) == 1: causer = list(grp_causer.values())[0] for key, result in grp_result.items(): process_causal_relations(causer, result) elif len(grp_result) == 1: result = list(grp_result.values())[0] for key, causer in grp_causer.items(): process_causal_relations(causer, result) else: raise Exception("Unbalanced CR codes") codes = set() current_word = "" current_sentence = [] def add_pair(current_word, current_sentence, codes, ch, ix): if current_word.strip() != "": pair = (current_word, codes) current_sentence.append(pair) self.tagged_words.append(pair) if ch.strip() != "" and ch != "/": if ix in codes_start: pair2 = (ch, codes_start[ix]) else: pair2 = (ch, set()) current_sentence.append(pair2) self.tagged_words.append(pair2) def onlyascii(s): out = "" for char in s: if ord(char) > 127: out += "" else: out += char return out def first_alnum(s): for c in s: if c.isalnum(): return c return "" def add_sentence(sentence, str_sent): sents = filter(lambda s: len(s) > 1 and s != '//', sent_tokenize(onlyascii(str_sent.strip()))) sents = list( map( lambda s: s.replace("/", " ").replace("-", " - ").replace( ")", " ) ").replace(" ", " ").strip(), sents)) # the code below handles cases where the sentences are not properly split and we get multiple sentences here if len(sents) > 1: # Only valid splits start with a initial capital letter new_sents = [] for s in sents: if len(new_sents) > 0 and (s.strip() == "\"" or s.strip()[0] in ("(", ")") or s.strip()[0].islower() or first_alnum(s).islower()): self.aborted_splits.append(new_sents[-1]) self.aborted_splits.append(s) new_sents[-1] += " " + s else: new_sents.append(s) sents = new_sents if len(sents) > 1: # filter to # of full sentences, and we should get at least this many out expected_min_sents = len( [s for s in sents if len(s.strip().split(" ")) > 1]) unique_wds = set( map(lambda s: s.lower(), list(zip(*sentence))[0])) processed = [] partitions = [] for i, sent in enumerate(sents): # last but one only if i < (len(sents) - 1): last = sent.split(" ")[-1] if last.lower() == "temp.": expected_min_sents -= 1 continue if last[-1] in {".", "?", "?", "\n"}: last = last[-1] elif last.lower() not in unique_wds: last = last[-1] assert last.lower() in unique_wds tokens = sents[i + 1].split() first = tokens[0] if first == "//" and len(tokens) > 1: first = tokens[1] if first.lower() not in unique_wds: if not first[-1].isalnum(): first = first[:-1] if not first[0].isalnum(): first = first[0] assert first.lower( ) in unique_wds, "first.lower():%s not in unique_wds" % first.lower( ) partitions.append((last, first)) if len(partitions) == 0: # handle the temp. error from above (one sentence where there is a temp. Increase) self.tagged_sentences.append(sentence) return current = [] for j in range(0, len(sentence) - 1): wd, tag = sentence[j] current.append((wd, tag)) if len(partitions) > 0: last, first = partitions[0] if last == wd: nextWd, nextTg = sentence[j + 1] if first.startswith(nextWd): self.tagged_sentences.append(current) processed.append(list(zip(*current))[0]) current = [] partitions = partitions[1:] current.append(sentence[-1]) self.tagged_sentences.append(current) processed.append(list(zip(*current))[0]) assert len(processed) >= max(2, expected_min_sents) self.split_sents.append(processed) else: self.tagged_sentences.append(sentence) str_sent = "" for ix, ch in enumerate(self.txt): if ch.isalnum() or ch == "'": current_word += ch else: add_pair(current_word, current_sentence, codes.copy(), ch, ix) str_sent += current_word + ch # don't always split on periods, as not all periods terminate sentences (e.g. acronyms) if len(current_sentence) > 0 and \ ((ch in {"\n", "!", "?"}) or (ch == "/" and ix > 0 and self.txt[ix-1] in {"\n", ".", "!", "?"}) ): add_sentence(current_sentence, str_sent) current_sentence = [] str_sent = "" current_word = "" if ix in codes_start: codes.update(codes_start[ix]) if ix in codes_end: codes.difference_update(codes_end[ix]) # add any remaining add_pair(current_word, current_sentence, codes.copy(), "", ix) if len(current_sentence) > 0: self.tagged_sentences.append(current_sentence) for sent in self.tagged_sentences: tags = list(zip(*sent))[1] self.sentence_tags.append(set(flatten(tags)))
filename_prefix=features_filename_prefix)(extract_features) essay_feats = mem_extract_features(tagged_essays, **feat_config) logger.info("Features loaded") """ DEFINE TAGS """ gw_codes = GWConceptCodes() tag_freq = get_tag_freq(tagged_essays) freq_tags = list( set((tag for tag, freq in tag_freq.items() if freq >= MIN_TAG_FREQ and gw_codes.is_valid_code(tag)))) non_causal = [t for t in freq_tags if "->" not in t] only_causal = [t for t in freq_tags if "->" in t] _, lst_all_tags = flatten_to_wordlevel_feat_tags(essay_feats) regular_tags = list( set((t for t in flatten(lst_all_tags) if "->" not in t and ":" not in t and gw_codes.is_valid_code(t)))) CAUSE_TAGS = ["Causer", "Result", "explicit"] CAUSAL_REL_TAGS = [CAUSAL_REL, CAUSE_RESULT, RESULT_REL] # + ["explicit"] """ """ # wd_train_tags = list( set(regular_tags + only_causal + CAUSE_TAGS + CAUSAL_REL_TAGS)) wd_test_tags = wd_train_tags # tags to evaluate against """ CLASSIFIERS """ """ Log Reg + Log Reg is best!!! """ f_output_file = open(out_predictions_file, "w+")
spelling_correct=SPELLING_CORRECT, replace_nums=REPLACE_NUMS, stem=STEM, remove_stop_words=REMOVE_STOP_WORDS, remove_punctuation=REMOVE_PUNCTUATION, lower_case=LOWER_CASE, include_vague=INCLUDE_VAGUE, include_normal=INCLUDE_NORMAL) # FEATURE SETTINGS WINDOW_SIZE = 7 CV_FOLDS = 5 # END FEATURE SETTINGS offset = (WINDOW_SIZE-1) / 2 # don't memoize as it's massive and also fast word_projector_transformer = WordProjectorTransformer(offset) essay_feats = word_projector_transformer.transform(tagged_essays) _, lst_all_tags = flatten_to_wordlevel_vectors_tags(essay_feats) all_tags = set(flatten(lst_all_tags)) # use more tags for training for sentence level classifier regular_tags = [t for t in all_tags if t[0].isdigit()] cause_tags = ["Causer", "Result", "explicit"] causal_rel_tags = [CAUSAL_REL, CAUSE_RESULT, RESULT_REL]# + ["explicit"] wd_train_tags = regular_tags + cause_tags wd_test_tags = regular_tags folds = cross_validation(essay_feats, CV_FOLDS) lst_td_wt_mean_prfa, lst_vd_wt_mean_prfa, lst_td_mean_prfa, lst_vd_mean_prfa = [], [], [], [] td_all_metricsByTag = defaultdict(list) vd_all_metricsByTag = defaultdict(list)
feat_config = dict(list(config.items()) + [("extractors", extractors)]) """ LOAD DATA """ mem_process_essays = memoize_to_disk(filename_prefix=processed_essay_filename_prefix)(load_process_essays) tagged_essays = mem_process_essays( **config ) logger.info("Essays loaded") # most params below exist ONLY for the purposes of the hashing to and from disk mem_extract_features = memoize_to_disk(filename_prefix=features_filename_prefix)(extract_features) essay_feats = mem_extract_features(tagged_essays, **feat_config) logger.info("Features loaded") """ DEFINE TAGS """ _, lst_all_tags = flatten_to_wordlevel_feat_tags(essay_feats) #all_regular_tags = list((t for t in flatten(lst_all_tags) if t[0].isdigit())) all_regular_tags = list(set((t for t in flatten(lst_all_tags) if t.lower().strip() == "anaphor" ))) tag_freq = Counter(all_regular_tags) regular_tags = list(tag_freq.keys()) """ works best with all the pair-wise causal relation codes """ wd_train_tags = regular_tags wd_test_tags = regular_tags """ CLASSIFIERS """ """ Log Reg + Log Reg is best!!! """ fn_create_wd_cls = lambda: LogisticRegression() # C=1, dual = False seems optimal wd_algo = str(fn_create_wd_cls()) print("Classifier:", wd_algo) folds = cross_validation(essay_feats, CV_FOLDS)
modified_sentence = filter2min_word_freq(sentence) modified_sentence = filterout_punctuation(modified_sentence) if len(modified_sentence) == 0: continue bookend(modified_sentence) new_windows = split_into_windows(modified_sentence, window_size= WINDOW_SIZE) assert_windows_correct(new_windows) # tagged words sentences.append(sentence) # words only wds, tags = zip(*sentence) tokenized_sentences.append(wds) ix2sentTags[i] = set(flatten(tags)) ix2windows[i] = new_windows ix2sents[i] = modified_sentence i += 1 """ Assert tags set correctly """ print "Windows loaded correctly!\n" print "\n".join(sorted(removed)) """ Extract Features """ from WindowFeatures import extract_positional_word_features, extract_word_features from NgramGenerator import compute_ngrams def extract_positional_bigram_features(window, mid_ix, feature_val = 1):
def __init__(self, full_path, include_vague = True, include_normal = True, load_annotations = True, essay_text = None): self.include_normal = include_normal self.include_vague = include_vague txt_file = full_path[:-4] + ".txt" if essay_text is None: self.full_path = full_path self.file_name = full_path.split("/")[-1] with open(txt_file, "r+") as f: self.txt = f.read() else: if load_annotations: raise Exception("Can't load annotations when passing in essay as text string") self.full_path = "None" self.file_name = "None" self.txt = essay_text self.tagged_words = [] #list of list of tuples (words and tags) self.tagged_sentences = [] # list of sets of tags self.sentence_tags = [] self.id2annotation = {} self.split_sents = [] self.aborted_splits = [] if load_annotations: assert full_path.endswith(".ann") assert os.path.exists(txt_file), "Missing associated text file for %s" % self.full_path with open(full_path, "r+") as f: lines = f.readlines() else: lines = [] codes_start = defaultdict(set) codes_end = defaultdict(set) def get_code(annotation): if ":" not in annotation.code: return annotation.code typ, id = annotation.code.split(":") """ strip off the trailing digit (e.g. Causer1:50) """ if typ[-1].isdigit(): typ = typ[:-1] return typ + ":" + id def process_causal_relations(causer, result, antecedent_mapping): start = min(causer.start, result.start) end = max(causer.end, result.end) if start == end: return False causer_code = get_code(causer) result_code = get_code(result) cr_code = causer_code + "->" + result_code codes_start[start].add(cr_code) codes_end[end].add(cr_code) if ANAPHORA.lower() in causer_code.lower(): #assert causer.id in antecedent_mapping, "No antecedent mapping found for annotation:" + str(causer) reference_ids = antecedent_mapping[causer.id] for ref_id in reference_ids: reference_annotation = self.id2annotation[ref_id] cr_ana_code = "{causer_code}[{reference_code}]->{result_code}".format( causer_code=causer_code, reference_code=reference_annotation.code, result_code=result_code ) codes_start[start].add(cr_ana_code) codes_end[end].add(cr_ana_code) if ANAPHORA.lower() in result_code.lower(): #assert result.id in antecedent_mapping, "No antecedent mapping found for annotation:" + str(result) reference_ids = antecedent_mapping[result.id] for ref_id in reference_ids: reference_annotation = self.id2annotation[ref_id] cr_ana_code = "{causer_code}->{result_code}[{reference_code}]".format( causer_code=causer_code, reference_code=reference_annotation.code, result_code=result_code ) codes_start[start].add(cr_ana_code) codes_end[end].add(cr_ana_code) return True def process_text_annotation_and_add_codes(annotation): if annotation.start == annotation.end: return False codes_end[annotation.end].add(get_code(annotation)) codes_start[annotation.start].add(get_code(annotation)) if hasattr(annotation, "dep_type"): codes_start[annotation.start].add(annotation.dep_type) codes_end[annotation.end].add(annotation.dep_type) return True def process_anaphoric_reference(anaphora_annotation, reference_annotation)->None: """ Adds start and end code for anaphoric references, resolving them with the location of the anphora tag, but the code from the reference annotation :param anaphora_annotation: text annotation with the Anaphor tag :param reference_annotation: text annotation that was a reference for that tag :return: None """ assert anaphora_annotation.code == ANAPHORA, "Code is not anaphora" # prepend Anaphor tag to the code so we can differentiate from the regular codes reference_code = "{anaphora}:[{code}]".format(anaphora=ANAPHORA, code=get_code(reference_annotation)) codes_end[anaphora_annotation.end].add(reference_code) codes_start[anaphora_annotation.start].add(reference_code) annotations_with_dependencies_inc_crels = [] text_annotations = [] antecedent_mapping = defaultdict(set) # map of id to a Set[id] for resolving co-references vague_ids = set() normal_ids = set() for line in lines: if len(line.strip()) < 2: continue if not line[1].isdigit(): print("Skipping annotation line: %s \n\tin essay %s" % (line.strip(), self.file_name)) continue first_char = line[0] if first_char == "T": if is_compound(line): annotation = CompoundTextAnnotation(line, self.txt) text_annotations.append(annotation.first_part) text_annotations.append(annotation.second_part) if annotation.third_part: text_annotations.append(annotation.third_part) """ DEBUGGING print "" print line.strip() print annotation.txt print "First: ", self.txt[annotation.first_part.start:annotation.first_part.end] print "Second: ", self.txt[annotation.second_part.start:annotation.second_part.end] print annotation.first_part.start, annotation.first_part.end, " ", print annotation.second_part.start, annotation.second_part.end """ else: try: annotation = TextAnnotation(line, self.txt) #Bad annotation, ignore if annotation.start == annotation.end: continue else: text_annotations.append(annotation) except Exception as e: import traceback print(traceback.format_exc()) raise e elif first_char == "A": annotation = AttributeAnnotation(line) if annotation.attribute == "Vague": vague_ids.add(annotation.target_id) if annotation.attribute == "Normal": normal_ids.add(annotation.target_id) for id in annotation.child_annotation_ids: annotation.child_annotations.append(self.id2annotation[id]) elif first_char == "R": annotation = RelationshipAnnotation(line) antecedent_mapping[annotation.arg1_code].add(annotation.arg2_code) elif first_char == "E": annotation = EventAnnotation(line, self.id2annotation) annotations_with_dependencies_inc_crels.append(annotation) elif first_char == "#": annotation = NoteAnnotation(line) for id in annotation.child_annotation_ids: annotation.child_annotations.append(self.id2annotation[id]) else: raise Exception("Unknown annotation type") self.id2annotation[annotation.id] = annotation #end process lines for annotation in text_annotations: if not include_vague and annotation.id in vague_ids: continue if not include_normal and annotation.id in normal_ids: continue # add concept code to is_valid = process_text_annotation_and_add_codes(annotation) # Process / resolve antecedent relations (anaphora refs) if is_valid and annotation.code == ANAPHORA: # some anaphora tags don't have antecedent mappings if annotation.id in antecedent_mapping: # get all references ref_ids = antecedent_mapping[annotation.id] # for each reference, add a new "Anaphora:<code>" tag for id in ref_ids: reference_annotation = self.id2annotation[id] process_anaphoric_reference(anaphora_annotation=annotation, reference_annotation=reference_annotation) # process causal relations, for annotation in annotations_with_dependencies_inc_crels: deps = annotation.dependencies() # group items grp_causer = dict() grp_result = dict() for dependency in deps: process_text_annotation_and_add_codes(dependency) code = dependency.code splt = code.split(":") typ = splt[0] grp_key = 0 if typ[-1].isdigit(): grp_key = typ[-1] if code.startswith("Cause"): grp_causer[grp_key] = dependency elif code.startswith("Result"): grp_result[grp_key]= dependency else: pass if len(grp_causer) > 0 and len(grp_result) > 0: if len(grp_causer) == 1 and len(grp_result) == 1: causer = list(grp_causer.values())[0] result = list(grp_result.values())[0] process_causal_relations(causer, result, antecedent_mapping) elif len(grp_causer) == len(grp_result): for key in grp_causer.keys(): causer = grp_causer[key] result = grp_result[key] process_causal_relations(causer, result, antecedent_mapping) elif len(grp_causer) == 1: causer = list(grp_causer.values())[0] for key, result in grp_result.items(): process_causal_relations(causer, result, antecedent_mapping) elif len(grp_result) == 1: result = list(grp_result.values())[0] for key, causer in grp_causer.items(): process_causal_relations(causer, result, antecedent_mapping) else: raise Exception("Unbalanced CR codes") codes = set() current_word = "" current_sentence = [] def add_pair(current_word, current_sentence, codes, ch, ix): if current_word.strip() != "": pair = (current_word, codes) current_sentence.append(pair) self.tagged_words.append(pair) if ch.strip() != "" and ch != "/": if ix in codes_start: pair2 = (ch, codes_start[ix]) else: pair2 = (ch, set()) current_sentence.append(pair2) self.tagged_words.append(pair2) def onlyascii(s): out = "" for char in s: if ord(char) > 127: out += "" else: out += char return out def first_alnum(s): for c in s: if c.isalnum(): return c return "" def add_sentence(sentence, str_sent): sents = filter(lambda s: len(s) > 1 and s != '//', sent_tokenize(onlyascii(str_sent.strip()))) sents = list(map(lambda s: s.replace("/", " ").replace("-", " - ").replace(")", " ) ").replace(" "," ").strip(), sents)) # the code below handles cases where the sentences are not properly split and we get multiple sentences here if len(sents) > 1: # Only valid splits start with a initial capital letter new_sents = [] for s in sents: if len(new_sents) > 0 and (s.strip() =="\"" or s.strip()[0] in ("(",")") or s.strip()[0].islower() or first_alnum(s).islower()): self.aborted_splits.append(new_sents[-1]) self.aborted_splits.append(s) new_sents[-1] += " " + s else: new_sents.append(s) sents = new_sents if len(sents) > 1: # filter to # of full sentences, and we should get at least this many out expected_min_sents = len([s for s in sents if len(s.strip().split(" ")) > 1]) unique_wds = set(map(lambda s: s.lower(), list(zip(*sentence))[0])) processed = [] partitions = [] for i, sent in enumerate(sents): # last but one only if i < (len(sents) - 1): last = sent.split(" ")[-1] if last.lower() == "temp.": expected_min_sents -= 1 continue if last[-1] in {".", "?", "?", "\n"}: last = last[-1] elif last.lower() not in unique_wds: last = last[-1] assert last.lower() in unique_wds tokens = sents[i + 1].split() first = tokens[0] if first == "//" and len(tokens) > 1: first = tokens[1] if first.lower() not in unique_wds: if not first[-1].isalnum(): first = first[:-1] if not first[0].isalnum(): first = first[0] assert first.lower() in unique_wds, "first.lower():%s not in unique_wds" % first.lower() partitions.append((last, first)) if len(partitions) == 0: # handle the temp. error from above (one sentence where there is a temp. Increase) self.tagged_sentences.append(sentence) return current = [] for j in range(0, len(sentence)-1): wd,tag = sentence[j] current.append((wd, tag)) if len(partitions) > 0: last, first = partitions[0] if last == wd: nextWd, nextTg = sentence[j + 1] if first.startswith(nextWd): self.tagged_sentences.append(current) processed.append(list(zip(*current))[0]) current = [] partitions = partitions[1:] current.append(sentence[-1]) self.tagged_sentences.append(current) processed.append(list(zip(*current))[0]) assert len(processed) >= max(2,expected_min_sents) self.split_sents.append(processed) else: self.tagged_sentences.append(sentence) str_sent = "" for ix, ch in enumerate(self.txt): if ch.isalnum() or ch == "'": current_word += ch else: add_pair(current_word, current_sentence, codes.copy(), ch, ix) str_sent += current_word + ch # don't always split on periods, as not all periods terminate sentences (e.g. acronyms) if len(current_sentence) > 0 and \ ((ch in {"\n", "!", "?"}) or (ch == "/" and ix > 0 and self.txt[ix-1] in {"\n", ".", "!", "?"}) ): add_sentence(current_sentence, str_sent) current_sentence = [] str_sent = "" current_word = "" if ix in codes_start: # add in all new codes here codes.update(codes_start[ix]) if ix in codes_end: # remove all codes from the set that end here # this is basically like a remove ALL based on what's in 'codes_end[ix]' codes.difference_update(codes_end[ix]) # add any remaining add_pair(current_word, current_sentence, codes.copy(), "", ix) if len(current_sentence) > 0: self.tagged_sentences.append(current_sentence) for sent in self.tagged_sentences: tags = list(zip(*sent))[1] self.sentence_tags.append(set(flatten(tags)))
if __name__ == "__main__": "1,2,3 => 1" "else => 0" simple_dataset = [ ([1, 2, 3, 7, 8], 1 ), ([1, 2, 3, 4, 5], 1 ), ([7, 8, 1, 2, 3, 4, 5], 1 ), ([1, 2], 0 ), ([2, 3], 0 ), ([1, 3], 0 ), ([5, 6, 7], 0 ), ([4, 1, 2], 0 ), ([3, 7, 9], 0 ), ([1, 2, 4, 5, 6, 7, 8], 0 ), ] xs, ys = zip(*simple_dataset) attributes = list(set(flatten(xs))) dt = DecisionTree(attributes) dt.fit(xs, ys) predictions = dt.predict(xs) acc = accuracy(ys, predictions, class_value=1) print "\nAccuracy: " + str(acc) print "" print str(dt.tree) pass
def test(epochs = 1): results = model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=epochs, validation_split=0.0, show_accuracy=True, verbose=1) classes = flatten( model.predict_classes(X_test, batch_size=batch_size) ) r, p, f1 = rpf1(y_test, classes) print("recall", r, "precision", p, "f1", f1) return f1
def test(epochs=1): model.fit(X_train, y_train, n_epochs=epochs, batch_size=64)#64 seems good for now predictions = flatten(model.predict(X_test)) r, p, f1, cutoff = find_cutoff(y_test, predictions) print("recall", rnd(r), "precision", rnd(p), "f1", rnd(f1), "cutoff", rnd(cutoff)) return f1
def evaluate_feature_set(config, existing_extractors, new_extractor, features_filename_prefix): feat_extractors = existing_extractors + [new_extractor] feat_config = dict(config.items() + [("extractors", feat_extractors)]) """ LOAD FEATURES """ # most params below exist ONLY for the purposes of the hashing to and from disk #mem_extract_features = memoize_to_disk(filename_prefix=features_filename_prefix, verbose=False)(extract_features) #essay_feats = mem_extract_features(tagged_essays, **feat_config) essay_feats = extract_features(tagged_essays, **feat_config) """ DEFINE TAGS """ _, lst_all_tags = flatten_to_wordlevel_feat_tags(essay_feats) regular_tags = list( set((t for t in flatten(lst_all_tags) if t[0].isdigit()))) """ works best with all the pair-wise causal relation codes """ wd_train_tags = regular_tags wd_test_tags = regular_tags """ CLASSIFIERS """ fn_create_wd_cls = lambda: LogisticRegression( ) # C=1, dual = False seems optimal wd_algo = str(fn_create_wd_cls()) # Gather metrics per fold cv_wd_td_ys_by_tag, cv_wd_td_predictions_by_tag = defaultdict( list), defaultdict(list) cv_wd_vd_ys_by_tag, cv_wd_vd_predictions_by_tag = defaultdict( list), defaultdict(list) folds = cross_validation(essay_feats, CV_FOLDS) def train_tagger(essays_TD, essays_VD, wd_test_tags, wd_train_tags): # TD and VD are lists of Essay objects. The sentences are lists # of featureextractortransformer.Word objects """ Data Partitioning and Training """ td_feats, td_tags = flatten_to_wordlevel_feat_tags(essays_TD) vd_feats, vd_tags = flatten_to_wordlevel_feat_tags(essays_VD) feature_transformer = FeatureVectorizer(min_feature_freq=MIN_FEAT_FREQ, sparse=SPARSE_WD_FEATS) td_X, vd_X = feature_transformer.fit_transform( td_feats), feature_transformer.transform(vd_feats) wd_td_ys_bytag = get_wordlevel_ys_by_code(td_tags, wd_train_tags) wd_vd_ys_bytag = get_wordlevel_ys_by_code(vd_tags, wd_train_tags) """ TRAIN Tagger """ tag2word_classifier = train_classifier_per_code( td_X, wd_td_ys_bytag, lambda: LogisticRegression(), wd_train_tags, verbose=False) """ TEST Tagger """ td_wd_predictions_by_code = test_classifier_per_code( td_X, tag2word_classifier, wd_test_tags) vd_wd_predictions_by_code = test_classifier_per_code( vd_X, tag2word_classifier, wd_test_tags) return td_wd_predictions_by_code, vd_wd_predictions_by_code, wd_td_ys_bytag, wd_vd_ys_bytag #results = Parallel(n_jobs=CV_FOLDS)( # delayed(train_tagger)(essays_TD, essays_VD, wd_test_tags, wd_train_tags) # for (essays_TD, essays_VD) in folds) results = [ train_tagger(essays_TD, essays_VD, wd_test_tags, wd_train_tags) for (essays_TD, essays_VD) in folds ] for result in results: td_wd_predictions_by_code, vd_wd_predictions_by_code, wd_td_ys_bytag, wd_vd_ys_bytag = result merge_dictionaries(wd_td_ys_bytag, cv_wd_td_ys_by_tag) merge_dictionaries(wd_vd_ys_bytag, cv_wd_vd_ys_by_tag) merge_dictionaries(td_wd_predictions_by_code, cv_wd_td_predictions_by_tag) merge_dictionaries(vd_wd_predictions_by_code, cv_wd_vd_predictions_by_tag) # print results for each code """ Persist Results to Mongo DB """ SUFFIX = "_FEAT_SELECTION" CB_TAGGING_TD, CB_TAGGING_VD = "CB_TAGGING_TD" + SUFFIX, "CB_TAGGING_VD" + SUFFIX parameters = dict(config) parameters["extractors"] = map(lambda fn: fn.func_name, feat_extractors) parameters["min_feat_freq"] = MIN_FEAT_FREQ wd_td_objectid = processor.persist_results(CB_TAGGING_TD, cv_wd_td_ys_by_tag, cv_wd_td_predictions_by_tag, parameters, wd_algo) wd_vd_objectid = processor.persist_results(CB_TAGGING_VD, cv_wd_vd_ys_by_tag, cv_wd_vd_predictions_by_tag, parameters, wd_algo) avg_f1 = float( processor.get_metric(CB_TAGGING_VD, wd_vd_objectid, __MICRO_F1__)["f1_score"]) return avg_f1
modified_sentence = filter2min_word_freq(sentence) modified_sentence = filterout_punctuation(modified_sentence) if len(modified_sentence) == 0: continue bookend(modified_sentence) new_windows = split_into_windows(modified_sentence, window_size=WINDOW_SIZE) assert_windows_correct(new_windows) # tagged words sentences.append(sentence) # words only wds, tags = zip(*sentence) tokenized_sentences.append(wds) ix2sentTags[i] = set(flatten(tags)) ix2windows[i] = new_windows ix2sents[i] = modified_sentence i += 1 """ Assert tags set correctly """ print "Windows loaded correctly!\n" print "\n".join(sorted(removed)) """ Extract Features """ from WindowFeatures import extract_positional_word_features, extract_word_features from NgramGenerator import compute_ngrams def extract_positional_bigram_features(window, mid_ix, feature_val=1): bi_grams = compute_ngrams(window, max_len=2, min_len=2)
from passage.layers import Dense from passage.models import RNN from passage.utils import save, load from passage.preprocessing import Tokenizer from passage.theano_utils import intX from passage.iterators import SortedPadded import theano.tensor as T from IterableFP import flatten #tokenizer = Tokenizer() #SH: doesn't work for some reason #train_tokens = tokenizer.fit_transform(["The big fat frog jumped out of the pond","frogs are amphibians", "toads are similar to frogs"]) train_tokens = [[1, 2, 4, 3, 6], [1, 2, 3], [3, 1, 2, 4, 3]] num_feats = len(set(flatten(train_tokens))) def get_labels(id): if id == 3: return [1, 0] else: return [0, 1] seq_labels = map(lambda (l): map(get_labels, l), train_tokens) layers = [ Embedding(size=128, n_features=num_feats), GatedRecurrent(size=128, seq_output=True), Dense(size=num_feats, activation='softmax')
] feat_config = dict(config.items() + [("extractors", extractors)]) """ LOAD DATA """ mem_process_essays = memoize_to_disk( filename_prefix=processed_essay_filename_prefix)(load_process_essays) tagged_essays = mem_process_essays(**config) logger.info("Essays loaded") # most params below exist ONLY for the purposes of the hashing to and from disk mem_extract_features = memoize_to_disk( filename_prefix=features_filename_prefix)(extract_features) essay_feats = mem_extract_features(tagged_essays, **feat_config) logger.info("Features loaded") """ DEFINE TAGS """ _, lst_all_tags = flatten_to_wordlevel_feat_tags(essay_feats) regular_tags = list(set((t for t in flatten(lst_all_tags) if t[0].isdigit()))) CAUSE_TAGS = ["Causer", "Result", "explicit"] CAUSAL_REL_TAGS = [CAUSAL_REL, CAUSE_RESULT, RESULT_REL] # + ["explicit"] """ works best with all the pair-wise causal relation codes """ wd_train_tags = regular_tags #+ CAUSE_TAGS wd_test_tags = regular_tags #+ CAUSE_TAGS # tags from tagging model used to train the stacked model sent_input_feat_tags = wd_train_tags # find interactions between these predicted tags from the word tagger to feed to the sentence tagger sent_input_interaction_tags = wd_train_tags # tags to train (as output) for the sentence based classifier sent_output_train_test_tags = list( set(regular_tags + CAUSE_TAGS + CAUSAL_REL_TAGS))
# most params below exist ONLY for the purposes of the hashing to and from disk mem_extract_features = memoize_to_disk(filename_prefix=features_filename_prefix)(extract_features) essay_feats = mem_extract_features(tagged_essays, **feat_config) logger.info("Features loaded") """ DEFINE TAGS """ gw_codes = GWConceptCodes() tag_freq = get_tag_freq(tagged_essays) freq_tags = list(set((tag for tag, freq in tag_freq.items() if freq >= MIN_TAG_FREQ and gw_codes.is_valid_code(tag)))) non_causal = [t for t in freq_tags if "->" not in t] only_causal = [t for t in freq_tags if "->" in t] _, lst_all_tags = flatten_to_wordlevel_feat_tags(essay_feats) regular_tags = list(set((t for t in flatten(lst_all_tags) if "->" not in t and ":" not in t and gw_codes.is_valid_code(t)))) CAUSE_TAGS = ["Causer", "Result", "explicit"] CAUSAL_REL_TAGS = [CAUSAL_REL, CAUSE_RESULT, RESULT_REL]# + ["explicit"] """ """ # wd_train_tags = list(set(regular_tags + only_causal + CAUSE_TAGS + CAUSAL_REL_TAGS)) wd_test_tags = wd_train_tags # tags to evaluate against """ CLASSIFIERS """ """ Log Reg + Log Reg is best!!! """
def evaluate_feature_set(config, existing_extractors, new_extractor, features_filename_prefix): feat_extractors = existing_extractors + [new_extractor] feat_config = dict(config.items() + [("extractors", feat_extractors)]) """ LOAD FEATURES """ # most params below exist ONLY for the purposes of the hashing to and from disk #mem_extract_features = memoize_to_disk(filename_prefix=features_filename_prefix, verbose=False)(extract_features) #essay_feats = mem_extract_features(tagged_essays, **feat_config) essay_feats = extract_features(tagged_essays, **feat_config) """ DEFINE TAGS """ _, lst_all_tags = flatten_to_wordlevel_feat_tags(essay_feats) regular_tags = list(set((t for t in flatten(lst_all_tags) if t[0].isdigit()))) """ works best with all the pair-wise causal relation codes """ wd_train_tags = regular_tags wd_test_tags = regular_tags """ CLASSIFIERS """ fn_create_wd_cls = lambda: LogisticRegression() # C=1, dual = False seems optimal wd_algo = str(fn_create_wd_cls()) # Gather metrics per fold cv_wd_td_ys_by_tag, cv_wd_td_predictions_by_tag = defaultdict(list), defaultdict(list) cv_wd_vd_ys_by_tag, cv_wd_vd_predictions_by_tag = defaultdict(list), defaultdict(list) folds = cross_validation(essay_feats, CV_FOLDS) def train_tagger(essays_TD, essays_VD, wd_test_tags, wd_train_tags): # TD and VD are lists of Essay objects. The sentences are lists # of featureextractortransformer.Word objects """ Data Partitioning and Training """ td_feats, td_tags = flatten_to_wordlevel_feat_tags(essays_TD) vd_feats, vd_tags = flatten_to_wordlevel_feat_tags(essays_VD) feature_transformer = FeatureVectorizer(min_feature_freq=MIN_FEAT_FREQ, sparse=SPARSE_WD_FEATS) td_X, vd_X = feature_transformer.fit_transform(td_feats), feature_transformer.transform(vd_feats) wd_td_ys_bytag = get_wordlevel_ys_by_code(td_tags, wd_train_tags) wd_vd_ys_bytag = get_wordlevel_ys_by_code(vd_tags, wd_train_tags) """ TRAIN Tagger """ tag2word_classifier = train_classifier_per_code(td_X, wd_td_ys_bytag, lambda: LogisticRegression(), wd_train_tags, verbose=False) """ TEST Tagger """ td_wd_predictions_by_code = test_classifier_per_code(td_X, tag2word_classifier, wd_test_tags) vd_wd_predictions_by_code = test_classifier_per_code(vd_X, tag2word_classifier, wd_test_tags) return td_wd_predictions_by_code, vd_wd_predictions_by_code, wd_td_ys_bytag, wd_vd_ys_bytag #results = Parallel(n_jobs=CV_FOLDS)( # delayed(train_tagger)(essays_TD, essays_VD, wd_test_tags, wd_train_tags) # for (essays_TD, essays_VD) in folds) results = [train_tagger(essays_TD, essays_VD, wd_test_tags, wd_train_tags) for (essays_TD, essays_VD) in folds] for result in results: td_wd_predictions_by_code, vd_wd_predictions_by_code, wd_td_ys_bytag, wd_vd_ys_bytag = result merge_dictionaries(wd_td_ys_bytag, cv_wd_td_ys_by_tag) merge_dictionaries(wd_vd_ys_bytag, cv_wd_vd_ys_by_tag) merge_dictionaries(td_wd_predictions_by_code, cv_wd_td_predictions_by_tag) merge_dictionaries(vd_wd_predictions_by_code, cv_wd_vd_predictions_by_tag) # print results for each code """ Persist Results to Mongo DB """ SUFFIX = "_FEAT_SELECTION" CB_TAGGING_TD, CB_TAGGING_VD = "CB_TAGGING_TD" + SUFFIX, "CB_TAGGING_VD" + SUFFIX parameters = dict(config) parameters["extractors"] = map(lambda fn: fn.func_name, feat_extractors) parameters["min_feat_freq"] = MIN_FEAT_FREQ wd_td_objectid = processor.persist_results(CB_TAGGING_TD, cv_wd_td_ys_by_tag, cv_wd_td_predictions_by_tag, parameters, wd_algo) wd_vd_objectid = processor.persist_results(CB_TAGGING_VD, cv_wd_vd_ys_by_tag, cv_wd_vd_predictions_by_tag, parameters, wd_algo) avg_f1 = float(processor.get_metric(CB_TAGGING_VD, wd_vd_objectid, __MICRO_F1__)["f1_score"]) return avg_f1
# most params below exist ONLY for the purposes of the hashing to and from disk mem_extract_features = memoize_to_disk(filename_prefix=features_filename_prefix)(extract_features) essay_feats = mem_extract_features(tagged_essays, **feat_config) logger.info("Features loaded") """ DEFINE TAGS """ gw_codes = GWConceptCodes() tag_freq = get_tag_freq(tagged_essays) freq_tags = list(set((tag for tag, freq in tag_freq.items() if freq >= MIN_TAG_FREQ and gw_codes.is_valid_code(tag)))) non_causal = [t for t in freq_tags if "->" not in t] only_causal = [t for t in freq_tags if "->" in t] _, lst_all_tags = flatten_to_wordlevel_feat_tags(essay_feats) regular_tags = list(set((t for t in flatten(lst_all_tags) if "->" not in t and ":" not in t and gw_codes.is_valid_code(t)))) CAUSE_TAGS = ["Causer", "Result", "explicit"] CAUSAL_REL_TAGS = [CAUSAL_REL, CAUSE_RESULT, RESULT_REL]# + ["explicit"] """ """ # wd_train_tags = list(set(regular_tags + CAUSE_TAGS)) wd_test_tags = wd_train_tags # tags from tagging model used to train the stacked model sent_input_feat_tags = wd_train_tags # find interactions between these predicted tags from the word tagger to feed to the sentence tagger sent_input_interaction_tags = list(set(wd_train_tags))