Exemple #1
0
 def fit(self, xs, ys):
     attribute_names = list(set(flatten(xs)))
     # Compute pairs of attributes : (attribute_name, is_child_before)
     self.attributes = [(a, True) for a in attribute_names
                        ] + [(a, False) for a in attribute_names]
     self.attributes.sort()
     DecisionTreeBase.fit(self, xs, ys)
Exemple #2
0
def test(epochs = 1):
    results = model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=epochs, validation_split=0.0, show_accuracy=True, verbose=1)
    probs = flatten( model.predict_proba(X_test, batch_size=batch_size) )
    y_pred = [1 if p >= 0.5 else 0 for p in probs]
    r, p, f1 = rpf1(y_test, y_pred)
    print("recall", r, "precision", p, "f1", f1)
    return f1
Exemple #3
0
def get_wordlevel_ys_by_code(lst_tag_sets, expected_tags):
    """
    Convert a list of tagsets to a dictionary of ys values per tag label

    Parameters
    ----------
    lst_tag_sets : a list of sets of tags
        List of labels for each word

    Returns
    ----------
    A dictionary of codes mapping to binary labels for that code

    """
    unique_tags = set(flatten(lst_tag_sets))
    tmp_ys_bycode = defaultdict(list)
    for tag_set in lst_tag_sets:
        for y in unique_tags:
            tmp_ys_bycode[y].append(1 if y in tag_set else 0)

    num_rows = len(list(tmp_ys_bycode.values())[0])

    # Convert to a dict of numpy arrays
    ys_bycode = dict()
    for tag in expected_tags:
        if tag in tmp_ys_bycode and len(tmp_ys_bycode[tag]) > 0:
            lst = tmp_ys_bycode[tag]
            ys_bycode[tag] = np.asarray(lst, dtype=np.int).reshape((len(lst), ))
        else:
            ys_bycode[tag] = np.zeros(shape=(num_rows,), dtype=np.int)
    return ys_bycode
def test(epochs=1):
    model.fit(X_train, y_train, n_epochs=epochs,
              batch_size=64)  #64 seems good for now
    predictions = flatten(model.predict(X_test))
    r, p, f1, cutoff = find_cutoff(y_test, predictions)
    print("recall", rnd(r), "precision", rnd(p), "f1", rnd(f1), "cutoff",
          rnd(cutoff))
    return f1
def tally_code_frequencies(tagged_essays):
    freq = defaultdict(int)
    all_codes = set()
    for essay in tagged_essays:
        for i, sentence in enumerate(essay.sentences):
            words, tags = zip(*sentence)
            utags = set(flatten(tags))
            all_codes.update(utags)
            for t in utags:
                freq[t] += 1
    return freq
def test(epochs=1):
    results = model.fit(X_train,
                        y_train,
                        batch_size=batch_size,
                        nb_epoch=epochs,
                        validation_split=0.0,
                        show_accuracy=True,
                        verbose=1)
    classes = flatten(model.predict_classes(X_test, batch_size=batch_size))
    r, p, f1 = rpf1(y_test, classes)
    print("recall", r, "precision", p, "f1", f1)
    return f1
Exemple #7
0
    def __train__(self, sentences):

        unique_words = (flatten(sentences))
        syn_map = {}
        mapped = set()
        for sentence in sentences:
            tags = nltk.pos_tag(sentence)
            for wd, tag in tags:
                pair = (wd, tag)
                if pair in mapped:
                    continue
                synonyms = [(s, tag)
                            for s in self.get_synonyms_for_word(wd, tag)
                            if s in unique_words]
                if len(synonyms) >= 1:
                    matches = []
                    for spair in synonyms:
                        if spair in syn_map:
                            matches.append(syn_map[spair])
                    if len(matches) == 0:
                        synset = set(synonyms)
                        synset.add(pair)
                        for p in synset:
                            syn_map[p] = synset
                    elif len(matches) == 1:
                        matches[0].add(pair)
                        syn_map[pair] = matches[0]
                    else:
                        #merge existing synonym lists
                        new_synset = set()
                        for m in matches:
                            new_synset.update(m)
                        #update mapping to map to new larger set
                        for s in new_synset:
                            syn_map[s] = new_synset
                else:  #length == 2
                    syn_map[pair] = set([pair])
                mapped.add(pair)
        self.synonym_map = {}

        processed = set()
        for values in syn_map.values():
            vid = id(values)
            if vid in processed:
                continue
            processed.add(vid)
            key = list(values)[0]
            for v in values:
                if v in self.synonym_map:
                    raise Exception("Duplicate key %s" % str(v))
                self.synonym_map[v] = key
Exemple #8
0
def evaluate_feature_set(config, existing_extractors, new_extractor,
                         features_filename_prefix):

    feat_extractors = existing_extractors + [new_extractor]
    feat_config = dict(
        list(config.items()) + [("extractors", feat_extractors)])
    """ LOAD FEATURES """
    # most params below exist ONLY for the purposes of the hashing to and from disk
    #mem_extract_features = memoize_to_disk(filename_prefix=features_filename_prefix, verbose=False)(extract_features)
    #essay_feats = mem_extract_features(tagged_essays, **feat_config)
    essay_feats = extract_features(tagged_essays, **feat_config)
    """ DEFINE TAGS """
    _, lst_all_tags = flatten_to_wordlevel_feat_tags(essay_feats)
    regular_tags = list(
        set((t for t in flatten(lst_all_tags) if t[0].isdigit())))
    """ works best with all the pair-wise causal relation codes """
    wd_train_tags = regular_tags
    wd_test_tags = regular_tags
    """ CLASSIFIERS """
    fn_create_wd_cls = lambda: LogisticRegression(
    )  # C=1, dual = False seems optimal
    wd_algo = str(fn_create_wd_cls())

    # Gather metrics per fold
    folds = cross_validation(essay_feats, CV_FOLDS)

    def train_tagger(essays_TD, essays_VD, wd_test_tags, wd_train_tags):
        # TD and VD are lists of Essay objects. The sentences are lists
        # of featureextractortransformer.Word objects
        """ Data Partitioning and Training """
        td_feats, td_tags = flatten_to_wordlevel_feat_tags(essays_TD)
        vd_feats, vd_tags = flatten_to_wordlevel_feat_tags(essays_VD)
        feature_transformer = FeatureVectorizer(min_feature_freq=MIN_FEAT_FREQ,
                                                sparse=SPARSE_WD_FEATS)
        td_X, vd_X = feature_transformer.fit_transform(
            td_feats), feature_transformer.transform(vd_feats)
        return td_X.shape, vd_X.shape

    #results = Parallel(n_jobs=CV_FOLDS)(
    #        delayed(train_tagger)(essays_TD, essays_VD, wd_test_tags, wd_train_tags)
    #            for (essays_TD, essays_VD) in folds)

    td_col_sizes, vd_col_sizes = [], []
    for (essays_TD, essays_VD) in folds:
        td_x_shape, vd_x_shape = train_tagger(essays_TD, essays_VD,
                                              wd_test_tags, wd_train_tags)
        td_col_sizes.append(td_x_shape[1])
        vd_col_sizes.append(vd_x_shape[1])
    return np.mean(td_col_sizes), np.mean(vd_col_sizes)
    def __train__(self, sentences):

        unique_words = (flatten(sentences))
        syn_map = {}
        mapped = set()
        for sentence in sentences:
            tags = nltk.pos_tag(sentence)
            for wd, tag in tags:
                pair = (wd, tag)
                if pair in mapped:
                    continue
                synonyms = [(s, tag) for s in self.get_synonyms_for_word(wd, tag) if s in unique_words]
                if len(synonyms) >= 1:
                    matches = []
                    for spair in synonyms:
                        if spair in syn_map:
                            matches.append(syn_map[spair])
                    if len(matches) == 0:
                        synset = set(synonyms)
                        synset.add(pair)
                        for p in synset:
                            syn_map[p] = synset
                    elif len(matches) == 1:
                        matches[0].add(pair)
                        syn_map[pair] = matches[0]
                    else:
                        #merge existing synonym lists
                        new_synset = set()
                        for m in matches:
                            new_synset.update(m)
                        #update mapping to map to new larger set
                        for s in new_synset:
                            syn_map[s] = new_synset
                else: #length == 2
                    syn_map[pair] = set([pair])
                mapped.add(pair)
        self.synonym_map = {}

        processed = set()
        for values in syn_map.values():
            vid = id(values)
            if vid in processed:
                continue
            processed.add(vid)
            key = list(values)[0]
            for v in values:
                if v in self.synonym_map:
                    raise Exception("Duplicate key %s" % str(v))
                self.synonym_map[v] = key
def evaluate_feature_set(config, existing_extractors, new_extractor, features_filename_prefix):

    feat_extractors = existing_extractors + [new_extractor]
    feat_config = dict(list(config.items()) + [("extractors", feat_extractors)])
    """ LOAD FEATURES """
    # most params below exist ONLY for the purposes of the hashing to and from disk
    #mem_extract_features = memoize_to_disk(filename_prefix=features_filename_prefix, verbose=False)(extract_features)
    #essay_feats = mem_extract_features(tagged_essays, **feat_config)
    essay_feats = extract_features(tagged_essays, **feat_config)
    """ DEFINE TAGS """
    _, lst_all_tags = flatten_to_wordlevel_feat_tags(essay_feats)
    regular_tags = list(set((t for t in flatten(lst_all_tags) if t[0].isdigit())))
    """ works best with all the pair-wise causal relation codes """
    wd_train_tags = regular_tags
    wd_test_tags = regular_tags
    """ CLASSIFIERS """
    fn_create_wd_cls = lambda: LogisticRegression()  # C=1, dual = False seems optimal
    wd_algo = str(fn_create_wd_cls())

    # Gather metrics per fold
    folds = cross_validation(essay_feats, CV_FOLDS)

    def train_tagger(essays_TD, essays_VD, wd_test_tags, wd_train_tags):
        # TD and VD are lists of Essay objects. The sentences are lists
        # of featureextractortransformer.Word objects
        """ Data Partitioning and Training """
        td_feats, td_tags = flatten_to_wordlevel_feat_tags(essays_TD)
        vd_feats, vd_tags = flatten_to_wordlevel_feat_tags(essays_VD)
        feature_transformer = FeatureVectorizer(min_feature_freq=MIN_FEAT_FREQ, sparse=SPARSE_WD_FEATS)
        td_X, vd_X = feature_transformer.fit_transform(td_feats), feature_transformer.transform(vd_feats)
        return td_X.shape, vd_X.shape

    #results = Parallel(n_jobs=CV_FOLDS)(
    #        delayed(train_tagger)(essays_TD, essays_VD, wd_test_tags, wd_train_tags)
    #            for (essays_TD, essays_VD) in folds)

    td_col_sizes, vd_col_sizes = [], []
    for (essays_TD, essays_VD) in folds:
        td_x_shape, vd_x_shape = train_tagger(essays_TD, essays_VD, wd_test_tags, wd_train_tags)
        td_col_sizes.append(td_x_shape[1])
        vd_col_sizes.append(vd_x_shape[1])
    return np.mean(td_col_sizes), np.mean(vd_col_sizes)
def train_tagger(fold, essays_TD, essays_VD, wd_test_tags, wd_train_tags):

    wd_train_tags = set(wd_train_tags)

    # TD and VD are lists of Essay objects. The sentences are lists
    # of featureextractortransformer.Word objects
    print "\nFold %s" % fold
    print "Training Tagging Model"

    _, lst_every_tag = flatten_to_wordlevel_feat_tags(essay_feats)
    tag_freq = Counter(flatten(lst_every_tag))

    """ Data Partitioning and Training """
    td_feats, td_tags = flatten_to_wordlevel_feat_tags(essays_TD)
    vd_feats, vd_tags = flatten_to_wordlevel_feat_tags(essays_VD)
    feature_transformer = FeatureVectorizer(min_feature_freq=MIN_FEAT_FREQ, sparse=SPARSE_WD_FEATS)
    td_X, vd_X = feature_transformer.fit_transform(td_feats), feature_transformer.transform(vd_feats)

    #TODO: compute most common tags per word for training only (but not for evaluation)
    wd_td_ys = get_wordlevel_mostfrequent_ys(td_tags, wd_train_tags, tag_freq)

    # Get Actual Ys by code (dict of label to predictions
    wd_td_ys_by_code = get_wordlevel_ys_by_code(td_tags, wd_train_tags)
    wd_vd_ys_by_code = get_wordlevel_ys_by_code(vd_tags, wd_train_tags)

    #TODO: get most common tags for each word, predict from that using multi class method

    """ TRAIN Tagger """
    model = fn_create_wd_cls()
    model.fit(td_X, wd_td_ys)

    wd_td_pred = model.predict(td_X)
    wd_vd_pred = model.predict(vd_X)

    """ TEST Tagger """
    td_wd_predictions_by_code = get_by_code_from_powerset_predictions(wd_td_pred, wd_test_tags)
    vd_wd_predictions_by_code = get_by_code_from_powerset_predictions(wd_vd_pred, wd_test_tags)

    return td_wd_predictions_by_code, vd_wd_predictions_by_code, wd_td_ys_by_code, wd_vd_ys_by_code
Exemple #12
0
                                   remove_punctuation=REMOVE_PUNCTUATION,
                                   lower_case=LOWER_CASE,
                                   include_vague=INCLUDE_VAGUE,
                                   include_normal=INCLUDE_NORMAL)
# FEATURE SETTINGS
WINDOW_SIZE = 7
CV_FOLDS = 5
# END FEATURE SETTINGS
offset = (WINDOW_SIZE - 1) / 2

# don't memoize as it's massive and also fast
word_projector_transformer = WordProjectorTransformer(offset)
essay_feats = word_projector_transformer.transform(tagged_essays)

_, lst_all_tags = flatten_to_wordlevel_vectors_tags(essay_feats)
all_tags = set(flatten(lst_all_tags))

# use more tags for training for sentence level classifier

regular_tags = [t for t in all_tags if t[0].isdigit()]
cause_tags = ["Causer", "Result", "explicit"]
causal_rel_tags = [CAUSAL_REL, CAUSE_RESULT, RESULT_REL]  # + ["explicit"]

wd_train_tags = regular_tags + cause_tags
wd_test_tags = regular_tags

folds = cross_validation(essay_feats, CV_FOLDS)
lst_td_wt_mean_prfa, lst_vd_wt_mean_prfa, lst_td_mean_prfa, lst_vd_mean_prfa = [], [], [], []
td_all_metricsByTag = defaultdict(list)
vd_all_metricsByTag = defaultdict(list)
def max_probs_to_words(vector):
    ixs = np.argmax(vector, axis=1)
    return ids_to_words(flatten(ixs))
 def fit(self, xs, ys):
     attribute_names = list(set(flatten(xs)))
     # Compute pairs of attributes : (attribute_name, is_child_before)
     self.attributes = [(a, True) for a in attribute_names] + [(a, False) for a in attribute_names]
     self.attributes.sort()
     DecisionTreeBase.fit(self, xs, ys)
feat_config = dict(config.items() + [("extractors", extractors)])

""" LOAD DATA """
mem_process_essays = memoize_to_disk(filename_prefix=processed_essay_filename_prefix)(load_process_essays)
tagged_essays = mem_process_essays( **config )
logger.info("Essays loaded")
# most params below exist ONLY for the purposes of the hashing to and from disk
mem_extract_features = memoize_to_disk(filename_prefix=features_filename_prefix)(extract_features)
essay_feats = mem_extract_features(tagged_essays, **feat_config)
logger.info("Features loaded")

""" DEFINE TAGS """

_, lst_all_tags = flatten_to_wordlevel_feat_tags(essay_feats)
regular_tags = list(set((t for t in flatten(lst_all_tags) if t[0].isdigit())))

""" works best with all the pair-wise causal relation codes """
wd_train_tags = regular_tags
wd_test_tags  = regular_tags

""" CLASSIFIERS """
""" Log Reg + Log Reg is best!!! """
fn_create_wd_cls   = lambda: LogisticRegression() # C=1, dual = False seems optimal
#fn_create_wd_cls   = lambda : LinearSVC(C=1.0)
#fn_create_wd_cls    = lambda : RandomForestClassifier(n_jobs=8, max_depth=100)
#fn_create_wd_cls   = lambda : GradientBoostingClassifier()

wd_algo   = str(fn_create_wd_cls())
print "Classifier:", wd_algo
""" LOAD DATA """
mem_process_essays = memoize_to_disk(
    filename_prefix=processed_essay_filename_prefix)(load_process_essays)
tagged_essays = mem_process_essays(**config)
logger.info("Essays loaded")
# most params below exist ONLY for the purposes of the hashing to and from disk
mem_extract_features = memoize_to_disk(
    filename_prefix=features_filename_prefix)(extract_features)
essay_feats = mem_extract_features(tagged_essays, **feat_config)
logger.info("Features loaded")
""" DEFINE TAGS """

_, lst_all_tags = flatten_to_wordlevel_feat_tags(essay_feats)
#all_regular_tags = list((t for t in flatten(lst_all_tags) if t[0].isdigit()))
all_regular_tags = list(
    set((t for t in flatten(lst_all_tags) if t.lower().strip() == "anaphor")))
tag_freq = Counter(all_regular_tags)
regular_tags = list(tag_freq.keys())
""" works best with all the pair-wise causal relation codes """
wd_train_tags = regular_tags
wd_test_tags = regular_tags
""" CLASSIFIERS """
""" Log Reg + Log Reg is best!!! """
fn_create_wd_cls = lambda: LogisticRegression(
)  # C=1, dual = False seems optimal
wd_algo = str(fn_create_wd_cls())
print("Classifier:", wd_algo)

folds = cross_validation(essay_feats, CV_FOLDS)

Exemple #17
0

if __name__ == "__main__":

    "1,2,3 => 1"
    "else => 0"
    simple_dataset = [
        ([1, 2, 3, 7, 8], 1),
        ([1, 2, 3, 4, 5], 1),
        ([7, 8, 1, 2, 3, 4, 5], 1),
        ([1, 2], 0),
        ([2, 3], 0),
        ([1, 3], 0),
        ([5, 6, 7], 0),
        ([4, 1, 2], 0),
        ([3, 7, 9], 0),
        ([1, 2, 4, 5, 6, 7, 8], 0),
    ]

    xs, ys = zip(*simple_dataset)
    attributes = list(set(flatten(xs)))

    dt = DecisionTree(attributes)
    dt.fit(xs, ys)
    predictions = dt.predict(xs)

    acc = accuracy(ys, predictions, class_value=1)
    print "\nAccuracy: " + str(acc)
    print ""
    print str(dt.tree)
    pass
    def __init__(self,
                 full_path,
                 include_vague=True,
                 include_normal=True,
                 load_annotations=True,
                 essay_text=None):

        self.include_normal = include_normal
        self.include_vague = include_vague

        if essay_text is None:
            self.full_path = full_path
            self.file_name = full_path.split("/")[-1]
            txt_file = full_path[:-4] + ".txt"
            with open(txt_file, "r+") as f:
                self.txt = f.read()
        else:
            if load_annotations:
                raise Exception(
                    "Can't load annotations when pasing in essay as text string"
                )
            self.full_path = "None"
            self.file_name = "None"
            self.txt = essay_text

        self.tagged_words = []
        #list of list of tuples (words and tags)
        self.tagged_sentences = []
        # list of sets of tags
        self.sentence_tags = []
        self.id2annotation = {}
        self.split_sents = []
        self.aborted_splits = []

        if load_annotations:
            assert full_path.endswith(".ann")
            assert os.path.exists(
                txt_file
            ), "Missing associated text file for %s" % self.full_path

            with open(full_path, "r+") as f:
                lines = f.readlines()
        else:
            lines = []

        codes_start = defaultdict(set)
        codes_end = defaultdict(set)

        def get_code(annotation):
            if ":" not in annotation.code:
                return annotation.code

            typ, id = annotation.code.split(":")
            """ strip off the trailing digit (e.g. Causer1:50) """
            if typ[-1].isdigit():
                typ = typ[:-1]
            return typ + ":" + id

        def process_causal_relations(causer, result):
            start = min(causer.start, result.start)
            end = max(causer.end, result.end)
            if start == end:
                return False
            cr_code = get_code(causer) + "->" + get_code(result)
            codes_start[start].add(cr_code)
            codes_end[end].add(cr_code)
            return True

        def process_text_annotation(annotation):
            if annotation.start == annotation.end:
                return False
            codes_end[annotation.end].add(get_code(annotation))
            codes_start[annotation.start].add(get_code(annotation))
            if hasattr(annotation, "dep_type"):
                codes_start[annotation.start].add(annotation.dep_type)
                codes_end[annotation.end].add(annotation.dep_type)
            return True

        annotations_with_dependencies = []
        text_annotations = []
        vague_ids = set()
        normal_ids = set()
        for line in lines:
            if len(line.strip()) < 2:
                continue

            if not line[1].isdigit():
                print("Skipping annotation line: %s \n\tin essay %s" %
                      (line.strip(), self.file_name))
                continue

            first_char = line[0]
            if first_char == "T":
                if is_compound(line):
                    annotation = CompoundTextAnnotation(line, self.txt)
                    text_annotations.append(annotation.first_part)
                    text_annotations.append(annotation.second_part)
                    if annotation.third_part:
                        text_annotations.append(annotation.third_part)
                    """ DEBUGGING
                    print ""
                    print line.strip()
                    print annotation.txt
                    print "First:  ", self.txt[annotation.first_part.start:annotation.first_part.end]
                    print "Second: ", self.txt[annotation.second_part.start:annotation.second_part.end]
                    print annotation.first_part.start, annotation.first_part.end, " ",
                    print annotation.second_part.start, annotation.second_part.end
                    """
                else:
                    try:
                        annotation = TextAnnotation(line, self.txt)
                        #Bad annotation, ignore
                        if annotation.start == annotation.end:
                            continue
                        else:
                            text_annotations.append(annotation)
                    except Exception as e:
                        import traceback
                        print(traceback.format_exc())
                        raise e

            elif first_char == "A":
                annotation = AttributeAnnotation(line)
                if annotation.attribute == "Vague":
                    vague_ids.add(annotation.target_id)
                if annotation.attribute == "Normal":
                    normal_ids.add(annotation.target_id)
                for id in annotation.child_annotation_ids:
                    annotation.child_annotations.append(self.id2annotation[id])
            elif first_char == "R":
                annotation = RelationshipAnnotation(line)
            elif first_char == "E":
                annotation = EventAnnotation(line, self.id2annotation)
                annotations_with_dependencies.append(annotation)
            elif first_char == "#":
                annotation = NoteAnnotation(line)
                for id in annotation.child_annotation_ids:
                    annotation.child_annotations.append(self.id2annotation[id])
            else:
                raise Exception("Unknown annotation type")
            self.id2annotation[annotation.id] = annotation
        #end process lines

        for annotation in text_annotations:
            if not include_vague and annotation.id in vague_ids:
                continue
            if not include_normal and annotation.id in normal_ids:
                continue
            process_text_annotation(annotation)

        for annotation in annotations_with_dependencies:
            deps = annotation.dependencies()
            # group items
            grp_causer = dict()
            grp_result = dict()
            for dependency in deps:
                process_text_annotation(dependency)

                code = dependency.code
                splt = code.split(":")
                typ = splt[0]
                grp_key = 0
                if typ[-1].isdigit():
                    grp_key = typ[-1]
                if code.startswith("Cause"):
                    grp_causer[grp_key] = dependency
                elif code.startswith("Result"):
                    grp_result[grp_key] = dependency
                else:
                    pass

            if len(grp_causer) > 0 and len(grp_result) > 0:
                if len(grp_causer) == 1 and len(grp_result) == 1:
                    causer = list(grp_causer.values())[0]
                    result = list(grp_result.values())[0]
                    process_causal_relations(causer, result)
                elif len(grp_causer) == len(grp_result):
                    for key in grp_causer.keys():
                        causer = grp_causer[key]
                        result = grp_result[key]
                        process_causal_relations(causer, result)
                elif len(grp_causer) == 1:
                    causer = list(grp_causer.values())[0]
                    for key, result in grp_result.items():
                        process_causal_relations(causer, result)
                elif len(grp_result) == 1:
                    result = list(grp_result.values())[0]
                    for key, causer in grp_causer.items():
                        process_causal_relations(causer, result)
                else:
                    raise Exception("Unbalanced CR codes")

        codes = set()
        current_word = ""
        current_sentence = []

        def add_pair(current_word, current_sentence, codes, ch, ix):
            if current_word.strip() != "":
                pair = (current_word, codes)
                current_sentence.append(pair)
                self.tagged_words.append(pair)
            if ch.strip() != "" and ch != "/":
                if ix in codes_start:
                    pair2 = (ch, codes_start[ix])
                else:
                    pair2 = (ch, set())
                current_sentence.append(pair2)
                self.tagged_words.append(pair2)

        def onlyascii(s):
            out = ""
            for char in s:
                if ord(char) > 127:
                    out += ""
                else:
                    out += char
            return out

        def first_alnum(s):
            for c in s:
                if c.isalnum():
                    return c
            return ""

        def add_sentence(sentence, str_sent):

            sents = filter(lambda s: len(s) > 1 and s != '//',
                           sent_tokenize(onlyascii(str_sent.strip())))
            sents = list(
                map(
                    lambda s: s.replace("/", " ").replace("-", " - ").replace(
                        ")", " ) ").replace("  ", " ").strip(), sents))

            # the code below handles cases where the sentences are not properly split and we get multiple sentences here
            if len(sents) > 1:
                # Only valid splits start with a initial capital letter

                new_sents = []
                for s in sents:
                    if len(new_sents) > 0 and (s.strip() == "\""
                                               or s.strip()[0] in ("(", ")")
                                               or s.strip()[0].islower()
                                               or first_alnum(s).islower()):
                        self.aborted_splits.append(new_sents[-1])
                        self.aborted_splits.append(s)
                        new_sents[-1] += " " + s
                    else:
                        new_sents.append(s)
                sents = new_sents

            if len(sents) > 1:

                # filter to # of full sentences, and we should get at least this many out
                expected_min_sents = len(
                    [s for s in sents if len(s.strip().split(" ")) > 1])

                unique_wds = set(
                    map(lambda s: s.lower(),
                        list(zip(*sentence))[0]))

                processed = []
                partitions = []
                for i, sent in enumerate(sents):
                    # last but one only
                    if i < (len(sents) - 1):
                        last = sent.split(" ")[-1]
                        if last.lower() == "temp.":
                            expected_min_sents -= 1
                            continue
                        if last[-1] in {".", "?", "?", "\n"}:
                            last = last[-1]
                        elif last.lower() not in unique_wds:
                            last = last[-1]

                        assert last.lower() in unique_wds

                        tokens = sents[i + 1].split()
                        first = tokens[0]
                        if first == "//" and len(tokens) > 1:
                            first = tokens[1]
                        if first.lower() not in unique_wds:
                            if not first[-1].isalnum():
                                first = first[:-1]
                            if not first[0].isalnum():
                                first = first[0]

                        assert first.lower(
                        ) in unique_wds, "first.lower():%s  not in unique_wds" % first.lower(
                        )

                        partitions.append((last, first))

                if len(partitions) == 0:
                    # handle the temp. error from above (one sentence where there is a temp. Increase)
                    self.tagged_sentences.append(sentence)
                    return
                current = []
                for j in range(0, len(sentence) - 1):
                    wd, tag = sentence[j]
                    current.append((wd, tag))
                    if len(partitions) > 0:
                        last, first = partitions[0]
                        if last == wd:
                            nextWd, nextTg = sentence[j + 1]
                            if first.startswith(nextWd):
                                self.tagged_sentences.append(current)
                                processed.append(list(zip(*current))[0])
                                current = []
                                partitions = partitions[1:]
                current.append(sentence[-1])
                self.tagged_sentences.append(current)
                processed.append(list(zip(*current))[0])
                assert len(processed) >= max(2, expected_min_sents)
                self.split_sents.append(processed)
            else:
                self.tagged_sentences.append(sentence)

        str_sent = ""
        for ix, ch in enumerate(self.txt):

            if ch.isalnum() or ch == "'":
                current_word += ch
            else:
                add_pair(current_word, current_sentence, codes.copy(), ch, ix)
                str_sent += current_word + ch
                # don't always split on periods, as not all periods terminate sentences (e.g. acronyms)
                if len(current_sentence) > 0 and \
                        ((ch in {"\n", "!", "?"}) or
                         (ch == "/" and ix > 0 and self.txt[ix-1] in {"\n", ".", "!", "?"})
                        ):
                    add_sentence(current_sentence, str_sent)
                    current_sentence = []
                    str_sent = ""
                current_word = ""

            if ix in codes_start:
                codes.update(codes_start[ix])
            if ix in codes_end:
                codes.difference_update(codes_end[ix])

        # add any remaining
        add_pair(current_word, current_sentence, codes.copy(), "", ix)
        if len(current_sentence) > 0:
            self.tagged_sentences.append(current_sentence)
        for sent in self.tagged_sentences:
            tags = list(zip(*sent))[1]
            self.sentence_tags.append(set(flatten(tags)))
Exemple #19
0
    filename_prefix=features_filename_prefix)(extract_features)
essay_feats = mem_extract_features(tagged_essays, **feat_config)
logger.info("Features loaded")
""" DEFINE TAGS """
gw_codes = GWConceptCodes()
tag_freq = get_tag_freq(tagged_essays)
freq_tags = list(
    set((tag for tag, freq in tag_freq.items()
         if freq >= MIN_TAG_FREQ and gw_codes.is_valid_code(tag))))

non_causal = [t for t in freq_tags if "->" not in t]
only_causal = [t for t in freq_tags if "->" in t]

_, lst_all_tags = flatten_to_wordlevel_feat_tags(essay_feats)
regular_tags = list(
    set((t for t in flatten(lst_all_tags)
         if "->" not in t and ":" not in t and gw_codes.is_valid_code(t))))

CAUSE_TAGS = ["Causer", "Result", "explicit"]
CAUSAL_REL_TAGS = [CAUSAL_REL, CAUSE_RESULT, RESULT_REL]  # + ["explicit"]
"""  """
#
wd_train_tags = list(
    set(regular_tags + only_causal + CAUSE_TAGS + CAUSAL_REL_TAGS))
wd_test_tags = wd_train_tags

# tags to evaluate against
""" CLASSIFIERS """
""" Log Reg + Log Reg is best!!! """

f_output_file = open(out_predictions_file, "w+")
                                       spelling_correct=SPELLING_CORRECT,
                                       replace_nums=REPLACE_NUMS, stem=STEM, remove_stop_words=REMOVE_STOP_WORDS,
                                       remove_punctuation=REMOVE_PUNCTUATION, lower_case=LOWER_CASE,
                                       include_vague=INCLUDE_VAGUE, include_normal=INCLUDE_NORMAL)
# FEATURE SETTINGS
WINDOW_SIZE         = 7
CV_FOLDS            = 5
# END FEATURE SETTINGS
offset = (WINDOW_SIZE-1) / 2

# don't memoize as it's massive and also fast
word_projector_transformer = WordProjectorTransformer(offset)
essay_feats = word_projector_transformer.transform(tagged_essays)

_, lst_all_tags = flatten_to_wordlevel_vectors_tags(essay_feats)
all_tags = set(flatten(lst_all_tags))

# use more tags for training for sentence level classifier

regular_tags = [t for t in all_tags if t[0].isdigit()]
cause_tags = ["Causer", "Result", "explicit"]
causal_rel_tags = [CAUSAL_REL, CAUSE_RESULT, RESULT_REL]# + ["explicit"]

wd_train_tags = regular_tags + cause_tags
wd_test_tags  = regular_tags


folds = cross_validation(essay_feats, CV_FOLDS)
lst_td_wt_mean_prfa, lst_vd_wt_mean_prfa, lst_td_mean_prfa, lst_vd_mean_prfa = [], [], [], []
td_all_metricsByTag = defaultdict(list)
vd_all_metricsByTag = defaultdict(list)
Exemple #21
0
feat_config = dict(list(config.items()) + [("extractors", extractors)])

""" LOAD DATA """
mem_process_essays = memoize_to_disk(filename_prefix=processed_essay_filename_prefix)(load_process_essays)
tagged_essays = mem_process_essays( **config )
logger.info("Essays loaded")
# most params below exist ONLY for the purposes of the hashing to and from disk
mem_extract_features = memoize_to_disk(filename_prefix=features_filename_prefix)(extract_features)
essay_feats = mem_extract_features(tagged_essays, **feat_config)
logger.info("Features loaded")

""" DEFINE TAGS """

_, lst_all_tags = flatten_to_wordlevel_feat_tags(essay_feats)
#all_regular_tags = list((t for t in flatten(lst_all_tags) if t[0].isdigit()))
all_regular_tags = list(set((t for t in flatten(lst_all_tags) if t.lower().strip() == "anaphor" )))
tag_freq = Counter(all_regular_tags)
regular_tags = list(tag_freq.keys())

""" works best with all the pair-wise causal relation codes """
wd_train_tags = regular_tags
wd_test_tags  = regular_tags

""" CLASSIFIERS """
""" Log Reg + Log Reg is best!!! """
fn_create_wd_cls   = lambda: LogisticRegression() # C=1, dual = False seems optimal
wd_algo   = str(fn_create_wd_cls())
print("Classifier:", wd_algo)

folds = cross_validation(essay_feats, CV_FOLDS)
        modified_sentence = filter2min_word_freq(sentence)
        modified_sentence = filterout_punctuation(modified_sentence)
        if len(modified_sentence) == 0:
            continue

        bookend(modified_sentence)
        new_windows = split_into_windows(modified_sentence, window_size= WINDOW_SIZE)
        assert_windows_correct(new_windows)

        # tagged words
        sentences.append(sentence)
        # words only
        wds, tags = zip(*sentence)
        tokenized_sentences.append(wds)
        ix2sentTags[i] = set(flatten(tags))

        ix2windows[i] = new_windows
        ix2sents[i] = modified_sentence
        i += 1

""" Assert tags set correctly """
print "Windows loaded correctly!\n"

print "\n".join(sorted(removed))

""" Extract Features """
from WindowFeatures import extract_positional_word_features, extract_word_features
from NgramGenerator import compute_ngrams

def extract_positional_bigram_features(window, mid_ix, feature_val = 1):
    def __init__(self, full_path, include_vague = True, include_normal = True, load_annotations = True, essay_text = None):

        self.include_normal = include_normal
        self.include_vague = include_vague

        txt_file = full_path[:-4] + ".txt"
        if essay_text is None:
            self.full_path = full_path
            self.file_name = full_path.split("/")[-1]
            with open(txt_file, "r+") as f:
                self.txt = f.read()
        else:
            if load_annotations:
                raise Exception("Can't load annotations when passing in essay as text string")
            self.full_path = "None"
            self.file_name = "None"
            self.txt = essay_text

        self.tagged_words = []
        #list of list of tuples (words and tags)
        self.tagged_sentences = []
        # list of sets of tags
        self.sentence_tags = []
        self.id2annotation = {}
        self.split_sents = []
        self.aborted_splits = []

        if load_annotations:
            assert full_path.endswith(".ann")
            assert os.path.exists(txt_file), "Missing associated text file for %s" % self.full_path

            with open(full_path, "r+") as f:
                lines = f.readlines()
        else:
            lines = []

        codes_start = defaultdict(set)
        codes_end = defaultdict(set)

        def get_code(annotation):
            if ":" not in annotation.code:
                return annotation.code

            typ, id = annotation.code.split(":")
            """ strip off the trailing digit (e.g. Causer1:50) """
            if typ[-1].isdigit():
                typ = typ[:-1]
            return typ + ":" + id

        def process_causal_relations(causer, result, antecedent_mapping):
            start = min(causer.start, result.start)
            end = max(causer.end, result.end)
            if start == end:
                return False
            causer_code = get_code(causer)
            result_code = get_code(result)

            cr_code = causer_code + "->" + result_code
            codes_start[start].add(cr_code)
            codes_end[end].add(cr_code)

            if ANAPHORA.lower() in causer_code.lower():
                #assert causer.id in antecedent_mapping, "No antecedent mapping found for annotation:" + str(causer)
                reference_ids = antecedent_mapping[causer.id]
                for ref_id in reference_ids:
                    reference_annotation = self.id2annotation[ref_id]
                    cr_ana_code = "{causer_code}[{reference_code}]->{result_code}".format(
                        causer_code=causer_code, reference_code=reference_annotation.code,
                        result_code=result_code
                    )
                    codes_start[start].add(cr_ana_code)
                    codes_end[end].add(cr_ana_code)

            if ANAPHORA.lower() in result_code.lower():
                #assert result.id in antecedent_mapping, "No antecedent mapping found for annotation:" + str(result)
                reference_ids = antecedent_mapping[result.id]
                for ref_id in reference_ids:
                    reference_annotation = self.id2annotation[ref_id]
                    cr_ana_code = "{causer_code}->{result_code}[{reference_code}]".format(
                        causer_code=causer_code, reference_code=reference_annotation.code,
                        result_code=result_code
                    )
                    codes_start[start].add(cr_ana_code)
                    codes_end[end].add(cr_ana_code)

            return True

        def process_text_annotation_and_add_codes(annotation):
            if annotation.start == annotation.end:
                return False
            codes_end[annotation.end].add(get_code(annotation))
            codes_start[annotation.start].add(get_code(annotation))
            if hasattr(annotation, "dep_type"):
                codes_start[annotation.start].add(annotation.dep_type)
                codes_end[annotation.end].add(annotation.dep_type)
            return True

        def process_anaphoric_reference(anaphora_annotation, reference_annotation)->None:
            """
            Adds start and end code for anaphoric references, resolving them with the
            location of the anphora tag, but the code from the reference annotation
            :param anaphora_annotation:  text annotation with the Anaphor tag
            :param reference_annotation:  text annotation that was a reference for that tag
            :return: None
            """
            assert anaphora_annotation.code == ANAPHORA, "Code is not anaphora"
            # prepend Anaphor tag to the code so we can differentiate from the regular codes
            reference_code =  "{anaphora}:[{code}]".format(anaphora=ANAPHORA, code=get_code(reference_annotation))
            codes_end[anaphora_annotation.end].add(reference_code)
            codes_start[anaphora_annotation.start].add(reference_code)

        annotations_with_dependencies_inc_crels = []
        text_annotations = []
        antecedent_mapping = defaultdict(set) # map of id to a Set[id] for resolving co-references
        vague_ids = set()
        normal_ids = set()
        for line in lines:
            if len(line.strip()) < 2:
                continue

            if not line[1].isdigit():
                print("Skipping annotation line: %s \n\tin essay %s" % (line.strip(), self.file_name))
                continue

            first_char = line[0]
            if first_char == "T":
                if is_compound(line):
                    annotation = CompoundTextAnnotation(line, self.txt)
                    text_annotations.append(annotation.first_part)
                    text_annotations.append(annotation.second_part)
                    if annotation.third_part:
                        text_annotations.append(annotation.third_part)

                    """ DEBUGGING
                    print ""
                    print line.strip()
                    print annotation.txt
                    print "First:  ", self.txt[annotation.first_part.start:annotation.first_part.end]
                    print "Second: ", self.txt[annotation.second_part.start:annotation.second_part.end]
                    print annotation.first_part.start, annotation.first_part.end, " ",
                    print annotation.second_part.start, annotation.second_part.end
                    """
                else:
                    try:
                        annotation = TextAnnotation(line, self.txt)
                        #Bad annotation, ignore
                        if annotation.start == annotation.end:
                            continue
                        else:
                            text_annotations.append(annotation)
                    except Exception as e:
                        import traceback
                        print(traceback.format_exc())
                        raise e

            elif first_char == "A":
                annotation = AttributeAnnotation(line)
                if annotation.attribute == "Vague":
                    vague_ids.add(annotation.target_id)
                if annotation.attribute == "Normal":
                    normal_ids.add(annotation.target_id)
                for id in annotation.child_annotation_ids:
                    annotation.child_annotations.append(self.id2annotation[id])
            elif first_char == "R":
                annotation = RelationshipAnnotation(line)
                antecedent_mapping[annotation.arg1_code].add(annotation.arg2_code)
            elif first_char == "E":
                annotation = EventAnnotation(line, self.id2annotation)
                annotations_with_dependencies_inc_crels.append(annotation)
            elif first_char == "#":
                annotation = NoteAnnotation(line)
                for id in annotation.child_annotation_ids:
                    annotation.child_annotations.append(self.id2annotation[id])
            else:
                raise Exception("Unknown annotation type")
            self.id2annotation[annotation.id] = annotation
        #end process lines

        for annotation in text_annotations:
            if not include_vague and annotation.id in vague_ids:
                continue
            if not include_normal and annotation.id in normal_ids:
                continue

            # add concept code to
            is_valid = process_text_annotation_and_add_codes(annotation)
            # Process / resolve antecedent relations (anaphora refs)
            if is_valid and annotation.code == ANAPHORA:
                # some anaphora tags don't have antecedent mappings
                if annotation.id in antecedent_mapping:
                    # get all references
                    ref_ids = antecedent_mapping[annotation.id]
                    # for each reference, add a new "Anaphora:<code>" tag
                    for id in ref_ids:
                        reference_annotation = self.id2annotation[id]
                        process_anaphoric_reference(anaphora_annotation=annotation,
                                                reference_annotation=reference_annotation)
        # process causal relations,
        for annotation in annotations_with_dependencies_inc_crels:
            deps = annotation.dependencies()
            # group items
            grp_causer = dict()
            grp_result = dict()
            for dependency in deps:
                process_text_annotation_and_add_codes(dependency)

                code = dependency.code
                splt = code.split(":")
                typ = splt[0]
                grp_key = 0
                if typ[-1].isdigit():
                    grp_key = typ[-1]
                if code.startswith("Cause"):
                    grp_causer[grp_key] = dependency
                elif code.startswith("Result"):
                    grp_result[grp_key]= dependency
                else:
                    pass

            if len(grp_causer) > 0 and len(grp_result) > 0:
                if len(grp_causer) == 1 and len(grp_result) == 1:
                    causer = list(grp_causer.values())[0]
                    result = list(grp_result.values())[0]
                    process_causal_relations(causer, result, antecedent_mapping)
                elif len(grp_causer) == len(grp_result):
                    for key in grp_causer.keys():
                        causer = grp_causer[key]
                        result = grp_result[key]
                        process_causal_relations(causer, result, antecedent_mapping)
                elif len(grp_causer) == 1:
                    causer = list(grp_causer.values())[0]
                    for key, result in grp_result.items():
                        process_causal_relations(causer, result, antecedent_mapping)
                elif len(grp_result) == 1:
                    result = list(grp_result.values())[0]
                    for key, causer in grp_causer.items():
                        process_causal_relations(causer, result, antecedent_mapping)
                else:
                    raise Exception("Unbalanced CR codes")

        codes = set()
        current_word = ""
        current_sentence = []

        def add_pair(current_word, current_sentence, codes, ch, ix):
            if current_word.strip() != "":
                pair = (current_word, codes)
                current_sentence.append(pair)
                self.tagged_words.append(pair)
            if ch.strip() != "" and ch != "/":
                if ix in codes_start:
                    pair2 = (ch, codes_start[ix])
                else:
                    pair2 = (ch, set())
                current_sentence.append(pair2)
                self.tagged_words.append(pair2)

        def onlyascii(s):
            out = ""
            for char in s:
                if ord(char) > 127:
                    out += ""
                else:
                    out += char
            return out

        def first_alnum(s):
            for c in s:
                if c.isalnum():
                    return c
            return ""

        def add_sentence(sentence, str_sent):

            sents = filter(lambda s: len(s) > 1 and s != '//', sent_tokenize(onlyascii(str_sent.strip())))
            sents = list(map(lambda s: s.replace("/", " ").replace("-", " - ").replace(")", " ) ").replace("  "," ").strip(), sents))

            # the code below handles cases where the sentences are not properly split and we get multiple sentences here
            if len(sents) > 1:
                # Only valid splits start with a initial capital letter

                new_sents = []
                for s in sents:
                    if len(new_sents) > 0 and (s.strip() =="\"" or s.strip()[0] in ("(",")") or s.strip()[0].islower() or first_alnum(s).islower()):
                        self.aborted_splits.append(new_sents[-1])
                        self.aborted_splits.append(s)
                        new_sents[-1] += " " + s
                    else:
                        new_sents.append(s)
                sents = new_sents

            if len(sents) > 1:

                # filter to # of full sentences, and we should get at least this many out
                expected_min_sents = len([s for s in sents if len(s.strip().split(" ")) > 1])

                unique_wds = set(map(lambda s: s.lower(), list(zip(*sentence))[0]))

                processed = []
                partitions = []
                for i, sent in enumerate(sents):
                    # last but one only
                    if i < (len(sents) - 1):
                        last = sent.split(" ")[-1]
                        if last.lower() == "temp.":
                            expected_min_sents -= 1
                            continue
                        if last[-1] in {".", "?", "?", "\n"}:
                            last = last[-1]
                        elif last.lower() not in unique_wds:
                            last = last[-1]

                        assert last.lower() in unique_wds

                        tokens = sents[i + 1].split()
                        first = tokens[0]
                        if first == "//" and len(tokens) > 1:
                            first = tokens[1]
                        if first.lower() not in unique_wds:
                            if not first[-1].isalnum():
                                first = first[:-1]
                            if not first[0].isalnum():
                                first = first[0]

                        assert first.lower() in unique_wds, "first.lower():%s  not in unique_wds" % first.lower()

                        partitions.append((last, first))

                if len(partitions) == 0:
                    # handle the temp. error from above (one sentence where there is a temp. Increase)
                    self.tagged_sentences.append(sentence)
                    return
                current = []
                for j in range(0, len(sentence)-1):
                    wd,tag = sentence[j]
                    current.append((wd, tag))
                    if len(partitions) > 0:
                        last, first = partitions[0]
                        if last == wd:
                            nextWd, nextTg = sentence[j + 1]
                            if first.startswith(nextWd):
                                self.tagged_sentences.append(current)
                                processed.append(list(zip(*current))[0])
                                current = []
                                partitions = partitions[1:]
                current.append(sentence[-1])
                self.tagged_sentences.append(current)
                processed.append(list(zip(*current))[0])
                assert len(processed) >= max(2,expected_min_sents)
                self.split_sents.append(processed)
            else:
                self.tagged_sentences.append(sentence)

        str_sent = ""
        for ix, ch in enumerate(self.txt):

            if ch.isalnum() or ch == "'":
                current_word += ch
            else:
                add_pair(current_word, current_sentence, codes.copy(), ch, ix)
                str_sent += current_word + ch
                # don't always split on periods, as not all periods terminate sentences (e.g. acronyms)
                if len(current_sentence) > 0 and \
                        ((ch in {"\n", "!", "?"}) or
                         (ch == "/" and ix > 0 and self.txt[ix-1] in {"\n", ".", "!", "?"})
                        ):
                    add_sentence(current_sentence, str_sent)
                    current_sentence = []
                    str_sent = ""
                current_word = ""

            if ix in codes_start:
                # add in all new codes here
                codes.update(codes_start[ix])
            if ix in codes_end:
                # remove all codes from the set that end here
                # this is basically like a remove ALL based on what's in 'codes_end[ix]'
                codes.difference_update(codes_end[ix])

        # add any remaining
        add_pair(current_word, current_sentence, codes.copy(), "", ix)
        if len(current_sentence) > 0:
            self.tagged_sentences.append(current_sentence)
        for sent in self.tagged_sentences:
            tags = list(zip(*sent))[1]
            self.sentence_tags.append(set(flatten(tags)))
if __name__ == "__main__":

    "1,2,3 => 1"
    "else => 0"
    simple_dataset = [
        ([1, 2, 3, 7, 8], 1 ),
        ([1, 2, 3, 4, 5], 1 ),
        ([7, 8, 1, 2, 3, 4, 5], 1 ),

        ([1, 2], 0 ),
        ([2, 3], 0 ),
        ([1, 3], 0 ),
        ([5, 6, 7], 0 ),
        ([4, 1, 2], 0 ),
        ([3, 7, 9], 0 ),
        ([1, 2, 4, 5, 6, 7, 8], 0 ),
    ]

    xs, ys = zip(*simple_dataset)
    attributes = list(set(flatten(xs)))

    dt = DecisionTree(attributes)
    dt.fit(xs, ys)
    predictions = dt.predict(xs)

    acc = accuracy(ys, predictions, class_value=1)
    print "\nAccuracy: " + str(acc)
    print ""
    print str(dt.tree)
    pass
feat_config = dict(list(config.items()) + [("extractors", extractors)])

""" LOAD DATA """
mem_process_essays = memoize_to_disk(filename_prefix=processed_essay_filename_prefix)(load_process_essays)
tagged_essays = mem_process_essays( **config )
logger.info("Essays loaded")
# most params below exist ONLY for the purposes of the hashing to and from disk
mem_extract_features = memoize_to_disk(filename_prefix=features_filename_prefix)(extract_features)
essay_feats = mem_extract_features(tagged_essays, **feat_config)
logger.info("Features loaded")

""" DEFINE TAGS """

_, lst_all_tags = flatten_to_wordlevel_feat_tags(essay_feats)
#all_regular_tags = list((t for t in flatten(lst_all_tags) if t[0].isdigit()))
all_regular_tags = list(set((t for t in flatten(lst_all_tags) if t.lower().strip() == "anaphor" )))
tag_freq = Counter(all_regular_tags)
regular_tags = list(tag_freq.keys())

""" works best with all the pair-wise causal relation codes """
wd_train_tags = regular_tags
wd_test_tags  = regular_tags

""" CLASSIFIERS """
""" Log Reg + Log Reg is best!!! """
fn_create_wd_cls   = lambda: LogisticRegression() # C=1, dual = False seems optimal
wd_algo   = str(fn_create_wd_cls())
print("Classifier:", wd_algo)

folds = cross_validation(essay_feats, CV_FOLDS)
def max_probs_to_words(vector):
    ixs = np.argmax(vector, axis=1)
    return ids_to_words(flatten(ixs))
def test(epochs = 1):
    results = model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=epochs, validation_split=0.0, show_accuracy=True, verbose=1)
    classes = flatten( model.predict_classes(X_test, batch_size=batch_size) )
    r, p, f1 = rpf1(y_test, classes)
    print("recall", r, "precision", p, "f1", f1)
    return f1
def test(epochs=1):
    model.fit(X_train, y_train, n_epochs=epochs, batch_size=64)#64 seems good for now
    predictions = flatten(model.predict(X_test))
    r, p, f1, cutoff = find_cutoff(y_test, predictions)
    print("recall", rnd(r), "precision", rnd(p), "f1", rnd(f1), "cutoff", rnd(cutoff))
    return f1
Exemple #29
0
def evaluate_feature_set(config, existing_extractors, new_extractor,
                         features_filename_prefix):

    feat_extractors = existing_extractors + [new_extractor]
    feat_config = dict(config.items() + [("extractors", feat_extractors)])
    """ LOAD FEATURES """
    # most params below exist ONLY for the purposes of the hashing to and from disk
    #mem_extract_features = memoize_to_disk(filename_prefix=features_filename_prefix, verbose=False)(extract_features)
    #essay_feats = mem_extract_features(tagged_essays, **feat_config)
    essay_feats = extract_features(tagged_essays, **feat_config)
    """ DEFINE TAGS """
    _, lst_all_tags = flatten_to_wordlevel_feat_tags(essay_feats)
    regular_tags = list(
        set((t for t in flatten(lst_all_tags) if t[0].isdigit())))
    """ works best with all the pair-wise causal relation codes """
    wd_train_tags = regular_tags
    wd_test_tags = regular_tags
    """ CLASSIFIERS """
    fn_create_wd_cls = lambda: LogisticRegression(
    )  # C=1, dual = False seems optimal
    wd_algo = str(fn_create_wd_cls())

    # Gather metrics per fold
    cv_wd_td_ys_by_tag, cv_wd_td_predictions_by_tag = defaultdict(
        list), defaultdict(list)
    cv_wd_vd_ys_by_tag, cv_wd_vd_predictions_by_tag = defaultdict(
        list), defaultdict(list)
    folds = cross_validation(essay_feats, CV_FOLDS)

    def train_tagger(essays_TD, essays_VD, wd_test_tags, wd_train_tags):
        # TD and VD are lists of Essay objects. The sentences are lists
        # of featureextractortransformer.Word objects
        """ Data Partitioning and Training """
        td_feats, td_tags = flatten_to_wordlevel_feat_tags(essays_TD)
        vd_feats, vd_tags = flatten_to_wordlevel_feat_tags(essays_VD)
        feature_transformer = FeatureVectorizer(min_feature_freq=MIN_FEAT_FREQ,
                                                sparse=SPARSE_WD_FEATS)
        td_X, vd_X = feature_transformer.fit_transform(
            td_feats), feature_transformer.transform(vd_feats)
        wd_td_ys_bytag = get_wordlevel_ys_by_code(td_tags, wd_train_tags)
        wd_vd_ys_bytag = get_wordlevel_ys_by_code(vd_tags, wd_train_tags)
        """ TRAIN Tagger """
        tag2word_classifier = train_classifier_per_code(
            td_X,
            wd_td_ys_bytag,
            lambda: LogisticRegression(),
            wd_train_tags,
            verbose=False)
        """ TEST Tagger """
        td_wd_predictions_by_code = test_classifier_per_code(
            td_X, tag2word_classifier, wd_test_tags)
        vd_wd_predictions_by_code = test_classifier_per_code(
            vd_X, tag2word_classifier, wd_test_tags)
        return td_wd_predictions_by_code, vd_wd_predictions_by_code, wd_td_ys_bytag, wd_vd_ys_bytag

    #results = Parallel(n_jobs=CV_FOLDS)(
    #        delayed(train_tagger)(essays_TD, essays_VD, wd_test_tags, wd_train_tags)
    #            for (essays_TD, essays_VD) in folds)

    results = [
        train_tagger(essays_TD, essays_VD, wd_test_tags, wd_train_tags)
        for (essays_TD, essays_VD) in folds
    ]

    for result in results:
        td_wd_predictions_by_code, vd_wd_predictions_by_code, wd_td_ys_bytag, wd_vd_ys_bytag = result
        merge_dictionaries(wd_td_ys_bytag, cv_wd_td_ys_by_tag)
        merge_dictionaries(wd_vd_ys_bytag, cv_wd_vd_ys_by_tag)
        merge_dictionaries(td_wd_predictions_by_code,
                           cv_wd_td_predictions_by_tag)
        merge_dictionaries(vd_wd_predictions_by_code,
                           cv_wd_vd_predictions_by_tag)

    # print results for each code
    """ Persist Results to Mongo DB """
    SUFFIX = "_FEAT_SELECTION"
    CB_TAGGING_TD, CB_TAGGING_VD = "CB_TAGGING_TD" + SUFFIX, "CB_TAGGING_VD" + SUFFIX
    parameters = dict(config)
    parameters["extractors"] = map(lambda fn: fn.func_name, feat_extractors)
    parameters["min_feat_freq"] = MIN_FEAT_FREQ

    wd_td_objectid = processor.persist_results(CB_TAGGING_TD,
                                               cv_wd_td_ys_by_tag,
                                               cv_wd_td_predictions_by_tag,
                                               parameters, wd_algo)
    wd_vd_objectid = processor.persist_results(CB_TAGGING_VD,
                                               cv_wd_vd_ys_by_tag,
                                               cv_wd_vd_predictions_by_tag,
                                               parameters, wd_algo)

    avg_f1 = float(
        processor.get_metric(CB_TAGGING_VD, wd_vd_objectid,
                             __MICRO_F1__)["f1_score"])
    return avg_f1
Exemple #30
0
        modified_sentence = filter2min_word_freq(sentence)
        modified_sentence = filterout_punctuation(modified_sentence)
        if len(modified_sentence) == 0:
            continue

        bookend(modified_sentence)
        new_windows = split_into_windows(modified_sentence,
                                         window_size=WINDOW_SIZE)
        assert_windows_correct(new_windows)

        # tagged words
        sentences.append(sentence)
        # words only
        wds, tags = zip(*sentence)
        tokenized_sentences.append(wds)
        ix2sentTags[i] = set(flatten(tags))

        ix2windows[i] = new_windows
        ix2sents[i] = modified_sentence
        i += 1
""" Assert tags set correctly """
print "Windows loaded correctly!\n"

print "\n".join(sorted(removed))
""" Extract Features """
from WindowFeatures import extract_positional_word_features, extract_word_features
from NgramGenerator import compute_ngrams


def extract_positional_bigram_features(window, mid_ix, feature_val=1):
    bi_grams = compute_ngrams(window, max_len=2, min_len=2)
from passage.layers import Dense

from passage.models import RNN
from passage.utils import save, load
from passage.preprocessing import Tokenizer
from passage.theano_utils import intX
from passage.iterators import SortedPadded
import theano.tensor as T
from IterableFP import flatten

#tokenizer = Tokenizer()
#SH: doesn't work for some reason
#train_tokens = tokenizer.fit_transform(["The big fat frog jumped out of the pond","frogs are amphibians", "toads are similar to frogs"])

train_tokens = [[1, 2, 4, 3, 6], [1, 2, 3], [3, 1, 2, 4, 3]]
num_feats = len(set(flatten(train_tokens)))


def get_labels(id):
    if id == 3:
        return [1, 0]
    else:
        return [0, 1]


seq_labels = map(lambda (l): map(get_labels, l), train_tokens)

layers = [
    Embedding(size=128, n_features=num_feats),
    GatedRecurrent(size=128, seq_output=True),
    Dense(size=num_feats, activation='softmax')
]

feat_config = dict(config.items() + [("extractors", extractors)])
""" LOAD DATA """
mem_process_essays = memoize_to_disk(
    filename_prefix=processed_essay_filename_prefix)(load_process_essays)
tagged_essays = mem_process_essays(**config)
logger.info("Essays loaded")
# most params below exist ONLY for the purposes of the hashing to and from disk
mem_extract_features = memoize_to_disk(
    filename_prefix=features_filename_prefix)(extract_features)
essay_feats = mem_extract_features(tagged_essays, **feat_config)
logger.info("Features loaded")
""" DEFINE TAGS """
_, lst_all_tags = flatten_to_wordlevel_feat_tags(essay_feats)
regular_tags = list(set((t for t in flatten(lst_all_tags) if t[0].isdigit())))

CAUSE_TAGS = ["Causer", "Result", "explicit"]
CAUSAL_REL_TAGS = [CAUSAL_REL, CAUSE_RESULT, RESULT_REL]  # + ["explicit"]
""" works best with all the pair-wise causal relation codes """
wd_train_tags = regular_tags  #+ CAUSE_TAGS
wd_test_tags = regular_tags  #+ CAUSE_TAGS

# tags from tagging model used to train the stacked model
sent_input_feat_tags = wd_train_tags
# find interactions between these predicted tags from the word tagger to feed to the sentence tagger
sent_input_interaction_tags = wd_train_tags
# tags to train (as output) for the sentence based classifier
sent_output_train_test_tags = list(
    set(regular_tags + CAUSE_TAGS + CAUSAL_REL_TAGS))
# most params below exist ONLY for the purposes of the hashing to and from disk
mem_extract_features = memoize_to_disk(filename_prefix=features_filename_prefix)(extract_features)
essay_feats = mem_extract_features(tagged_essays, **feat_config)
logger.info("Features loaded")

""" DEFINE TAGS """
gw_codes = GWConceptCodes()
tag_freq = get_tag_freq(tagged_essays)
freq_tags = list(set((tag for tag, freq in tag_freq.items()
                      if freq >= MIN_TAG_FREQ and gw_codes.is_valid_code(tag))))

non_causal  = [t for t in freq_tags if "->" not in t]
only_causal = [t for t in freq_tags if "->" in t]

_, lst_all_tags = flatten_to_wordlevel_feat_tags(essay_feats)
regular_tags = list(set((t for t in flatten(lst_all_tags)
                         if "->" not in t and ":" not in t
                         and gw_codes.is_valid_code(t))))

CAUSE_TAGS = ["Causer", "Result", "explicit"]
CAUSAL_REL_TAGS = [CAUSAL_REL, CAUSE_RESULT, RESULT_REL]# + ["explicit"]

"""  """
#
wd_train_tags = list(set(regular_tags + only_causal + CAUSE_TAGS + CAUSAL_REL_TAGS))
wd_test_tags  = wd_train_tags

# tags to evaluate against

""" CLASSIFIERS """
""" Log Reg + Log Reg is best!!! """
def evaluate_feature_set(config, existing_extractors, new_extractor, features_filename_prefix):

    feat_extractors = existing_extractors + [new_extractor]
    feat_config = dict(config.items() + [("extractors", feat_extractors)])
    """ LOAD FEATURES """
    # most params below exist ONLY for the purposes of the hashing to and from disk
    #mem_extract_features = memoize_to_disk(filename_prefix=features_filename_prefix, verbose=False)(extract_features)
    #essay_feats = mem_extract_features(tagged_essays, **feat_config)
    essay_feats = extract_features(tagged_essays, **feat_config)
    """ DEFINE TAGS """
    _, lst_all_tags = flatten_to_wordlevel_feat_tags(essay_feats)
    regular_tags = list(set((t for t in flatten(lst_all_tags) if t[0].isdigit())))
    """ works best with all the pair-wise causal relation codes """
    wd_train_tags = regular_tags
    wd_test_tags = regular_tags
    """ CLASSIFIERS """
    fn_create_wd_cls = lambda: LogisticRegression()  # C=1, dual = False seems optimal
    wd_algo = str(fn_create_wd_cls())

    # Gather metrics per fold
    cv_wd_td_ys_by_tag, cv_wd_td_predictions_by_tag = defaultdict(list), defaultdict(list)
    cv_wd_vd_ys_by_tag, cv_wd_vd_predictions_by_tag = defaultdict(list), defaultdict(list)
    folds = cross_validation(essay_feats, CV_FOLDS)

    def train_tagger(essays_TD, essays_VD, wd_test_tags, wd_train_tags):
        # TD and VD are lists of Essay objects. The sentences are lists
        # of featureextractortransformer.Word objects
        """ Data Partitioning and Training """
        td_feats, td_tags = flatten_to_wordlevel_feat_tags(essays_TD)
        vd_feats, vd_tags = flatten_to_wordlevel_feat_tags(essays_VD)
        feature_transformer = FeatureVectorizer(min_feature_freq=MIN_FEAT_FREQ, sparse=SPARSE_WD_FEATS)
        td_X, vd_X = feature_transformer.fit_transform(td_feats), feature_transformer.transform(vd_feats)
        wd_td_ys_bytag = get_wordlevel_ys_by_code(td_tags, wd_train_tags)
        wd_vd_ys_bytag = get_wordlevel_ys_by_code(vd_tags, wd_train_tags)
        """ TRAIN Tagger """
        tag2word_classifier = train_classifier_per_code(td_X, wd_td_ys_bytag, lambda: LogisticRegression(),
                                                        wd_train_tags, verbose=False)
        """ TEST Tagger """
        td_wd_predictions_by_code = test_classifier_per_code(td_X, tag2word_classifier, wd_test_tags)
        vd_wd_predictions_by_code = test_classifier_per_code(vd_X, tag2word_classifier, wd_test_tags)
        return td_wd_predictions_by_code, vd_wd_predictions_by_code, wd_td_ys_bytag, wd_vd_ys_bytag

    #results = Parallel(n_jobs=CV_FOLDS)(
    #        delayed(train_tagger)(essays_TD, essays_VD, wd_test_tags, wd_train_tags)
    #            for (essays_TD, essays_VD) in folds)

    results = [train_tagger(essays_TD, essays_VD, wd_test_tags, wd_train_tags)
               for (essays_TD, essays_VD) in folds]

    for result in results:
        td_wd_predictions_by_code, vd_wd_predictions_by_code, wd_td_ys_bytag, wd_vd_ys_bytag = result
        merge_dictionaries(wd_td_ys_bytag, cv_wd_td_ys_by_tag)
        merge_dictionaries(wd_vd_ys_bytag, cv_wd_vd_ys_by_tag)
        merge_dictionaries(td_wd_predictions_by_code, cv_wd_td_predictions_by_tag)
        merge_dictionaries(vd_wd_predictions_by_code, cv_wd_vd_predictions_by_tag)

    # print results for each code
    """ Persist Results to Mongo DB """
    SUFFIX = "_FEAT_SELECTION"
    CB_TAGGING_TD, CB_TAGGING_VD = "CB_TAGGING_TD" + SUFFIX, "CB_TAGGING_VD" + SUFFIX
    parameters = dict(config)
    parameters["extractors"] = map(lambda fn: fn.func_name, feat_extractors)
    parameters["min_feat_freq"] = MIN_FEAT_FREQ

    wd_td_objectid = processor.persist_results(CB_TAGGING_TD, cv_wd_td_ys_by_tag,
                                               cv_wd_td_predictions_by_tag, parameters, wd_algo)
    wd_vd_objectid = processor.persist_results(CB_TAGGING_VD, cv_wd_vd_ys_by_tag,
                                               cv_wd_vd_predictions_by_tag, parameters, wd_algo)

    avg_f1 = float(processor.get_metric(CB_TAGGING_VD, wd_vd_objectid, __MICRO_F1__)["f1_score"])
    return avg_f1
# most params below exist ONLY for the purposes of the hashing to and from disk
mem_extract_features = memoize_to_disk(filename_prefix=features_filename_prefix)(extract_features)
essay_feats = mem_extract_features(tagged_essays, **feat_config)
logger.info("Features loaded")

""" DEFINE TAGS """
gw_codes = GWConceptCodes()
tag_freq = get_tag_freq(tagged_essays)
freq_tags = list(set((tag for tag, freq in tag_freq.items()
                      if freq >= MIN_TAG_FREQ and gw_codes.is_valid_code(tag))))

non_causal  = [t for t in freq_tags if "->" not in t]
only_causal = [t for t in freq_tags if "->" in t]

_, lst_all_tags = flatten_to_wordlevel_feat_tags(essay_feats)
regular_tags = list(set((t for t in flatten(lst_all_tags)
                         if "->" not in t and ":" not in t
                         and gw_codes.is_valid_code(t))))

CAUSE_TAGS = ["Causer", "Result", "explicit"]
CAUSAL_REL_TAGS = [CAUSAL_REL, CAUSE_RESULT, RESULT_REL]# + ["explicit"]

"""  """
#
wd_train_tags = list(set(regular_tags + CAUSE_TAGS))
wd_test_tags  = wd_train_tags

# tags from tagging model used to train the stacked model
sent_input_feat_tags = wd_train_tags
# find interactions between these predicted tags from the word tagger to feed to the sentence tagger
sent_input_interaction_tags = list(set(wd_train_tags))