def load_bratt_essays(directory=None,
                      include_vague=True,
                      include_normal=True,
                      load_annotations=True):
    import warnings

    bratt_root_folder = directory
    if not bratt_root_folder:
        settings = Settings.Settings()
        #bratt_root_folder = settings.data_directory + "CoralBleaching/BrattData/Merged/"
        bratt_root_folder = settings.data_directory + "CoralBleaching/BrattData/EBA1415_Merged/"

    if load_annotations:
        files = find_files(bratt_root_folder, "\.ann$", remove_empty=True)
    else:
        files = find_files(bratt_root_folder, "\.txt$", remove_empty=True)
    print(len(files), "files found")

    essays = []
    for f in files:
        try:
            txt_file = f[:-4] + ".txt"
            with open(txt_file) as fin:
                contents = fin.read().strip().lower()
                if "no essay" in contents[:20] or "no text" in contents[0:20]:
                    print("Skipping %s file as .txt file is %s'" %
                          (f, contents))
                    continue

            essay = Essay(f,
                          include_vague=include_vague,
                          include_normal=include_normal,
                          load_annotations=load_annotations)
            if len(essay.tagged_sentences) > 60:
                warnings.warn("Too many sentences (%s) in essay %s" %
                              (str(len(essay.sentence_tags)), essay.file_name))
                print("Too many sentences (%s) in essay %s" %
                      (str(len(essay.sentence_tags)), essay.file_name))
            else:
                essays.append(essay)
        except Exception as ex:
            from traceback import format_exc
            print(format_exc())
            print("Error processing file: ", f)
            pass

    print("%s essays processed" % str(len(essays)))
    return essays
def get_essays(folder, partition):
    essay_files = find_files(folder, regex=".*.dill")
    if partition == "Training":
        essay_files = [e for e in essay_files if "train" in e]
    else:
        essay_files = [e for e in essay_files if "test" in e]
    assert len(essay_files) == 1
    print("Found file", essay_files[0])
    with open(essay_files[0], "rb") as f:
        loaded_essays = dill.load(f)
    return loaded_essays
def load_bratt_essays(directory = None, include_vague = True, include_normal = True, load_annotations = True):
    import warnings

    bratt_root_folder = directory
    if not bratt_root_folder:
        settings = Settings.Settings()
        #bratt_root_folder = settings.data_directory + "CoralBleaching/BrattData/Merged/"
        bratt_root_folder = settings.data_directory + "CoralBleaching/BrattData/EBA1415_Merged/"

    if load_annotations:
        files = find_files(bratt_root_folder, "\.ann$", remove_empty=True)
    else:
        files = find_files(bratt_root_folder, "\.txt$", remove_empty=True)
    print(len(files), "files found")

    essays = []
    for f in files:
        try:
            txt_file = f[:-4] + ".txt"
            with open(txt_file) as fin:
                contents = fin.read().strip().lower()
                if "no essay" in contents[:20] or "no text" in contents[0:20]:
                    print("Skipping %s file as .txt file is %s'" % (f, contents))
                    continue

            essay = Essay(f, include_vague=include_vague, include_normal=include_normal, load_annotations=load_annotations)
            if len(essay.tagged_sentences) > 60:
                warnings.warn("Too many sentences (%s) in essay %s" % (str(len(essay.sentence_tags)), essay.file_name))
                print("Too many sentences (%s) in essay %s" % (str(len(essay.sentence_tags)), essay.file_name))
            else:
                essays.append(essay)
        except Exception as ex:
            from traceback import format_exc
            print(format_exc())
            print("Error processing file: ", f)
            pass

    print("%s essays processed" % str(len(essays)))
    return essays
def move(from_dir, to_dir, regex, ignore_empty = True, replace_existing = False):
    lst_files = find_files(from_dir, regex, remove_empty = not ignore_empty)
    cnt = 0
    for f in lst_files:
        dfile = os.path.join(to_dir, os.path.basename(f))
        if os.path.exists(dfile):
            if not replace_existing:
                print "%s already exists" % dfile
            else:
                os.remove(dfile)
        else:

            shutil.copyfile(f, dfile)
            cnt += 1
    print "Moved %s files" % str(cnt)
def replace_if_newer(from_dir, to_dir, regex, ignore_empty=True):
    lst_files = find_files(from_dir, regex, remove_empty=ignore_empty)
    cnt = 0
    for from_file in lst_files:
        to_file = os.path.join(to_dir, os.path.basename(from_file))
        if os.path.exists(to_file):
            from_time = time.ctime(os.path.getmtime(from_file))
            to_time = time.ctime(os.path.getmtime(to_file))

            if from_time >= to_time:
                os.remove(to_file)
                shutil.copyfile(from_file, to_file)
            else:
                print "Passing on %s as destination file is newer" % from_file
        else:

            shutil.copyfile(from_file, to_file)
            cnt += 1
    print "Moved %s files" % str(cnt)
Exemple #6
0
target_folder = root_folder + partition + "/"
processed_essay_filename_prefix = root_folder + "Pickled/essays_proc_pickled_"

config = get_config(target_folder)
# override this so we don't replace INFREQUENT words
#config["min_df"] = 0

mem_process_essays = memoize_to_disk(
    filename_prefix=processed_essay_filename_prefix)(load_process_essays)
tagged_essays = mem_process_essays(**config)
print("{0} essays loaded".format(len(tagged_essays)))

coref_root = root_folder + "CoReference/"
coref_folder = coref_root + partition

coref_files = find_files(coref_folder, ".*\.tagged")
print("{0} co-ref tagged files loaded".format(len(coref_files)))
assert len(coref_files) == len(tagged_essays)


def parse_stanfordnlp_tagged_essays(coref_files):
    DELIM = "->"
    DELIM_TAG = "|||"

    essay2tagged = {}
    for fname in coref_files:
        with open(fname) as f:
            lines = f.readlines()

        tagged_lines = []
        for line in lines:
target_folder = root_folder + partition + "/"
processed_essay_filename_prefix =  root_folder + "Pickled/essays_proc_pickled_"

config = get_config(target_folder)

# LOAD ESSAYS
mem_process_essays = memoize_to_disk(filename_prefix=processed_essay_filename_prefix)(load_process_essays)
tagged_essays = mem_process_essays(**config)
# map parsed essays to essay name

print("{0} essays loaded".format(len(tagged_essays)))

# LOAD COREF RESULTS
coref_root = root_folder + "CoReference/"
coref_folder = coref_root + partition
coref_files = find_files(coref_folder, ".*\.tagged")
print("{0} co-ref tagged files loaded".format(len(coref_files)))
assert len(coref_files) == len(tagged_essays)

config["window_size"] = 9
offset = int((config["window_size"] - 1) / 2)

unigram_window_stemmed  = fact_extract_positional_word_features_stemmed(offset)
biigram_window_stemmed  = fact_extract_ngram_features_stemmed(offset, 2)
triigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 3)
unigram_bow_window      = fact_extract_bow_ngram_features(offset, 1)

#optimal CB feature set
extractors = [
    unigram_window_stemmed,
    biigram_window_stemmed,