def load_bratt_essays(directory=None, include_vague=True, include_normal=True, load_annotations=True): import warnings bratt_root_folder = directory if not bratt_root_folder: settings = Settings.Settings() #bratt_root_folder = settings.data_directory + "CoralBleaching/BrattData/Merged/" bratt_root_folder = settings.data_directory + "CoralBleaching/BrattData/EBA1415_Merged/" if load_annotations: files = find_files(bratt_root_folder, "\.ann$", remove_empty=True) else: files = find_files(bratt_root_folder, "\.txt$", remove_empty=True) print(len(files), "files found") essays = [] for f in files: try: txt_file = f[:-4] + ".txt" with open(txt_file) as fin: contents = fin.read().strip().lower() if "no essay" in contents[:20] or "no text" in contents[0:20]: print("Skipping %s file as .txt file is %s'" % (f, contents)) continue essay = Essay(f, include_vague=include_vague, include_normal=include_normal, load_annotations=load_annotations) if len(essay.tagged_sentences) > 60: warnings.warn("Too many sentences (%s) in essay %s" % (str(len(essay.sentence_tags)), essay.file_name)) print("Too many sentences (%s) in essay %s" % (str(len(essay.sentence_tags)), essay.file_name)) else: essays.append(essay) except Exception as ex: from traceback import format_exc print(format_exc()) print("Error processing file: ", f) pass print("%s essays processed" % str(len(essays))) return essays
def get_essays(folder, partition): essay_files = find_files(folder, regex=".*.dill") if partition == "Training": essay_files = [e for e in essay_files if "train" in e] else: essay_files = [e for e in essay_files if "test" in e] assert len(essay_files) == 1 print("Found file", essay_files[0]) with open(essay_files[0], "rb") as f: loaded_essays = dill.load(f) return loaded_essays
def load_bratt_essays(directory = None, include_vague = True, include_normal = True, load_annotations = True): import warnings bratt_root_folder = directory if not bratt_root_folder: settings = Settings.Settings() #bratt_root_folder = settings.data_directory + "CoralBleaching/BrattData/Merged/" bratt_root_folder = settings.data_directory + "CoralBleaching/BrattData/EBA1415_Merged/" if load_annotations: files = find_files(bratt_root_folder, "\.ann$", remove_empty=True) else: files = find_files(bratt_root_folder, "\.txt$", remove_empty=True) print(len(files), "files found") essays = [] for f in files: try: txt_file = f[:-4] + ".txt" with open(txt_file) as fin: contents = fin.read().strip().lower() if "no essay" in contents[:20] or "no text" in contents[0:20]: print("Skipping %s file as .txt file is %s'" % (f, contents)) continue essay = Essay(f, include_vague=include_vague, include_normal=include_normal, load_annotations=load_annotations) if len(essay.tagged_sentences) > 60: warnings.warn("Too many sentences (%s) in essay %s" % (str(len(essay.sentence_tags)), essay.file_name)) print("Too many sentences (%s) in essay %s" % (str(len(essay.sentence_tags)), essay.file_name)) else: essays.append(essay) except Exception as ex: from traceback import format_exc print(format_exc()) print("Error processing file: ", f) pass print("%s essays processed" % str(len(essays))) return essays
def move(from_dir, to_dir, regex, ignore_empty = True, replace_existing = False): lst_files = find_files(from_dir, regex, remove_empty = not ignore_empty) cnt = 0 for f in lst_files: dfile = os.path.join(to_dir, os.path.basename(f)) if os.path.exists(dfile): if not replace_existing: print "%s already exists" % dfile else: os.remove(dfile) else: shutil.copyfile(f, dfile) cnt += 1 print "Moved %s files" % str(cnt)
def replace_if_newer(from_dir, to_dir, regex, ignore_empty=True): lst_files = find_files(from_dir, regex, remove_empty=ignore_empty) cnt = 0 for from_file in lst_files: to_file = os.path.join(to_dir, os.path.basename(from_file)) if os.path.exists(to_file): from_time = time.ctime(os.path.getmtime(from_file)) to_time = time.ctime(os.path.getmtime(to_file)) if from_time >= to_time: os.remove(to_file) shutil.copyfile(from_file, to_file) else: print "Passing on %s as destination file is newer" % from_file else: shutil.copyfile(from_file, to_file) cnt += 1 print "Moved %s files" % str(cnt)
target_folder = root_folder + partition + "/" processed_essay_filename_prefix = root_folder + "Pickled/essays_proc_pickled_" config = get_config(target_folder) # override this so we don't replace INFREQUENT words #config["min_df"] = 0 mem_process_essays = memoize_to_disk( filename_prefix=processed_essay_filename_prefix)(load_process_essays) tagged_essays = mem_process_essays(**config) print("{0} essays loaded".format(len(tagged_essays))) coref_root = root_folder + "CoReference/" coref_folder = coref_root + partition coref_files = find_files(coref_folder, ".*\.tagged") print("{0} co-ref tagged files loaded".format(len(coref_files))) assert len(coref_files) == len(tagged_essays) def parse_stanfordnlp_tagged_essays(coref_files): DELIM = "->" DELIM_TAG = "|||" essay2tagged = {} for fname in coref_files: with open(fname) as f: lines = f.readlines() tagged_lines = [] for line in lines:
target_folder = root_folder + partition + "/" processed_essay_filename_prefix = root_folder + "Pickled/essays_proc_pickled_" config = get_config(target_folder) # LOAD ESSAYS mem_process_essays = memoize_to_disk(filename_prefix=processed_essay_filename_prefix)(load_process_essays) tagged_essays = mem_process_essays(**config) # map parsed essays to essay name print("{0} essays loaded".format(len(tagged_essays))) # LOAD COREF RESULTS coref_root = root_folder + "CoReference/" coref_folder = coref_root + partition coref_files = find_files(coref_folder, ".*\.tagged") print("{0} co-ref tagged files loaded".format(len(coref_files))) assert len(coref_files) == len(tagged_essays) config["window_size"] = 9 offset = int((config["window_size"] - 1) / 2) unigram_window_stemmed = fact_extract_positional_word_features_stemmed(offset) biigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 2) triigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 3) unigram_bow_window = fact_extract_bow_ngram_features(offset, 1) #optimal CB feature set extractors = [ unigram_window_stemmed, biigram_window_stemmed,