LOOK_BACK = 0 # how many sentences to look back when predicting tags # end not hashed # construct unique key using settings for pickling settings = Settings.Settings() root_folder = settings.data_directory + "CoralBleaching/Thesis_Dataset/" folder = root_folder + "Training/" processed_essay_filename_prefix = root_folder + "Pickled/essays_proc_pickled_" features_filename_prefix = root_folder + "Pickled/feats_pickled_" config = get_config(folder) """ Load Essays """ mem_process_essays = memoize_to_disk(filename_prefix=processed_essay_filename_prefix)(load_process_essays) tagged_essays = mem_process_essays( **config ) logger.info("Essays loaded") """ End load Essays """ def evaluate_window_size(config, window_size, features_filename_prefix): config["window_size"] = window_size """ FEATURE EXTRACTION """ offset = (config["window_size"] - 1) / 2 unigram_window = fact_extract_positional_word_features(offset) bigram_window = fact_extract_positional_ngram_features(offset, 2) trigram_window = fact_extract_positional_ngram_features(offset, 3) unigram_bow_window = fact_extract_bow_ngram_features(offset, 1)
unigram_window_stemmed = fact_extract_positional_word_features_stemmed(offset) biigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 2) trigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 3) extractors = [unigram_bow_window, unigram_window_stemmed, biigram_window_stemmed, trigram_window_stemmed, extract_brown_cluster, extract_dependency_relation ] feat_config = dict(config.items() + [("extractors", extractors)]) """ LOAD DATA """ mem_process_essays = memoize_to_disk(filename_prefix=processed_essay_filename_prefix)(load_process_essays) tagged_essays = mem_process_essays( **config ) logger.info("Essays loaded") # most params below exist ONLY for the purposes of the hashing to and from disk mem_extract_features = memoize_to_disk(filename_prefix=features_filename_prefix)(extract_features) essay_feats = mem_extract_features(tagged_essays, **feat_config) logger.info("Features loaded") """ DEFINE TAGS """ _, lst_all_tags = flatten_to_wordlevel_feat_tags(essay_feats) regular_tags = list(set((t for t in flatten(lst_all_tags) if t[0].isdigit()))) """ works best with all the pair-wise causal relation codes """ wd_train_tags = regular_tags wd_test_tags = regular_tags
MIN_TAG_FREQ = 5 LOOK_BACK = 0 # how many sentences to look back when predicting tags STEM = True # end not hashed # construct unique key using settings for pickling settings = Settings.Settings() root_folder = settings.data_directory + "SkinCancer/Thesis_Dataset/" folder = root_folder + "Training/" processed_essay_filename_prefix = root_folder + "Pickled/essays_proc_pickled_" config = get_config(folder) print(config) mem_process_essays = memoize_to_disk(filename_prefix=processed_essay_filename_prefix)(load_process_essays) tagged_essays = mem_process_essays(**config) logger.info("Essays loaded") len(tagged_essays) # Create Corpus in CRF Format (list of list of tuples(word,tag)) # -------------------------------------------------------------- tag_freq = get_tag_freq(tagged_essays) regular_tags = list(set((tag for tag, freq in tag_freq.items() if freq >= 0 and tag[0].isdigit()))) """ FEATURE EXTRACTION """ config["window_size"] = 11 offset = (config["window_size"] - 1) / 2 cv_wd_td_ys_by_tag, cv_wd_td_predictions_by_tag = defaultdict(list), defaultdict(list)
biigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 2) trigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 3) # modified to use new optimal feats extractors = [ unigram_bow_window, unigram_window_stemmed, biigram_window_stemmed, #trigram_window_stemmed, extract_brown_cluster, #extract_dependency_relation ] feat_config = dict(config.items() + [("extractors", extractors)]) """ LOAD DATA """ mem_process_essays = memoize_to_disk( filename_prefix=processed_essay_filename_prefix)(load_process_essays) tagged_essays = mem_process_essays(**config) logger.info("Essays loaded") # most params below exist ONLY for the purposes of the hashing to and from disk mem_extract_features = memoize_to_disk( filename_prefix=features_filename_prefix)(extract_features) essay_feats = mem_extract_features(tagged_essays, **feat_config) logger.info("Features loaded") """ DEFINE TAGS """ _, lst_all_tags = flatten_to_wordlevel_feat_tags(essay_feats) regular_tags = list(set((t for t in flatten(lst_all_tags) if t[0].isdigit()))) CAUSE_TAGS = ["Causer", "Result", "explicit"] CAUSAL_REL_TAGS = [CAUSAL_REL, CAUSE_RESULT, RESULT_REL] # + ["explicit"] """ works best with all the pair-wise causal relation codes """ wd_train_tags = regular_tags #+ CAUSE_TAGS