settings = Settings() root_folder = settings.data_directory + "CoralBleaching/Thesis_Dataset/" training_folder = root_folder + "Training" + "/" test_folder = root_folder + "Test" + "/" training_pickled = settings.data_directory + "CoralBleaching/Thesis_Dataset/training.pl" # NOTE: These predictions are generated from the "./notebooks/SEARN/Keras - Train Tagger and Save CV Predictions For Word Tags.ipynb" notebook # used as inputs to parsing model rnn_predictions_folder = root_folder + "Predictions/Bi-LSTM-4-SEARN/" config = get_config(training_folder) processor = ResultsProcessor(dbname="metrics_causal") # Get Test Data In Order to Get Test CRELS # load the test essays to make sure we compute metrics over the test CR labels test_config = get_config(test_folder) tagged_essays_test = load_process_essays(**test_config) ######################################################## fname = rnn_predictions_folder + "essays_train_bi_directional-True_hidden_size-256_merge_mode-sum_num_rnns-2_use_pretrained_embedding-True.dill" with open(fname, "rb") as f: pred_tagged_essays = dill.load(f) logger.info("Started at: " + str(datetime.datetime.now())) logger.info("Number of pred tagged essays %i" % len(pred_tagged_essays)) # should be 902 # In[7]: CAUSER = "******" RESULT = "Result" EXPLICIT = "explicit"
# construct unique key using settings for pickling settings = Settings.Settings() root_folder = settings.data_directory + "SkinCancer/Thesis_Dataset/" training_folder = root_folder + "Training/" test_folder = root_folder + "Test/" models_folder = settings.data_directory + "SkinCancer/models/CRF" """ Load Configs """ train_config = get_config(training_folder) train_config["window_size"] = 9 offset = (train_config["window_size"] - 1) / 2 test_config = dict(train_config.items()) test_config["folder"] = test_folder """ Load Data """ train_tagged_essays = load_process_essays(**train_config) test_tagged_essays = load_process_essays(**test_config) logger.info("Essays loaded - Train: %i Test %i" % (len(train_tagged_essays), len(test_tagged_essays))) # Create Corpus in CRF Format (list of list of tuples(word,tag)) # -------------------------------------------------------------- """ Define Tags """ tag_freq = get_tag_freq(train_tagged_essays) regular_tags = list( set((tag for tag, freq in tag_freq.items() if freq >= 0 and tag[0].isdigit()))) """ FEATURE EXTRACTION """ unigram_window_stemmed = fact_extract_positional_word_features(offset, True)
# not hashed as don't affect persistence of feature processing config = get_config(data) """ FEATURE EXTRACTION """ offset = (config["window_size"] - 1) / 2 unigram_window_stemmed = fact_extract_positional_word_features_stemmed(offset) biigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 2) extractors = [unigram_window_stemmed, biigram_window_stemmed] feat_config = dict(config.items() + [("extractors", extractors)]) """ LOAD DATA """ tagged_essays = load_process_essays( **config ) logger.info("Essays loaded") # most params below exist ONLY for the purposes of the hashing to and from disk # Collapse all variants of a tag into one tag feature_extractor = FeatureExtractorTransformer(extractors) essay_feats = feature_extractor.transform(tagged_essays) logger.info("Features loaded") with open(serialized_essays, "w+") as f_essays: pickle.dump(tagged_essays, f_essays) with open(serialized_features, "w+") as f_feats: pickle.dump(essay_feats, f_feats)
settings = Settings() root_folder = settings.data_directory + "CoralBleaching/Thesis_Dataset/" training_folder = root_folder + "Training" + "/" test_folder = root_folder + "Test" + "/" training_pickled = settings.data_directory + "CoralBleaching/Thesis_Dataset/training.pl" # NOTE: These predictions are generated from the "./notebooks/SEARN/Keras - Train Tagger and Save CV Predictions For Word Tags.ipynb" notebook # used as inputs to parsing model rnn_predictions_folder = root_folder + "Predictions/Bi-LSTM-4-SEARN/" config = get_config(training_folder) results_processor = ResultsProcessor(dbname="metrics_causal") # Get Test Data In Order to Get Test CRELS # load the test essays to make sure we compute metrics over the test CR labels test_config = get_config(test_folder) tagged_essays_test = load_process_essays(**test_config) ######################################################## fname = rnn_predictions_folder + "essays_train_bi_directional-True_hidden_size-256_merge_mode-sum_num_rnns-2_use_pretrained_embedding-True.dill" with open(fname, "rb") as f: pred_tagged_essays = dill.load(f) logger.info("Started at: " + str(datetime.datetime.now())) logger.info("Number of pred tagged essays %i" % len(pred_tagged_essays)) # should be 902 cr_tags = get_cr_tags(train_tagged_essays=pred_tagged_essays, tag_essays_test=tagged_essays_test) cv_folds = cross_validation(pred_tagged_essays, CV_FOLDS) # type: List[Tuple[Any,Any]] def evaluate_features( collection_prefix: str, folds: List[Tuple[Any, Any]],
#optimal SC feature set extractors = [unigram_bow_window, unigram_window_stemmed, biigram_window_stemmed, #trigram_window_stemmed, extract_brown_cluster, #extract_dependency_relation ] # For mongo extractor_names = map(lambda fn: fn.func_name, extractors) print("Extractors\n\t" + "\n\t".join(extractor_names)) feat_config = dict(train_config.items() + [("extractors", extractors)]) """ LOAD DATA """ train_tagged_essays = load_process_essays(**train_config) test_config = dict(train_config.items()) test_config["folder"] = test_folder test_tagged_essays = load_process_essays(**test_config) logger.info("Essays loaded- Train: %i Test %i" % (len(train_tagged_essays), len(test_tagged_essays))) # most params below exist ONLY for the purposes of the hashing to and from disk train_essay_feats = extract_features(train_tagged_essays, **feat_config) test_essay_feats = extract_features(test_tagged_essays, **feat_config) logger.info("Features loaded") """ DEFINE TAGS """ _, lst_all_tags = flatten_to_wordlevel_feat_tags(train_essay_feats) all_regular_tags = list((t for t in flatten(lst_all_tags) if t[0].isdigit()))
level=logging.INFO) logger = logging.getLogger() # not hashed as don't affect persistence of feature processing config = get_config(data) """ FEATURE EXTRACTION """ offset = (config["window_size"] - 1) / 2 unigram_window_stemmed = fact_extract_positional_word_features_stemmed(offset) biigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 2) extractors = [unigram_window_stemmed, biigram_window_stemmed] feat_config = dict(config.items() + [("extractors", extractors)]) """ LOAD DATA """ tagged_essays = load_process_essays(**config) logger.info("Essays loaded") # most params below exist ONLY for the purposes of the hashing to and from disk # Collapse all variants of a tag into one tag feature_extractor = FeatureExtractorTransformer(extractors) essay_feats = feature_extractor.transform(tagged_essays) logger.info("Features loaded") with open(serialized_essays, "w+") as f_essays: pickle.dump(tagged_essays, f_essays) with open(serialized_features, "w+") as f_feats: pickle.dump(essay_feats, f_feats)