settings = Settings()
root_folder = settings.data_directory + "CoralBleaching/Thesis_Dataset/"
training_folder = root_folder + "Training" + "/"
test_folder = root_folder + "Test" + "/"
training_pickled = settings.data_directory + "CoralBleaching/Thesis_Dataset/training.pl"
# NOTE: These predictions are generated from the "./notebooks/SEARN/Keras - Train Tagger and Save CV Predictions For Word Tags.ipynb" notebook
# used as inputs to parsing model
rnn_predictions_folder = root_folder + "Predictions/Bi-LSTM-4-SEARN/"

config = get_config(training_folder)
processor = ResultsProcessor(dbname="metrics_causal")

# Get Test Data In Order to Get Test CRELS
# load the test essays to make sure we compute metrics over the test CR labels
test_config = get_config(test_folder)
tagged_essays_test = load_process_essays(**test_config)
########################################################

fname = rnn_predictions_folder + "essays_train_bi_directional-True_hidden_size-256_merge_mode-sum_num_rnns-2_use_pretrained_embedding-True.dill"
with open(fname, "rb") as f:
    pred_tagged_essays = dill.load(f)

logger.info("Started at: " + str(datetime.datetime.now()))
logger.info("Number of pred tagged essays %i" %
            len(pred_tagged_essays))  # should be 902

# In[7]:

CAUSER = "******"
RESULT = "Result"
EXPLICIT = "explicit"
# construct unique key using settings for pickling
settings = Settings.Settings()
root_folder = settings.data_directory + "SkinCancer/Thesis_Dataset/"
training_folder = root_folder + "Training/"
test_folder = root_folder + "Test/"

models_folder = settings.data_directory + "SkinCancer/models/CRF"
""" Load Configs """
train_config = get_config(training_folder)
train_config["window_size"] = 9
offset = (train_config["window_size"] - 1) / 2

test_config = dict(train_config.items())
test_config["folder"] = test_folder
""" Load Data """
train_tagged_essays = load_process_essays(**train_config)
test_tagged_essays = load_process_essays(**test_config)

logger.info("Essays loaded - Train: %i Test %i" %
            (len(train_tagged_essays), len(test_tagged_essays)))

# Create Corpus in CRF Format (list of list of tuples(word,tag))
# --------------------------------------------------------------
""" Define Tags """
tag_freq = get_tag_freq(train_tagged_essays)
regular_tags = list(
    set((tag for tag, freq in tag_freq.items()
         if freq >= 0 and tag[0].isdigit())))
""" FEATURE EXTRACTION """

unigram_window_stemmed = fact_extract_positional_word_features(offset, True)
コード例 #3
0
# not hashed as don't affect persistence of feature processing

config = get_config(data)

""" FEATURE EXTRACTION """
offset = (config["window_size"] - 1) / 2

unigram_window_stemmed = fact_extract_positional_word_features_stemmed(offset)
biigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 2)

extractors = [unigram_window_stemmed, biigram_window_stemmed]
feat_config = dict(config.items() + [("extractors", extractors)])

""" LOAD DATA """
tagged_essays = load_process_essays( **config )
logger.info("Essays loaded")
# most params below exist ONLY for the purposes of the hashing to and from disk

# Collapse all variants of a tag into one tag
feature_extractor = FeatureExtractorTransformer(extractors)

essay_feats = feature_extractor.transform(tagged_essays)
logger.info("Features loaded")

with open(serialized_essays, "w+") as f_essays:
    pickle.dump(tagged_essays, f_essays)

with open(serialized_features, "w+") as f_feats:
    pickle.dump(essay_feats,   f_feats)
settings = Settings()
root_folder = settings.data_directory + "CoralBleaching/Thesis_Dataset/"
training_folder = root_folder + "Training" + "/"
test_folder = root_folder + "Test" + "/"
training_pickled = settings.data_directory + "CoralBleaching/Thesis_Dataset/training.pl"
# NOTE: These predictions are generated from the "./notebooks/SEARN/Keras - Train Tagger and Save CV Predictions For Word Tags.ipynb" notebook
# used as inputs to parsing model
rnn_predictions_folder = root_folder + "Predictions/Bi-LSTM-4-SEARN/"

config = get_config(training_folder)
results_processor = ResultsProcessor(dbname="metrics_causal")

# Get Test Data In Order to Get Test CRELS
# load the test essays to make sure we compute metrics over the test CR labels
test_config = get_config(test_folder)
tagged_essays_test = load_process_essays(**test_config)
########################################################

fname = rnn_predictions_folder + "essays_train_bi_directional-True_hidden_size-256_merge_mode-sum_num_rnns-2_use_pretrained_embedding-True.dill"
with open(fname, "rb") as f:
    pred_tagged_essays = dill.load(f)

logger.info("Started at: " + str(datetime.datetime.now()))
logger.info("Number of pred tagged essays %i" % len(pred_tagged_essays))  # should be 902

cr_tags = get_cr_tags(train_tagged_essays=pred_tagged_essays, tag_essays_test=tagged_essays_test)
cv_folds = cross_validation(pred_tagged_essays, CV_FOLDS)  # type: List[Tuple[Any,Any]]

def evaluate_features(
        collection_prefix: str,
        folds: List[Tuple[Any, Any]],
#optimal SC feature set
extractors = [unigram_bow_window,
              unigram_window_stemmed,
              biigram_window_stemmed,
              #trigram_window_stemmed,
              extract_brown_cluster,
              #extract_dependency_relation
]

# For mongo
extractor_names = map(lambda fn: fn.func_name, extractors)
print("Extractors\n\t" + "\n\t".join(extractor_names))

feat_config = dict(train_config.items() + [("extractors", extractors)])
""" LOAD DATA """
train_tagged_essays = load_process_essays(**train_config)

test_config = dict(train_config.items())
test_config["folder"] = test_folder

test_tagged_essays = load_process_essays(**test_config)
logger.info("Essays loaded- Train: %i Test %i" % (len(train_tagged_essays), len(test_tagged_essays)))

# most params below exist ONLY for the purposes of the hashing to and from disk
train_essay_feats = extract_features(train_tagged_essays, **feat_config)
test_essay_feats  = extract_features(test_tagged_essays,  **feat_config)
logger.info("Features loaded")

""" DEFINE TAGS """
_, lst_all_tags = flatten_to_wordlevel_feat_tags(train_essay_feats)
all_regular_tags = list((t for t in flatten(lst_all_tags) if t[0].isdigit()))
コード例 #6
0
                    level=logging.INFO)
logger = logging.getLogger()

# not hashed as don't affect persistence of feature processing

config = get_config(data)
""" FEATURE EXTRACTION """
offset = (config["window_size"] - 1) / 2

unigram_window_stemmed = fact_extract_positional_word_features_stemmed(offset)
biigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 2)

extractors = [unigram_window_stemmed, biigram_window_stemmed]
feat_config = dict(config.items() + [("extractors", extractors)])
""" LOAD DATA """
tagged_essays = load_process_essays(**config)
logger.info("Essays loaded")
# most params below exist ONLY for the purposes of the hashing to and from disk

# Collapse all variants of a tag into one tag
feature_extractor = FeatureExtractorTransformer(extractors)

essay_feats = feature_extractor.transform(tagged_essays)
logger.info("Features loaded")

with open(serialized_essays, "w+") as f_essays:
    pickle.dump(tagged_essays, f_essays)

with open(serialized_features, "w+") as f_feats:
    pickle.dump(essay_feats, f_feats)