コード例 #1
0
    def __init__(self, models_folder, essays_folder, spell_check_dict):

        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
        if not models_folder.endswith("/"):
            models_folder += "/"
        if not essays_folder.endswith("/"):
            essays_folder += "/"

        self.logger = logging.getLogger()
        cfg = get_config(essays_folder)
        self.config = cfg
        self.essays_folder = essays_folder

        # Create spell checker
        # Need annotations here purely to load the tags
        tagged_essays = load_bratt_essays(essays_folder, include_vague=cfg["include_vague"], include_normal=cfg["include_normal"], load_annotations=True)
        self.__set_tags_(tagged_essays)
        self.wd_sent_freq = defaultdict(int)
        self.spelling_corrector = build_spelling_corrector(tagged_essays, self.config["lower_case"], self.wd_sent_freq, folder=spell_check_dict)

        # has to be an int as used in slices. In python 3.x this will automatically be a float
        offset = int((self.config["window_size"] - 1) / 2)

        unigram_window_stemmed = fact_extract_positional_word_features_stemmed(offset)
        biigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 2)

        extractors = [unigram_window_stemmed, biigram_window_stemmed]

        # most params below exist ONLY for the purposes of the hashing to and from disk
        self.feature_extractor = FeatureExtractorTransformer(extractors)

        # load models
        self.logger.info("Loading pickled models")
        store = ModelStore(models_folder=models_folder)

        self.feature_transformer =  store.get_transformer()
        self.logger.info("Loaded Transformer")
        self.tag_2_wd_classifier = store.get_tag_2_wd_classifier()
        self.logger.info("Loaded word tagging model")
        self.tag_2_sent_classifier = store.get_tag_2_sent_classifier()
        self.logger.info("Loaded sentence classifier")
コード例 #2
0
CV_FOLDS            = 5

MIN_TAG_FREQ        = 5
LOOK_BACK           = 0     # how many sentences to look back when predicting tags
# end not hashed

# construct unique key using settings for pickling

settings = Settings.Settings()

root_folder = settings.data_directory + "CoralBleaching/Thesis_Dataset/"
folder =                            root_folder + "Training/"
processed_essay_filename_prefix =   root_folder + "Pickled/essays_proc_pickled_"
features_filename_prefix =          root_folder + "Pickled/feats_pickled_"

config = get_config(folder)

""" Load Essays """
mem_process_essays = memoize_to_disk(filename_prefix=processed_essay_filename_prefix)(load_process_essays)
tagged_essays = mem_process_essays( **config )
logger.info("Essays loaded")
""" End load Essays """

def evaluate_window_size(config, window_size, features_filename_prefix):

    config["window_size"] = window_size

    """ FEATURE EXTRACTION """
    offset = (config["window_size"] - 1) / 2

    unigram_window = fact_extract_positional_word_features(offset)
root_folder = settings.data_directory + "SkinCancer/Thesis_Dataset/"

coref_root = root_folder + "CoReference/"
coref_output_folder = coref_root + "CRel/"

train_fname = coref_output_folder + "training_crel_anatagged_essays_most_recent_code.dill"
with open(train_fname, "rb") as f:
    pred_tagged_essays_train = dill.load(f)

test_fname = coref_output_folder + "test_crel_anatagged_essays_most_recent_code.dill"
with open(test_fname, "rb") as f:
    pred_tagged_essays_test = dill.load(f)

# Get Test Data In Order to Get Test CRELS
# load the test essays to make sure we compute metrics over the test CR labels
config = get_config(coref_output_folder)
results_processor = ResultsProcessor(dbname="metrics_causal_model")
########################################################

logger.info("Started at: " + str(datetime.datetime.now()))
logger.info("Number of pred tagged essays %i" % len(pred_tagged_essays_train))  # should be 902

cr_tags = get_cr_tags(train_tagged_essays=pred_tagged_essays_train, tag_essays_test=pred_tagged_essays_test)
cv_folds  = [(pred_tagged_essays_train, pred_tagged_essays_test)]  # type: List[Tuple[Any,Any]]

def evaluate_model(
        collection_prefix: str,
        folds: List[Tuple[Any, Any]],
        extractor_fn_names_lst: List[str],
        cost_function_name: str,
        beta: float,
コード例 #4
0
# Data Set Partition
CV_FOLDS = 5
MIN_FEAT_FREQ = 5

# Global settings

settings = Settings()
root_folder = settings.data_directory + "CoralBleaching/Thesis_Dataset/"
training_folder = root_folder + "Training" + "/"
test_folder = root_folder + "Test" + "/"
training_pickled = settings.data_directory + "CoralBleaching/Thesis_Dataset/training.pl"
# NOTE: These predictions are generated from the "./notebooks/SEARN/Keras - Train Tagger and Save CV Predictions For Word Tags.ipynb" notebook
# used as inputs to parsing model
rnn_predictions_folder = root_folder + "Predictions/Bi-LSTM-4-SEARN/"

config = get_config(training_folder)

# Get Test Data In Order to Get Test CRELS
# load the test essays to make sure we compute metrics over the test CR labels
test_config = get_config(test_folder)
tagged_essays_test = load_process_essays(**test_config)
########################################################

fname = rnn_predictions_folder + "essays_train_bi_directional-True_hidden_size-256_merge_mode-sum_num_rnns-2_use_pretrained_embedding-True.dill"
with open(fname, "rb") as f:
    pred_tagged_essays = dill.load(f)

logger.info("Started at: " + str(datetime.datetime.now()))
logger.info("Number of pred tagged essays %i" %
            len(pred_tagged_essays))  # should be 902
コード例 #5
0
MIN_FEAT_FREQ = 5  # 5 best so far
CV_FOLDS = 5

MIN_TAG_FREQ = 5
LOOK_BACK = 0  # how many sentences to look back when predicting tags

STEM = True
# end not hashed

# construct unique key using settings for pickling
settings = Settings.Settings()
root_folder = settings.data_directory + "SkinCancer/Thesis_Dataset/"
folder =                            root_folder + "Training/"
processed_essay_filename_prefix =   root_folder + "Pickled/essays_proc_pickled_"

config = get_config(folder)
print(config)

mem_process_essays = memoize_to_disk(filename_prefix=processed_essay_filename_prefix)(load_process_essays)
tagged_essays = mem_process_essays(**config)
logger.info("Essays loaded")
len(tagged_essays)

# Create Corpus in CRF Format (list of list of tuples(word,tag))
# --------------------------------------------------------------

tag_freq = get_tag_freq(tagged_essays)
regular_tags = list(set((tag for tag, freq in tag_freq.items() if freq >= 0 and tag[0].isdigit())))

""" FEATURE EXTRACTION """
config["window_size"] = 11
コード例 #6
0
from featureextractortransformer import FeatureExtractorTransformer
from load_data import load_process_essays

from featureextractionfunctions import *
from window_based_tagger_config import get_config

import cPickle as pickle
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logger = logging.getLogger()

# not hashed as don't affect persistence of feature processing

config = get_config(data)

""" FEATURE EXTRACTION """
offset = (config["window_size"] - 1) / 2

unigram_window_stemmed = fact_extract_positional_word_features_stemmed(offset)
biigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 2)

extractors = [unigram_window_stemmed, biigram_window_stemmed]
feat_config = dict(config.items() + [("extractors", extractors)])

""" LOAD DATA """
tagged_essays = load_process_essays( **config )
logger.info("Essays loaded")
# most params below exist ONLY for the purposes of the hashing to and from disk
# Data Set Partition
CV_FOLDS = 5
MIN_FEAT_FREQ = 5

# Global settings

settings = Settings()
root_folder = settings.data_directory + "CoralBleaching/Thesis_Dataset/"
training_folder = root_folder + "Training" + "/"
test_folder = root_folder + "Test" + "/"
training_pickled = settings.data_directory + "CoralBleaching/Thesis_Dataset/training.pl"
# NOTE: These predictions are generated from the "./notebooks/SEARN/Keras - Train Tagger and Save CV Predictions For Word Tags.ipynb" notebook
# used as inputs to parsing model
rnn_predictions_folder = root_folder + "Predictions/Bi-LSTM-4-SEARN/"

config = get_config(training_folder)
results_processor = ResultsProcessor(dbname="metrics_causal")

# Get Test Data In Order to Get Test CRELS
# load the test essays to make sure we compute metrics over the test CR labels
test_config = get_config(test_folder)
tagged_essays_test = load_process_essays(**test_config)
########################################################

fname = rnn_predictions_folder + "essays_train_bi_directional-True_hidden_size-256_merge_mode-sum_num_rnns-2_use_pretrained_embedding-True.dill"
with open(fname, "rb") as f:
    pred_tagged_essays = dill.load(f)

logger.info("Started at: " + str(datetime.datetime.now()))
logger.info("Number of pred tagged essays %i" % len(pred_tagged_essays))  # should be 902
MIN_FEAT_FREQ       = 5        # 5 best so far
CV_FOLDS            = 5

MIN_TAG_FREQ        = 5
LOOK_BACK           = 0     # how many sentences to look back when predicting tags
# end not hashed

# construct unique key using settings for pickling

settings = Settings.Settings()

root_folder = settings.data_directory + "SkinCancer/Thesis_Dataset/"
training_folder                     = root_folder + "Training/"
test_folder                         = root_folder + "Test/"

train_config = get_config(training_folder)

""" FEATURE EXTRACTION """
train_config["window_size"] = 9
offset = (train_config["window_size"] - 1) / 2

unigram_window_stemmed  = fact_extract_positional_word_features_stemmed(offset)
biigram_window_stemmed  = fact_extract_ngram_features_stemmed(offset, 2)
triigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 3)
unigram_bow_window      = fact_extract_bow_ngram_features(offset, 1)

#optimal SC feature set
extractors = [unigram_bow_window,
              unigram_window_stemmed,
              biigram_window_stemmed,
              #trigram_window_stemmed,
MIN_FEAT_FREQ = 5  # 5 best so far
CV_FOLDS = 5

MIN_TAG_FREQ = 5
LOOK_BACK = 0  # how many sentences to look back when predicting tags
# end not hashed

# construct unique key using settings for pickling
settings = Settings.Settings()
root_folder = settings.data_directory + "CoralBleaching/Thesis_Dataset/"
training_folder = root_folder + "Training/"
test_folder = root_folder + "Test/"

models_folder = settings.data_directory + "CoralBleaching/models/CRF"
""" Load Configs """
train_config = get_config(training_folder)
train_config["window_size"] = 9
offset = (train_config["window_size"] - 1) / 2

test_config = dict(train_config.items())
test_config["folder"] = test_folder
""" Load Data """
train_tagged_essays = load_process_essays(**train_config)
test_tagged_essays = load_process_essays(**test_config)

logger.info("Essays loaded - Train: %i Test %i" %
            (len(train_tagged_essays), len(test_tagged_essays)))

# Create Corpus in CRF Format (list of list of tuples(word,tag))
# --------------------------------------------------------------
""" Define Tags """
コード例 #10
0
from FindFiles import find_files
from Settings import Settings
from load_data import load_process_essays
from window_based_tagger_config import get_config
import dill

CV_FOLDS = 5
DEV_SPLIT = 0.1

settings = Settings()
root_folder = settings.data_directory + "CoralBleaching/Thesis_Dataset/"
partition = "Training"  # Training | Test
target_folder = root_folder + partition + "/"
processed_essay_filename_prefix = root_folder + "Pickled/essays_proc_pickled_"

config = get_config(target_folder)
# override this so we don't replace INFREQUENT words
#config["min_df"] = 0

mem_process_essays = memoize_to_disk(
    filename_prefix=processed_essay_filename_prefix)(load_process_essays)
tagged_essays = mem_process_essays(**config)
print("{0} essays loaded".format(len(tagged_essays)))

coref_root = root_folder + "CoReference/"
coref_folder = coref_root + partition

coref_files = find_files(coref_folder, ".*\.tagged")
print("{0} co-ref tagged files loaded".format(len(coref_files)))
assert len(coref_files) == len(tagged_essays)
コード例 #11
0
from featureextractortransformer import FeatureExtractorTransformer
from load_data import load_process_essays

from featureextractionfunctions import *
from window_based_tagger_config import get_config

import cPickle as pickle
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)
logger = logging.getLogger()

# not hashed as don't affect persistence of feature processing

config = get_config(data)
""" FEATURE EXTRACTION """
offset = (config["window_size"] - 1) / 2

unigram_window_stemmed = fact_extract_positional_word_features_stemmed(offset)
biigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 2)

extractors = [unigram_window_stemmed, biigram_window_stemmed]
feat_config = dict(config.items() + [("extractors", extractors)])
""" LOAD DATA """
tagged_essays = load_process_essays(**config)
logger.info("Essays loaded")
# most params below exist ONLY for the purposes of the hashing to and from disk

# Collapse all variants of a tag into one tag
feature_extractor = FeatureExtractorTransformer(extractors)
MIN_TAG_FREQ        = 5
LOOK_BACK           = 0     # how many sentences to look back when predicting tags
# end not hashed

# construct unique key using settings for pickling
settings = Settings.Settings()

model_store = ModelStore()

""" PETER - CHANGE THESE FILE PATHS """
folder =                settings.data_directory + "CoralBleaching/BrattData/EBA1415_Merged/"   # Location where the training data is, use EBA_Pre and Post test essays preferably
test_folder=            settings.data_directory + "CoralBleaching/BrattData/Merged/"                # Location where the new essays to tag are located
out_predictions_file =  settings.data_directory + "CoralBleaching/Results/predictions.txt"          # File to dump the predictions to

config = get_config(folder)

""" FEATURE EXTRACTION """
offset = (config["window_size"] - 1) / 2

unigram_window_stemmed = fact_extract_positional_word_features_stemmed(offset)
biigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 2)

#pos_tag_window = fact_extract_positional_POS_features(offset)
#pos_tag_plus_wd_window = fact_extract_positional_POS_features_plus_word(offset)
#head_wd_window = fact_extract_positional_head_word_features(offset)
#pos_dep_vecs = fact_extract_positional_dependency_vectors(offset)

extractors = [unigram_window_stemmed, biigram_window_stemmed]
feat_config = dict(config.items() + [("extractors", extractors)])
コード例 #13
0
LOOK_BACK = 0  # how many sentences to look back when predicting tags

settings = Settings.Settings()

root = settings.data_directory + "/GlobalWarming/BrattFiles/merged/"
""" INPUT - two serialized files, one for the pre-processed essays, the other for the features """
""" OUTPUT """
processed_essay_filename_prefix = root + "Pickled/essays_proc_pickled_"
features_filename_prefix = root + "Pickled/feats_pickled_"

out_predictions_file = root + "Experiment/Output/predictions.txt"
out_predicted_margins_file = root + "Experiment/Output/predicted_confidence.txt"
out_metrics_file = root + "Experiment/Output/metrics.txt"
out_categories_file = root + "Experiment/Output/categories.txt"

config = get_config(root)
""" FEATURE EXTRACTION """
offset = (config["window_size"] - 1) / 2

unigram_window_stemmed = fact_extract_positional_word_features_stemmed(offset)
biigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 2)

extractors = [unigram_window_stemmed, biigram_window_stemmed]
feat_config = dict(config.items() + [("extractors", extractors)])
""" LOAD DATA """
mem_process_essays = memoize_to_disk(
    filename_prefix=processed_essay_filename_prefix)(load_process_essays)
tagged_essays = mem_process_essays(**config)
# replace periods in tags so that we can store results in mongo
replace_periods(tagged_essays)
logger.info("Essays loaded")
コード例 #14
0
CV_FOLDS = 5

MIN_TAG_FREQ = 5
LOOK_BACK = 0  # how many sentences to look back when predicting tags
# end not hashed

# construct unique key using settings for pickling
settings = Settings.Settings()

model_store = ModelStore()
""" PETER - CHANGE THESE FILE PATHS """
folder = settings.data_directory + "CoralBleaching/BrattData/EBA1415_Merged/"  # Location where the training data is, use EBA_Pre and Post test essays preferably
test_folder = settings.data_directory + "CoralBleaching/BrattData/Merged/"  # Location where the new essays to tag are located
out_predictions_file = settings.data_directory + "CoralBleaching/Results/predictions.txt"  # File to dump the predictions to

config = get_config(folder)
""" FEATURE EXTRACTION """
offset = (config["window_size"] - 1) / 2

unigram_window_stemmed = fact_extract_positional_word_features_stemmed(offset)
biigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 2)

#pos_tag_window = fact_extract_positional_POS_features(offset)
#pos_tag_plus_wd_window = fact_extract_positional_POS_features_plus_word(offset)
#head_wd_window = fact_extract_positional_head_word_features(offset)
#pos_dep_vecs = fact_extract_positional_dependency_vectors(offset)

extractors = [unigram_window_stemmed, biigram_window_stemmed]
feat_config = dict(config.items() + [("extractors", extractors)])
""" LOAD DATA """
tagged_essays = load_process_essays(**config)
コード例 #15
0
SPARSE_WD_FEATS     = True

MIN_FEAT_FREQ       = 5        # 5 best so far
CV_FOLDS            = 5

MIN_TAG_FREQ        = 5
LOOK_BACK           = 0     # how many sentences to look back when predicting tags
# end not hashed

settings = Settings()
root_folder = settings.data_directory + "CoralBleaching/Thesis_Dataset/"
partition = "Training" # Training | Test
target_folder = root_folder + partition + "/"
processed_essay_filename_prefix =  root_folder + "Pickled/essays_proc_pickled_"

config = get_config(target_folder)

# LOAD ESSAYS
mem_process_essays = memoize_to_disk(filename_prefix=processed_essay_filename_prefix)(load_process_essays)
tagged_essays = mem_process_essays(**config)
# map parsed essays to essay name

print("{0} essays loaded".format(len(tagged_essays)))

# LOAD COREF RESULTS
coref_root = root_folder + "CoReference/"
coref_folder = coref_root + partition
coref_files = find_files(coref_folder, ".*\.tagged")
print("{0} co-ref tagged files loaded".format(len(coref_files)))
assert len(coref_files) == len(tagged_essays)
settings = Settings.Settings()

root     =  settings.data_directory + "/GlobalWarming/BrattFiles/merged/"

""" INPUT - two serialized files, one for the pre-processed essays, the other for the features """

""" OUTPUT """
processed_essay_filename_prefix =  root + "Pickled/essays_proc_pickled_"
features_filename_prefix =         root + "Pickled/feats_pickled_"

out_predictions_file        = root + "Experiment/Output/predictions.txt"
out_predicted_margins_file  = root + "Experiment/Output/predicted_confidence.txt"
out_metrics_file            = root + "Experiment/Output/metrics.txt"
out_categories_file         = root + "Experiment/Output/categories.txt"

config = get_config(root)

""" FEATURE EXTRACTION """
offset = (config["window_size"] - 1) / 2

unigram_window_stemmed = fact_extract_positional_word_features_stemmed(offset)
biigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 2)

extractors = [unigram_window_stemmed, biigram_window_stemmed]
feat_config = dict(config.items() + [("extractors", extractors)])

""" LOAD DATA """
mem_process_essays = memoize_to_disk(filename_prefix=processed_essay_filename_prefix)(load_process_essays)
tagged_essays = mem_process_essays( **config )
# replace periods in tags so that we can store results in mongo
replace_periods(tagged_essays)