train_config["window_size"] = 9
offset = (train_config["window_size"] - 1) / 2

test_config = dict(train_config.items())
test_config["folder"] = test_folder
""" Load Data """
train_tagged_essays = load_process_essays(**train_config)
test_tagged_essays = load_process_essays(**test_config)

logger.info("Essays loaded - Train: %i Test %i" %
            (len(train_tagged_essays), len(test_tagged_essays)))

# Create Corpus in CRF Format (list of list of tuples(word,tag))
# --------------------------------------------------------------
""" Define Tags """
tag_freq = get_tag_freq(train_tagged_essays)
regular_tags = list(
    set((tag for tag, freq in tag_freq.items()
         if freq >= 0 and tag[0].isdigit())))
""" FEATURE EXTRACTION """

unigram_window_stemmed = fact_extract_positional_word_features(offset, True)
biigram_window_stemmed = fact_extract_ngram_features(offset=offset,
                                                     ngram_size=2,
                                                     stem_words=True)
trigram_window_stemmed = fact_extract_ngram_features(offset=offset,
                                                     ngram_size=3,
                                                     stem_words=True)
unigram_bow_window = fact_extract_ngram_features(offset=offset,
                                                 ngram_size=1,
                                                 positional=False,
test_folder                         = root_folder + "Test/"

train_config = get_config(training_folder)

test_config = dict(train_config.items())
test_config["folder"] = test_folder

train_tagged_essays = load_process_essays(**train_config)
test_tagged_essays = load_process_essays(**test_config)

logger.info("Essays loaded - Train: %i Test %i" % (len(train_tagged_essays), len(test_tagged_essays)))

# Create Corpus in CRF Format (list of list of tuples(word,tag))
# --------------------------------------------------------------

tag_freq = get_tag_freq(train_tagged_essays)
regular_tags = list(set((tag for tag, freq in tag_freq.items() if freq >= 0 and tag[0].isdigit())))

""" FEATURE EXTRACTION """

cv_wd_td_ys_by_tag, cv_wd_td_predictions_by_tag = defaultdict(list), defaultdict(list)
cv_wd_vd_ys_by_tag, cv_wd_vd_predictions_by_tag = defaultdict(list), defaultdict(list)

folds = [(train_tagged_essays, test_tagged_essays)]

results = Parallel(n_jobs=len(folds))(
            delayed(train_classifer_on_fold)(essays_TD, essays_VD, regular_tags, fold)
                for fold, (essays_TD, essays_VD) in enumerate(folds))

for result in results:
    wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code = result
root_folder = settings.data_directory + "SkinCancer/Thesis_Dataset/"
folder =                            root_folder + "Training/"
processed_essay_filename_prefix =   root_folder + "Pickled/essays_proc_pickled_"

config = get_config(folder)
print(config)

mem_process_essays = memoize_to_disk(filename_prefix=processed_essay_filename_prefix)(load_process_essays)
tagged_essays = mem_process_essays(**config)
logger.info("Essays loaded")
len(tagged_essays)

# Create Corpus in CRF Format (list of list of tuples(word,tag))
# --------------------------------------------------------------

tag_freq = get_tag_freq(tagged_essays)
regular_tags = list(set((tag for tag, freq in tag_freq.items() if freq >= 0 and tag[0].isdigit())))

""" FEATURE EXTRACTION """
config["window_size"] = 11
offset = (config["window_size"] - 1) / 2

cv_wd_td_ys_by_tag, cv_wd_td_predictions_by_tag = defaultdict(list), defaultdict(list)
cv_wd_vd_ys_by_tag, cv_wd_vd_predictions_by_tag = defaultdict(list), defaultdict(list)
folds = cross_validation(tagged_essays, CV_FOLDS)

results = Parallel(n_jobs=CV_FOLDS)(
            delayed(train_classifer_on_fold)(essays_TD, essays_VD, regular_tags, fold)
                for fold, (essays_TD, essays_VD) in enumerate(folds))

for result in results:
root_folder = settings.data_directory + "SkinCancer/Thesis_Dataset/"
folder =                            root_folder + "Training/"
processed_essay_filename_prefix =   root_folder + "Pickled/essays_proc_pickled_"

config = get_config(folder)
print(config)

mem_process_essays = memoize_to_disk(filename_prefix=processed_essay_filename_prefix)(load_process_essays)
tagged_essays = mem_process_essays(**config)
logger.info("Essays loaded")
len(tagged_essays)

# Create Corpus in CRF Format (list of list of tuples(word,tag))
# --------------------------------------------------------------

tag_freq = get_tag_freq(tagged_essays)
regular_tags = list(set((tag for tag, freq in tag_freq.items() if freq >= 0 and tag[0].isdigit())))

""" FEATURE EXTRACTION """
config["window_size"] = 9
offset = (config["window_size"] - 1) / 2

unigram_window_stemmed = fact_extract_positional_word_features(offset, True)
biigram_window_stemmed   = fact_extract_ngram_features(offset=offset, ngram_size=2, stem_words=True)
trigram_window_stemmed   = fact_extract_ngram_features(offset=offset, ngram_size=3, stem_words=True)
unigram_bow_window = fact_extract_ngram_features(offset=offset, ngram_size=1, positional=False, stem_words=False)

extractors = [
    unigram_bow_window,

    unigram_window_stemmed,