Ejemplo n.º 1
0
def evaluate_feature_set(config, existing_extractors, new_extractor,
                         features_filename_prefix):

    feat_extractors = existing_extractors + [new_extractor]
    feat_config = dict(
        list(config.items()) + [("extractors", feat_extractors)])
    """ LOAD FEATURES """
    # most params below exist ONLY for the purposes of the hashing to and from disk
    #mem_extract_features = memoize_to_disk(filename_prefix=features_filename_prefix, verbose=False)(extract_features)
    #essay_feats = mem_extract_features(tagged_essays, **feat_config)
    essay_feats = extract_features(tagged_essays, **feat_config)
    """ DEFINE TAGS """
    _, lst_all_tags = flatten_to_wordlevel_feat_tags(essay_feats)
    regular_tags = list(
        set((t for t in flatten(lst_all_tags) if t[0].isdigit())))
    """ works best with all the pair-wise causal relation codes """
    wd_train_tags = regular_tags
    wd_test_tags = regular_tags
    """ CLASSIFIERS """
    fn_create_wd_cls = lambda: LogisticRegression(
    )  # C=1, dual = False seems optimal
    wd_algo = str(fn_create_wd_cls())

    # Gather metrics per fold
    folds = cross_validation(essay_feats, CV_FOLDS)

    def train_tagger(essays_TD, essays_VD, wd_test_tags, wd_train_tags):
        # TD and VD are lists of Essay objects. The sentences are lists
        # of featureextractortransformer.Word objects
        """ Data Partitioning and Training """
        td_feats, td_tags = flatten_to_wordlevel_feat_tags(essays_TD)
        vd_feats, vd_tags = flatten_to_wordlevel_feat_tags(essays_VD)
        feature_transformer = FeatureVectorizer(min_feature_freq=MIN_FEAT_FREQ,
                                                sparse=SPARSE_WD_FEATS)
        td_X, vd_X = feature_transformer.fit_transform(
            td_feats), feature_transformer.transform(vd_feats)
        return td_X.shape, vd_X.shape

    #results = Parallel(n_jobs=CV_FOLDS)(
    #        delayed(train_tagger)(essays_TD, essays_VD, wd_test_tags, wd_train_tags)
    #            for (essays_TD, essays_VD) in folds)

    td_col_sizes, vd_col_sizes = [], []
    for (essays_TD, essays_VD) in folds:
        td_x_shape, vd_x_shape = train_tagger(essays_TD, essays_VD,
                                              wd_test_tags, wd_train_tags)
        td_col_sizes.append(td_x_shape[1])
        vd_col_sizes.append(vd_x_shape[1])
    return np.mean(td_col_sizes), np.mean(vd_col_sizes)
    def run(self, min_wd_cnt = 5, stem = True, spelling_correct = True, folds = 10):

        """ We don't want to remove stop words
        """
        sentences, tagged_sentences = self.load_tagged_sentences()
        processed_sentences = self.process_sentences(sentences, spelling_correct, stem, min_wd_cnt)
        sentence_features = np.asarray( map(self.features_for_sentence, processed_sentences))

        cross_validation_ixs = cross_validation(range(len(sentences)), folds)
        codes = sorted(set(flatten(tagged_sentences)))

        for code in codes:

            code_tags = self.tags_for_code(code, tagged_sentences)

            pass

        pass
def evaluate_feature_set(config, existing_extractors, new_extractor, features_filename_prefix):

    feat_extractors = existing_extractors + [new_extractor]
    feat_config = dict(list(config.items()) + [("extractors", feat_extractors)])
    """ LOAD FEATURES """
    # most params below exist ONLY for the purposes of the hashing to and from disk
    #mem_extract_features = memoize_to_disk(filename_prefix=features_filename_prefix, verbose=False)(extract_features)
    #essay_feats = mem_extract_features(tagged_essays, **feat_config)
    essay_feats = extract_features(tagged_essays, **feat_config)
    """ DEFINE TAGS """
    _, lst_all_tags = flatten_to_wordlevel_feat_tags(essay_feats)
    regular_tags = list(set((t for t in flatten(lst_all_tags) if t[0].isdigit())))
    """ works best with all the pair-wise causal relation codes """
    wd_train_tags = regular_tags
    wd_test_tags = regular_tags
    """ CLASSIFIERS """
    fn_create_wd_cls = lambda: LogisticRegression()  # C=1, dual = False seems optimal
    wd_algo = str(fn_create_wd_cls())

    # Gather metrics per fold
    folds = cross_validation(essay_feats, CV_FOLDS)

    def train_tagger(essays_TD, essays_VD, wd_test_tags, wd_train_tags):
        # TD and VD are lists of Essay objects. The sentences are lists
        # of featureextractortransformer.Word objects
        """ Data Partitioning and Training """
        td_feats, td_tags = flatten_to_wordlevel_feat_tags(essays_TD)
        vd_feats, vd_tags = flatten_to_wordlevel_feat_tags(essays_VD)
        feature_transformer = FeatureVectorizer(min_feature_freq=MIN_FEAT_FREQ, sparse=SPARSE_WD_FEATS)
        td_X, vd_X = feature_transformer.fit_transform(td_feats), feature_transformer.transform(vd_feats)
        return td_X.shape, vd_X.shape

    #results = Parallel(n_jobs=CV_FOLDS)(
    #        delayed(train_tagger)(essays_TD, essays_VD, wd_test_tags, wd_train_tags)
    #            for (essays_TD, essays_VD) in folds)

    td_col_sizes, vd_col_sizes = [], []
    for (essays_TD, essays_VD) in folds:
        td_x_shape, vd_x_shape = train_tagger(essays_TD, essays_VD, wd_test_tags, wd_train_tags)
        td_col_sizes.append(td_x_shape[1])
        vd_col_sizes.append(vd_x_shape[1])
    return np.mean(td_col_sizes), np.mean(vd_col_sizes)
Ejemplo n.º 4
0
    def run(self, min_wd_cnt=5, stem=True, spelling_correct=True, folds=10):
        """ We don't want to remove stop words
        """
        sentences, tagged_sentences = self.load_tagged_sentences()
        processed_sentences = self.process_sentences(sentences,
                                                     spelling_correct, stem,
                                                     min_wd_cnt)
        sentence_features = np.asarray(
            map(self.features_for_sentence, processed_sentences))

        cross_validation_ixs = cross_validation(range(len(sentences)), folds)
        codes = sorted(set(flatten(tagged_sentences)))

        for code in codes:

            code_tags = self.tags_for_code(code, tagged_sentences)

            pass

        pass
#assert set(CAUSE_TAGS).issubset(set(sent_input_feat_tags)), "To extract causal relations, we need Causer tags"
# tags to evaluate against
""" CLASSIFIERS """
""" Log Reg + Log Reg is best!!! """

f_output_file = open(out_predictions_file, "w+")
f_output_file.write(
    "Essay|Sent Number|Processed Sentence|Concept Codes|Predictions\n")

# Gather metrics per fold
cv_wd_td_ys_by_tag, cv_wd_td_predictions_by_tag = defaultdict(
    list), defaultdict(list)
cv_wd_vd_ys_by_tag, cv_wd_vd_predictions_by_tag = defaultdict(
    list), defaultdict(list)

folds = cross_validation(essay_feats, CV_FOLDS)


def pad_str(val):
    return str(val).ljust(20) + "  "


def toDict(obj):
    return obj.__dict__


#TODO Parallelize
for i, (essays_TD, essays_VD) in enumerate(folds):

    # TD and VD are lists of Essay objects. The sentences are lists
    # of featureextractortransformer.Word objects
# Get Test Data In Order to Get Test CRELS
# load the test essays to make sure we compute metrics over the test CR labels
test_config = get_config(test_folder)
tagged_essays_test = load_process_essays(**test_config)
########################################################

fname = rnn_predictions_folder + "essays_train_bi_directional-True_hidden_size-256_merge_mode-sum_num_rnns-2_use_pretrained_embedding-True.dill"
with open(fname, "rb") as f:
    pred_tagged_essays = dill.load(f)

logger.info("Started at: " + str(datetime.datetime.now()))
logger.info("Number of pred tagged essays %i" % len(pred_tagged_essays))  # should be 902

cr_tags = get_cr_tags(train_tagged_essays=pred_tagged_essays, tag_essays_test=tagged_essays_test)
cv_folds = cross_validation(pred_tagged_essays, CV_FOLDS)  # type: List[Tuple[Any,Any]]

def evaluate_features(
        collection_prefix: str,
        folds: List[Tuple[Any, Any]],
        extractor_fn_names_lst: List[str],
        cost_function_name: str,
        beta: float,
        base_learner: Any,
        ngrams: int,
        stemmed: bool,
        down_sample_rate=1.0) -> float:
    if down_sample_rate < 1.0:
        new_folds = []  # type: List[Tuple[Any, Any]]
        for i, (essays_TD, essays_VD) in enumerate(folds):
            essays_TD = essays_TD[:int(down_sample_rate * len(essays_TD))]
extractors = [
    unigram_bow_window,

    unigram_window_stemmed,
    biigram_window_stemmed,
    #trigram_window_stemmed,

    extract_brown_cluster,
    #extract_dependency_relation
]

comp_feat_extactor = fact_composite_feature_extractor(extractors)

cv_wd_td_ys_by_tag, cv_wd_td_predictions_by_tag = defaultdict(list), defaultdict(list)
cv_wd_vd_ys_by_tag, cv_wd_vd_predictions_by_tag = defaultdict(list), defaultdict(list)
folds = cross_validation(tagged_essays, CV_FOLDS)

results = Parallel(n_jobs=CV_FOLDS)(
            delayed(train_classifer_on_fold)(essays_TD, essays_VD, regular_tags, fold)
                for fold, (essays_TD, essays_VD) in enumerate(folds))

for result in results:
    wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code = result

    merge_dictionaries(wd_td_ys_bytag, cv_wd_td_ys_by_tag)
    merge_dictionaries(wd_vd_ys_bytag, cv_wd_vd_ys_by_tag)
    merge_dictionaries(td_wd_predictions_by_code, cv_wd_td_predictions_by_tag)
    merge_dictionaries(vd_wd_predictions_by_code, cv_wd_vd_predictions_by_tag)

logger.info("Training completed")
results_processor = ResultsProcessor(dbname="metrics_coref_causal")

train_fname = coref_output_folder + "training_crel_anatagged_essays_most_recent_code.dill"
with open(train_fname, "rb") as f:
    pred_tagged_essays_train = dill.load(f)

test_fname = coref_output_folder + "test_crel_anatagged_essays_most_recent_code.dill"
with open(test_fname, "rb") as f:
    pred_tagged_essays_test = dill.load(f)

logger.info("Started at: " + str(datetime.datetime.now()))
logger.info("Number of pred tagged essays %i" % len(pred_tagged_essays_train))  # should be 902

cr_tags = get_cr_tags(train_tagged_essays=pred_tagged_essays_train, tag_essays_test=pred_tagged_essays_test)
# cv_folds  = [(pred_tagged_essays_train, pred_tagged_essays_test)]  # type: List[Tuple[Any,Any]]
cv_folds = cross_validation(pred_tagged_essays_train, CV_FOLDS)

def evaluate_model(
        collection_prefix: str,
        folds: List[Tuple[Any, Any]],
        extractor_fn_names_lst: List[str],
        cost_function_name: str,
        beta: float,
        ngrams: int,
        stemmed: bool,
        max_epochs: int,
        down_sample_rate=1.0) -> float:

    if down_sample_rate < 1.0:
        new_folds = []  # type: List[Tuple[Any, Any]]
        for i, (essays_TD, essays_VD) in enumerate(folds):
def evaluate_feature_set(config, existing_extractors, new_extractor, features_filename_prefix):

    feat_extractors = existing_extractors + [new_extractor]
    feat_config = dict(config.items() + [("extractors", feat_extractors)])
    """ LOAD FEATURES """
    # most params below exist ONLY for the purposes of the hashing to and from disk
    #mem_extract_features = memoize_to_disk(filename_prefix=features_filename_prefix, verbose=False)(extract_features)
    #essay_feats = mem_extract_features(tagged_essays, **feat_config)
    essay_feats = extract_features(tagged_essays, **feat_config)
    """ DEFINE TAGS """
    _, lst_all_tags = flatten_to_wordlevel_feat_tags(essay_feats)
    regular_tags = list(set((t for t in flatten(lst_all_tags) if t[0].isdigit())))
    """ works best with all the pair-wise causal relation codes """
    wd_train_tags = regular_tags
    wd_test_tags = regular_tags
    """ CLASSIFIERS """
    fn_create_wd_cls = lambda: LogisticRegression()  # C=1, dual = False seems optimal
    wd_algo = str(fn_create_wd_cls())

    # Gather metrics per fold
    cv_wd_td_ys_by_tag, cv_wd_td_predictions_by_tag = defaultdict(list), defaultdict(list)
    cv_wd_vd_ys_by_tag, cv_wd_vd_predictions_by_tag = defaultdict(list), defaultdict(list)
    folds = cross_validation(essay_feats, CV_FOLDS)

    def train_tagger(essays_TD, essays_VD, wd_test_tags, wd_train_tags):
        # TD and VD are lists of Essay objects. The sentences are lists
        # of featureextractortransformer.Word objects
        """ Data Partitioning and Training """
        td_feats, td_tags = flatten_to_wordlevel_feat_tags(essays_TD)
        vd_feats, vd_tags = flatten_to_wordlevel_feat_tags(essays_VD)
        feature_transformer = FeatureVectorizer(min_feature_freq=MIN_FEAT_FREQ, sparse=SPARSE_WD_FEATS)
        td_X, vd_X = feature_transformer.fit_transform(td_feats), feature_transformer.transform(vd_feats)
        wd_td_ys_bytag = get_wordlevel_ys_by_code(td_tags, wd_train_tags)
        wd_vd_ys_bytag = get_wordlevel_ys_by_code(vd_tags, wd_train_tags)
        """ TRAIN Tagger """
        tag2word_classifier = train_classifier_per_code(td_X, wd_td_ys_bytag, lambda: LogisticRegression(),
                                                        wd_train_tags, verbose=False)
        """ TEST Tagger """
        td_wd_predictions_by_code = test_classifier_per_code(td_X, tag2word_classifier, wd_test_tags)
        vd_wd_predictions_by_code = test_classifier_per_code(vd_X, tag2word_classifier, wd_test_tags)
        return td_wd_predictions_by_code, vd_wd_predictions_by_code, wd_td_ys_bytag, wd_vd_ys_bytag

    #results = Parallel(n_jobs=CV_FOLDS)(
    #        delayed(train_tagger)(essays_TD, essays_VD, wd_test_tags, wd_train_tags)
    #            for (essays_TD, essays_VD) in folds)

    results = [train_tagger(essays_TD, essays_VD, wd_test_tags, wd_train_tags)
               for (essays_TD, essays_VD) in folds]

    for result in results:
        td_wd_predictions_by_code, vd_wd_predictions_by_code, wd_td_ys_bytag, wd_vd_ys_bytag = result
        merge_dictionaries(wd_td_ys_bytag, cv_wd_td_ys_by_tag)
        merge_dictionaries(wd_vd_ys_bytag, cv_wd_vd_ys_by_tag)
        merge_dictionaries(td_wd_predictions_by_code, cv_wd_td_predictions_by_tag)
        merge_dictionaries(vd_wd_predictions_by_code, cv_wd_vd_predictions_by_tag)

    # print results for each code
    """ Persist Results to Mongo DB """
    SUFFIX = "_FEAT_SELECTION"
    CB_TAGGING_TD, CB_TAGGING_VD = "CB_TAGGING_TD" + SUFFIX, "CB_TAGGING_VD" + SUFFIX
    parameters = dict(config)
    parameters["extractors"] = map(lambda fn: fn.func_name, feat_extractors)
    parameters["min_feat_freq"] = MIN_FEAT_FREQ

    wd_td_objectid = processor.persist_results(CB_TAGGING_TD, cv_wd_td_ys_by_tag,
                                               cv_wd_td_predictions_by_tag, parameters, wd_algo)
    wd_vd_objectid = processor.persist_results(CB_TAGGING_VD, cv_wd_vd_ys_by_tag,
                                               cv_wd_vd_predictions_by_tag, parameters, wd_algo)

    avg_f1 = float(processor.get_metric(CB_TAGGING_VD, wd_vd_objectid, __MICRO_F1__)["f1_score"])
    return avg_f1
            # for all other tags, a 0
            for tag in (vtags - set([EMPTY_TAG, pred_tag])):
                pred_ys_by_tag[tag].append(0)
        if EMPTY_TAG in pred_ys_by_tag:
            del pred_ys_by_tag[EMPTY_TAG]
    return pred_ys_by_tag


def train_dev_split(lst, dev_split):
    # random shuffle
    shuffle(lst)
    num_training = int((1.0 - dev_split) * len(lst))
    return lst[:num_training], lst[num_training:]


folds = cross_validation(tagged_essays, CV_FOLDS)
fold2training_data = {}
fold2dev_data = {}
fold2test_data = {}

for i, (essays_TD, essays_VD) in enumerate(folds):
    # further split into train and dev test
    essays_train, essays_dev = train_dev_split(essays_TD, DEV_SPLIT)
    fold2training_data[i] = get_training_data(essays_train)
    fold2dev_data[i] = get_training_data(essays_dev)
    # Test Data
    fold2test_data[i] = get_training_data(essays_VD)

# ## Load Glove 100 Dim Embeddings

# see /Users/simon.hughes/GitHub/NlpResearch/PythonNlpResearch/DeepLearning/WordVectors/pickle_glove_embedding.py
Ejemplo n.º 11
0
Archivo: main.py Proyecto: cxf78/-
from LoadData import loadTestData
from LoadData import loadTrainData
from Knn import Knn
from CrossValidation import cross_validation
import numpy as np
import math


trainData,trainLabel=loadTrainData("cifar-10-batches-py/")
print(np.shape(trainData),np.shape(trainLabel))

testData,testLabel=loadTestData("cifar-10-batches-py/")
print(np.shape(testData),np.shape(testLabel))

trainData=trainData[:100]
trainLabel=trainLabel[:100]
testData=testData[:10]
testLabel=testLabel[:10]

accuracy = cross_validation(trainData,trainLabel,4)
print(accuracy)
            pred_ys_by_tag[pred_tag].append(1)
            # for all other tags, a 0
            for tag in (vtags - set([EMPTY_TAG, pred_tag])):
                pred_ys_by_tag[tag].append(0)
        if EMPTY_TAG in pred_ys_by_tag:
            del pred_ys_by_tag[EMPTY_TAG]
    return pred_ys_by_tag

def train_dev_split(lst, dev_split):
    # random shuffle
    shuffle(lst)
    num_training = int((1.0 - dev_split) * len(lst))
    return lst[:num_training], lst[num_training:]


folds = cross_validation(tagged_essays, CV_FOLDS)
fold2training_data = {}
fold2dev_data = {}
fold2test_data = {}

for i, (essays_TD, essays_VD) in enumerate(folds):
    # further split into train and dev test
    essays_train, essays_dev = train_dev_split(essays_TD, DEV_SPLIT)
    fold2training_data[i] = get_training_data(essays_train)
    fold2dev_data[i]     = get_training_data(essays_dev)
    # Test Data
    fold2test_data[i]     = get_training_data(essays_VD)

# ## Load Glove 100 Dim Embeddings

# see /Users/simon.hughes/GitHub/NlpResearch/PythonNlpResearch/DeepLearning/WordVectors/pickle_glove_embedding.py
stemmed = True
cost_function_name = micro_f1_cost_plusepsilon.__name__
dual = True
fit_intercept = True
beta = 0.5
max_epochs = 2
C = 0.5
penalty = "l2"

# Note these also differ for SC dataset
BASE_LEARNER_FACT = lambda: LogisticRegression(dual=dual, C=C, penalty=penalty, fit_intercept=fit_intercept)
best_extractor_names = ['single_words', 'between_word_features', 'label_set',
                        'three_words', 'third_order', 'unigrams']  # type: List[str]

test_folds = [(pred_tagged_essays_train, pred_tagged_essays_test)]  # type: List[Tuple[Any,Any]]
cv_folds = cross_validation(pred_tagged_essays_train, CV_FOLDS)  # type: List[Tuple[Any,Any]]

result_test_essay_level = evaluate_model_essay_level(
    folds=cv_folds,
    extractor_fn_names_lst=best_extractor_names,
    all_extractor_fns=all_extractor_fns,
    ngrams=ngrams,
    beta=beta,
    stemmed=stemmed,
    down_sample_rate=1.0,
    max_epochs=max_epochs)

models, cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag, cv_td_preds_by_sent, cv_sent_vd_ys_by_tag = result_test_essay_level

mean_metrics = ResultsProcessor.compute_mean_metrics(cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag)
print(get_micro_metrics(metrics_to_df(mean_metrics)))
Ejemplo n.º 14
0
    for code in codes:
        print "Training for :", code
        cls = fn_create_cls()
        code2classifier[code] = cls
        ys = np.asarray(yByCode[code])
        #ys = map(map_y, ys)
        cls.fit(xs, ys)
    return code2classifier


fn_classifier = LinearSVC
SPLITS = 2
causal_codes = cr_codes + ["explicit"]

ixs = range(len(sentences))
folds = cross_validation(ixs, SPLITS)
td_metrics = []
vd_metrics = []

for num, (ix_train, ix_valid) in enumerate(folds):
    print "Fold:", num + 1

    # Train sequential classifier
    xs_t, yByCode_t = extract_xs_ys(ix_train, ix2xs, ix2ys, all_codes)
    code2cls = train(all_codes, xs_t, yByCode_t, fn_classifier)

    print "Training Sentence Classifier"
    # Extract new data points and target classes
    ix2xs_sent = to_sentence_level_predictions(ix2xs, code2cls)
    newxs_t, newyByCode_t = extract_xs_ys(ix_train, ix2xs_sent, ix2ys_sent,
                                          all_codes + causal_codes)
    code2classifier = {}
    for code in codes:
        print "Training for :", code
        cls = fn_create_cls()
        code2classifier[code] = cls
        ys = np.asarray(yByCode[code])
        #ys = map(map_y, ys)
        cls.fit(xs, ys)
    return code2classifier

fn_classifier = LinearSVC
SPLITS = 2
causal_codes = cr_codes + ["explicit"]

ixs = range(len(sentences))
folds = cross_validation(ixs, SPLITS)
td_metrics = []
vd_metrics = []

for num, (ix_train, ix_valid) in enumerate(folds):
    print "Fold:", num + 1

    # Train sequential classifier
    xs_t, yByCode_t = extract_xs_ys(ix_train, ix2xs, ix2ys, all_codes)
    code2cls = train(all_codes, xs_t, yByCode_t, fn_classifier)

    print "Training Sentence Classifier"
    # Extract new data points and target classes
    ix2xs_sent = to_sentence_level_predictions(ix2xs, code2cls)
    newxs_t, newyByCode_t = extract_xs_ys(ix_train, ix2xs_sent, ix2ys_sent, all_codes + causal_codes)
    wd_td_ys_bytag = get_wordlevel_ys_by_code(td_tags, wd_train_tags)
    wd_vd_ys_bytag = get_wordlevel_ys_by_code(vd_tags, wd_train_tags)
    """ TRAIN Tagger """

    create_classifier = lambda : LogisticRegression(dual=dual, C=C, penalty=penalty, fit_intercept=fit_intercept)
    if fold == 0:
        print(create_classifier())
    tag2word_classifier = train_classifier_per_code(
        td_X, wd_td_ys_bytag, create_classifier, wd_train_tags, verbose=False)
    """ TEST Tagger """
    td_wd_predictions_by_code = test_classifier_per_code(td_X, tag2word_classifier, wd_test_tags)
    vd_wd_predictions_by_code = test_classifier_per_code(vd_X, tag2word_classifier, wd_test_tags)
    return td_wd_predictions_by_code, vd_wd_predictions_by_code, wd_td_ys_bytag, wd_vd_ys_bytag

folds = cross_validation(essay_feats, CV_FOLDS)

def evaluate_tagger(dual, C, penalty, fit_intercept):

    hyper_opt_params = locals()
    # Gather metrics per fold
    cv_wd_td_ys_by_tag, cv_wd_td_predictions_by_tag = defaultdict(list), defaultdict(list)
    cv_wd_vd_ys_by_tag, cv_wd_vd_predictions_by_tag = defaultdict(list), defaultdict(list)

    """ This doesn't run in parallel ! Sequential operation takes exactly same duration """
    # results = Parallel(n_jobs=CV_FOLDS, verbose=0, backend='multiprocessing')(
    #         delayed(train_tagger)(fold, essays_TD, essays_VD, wd_test_tags, wd_train_tags, dual, C, penalty, fit_intercept)
    #             for fold, (essays_TD, essays_VD) in enumerate(folds))
    #
    # for result in results:
    #     td_wd_predictions_by_code, vd_wd_predictions_by_code, wd_td_ys_bytag, wd_vd_ys_bytag = result
logger.info("Essays loaded")
len(tagged_essays)

# Create Corpus in CRF Format (list of list of tuples(word,tag))
# --------------------------------------------------------------

tag_freq = get_tag_freq(tagged_essays)
regular_tags = list(set((tag for tag, freq in tag_freq.items() if freq >= 0 and tag[0].isdigit())))

""" FEATURE EXTRACTION """
config["window_size"] = 11
offset = (config["window_size"] - 1) / 2

cv_wd_td_ys_by_tag, cv_wd_td_predictions_by_tag = defaultdict(list), defaultdict(list)
cv_wd_vd_ys_by_tag, cv_wd_vd_predictions_by_tag = defaultdict(list), defaultdict(list)
folds = cross_validation(tagged_essays, CV_FOLDS)

results = Parallel(n_jobs=CV_FOLDS)(
            delayed(train_classifer_on_fold)(essays_TD, essays_VD, regular_tags, fold)
                for fold, (essays_TD, essays_VD) in enumerate(folds))

for result in results:
    wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code = result

    merge_dictionaries(wd_td_ys_bytag, cv_wd_td_ys_by_tag)
    merge_dictionaries(wd_vd_ys_bytag, cv_wd_vd_ys_by_tag)
    merge_dictionaries(td_wd_predictions_by_code, cv_wd_td_predictions_by_tag)
    merge_dictionaries(vd_wd_predictions_by_code, cv_wd_vd_predictions_by_tag)

logger.info("Training completed")
Ejemplo n.º 18
0
                    hidden_layer_sizes=(10, 2),
                    random_state=111)
models = [forest, gdBoost, mlp]
names = ["Random Forest", "Gradient Boosting", "MuliLayer Perceptrons"]

# Vars to select the best suited model
bestModel = None
bestName = "none"
bestMean = 0.0
x_best = []
scenario = "none"

# Using all the features
print("------------------------------------------")
print("------All Features -----------------------")
model, name, mean = cross_validation(x_norm, y, models, names)
if (mean > bestMean):
    bestModel, bestName, bestMean = model, name, mean
    x_best = x_norm
    scenario = "all"

# Removing features with low variance
print("------------------------------------------")
print("------Removing features with low variance -----------------------")
sel = VarianceThreshold(threshold=(0.01))
x_case = sel.fit_transform(x_norm)
model, name, mean = cross_validation(x_case, y, models, names)
if (mean > bestMean):
    bestModel, bestName, bestMean = model, name, mean
    x_best = x_case
    scenario = "variance"
            for tag in tags:
                stag_freq[tag] += 1

# TODO - don't ignore Anaphor, other and rhetoricals here
cr_tags = list((t for t in stag_freq.keys()
                if ("->" in t) and not "Anaphor" in t and not "other" in t
                and not "rhetorical" in t and not "factor" in t and 1 == 1))

regular_tags = set(
    (t for t in stag_freq.keys() if ("->" not in t) and (t[0].isdigit())))
#regular_tags = set((t for t in stag_freq.keys() if ( "->" not in t) and (t == "explicit" or t[0].isdigit())))
vtags = set(regular_tags)

assert "explicit" not in vtags, "explicit should NOT be in the regular tags"

cv_folds = cross_validation(pred_tagged_essays,
                            CV_FOLDS)  # type: List[Tuple[Any,Any]]


def get_functions_by_name(function_names, functions):
    return [fn for fn in functions if fn.__name__ in function_names]


def get_function_names(functions):
    return list(map(lambda fn: fn.__name__, functions))


def evaluate_features(folds: List[Tuple[Any, Any]],
                      extractor_names: Set[str],
                      cost_function_name: str,
                      beta: float = 0.3,
                      base_learner: Any = LogisticRegression,
def evaluate_feature_set(config, existing_extractors):

    feat_extractors = existing_extractors
    feat_config = dict(
        list(config.items()) + [("extractors", feat_extractors)])
    """ LOAD FEATURES """
    # most params below exist ONLY for the purposes of the hashing to and from disk
    #mem_extract_features = memoize_to_disk(filename_prefix=features_filename_prefix, verbose=False)(extract_features)
    #essay_feats = mem_extract_features(tagged_essays, **feat_config)
    essay_feats = extract_features(tagged_essays, **feat_config)
    """ DEFINE TAGS """
    _, lst_all_tags = flatten_to_wordlevel_feat_tags(essay_feats)
    regular_tags = list(
        set((t for t in flatten(lst_all_tags)
             if t.lower().strip() == "anaphor")))
    """ works best with all the pair-wise causal relation codes """
    wd_train_tags = regular_tags
    wd_test_tags = regular_tags
    """ CLASSIFIERS """
    fn_create_wd_cls = lambda: LogisticRegression(
    )  # C=1, dual = False seems optimal
    wd_algo = str(fn_create_wd_cls())

    # Gather metrics per fold
    cv_wd_td_ys_by_tag, cv_wd_td_predictions_by_tag = defaultdict(
        list), defaultdict(list)
    cv_wd_vd_ys_by_tag, cv_wd_vd_predictions_by_tag = defaultdict(
        list), defaultdict(list)
    folds = cross_validation(essay_feats, CV_FOLDS)

    def train_tagger(essays_TD, essays_VD, wd_test_tags, wd_train_tags):
        # TD and VD are lists of Essay objects. The sentences are lists
        # of featureextractortransformer.Word objects
        """ Data Partitioning and Training """
        td_feats, td_tags = flatten_to_wordlevel_feat_tags(essays_TD)
        vd_feats, vd_tags = flatten_to_wordlevel_feat_tags(essays_VD)
        feature_transformer = FeatureVectorizer(min_feature_freq=MIN_FEAT_FREQ,
                                                sparse=SPARSE_WD_FEATS)
        td_X, vd_X = feature_transformer.fit_transform(
            td_feats), feature_transformer.transform(vd_feats)
        wd_td_ys_bytag = get_wordlevel_ys_by_code(td_tags, wd_train_tags)
        wd_vd_ys_bytag = get_wordlevel_ys_by_code(vd_tags, wd_train_tags)
        """ TRAIN Tagger """
        tag2word_classifier = train_classifier_per_code(
            td_X,
            wd_td_ys_bytag,
            lambda: LogisticRegression(),
            wd_train_tags,
            verbose=False)
        """ TEST Tagger """
        td_wd_predictions_by_code = test_classifier_per_code(
            td_X, tag2word_classifier, wd_test_tags)
        vd_wd_predictions_by_code = test_classifier_per_code(
            vd_X, tag2word_classifier, wd_test_tags)
        return td_wd_predictions_by_code, vd_wd_predictions_by_code, wd_td_ys_bytag, wd_vd_ys_bytag

    #results = Parallel(n_jobs=CV_FOLDS)(
    #        delayed(train_tagger)(essays_TD, essays_VD, wd_test_tags, wd_train_tags)
    #            for (essays_TD, essays_VD) in folds)

    results = [
        train_tagger(essays_TD, essays_VD, wd_test_tags, wd_train_tags)
        for (essays_TD, essays_VD) in folds
    ]

    for result in results:
        td_wd_predictions_by_code, vd_wd_predictions_by_code, wd_td_ys_bytag, wd_vd_ys_bytag = result
        merge_dictionaries(wd_td_ys_bytag, cv_wd_td_ys_by_tag)
        merge_dictionaries(wd_vd_ys_bytag, cv_wd_vd_ys_by_tag)
        merge_dictionaries(td_wd_predictions_by_code,
                           cv_wd_td_predictions_by_tag)
        merge_dictionaries(vd_wd_predictions_by_code,
                           cv_wd_vd_predictions_by_tag)

    # print results for each code
    """ Persist Results to Mongo DB """
    # SUFFIX = "_FEAT_SELECTION"
    # CB_TAGGING_TD, CB_TAGGING_VD = "CB_TAGGING_TD" + SUFFIX, "CB_TAGGING_VD" + SUFFIX
    # parameters = dict(config)
    # parameters["extractors"] = list(map(lambda fn: fn.func_name, feat_extractors))
    # parameters["min_feat_freq"] = MIN_FEAT_FREQ
    #
    # wd_td_objectid = processor.persist_results(CB_TAGGING_TD, cv_wd_td_ys_by_tag,
    #                                            cv_wd_td_predictions_by_tag, parameters, wd_algo)
    # wd_vd_objectid = processor.persist_results(CB_TAGGING_VD, cv_wd_vd_ys_by_tag,
    #                                            cv_wd_vd_predictions_by_tag, parameters, wd_algo)

    # avg_f1 = float(processor.get_metric(CB_TAGGING_VD, wd_vd_objectid, __MICRO_F1__)["f1_score"])
    return 0