def setUp(self): self.config = { "classifier": "dtree", "classifier_args": dict(), "dimensionality_reduction": None, "dimensionality_reduction_dimension": None, "feature_selection": None, "feature_selection_dimension": None, "scaler": False, "sparse": True, "features": make_feature_list(""" bag_of_words bag_of_pos bag_of_word_bigrams bag_of_wordpos bag_of_wordpos_bigrams bag_of_words_in_between bag_of_pos_in_between bag_of_word_bigrams_in_between bag_of_wordpos_in_between bag_of_wordpos_bigrams_in_between entity_order entity_distance other_entities_in_between in_same_sentence verbs_count_in_between verbs_count total_number_of_entities symbols_in_between number_of_tokens """), }
def test_configuration_with_arguments(self, mocked_feature): patch = make_feature_list(""" BagOfVerbStems True BagOfVerbLemmas True BagOfVerbLemmas False """) self.config["features"] = self.config["features"] + patch FactExtractor(self.config) self.assertEqual(mocked_feature.call_count, 1) self.assertEqual(mocked_feature.call_args, ((True, ), ))
def setUp(self): self.config = { "classifier": "svc", "classifier_args": {}, "dense_features": make_feature_list(""" entity_order entity_distance other_entities_in_between verbs_count_in_between verbs_count total_number_of_entities symbols_in_between number_of_tokens """), "sparse_features": make_feature_list(""" bag_of_words bag_of_pos bag_of_words_in_between bag_of_pos_in_between """) }
def iter_configs(input_file_path, dbname): input_file_path = os.path.abspath(input_file_path) hasher = hashlib.md5(open(input_file_path, "rb").read()) base = { # Experiment configuration u"config_version": u"1", u"data_shuffle_seed": None, u"train_percentage": None, u"input_file_path": input_file_path, u"input_file_md5": hasher.hexdigest(), u"database_name": dbname, # Classifier configuration u"classifier": u"svm", u"classifier_args": dict(), u"dimensionality_reduction": None, u"dimensionality_reduction_dimension": None, u"feature_selection": None, u"feature_selection_dimension": None, u"scaler": True, u"sparse": False, u"features": make_feature_list(u""" bag_of_words bag_of_pos bag_of_word_bigrams bag_of_wordpos bag_of_wordpos_bigrams bag_of_words_in_between bag_of_pos_in_between bag_of_word_bigrams_in_between bag_of_wordpos_in_between bag_of_wordpos_bigrams_in_between entity_order entity_distance other_entities_in_between in_same_sentence verbs_count_in_between verbs_count total_number_of_entities symbols_in_between number_of_tokens BagOfVerbStems True BagOfVerbStems False BagOfVerbLemmas True BagOfVerbLemmas False """) } patch = {u"train_percentage": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], u"data_shuffle_seed": [u"domino" + str(i) for i in range(10)]} xs = [(u"sgd", {}), (u"naivebayes", {}), (u"naivebayes_m", {}), (u"dtree", {u"max_depth": 4, u"min_samples_leaf": 5}), (u"logit", {}), (u"svm", {}), (u"adaboost", {})] for classifier, args in xs: base[u"classifier"] = classifier base[u"classifier_args"] = args base[u"scaler"] = True if classifier == "naivebayes_m": base[u"scaler"] = False for config in apply_dict_combinations(base, patch): yield config
"sparse": False, "features": make_feature_list(""" bag_of_words bag_of_pos bag_of_word_bigrams bag_of_wordpos bag_of_wordpos_bigrams bag_of_words_in_between bag_of_pos_in_between bag_of_word_bigrams_in_between bag_of_wordpos_in_between bag_of_wordpos_bigrams_in_between entity_order entity_distance other_entities_in_between in_same_sentence verbs_count_in_between verbs_count total_number_of_entities symbols_in_between number_of_tokens BagOfVerbStems True BagOfVerbStems False BagOfVerbLemmas True BagOfVerbLemmas False """), }
# In this file we have defined the default values for IEPY settings # These default values will be always used except explicitely said. from iepy.utils import make_feature_list extractor_config = { "classifier": "svc", "classifier_args": {}, "dense_features": make_feature_list(""" entity_order entity_distance other_entities_in_between verbs_count_in_between verbs_count total_number_of_entities symbols_in_between number_of_tokens """), "sparse_features": make_feature_list(""" bag_of_words bag_of_pos bag_of_words_in_between bag_of_pos_in_between """) }
def iter_configs(input_file_path, dbname): input_file_path = os.path.abspath(input_file_path) hasher = hashlib.md5(open(input_file_path, "rb").read()) base = { # Experiment configuration u"config_version": u"1", u"data_shuffle_seed": None, u"train_percentage": None, u"input_file_path": input_file_path, u"input_file_md5": hasher.hexdigest(), u"database_name": dbname, # Classifier configuration u"classifier": u"svm", u"classifier_args": dict(), u"dimensionality_reduction": None, u"dimensionality_reduction_dimension": None, u"feature_selection": None, u"feature_selection_dimension": 1000, u"scaler": True, u"sparse": False, u"features": make_feature_list(u""" bag_of_words bag_of_pos bag_of_word_bigrams bag_of_wordpos bag_of_wordpos_bigrams bag_of_words_in_between bag_of_pos_in_between bag_of_word_bigrams_in_between bag_of_wordpos_in_between bag_of_wordpos_bigrams_in_between entity_order entity_distance other_entities_in_between in_same_sentence verbs_count_in_between verbs_count total_number_of_entities symbols_in_between number_of_tokens BagOfVerbStems True BagOfVerbStems False BagOfVerbLemmas True BagOfVerbLemmas False """) } # SVM ###### patch = { u"train_percentage": [0.05 * x for x in range(1, 11)], u"data_shuffle_seed": [u"daddycool" + str(i) for i in range(20)], u"feature_selection": [None, "kbest"] } svm_args_patches = [ { u"kernel": [u"rbf"], u"C": [1, 10, 100], u"gamma": [0.0, 0.001, 0.0001] }, { u"kernel": [u"poly"], u"C": [1, 10, 100], u"degree": [2, 3, 4], u"gamma": [0.0, 0.001, 0.0001] }, { u"kernel": [u"linear"], u"C": [1, 10, 100] }, ] for argpatch in svm_args_patches: for argconfig in apply_dict_combinations({}, argpatch): base[u"classifier_args"] = argconfig for config in apply_dict_combinations(base, patch): yield config # Adaboost ########### base.update({ u"classifier": u"adaboost", u"feature_selection_dimension": None, u"scaler": False, }) patch = { u"train_percentage": [0.05 * x for x in range(1, 11)], u"data_shuffle_seed": [u"daddycool" + str(i) for i in range(10)] } argpatch = { u"n_estimators": [5, 10, 20, 50], u"learning_rate": [0.9, 1.0, 1.1], u"max_depth": [1, 2, 3] } for argconfig in apply_dict_combinations({}, argpatch): base[u"classifier_args"] = argconfig for config in apply_dict_combinations(base, patch): yield config
def iter_configs(input_file_path, dbname): input_file_path = os.path.abspath(input_file_path) hasher = hashlib.md5(open(input_file_path, "rb").read()) base = { # Experiment configuration u"config_version": u"1", u"data_shuffle_seed": None, u"train_percentage": None, u"input_file_path": input_file_path, u"input_file_md5": hasher.hexdigest(), u"database_name": dbname, # Classifier configuration u"classifier": u"svm", u"classifier_args": dict(), u"dimensionality_reduction": None, u"dimensionality_reduction_dimension": None, u"feature_selection": None, u"feature_selection_dimension": None, u"scaler": True, u"sparse": False, u"features": make_feature_list(u""" bag_of_words bag_of_pos bag_of_word_bigrams bag_of_wordpos bag_of_wordpos_bigrams bag_of_words_in_between bag_of_pos_in_between bag_of_word_bigrams_in_between bag_of_wordpos_in_between bag_of_wordpos_bigrams_in_between entity_order entity_distance other_entities_in_between in_same_sentence verbs_count_in_between verbs_count total_number_of_entities symbols_in_between number_of_tokens BagOfVerbStems True BagOfVerbStems False BagOfVerbLemmas True BagOfVerbLemmas False """) } # RBF ###### patch = { u"train_percentage": [0.07 * x for x in range(1, 11)], u"data_shuffle_seed": [u"sussieq" + str(i) for i in range(20)] } argpatch = { u"kernel": [u"rbf"], u"gamma": [0.0, 1e-4, 1e-5, 1e-6], u"class_weight": [{ True: 10, False: 1 }, { True: 1, False: 10 }, { True: 1, False: 1 }, { True: 1, False: 0.1 }, { True: 0.1, False: 1 }] } for argconfig in apply_dict_combinations({}, argpatch): base[u"classifier_args"] = argconfig for config in apply_dict_combinations(base, patch): yield config # POLY ####### base[u"feature_selection"] = "kbest" patch = { u"train_percentage": [0.07 * x for x in range(1, 11)], u"data_shuffle_seed": [u"sussieq" + str(i) for i in range(20)], u"feature_selection_dimension": [500, 1000, 2000, 4000] } argpatch = { u"kernel": [u"poly"], u"degree": [4], u"gamma": [0.0, 1e-4, 1e-5, 1e-6], u"class_weight": [{ True: 10, False: 1 }, { True: 1, False: 10 }, { True: 1, False: 1 }, { True: 1, False: 0.1 }, { True: 0.1, False: 1 }] } for argconfig in apply_dict_combinations({}, argpatch): base[u"classifier_args"] = argconfig for config in apply_dict_combinations(base, patch): yield config
file will be added to the configurations. """ import os import hashlib from itertools import product from copy import deepcopy from iepy.utils import make_feature_list from utils import apply_dict_combinations, check_configs features = make_feature_list(u""" bag_of_words_in_between bag_of_pos_in_between bag_of_wordpos_in_between entity_order entity_distance other_entities_in_between verbs_count_in_between""") # Proposing as classifiers the champions of the # experimentation/classifier/config_round5.py # Note: they will suffer small modifications (on feature_selection_dimension) candidate_classifiers = [ { u'classifier': u'svm', u'classifier_args': { u'class_weight': { u'false': 1, u'true': 1 },
def __init__(self, db_connector, seed_facts, gold_standard=None): """ Not blocking. """ self.db_con = db_connector self.knowledge = Knowledge( {Evidence(f, None, None, None): 1 for f in seed_facts}) self.evidence_threshold = 0.99 self.fact_threshold = 0.99 self.questions = Knowledge() self.answers = {} self.gold_standard = gold_standard self.steps = [ self.generalize_knowledge, # Step 1 self.generate_questions, # Step 2, first half None, # Pause to wait question answers self.filter_evidence, # Step 2, second half self.learn_fact_extractors, # Step 3 self.extract_facts, # Step 5 self.filter_facts, # Step 6 self.evaluate # Optional evaluation step ] self.step_iterator = itertools.cycle(self.steps) # Build relation description: a map from relation labels to pairs of entity kinds self.relations = {} for e in self.knowledge: t1 = e.fact.e1.kind t2 = e.fact.e2.kind if e.fact.relation in self.relations and ( t1, t2) != self.relations[e.fact.relation]: raise ValueError("Ambiguous kinds for relation %r" % e.fact.relation) self.relations[e.fact.relation] = (t1, t2) # Classifier configuration self.extractor_config = { "classifier": "dtree", "classifier_args": dict(), "dimensionality_reduction": None, "dimensionality_reduction_dimension": None, "feature_selection": None, "feature_selection_dimension": None, "scaler": False, "sparse": False, "features": make_feature_list(""" bag_of_words bag_of_pos bag_of_word_bigrams bag_of_wordpos bag_of_wordpos_bigrams bag_of_words_in_between bag_of_pos_in_between bag_of_word_bigrams_in_between bag_of_wordpos_in_between bag_of_wordpos_bigrams_in_between entity_order entity_distance other_entities_in_between in_same_sentence verbs_count_in_between verbs_count total_number_of_entities symbols_in_between number_of_tokens BagOfVerbStems True BagOfVerbStems False BagOfVerbLemmas True BagOfVerbLemmas False """), }