def iter_configs(input_file_path, dbname): input_file_path = os.path.abspath(input_file_path) hasher = hashlib.md5(open(input_file_path, 'rb').read()) base = { # Experiment configuration u'config_version': u'6', u'input_file_path': input_file_path, u'input_file_md5': hasher.hexdigest(), u'database_name': dbname } prediction_patch = { u'method': [u'predict', u'predict_proba', 'decision_function'], } seed_patch = { u'shuffle': [u"%i lemon and half lemon" % i for i in range(3)] } for champ, classifier in product(loop_champions_round_4, candidate_classifiers): champ = deepcopy(champ) champ.update(base) champ['classifier_config'] = deepcopy(classifier) prediction_options_range = list( apply_dict_combinations(champ['prediction_config'], prediction_patch)) seed_options_range = list( apply_dict_combinations(champ[u'seed_facts'], seed_patch)) patch = { u'prediction_config': prediction_options_range, u'questions_sorting': [u'score', u'certainty'], u'seed_facts': seed_options_range } for config in apply_dict_combinations(champ, patch): config = deepcopy(config) if config['prediction_config']['method'] == u'predict_proba': config['classifier_config']['classifier_args'][ u'probability'] = True if config['prediction_config']['method'] == u'decision_function': config['classifier_config'][ 'sparse'] = False # in some cases is failing yield config config = deepcopy(config) config['classifier_config']['feature_selection_dimension'] = 10 yield config
def iter_configs(input_file_path, dbname): input_file_path = os.path.abspath(input_file_path) hasher = hashlib.md5(open(input_file_path, "rb").read()) base = { # Experiment configuration u"config_version": u"1", u"data_shuffle_seed": None, u"train_percentage": None, u"input_file_path": input_file_path, u"input_file_md5": hasher.hexdigest(), u"database_name": dbname, # Classifier configuration u"classifier": u"svm", u"classifier_args": dict(), u"dimensionality_reduction": None, u"dimensionality_reduction_dimension": None, u"feature_selection": None, u"feature_selection_dimension": None, u"scaler": True, u"sparse": False, u"features": make_feature_list(u""" bag_of_words bag_of_pos bag_of_word_bigrams bag_of_wordpos bag_of_wordpos_bigrams bag_of_words_in_between bag_of_pos_in_between bag_of_word_bigrams_in_between bag_of_wordpos_in_between bag_of_wordpos_bigrams_in_between entity_order entity_distance other_entities_in_between in_same_sentence verbs_count_in_between verbs_count total_number_of_entities symbols_in_between number_of_tokens BagOfVerbStems True BagOfVerbStems False BagOfVerbLemmas True BagOfVerbLemmas False """) } patch = {u"train_percentage": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], u"data_shuffle_seed": [u"domino" + str(i) for i in range(10)]} xs = [(u"sgd", {}), (u"naivebayes", {}), (u"naivebayes_m", {}), (u"dtree", {u"max_depth": 4, u"min_samples_leaf": 5}), (u"logit", {}), (u"svm", {}), (u"adaboost", {})] for classifier, args in xs: base[u"classifier"] = classifier base[u"classifier_args"] = args base[u"scaler"] = True if classifier == "naivebayes_m": base[u"scaler"] = False for config in apply_dict_combinations(base, patch): yield config
def iter_configs(input_file_path, dbname): input_file_path = os.path.abspath(input_file_path) hasher = hashlib.md5(open(input_file_path, 'rb').read()) base = { # Experiment configuration u'experiment': u'bootstrap', u'config_version': u'1', u'data_shuffle_seed': "a-ha", u'input_file_path': input_file_path, u'input_file_md5': hasher.hexdigest(), u'database_name': dbname, # Human In The Middle configuration u'answers_per_round': 5, u'max_number_of_rounds': 15, # Bootstrap configuration u'prediction_config': { u'method': u'predic', u'scale_to_range': [0.1, 0.9] }, # threshold are expressed as delta to max, so it's uniformly expressed # having scaling enabled or not u'fact_threshold_distance': 0.01, u'evidence_threshold_distance': 0.01, u'questions_sorting': 'score', u'seed_facts': { u'number_to_use': 5, u'shuffle': u"it's a trap" }, # Classifier configuration u'classifier_config': {} # to be filled with each candidate-classifier } prediction_patch = { u'method': [u'predict', u'predict_proba'], u'scale_to_range': [None, [0.1, 0.9]] } prediction_options_range = list( apply_dict_combinations(base['prediction_config'], prediction_patch) ) seed_patch = { u'number_to_use': [5, 10], u'shuffle': [u"%i lemon and half lemon" % i for i in range(2)] } seed_options_range = list( apply_dict_combinations(base[u'seed_facts'], seed_patch) ) patch = { u'answers_per_round': [5, 15, 25], u'prediction_config': prediction_options_range, u'fact_threshold_distance': [0.01, 0.05], u'evidence_threshold_distance': [0.01, 0.05], u'questions_sorting': [u'score', u'certainty'], u'seed_facts': seed_options_range } for classifier_config in candidate_classifiers: base[u'classifier_config'] = classifier_config for config in apply_dict_combinations(base, patch): # Threshold adjustments max_score = 1.0 if config[u'prediction_config']['scale_to_range']: max_score = max(config[u'prediction_config']['scale_to_range']) config[u'fact_threshold'] = max_score - config.pop(u'fact_threshold_distance') config[u'evidence_threshold'] = max_score - config.pop(u'evidence_threshold_distance') if (config[u'classifier_config'][u'classifier'] == u'svm' and config[u'prediction_config'][u'method'] == u'predict_proba'): # we'll split this config in 2 options: actual predict_proba, # and decision_function config_copied = deepcopy(config) config_copied[u'classifier_config'][u'classifier_args'][u'probability'] = True yield config_copied # http://scikit-learn.org/stable/modules/svm.html#scores-and-probabilities config[u'classifier_config'][u'classifier_args'][u'probability'] = False config[u'prediction_config'][u'method'] = u'decision_function' yield config
def iter_configs(input_file_path, dbname): input_file_path = os.path.abspath(input_file_path) hasher = hashlib.md5(open(input_file_path, "rb").read()) base = { # Experiment configuration u"config_version": u"1", u"data_shuffle_seed": None, u"train_percentage": None, u"input_file_path": input_file_path, u"input_file_md5": hasher.hexdigest(), u"database_name": dbname, # Classifier configuration u"classifier": u"svm", u"classifier_args": dict(), u"dimensionality_reduction": None, u"dimensionality_reduction_dimension": None, u"feature_selection": None, u"feature_selection_dimension": 1000, u"scaler": True, u"sparse": False, u"features": make_feature_list(u""" bag_of_words bag_of_pos bag_of_word_bigrams bag_of_wordpos bag_of_wordpos_bigrams bag_of_words_in_between bag_of_pos_in_between bag_of_word_bigrams_in_between bag_of_wordpos_in_between bag_of_wordpos_bigrams_in_between entity_order entity_distance other_entities_in_between in_same_sentence verbs_count_in_between verbs_count total_number_of_entities symbols_in_between number_of_tokens BagOfVerbStems True BagOfVerbStems False BagOfVerbLemmas True BagOfVerbLemmas False """) } # SVM ###### patch = { u"train_percentage": [0.05 * x for x in range(1, 11)], u"data_shuffle_seed": [u"daddycool" + str(i) for i in range(20)], u"feature_selection": [None, "kbest"] } svm_args_patches = [ { u"kernel": [u"rbf"], u"C": [1, 10, 100], u"gamma": [0.0, 0.001, 0.0001] }, { u"kernel": [u"poly"], u"C": [1, 10, 100], u"degree": [2, 3, 4], u"gamma": [0.0, 0.001, 0.0001] }, { u"kernel": [u"linear"], u"C": [1, 10, 100] }, ] for argpatch in svm_args_patches: for argconfig in apply_dict_combinations({}, argpatch): base[u"classifier_args"] = argconfig for config in apply_dict_combinations(base, patch): yield config # Adaboost ########### base.update({ u"classifier": u"adaboost", u"feature_selection_dimension": None, u"scaler": False, }) patch = { u"train_percentage": [0.05 * x for x in range(1, 11)], u"data_shuffle_seed": [u"daddycool" + str(i) for i in range(10)] } argpatch = { u"n_estimators": [5, 10, 20, 50], u"learning_rate": [0.9, 1.0, 1.1], u"max_depth": [1, 2, 3] } for argconfig in apply_dict_combinations({}, argpatch): base[u"classifier_args"] = argconfig for config in apply_dict_combinations(base, patch): yield config
def iter_configs(input_file_path, dbname): input_file_path = os.path.abspath(input_file_path) hasher = hashlib.md5(open(input_file_path, 'rb').read()) base = { # Experiment configuration u'experiment': u'bootstrap', u'config_version': u'4', u'data_shuffle_seed': "a-ha", u'input_file_path': input_file_path, u'input_file_md5': hasher.hexdigest(), u'database_name': dbname, # Human In The Middle configuration u'answers_per_round': 5, u'max_number_of_rounds': 15, # Bootstrap configuration u'prediction_config': { u'method': u'decision_function', u'scale_to_range': [0.1, 0.9] }, # threshold are expressed as delta to max, so it's uniformly expressed # having scaling enabled or not u'fact_threshold_distance': 0.01, u'evidence_threshold_distance': 0.01, u'questions_sorting': u'score', u'drop_guesses_each_round': True, u'seed_facts': { u'number_to_use': 5, u'shuffle': u"it's a trap" }, # Classifier configuration u'classifier_config': {} # to be filled with each candidate-classifier } prediction_patch = { u'method': [u'predict', u'predict_proba'], u'scale_to_range': [None, [0.1, 0.9]] } prediction_options_range = list( apply_dict_combinations(base['prediction_config'], prediction_patch)) seed_patch = { u'shuffle': [u"%i lemon and half lemon" % i for i in range(2)] } seed_options_range = list( apply_dict_combinations(base[u'seed_facts'], seed_patch)) patch = { u'answers_per_round': [3, 5], u'prediction_config': prediction_options_range, u'fact_threshold_distance': [0.01, 0.05, 0.1], u'evidence_threshold_distance': [0.05, 0.1], u'questions_sorting': [u'score', u'certainty'], u'seed_facts': seed_options_range } for classifier_config in candidate_classifiers: base[u'classifier_config'] = classifier_config for config in apply_dict_combinations(base, patch): # Threshold adjustments if config[u'fact_threshold_distance'] < config[ u'evidence_threshold_distance']: # Based on champions of prior rounds, delta for facts shall # be greater than delta for evidences. # skipping... continue max_score = 1.0 if config[u'prediction_config']['scale_to_range']: max_score = max(config[u'prediction_config']['scale_to_range']) config[u'fact_threshold'] = max_score - config.pop( u'fact_threshold_distance') config[u'evidence_threshold'] = max_score - config.pop( u'evidence_threshold_distance') if (config[u'classifier_config'][u'classifier'] == u'svm' and config[u'prediction_config'][u'method'] == u'predict_proba'): # For SVMs, predict_proba will be replaced with decision_function # http://scikit-learn.org/stable/modules/svm.html#scores-and-probabilities config[u'classifier_config'][u'classifier_args'][ u'probability'] = False config[u'prediction_config'][u'method'] = u'decision_function' # Also, given that decision_function with rbf doest run on sparse matrixes if config[u'classifier_config'][u'classifier_args'].get( 'kernel', '') == 'rbf': config[u'classifier_config'][u'sparse'] = False yield config
def iter_configs(input_file_path, dbname): input_file_path = os.path.abspath(input_file_path) hasher = hashlib.md5(open(input_file_path, "rb").read()) base = { # Experiment configuration u"config_version": u"1", u"data_shuffle_seed": None, u"train_percentage": None, u"input_file_path": input_file_path, u"input_file_md5": hasher.hexdigest(), u"database_name": dbname, # Classifier configuration u"classifier": u"svm", u"classifier_args": dict(), u"dimensionality_reduction": None, u"dimensionality_reduction_dimension": None, u"feature_selection": None, u"feature_selection_dimension": None, u"scaler": True, u"sparse": False, u"features": make_feature_list(u""" bag_of_words bag_of_pos bag_of_word_bigrams bag_of_wordpos bag_of_wordpos_bigrams bag_of_words_in_between bag_of_pos_in_between bag_of_word_bigrams_in_between bag_of_wordpos_in_between bag_of_wordpos_bigrams_in_between entity_order entity_distance other_entities_in_between in_same_sentence verbs_count_in_between verbs_count total_number_of_entities symbols_in_between number_of_tokens BagOfVerbStems True BagOfVerbStems False BagOfVerbLemmas True BagOfVerbLemmas False """) } # RBF ###### patch = { u"train_percentage": [0.07 * x for x in range(1, 11)], u"data_shuffle_seed": [u"sussieq" + str(i) for i in range(20)] } argpatch = { u"kernel": [u"rbf"], u"gamma": [0.0, 1e-4, 1e-5, 1e-6], u"class_weight": [{ True: 10, False: 1 }, { True: 1, False: 10 }, { True: 1, False: 1 }, { True: 1, False: 0.1 }, { True: 0.1, False: 1 }] } for argconfig in apply_dict_combinations({}, argpatch): base[u"classifier_args"] = argconfig for config in apply_dict_combinations(base, patch): yield config # POLY ####### base[u"feature_selection"] = "kbest" patch = { u"train_percentage": [0.07 * x for x in range(1, 11)], u"data_shuffle_seed": [u"sussieq" + str(i) for i in range(20)], u"feature_selection_dimension": [500, 1000, 2000, 4000] } argpatch = { u"kernel": [u"poly"], u"degree": [4], u"gamma": [0.0, 1e-4, 1e-5, 1e-6], u"class_weight": [{ True: 10, False: 1 }, { True: 1, False: 10 }, { True: 1, False: 1 }, { True: 1, False: 0.1 }, { True: 0.1, False: 1 }] } for argconfig in apply_dict_combinations({}, argpatch): base[u"classifier_args"] = argconfig for config in apply_dict_combinations(base, patch): yield config