def single_features(): train = get_train_dev_data() test = get_test_data() for i_bal, balanced in enumerate([False, True]): for classifier in classifier_names: if balanced and classifier in classifiers_not_supporting_balancing: continue for feature_name in all_feature_names: print(feature_name) accuracies = [] for i in range(15): params = SingleBaseParams(feature_name, classifier, dirname=sklearn_master_name, params={'balanced': balanced}, pca=False) a = generate_probabilities(train, test, params, test_only=True) print('\t', i, feature_name, classifier, a) accuracies.append(a * 100) l, r = confidence_intervals(accuracies) m = float(np.mean([l, r])) balanced_str = "balanced" if balanced else "not_balanced" results = "%s & %s & %s & %.3f & %.1f-%.1f" % ( balanced_str, feature_name, classifier, m, l, r) print(results) with open("out/conf_int/base/" + feature_name + ".txt", 'a') as f: f.write("%s\n" % results)
def multiple_features(dirname): train = get_train_dev_data() test = get_test_data() all_subsets = get_feature_name_sets_with_highest_probable_accuracy(dirname) n_features = len(all_subsets) n_classifiers = len(classifier_names) print(n_features) max_a = 0. for i_bal, balanced in enumerate([False]): for ic, classifier in enumerate(classifier_names): if balanced and classifier in classifiers_not_supporting_balancing: continue for ifnl, feature_names_list in enumerate(all_subsets): print('Balanced', balanced) print('Classifier: ', ic, ' out of ', n_classifiers, classifier) print('Features ', ifnl, ' out of ', n_features) print('Progress ', (((((i_bal)) * 2 + ic) * n_features) + ifnl) / (n_classifiers * n_features * 2 * 2) * 100., '%') accs = [] for i in range(10): params = MultiBaseParams(features=feature_names_list, classifier=classifier, dirname=dirname, pca=False, params={'balanced': balanced}) a = generate_probabilities(train, test, params, test_only=True) accs.append(a * 100) l, r = confidence_intervals(accs) m = float(np.mean([l, r])) balanced_str = "balanced" if balanced else "not_balanced" results = "%s & %s & %s & %.3f & %.1f-%.1f" % ( balanced_str, ' '.join(feature_names_list), classifier, m, l, r) print(results) with open("out/conf_int/" + dirname + "/all.txt", 'a') as f: f.write("%s\n" % results) max_a = max(max_a, m) print("max accuracy: %0.4f current accuracy %0.4f\n" % (max_a, m))
def multiple_features(test_only=False): train = get_train_dev_data() test = get_test_data() all_subsets = get_all_feature_subsets() for classifier in classifier_names: for subset in all_subsets: print(subset) params = MultiBaseParams(features=subset, classifier=classifier, dirname="new", params={}, pca=False) generate_probabilities(train, test, params, test_only=test_only) params.serialize()
def single_features(): train = get_train_dev_data() test = get_test_data() for classifier in classifiers.keys(): for feature in all_feature_names: print(feature) params = SingleBaseParams(feature_name=feature, classifier=classifier, dirname="new", params={}, pca=False) run_grid_search(train=train, test=test, params=params, parameters=classifiers[classifier])
def single_features(test_only=False): train = get_train_dev_data() test = get_test_data() for classifier in classifier_names: for feature in [publication_type_key]: print(feature) params = SingleBaseParams(feature_name=feature, classifier=classifier, dirname="new", params={}, pca=False) generate_probabilities(train=train, test=test, params=params, test_only=test_only) params.serialize()
def multiple_features(): train = get_train_dev_data() test = get_test_data() # all_subsets = get_all_feature_subsets() all_subsets = [[publication_type_key, publication_year_key]] for classifier in classifiers.keys(): for subset in all_subsets: print(subset) params = MultiBaseParams(features=subset, classifier=classifier, dirname="new", params={}, pca=False) run_grid_search(train, test, params, parameters=classifiers[classifier])