def generate_candidates(num_to_generate, out_path, pos, neg, exclude): # Open the file to which we will write candidates out_f = open(out_path, 'w') # Read in the extracted features, which we'll also need for a couple things features = extract_features.FeatureAccumulator( load=BEST_WORDNET_ONLY_FEATURES_PATH) # Make the best performing classifier. This is what we'll use to score the # "relationalness" of new words. clf = classifier.make_classifier(kind='svm', features=features, positives=pos, negatives=neg, **BEST_CLASSIFIER_CONFIG) # Now generate the candidates. We only keep track of the number of # positives generated, because there are always more negatives num_generated = 0 for token in features.dictionary.get_token_list(): if token in exclude: print '\t\tx\t%s' % token continue score = clf.score(token)[0] if score > clf.threshold: print '%s\t+' % token out_f.write('%s\t+\t%f\n' % (token, score)) num_generated += 1 if num_generated == num_to_generate: break else: print '\t-\t%s' % token out_f.write('%s\t-\t%f\n' % (token, score))
def get_map_evaluator( kind='svm', on_unk=False, syntax_feature_types=['baseline', 'dependency', 'hand_picked'], semantic_similarity=None, syntactic_multiplier=1.0, semantic_multiplier=1.0, k=3, ): classifier = c.make_classifier( kind=kind, on_unk=on_unk, syntax_feature_types=syntax_feature_types, semantic_similarity=semantic_similarity, syntactic_multiplier=syntactic_multiplier, semantic_multiplier=semantic_multiplier, k=k, ) train_positives, train_negatives, train_neutrals = get_train_sets() test_positives, test_negatives, test_neutrals = get_test_sets() evaluator = RelationalNounMapEvaluator(classifier, train_positives, train_negatives, test_positives, test_negatives) return evaluator
def test_classifier_with_mass_fallback(): clf = classifier.make_classifier(fallback_enabled=True) assert clf is not None predictions = classifier.predict(clf, dragoon_data) assert predictions[0] == True assert predictions[1] == True
def test_make_classifier(): clf = classifier.make_classifier() assert clf is not None predictions = classifier.predict(clf, dragoon_data) assert predictions[0] == True assert predictions[1] == True
def test_harris_paper_data(): # Load up the real harris data from the paper data = pd.read_csv('tests/table_harris.csv') paper_prediction = data['BHS'] paper_prediction_fallback = data['BHS (Fallback)'] data = data.drop(columns=['BHS', 'BHS (Fallback)', 'Cluster Name']) clf = classifier.make_classifier() clf_fallback = classifier.make_classifier(fallback_enabled=True) assert clf is not None assert clf_fallback is not None assert arrays_almost_same(paper_prediction.values, clf.predict(data)) assert arrays_almost_same(paper_prediction_fallback.values, clf_fallback.predict(data))
def test_classifier_no_estimate(): clf = classifier.make_classifier(use_relaxation_time_estimate=False) assert clf is not None # Should raise since dragoon_data lacks Half-Mass Relaxation Time with pytest.raises(KeyError): predictions = classifier.predict(clf, dragoon_data) # Let's add some fake values and make sure it works dragoon_copy = dragoon_data.copy() dragoon_copy['Half-Mass Relaxation Time'] = 10000 predictions = classifier.predict(clf, dragoon_copy) assert len(predictions) == 3
def generate_candidates_ordinal(num_to_generate, out_path, pos, neg, neut, exclude, kernel=None, features=None): # Open the file to which we will write candidates out_f = open(out_path, 'w') # Read in the extracted features, which we'll also need for a couple things if features is None: features = extract_features.FeatureAccumulator( load=BEST_WORDNET_ONLY_FEATURES_PATH) # Make the best performing classifier. This is what we'll use to score the # "relationalness" of new words. clf = classifier.make_classifier(kind='osvm', kernel=kernel, features=features, positives=pos, negatives=neg, neutrals=neut, **BEST_CLASSIFIER_CONFIG) # Now generate the candidates. We only keep track of the number of # positives generated, because there are always more negatives num_generated = 0 filtered_tokens = [ t for t in features.dictionary.get_token_list() if t not in exclude ] for token, score in clf.score_parallel(filtered_tokens): if score >= 1: print '%s\t+' % token out_f.write('%s\t+\t%f\n' % (token, score)) num_generated += 1 if num_generated == num_to_generate: break elif score > -1: print '\t0\t%s' % token out_f.write('%s\t0\t%f\n' % (token, score)) else: print '\t-\t%s' % token out_f.write('%s\t-\t%f\n' % (token, score))
def evaluate_classifier(name, classifier_definition, features, out_path=None): """ Assesses the performance of the classifier defined by the dictionary ``classifier_definitions``. That dictionary should provide the arguments needed to construct the classifier when provided to the function classifier.make_classifier. """ print 'evaluating:', json.dumps(classifier_definition, indent=2) print 'writing result to:%s' % out_path errors_path = classifier_definition.get('error-analysis-path', 'analyze-errors.tsv') print 'errors_path:', errors_path analyze_errors_f = open(os.path.join(DATA_DIR, errors_path), 'w') if out_path is not None: out_file = open(out_path, 'a') # Some of the "classifier definition" settings control the train / test # data and features supplied to the classifier, rather than the classifiers # config. Handle those settings now. # Get the desired datset data_source = classifier_definition.get('data_source', None) seed = classifier_definition.get('seed', 0) print 'seed:', seed if data_source == 'seed': train, test = utils.get_train_test_seed_split(features.dictionary) elif data_source == 'crowdflower-annotated-top': train, test = annotations.Annotations( features.dictionary).get_train_test('top', seed=seed) elif data_source == 'crowdflower-annotated-rand': train, test = annotations.Annotations( features.dictionary).get_train_test('rand', seed=seed) elif data_source == 'crowdflower-dev-top': train, test = annotations.Annotations( features.dictionary).get_train_dev('top') elif data_source == 'crowdflower-dev-rand': train, test = annotations.Annotations( features.dictionary).get_train_dev('rand') elif data_source == 'crowdflower-dev': train, test = annotations.Annotations( features.dictionary).get_train_dev() else: raise ValueError('Unexpected data_source: %s' % data_source) # Binarize the dataset if desired by converting items labelled `neutral` # into either positive or negative. binarize_mode = classifier_definition.get('binarize_mode', None) if binarize_mode is not None: if binarize_mode == '+/0-': train['neg'] = train['neut'] | train['neg'] test['neg'] = test['neut'] | test['neg'] elif binarize_mode == '+0/-': train['pos'] = train['pos'] | train['neut'] test['pos'] = test['pos'] | test['neut'] else: raise ValueError('Unexpected binarize_mode: %s' % binarize_mode) train['neut'] = set() test['neut'] = set() # Convert the training and test sets into a vectorized format. In the # "kernel" format, the feature vectors are just token ids (the kernel # function is "smart" and knows how to compute the kernels given ids). data_format = classifier_definition.get('data_format') if data_format == 'kernel': X_train, Y_train = utils.make_kernel_vector(train, features) X_test, Y_test = utils.make_kernel_vector(test, features) classifier_definition['verbose'] = 1 precomputed_kernel = kernels.PrecomputedKernel(features, classifier_definition) # Trigger the necessary various lazy calculates on the features class # by doing one kernel calculation precomputed_kernel.eval_pair_token('car', 'tree') # Now precompute values of the kernel for all example pairs num_processes = classifier_definition.get('kernel_processes', 4) precomputed_kernel.precompute_parallel(examples=X_train + X_test, num_processes=num_processes) classifier_definition['pre-bound-kernel'] = precomputed_kernel # Or convert the training and test sets into a numpy sparse matrix format elif data_format == 'vector': count_based_features = classifier_definition.get( 'count_based_features') non_count_features = classifier_definition.get('non_count_features') count_feature_mode = classifier_definition.get('count_feature_mode') whiten = classifier_definition.get('whiten', False) feature_threshold = classifier_definition.get('feature_threshold', 0.5) Q_train, X_train, Y_train = utils.make_vectors( train, features, count_based_features, non_count_features, count_feature_mode, whiten=whiten, threshold=feature_threshold) Q_test, X_test, Y_test = utils.make_vectors( test, features, count_based_features, non_count_features, count_feature_mode, whiten=whiten, threshold=feature_threshold) else: raise ValueError('Unexpected data_format: %s' % data_format) # Allow automatically re-weighting the class to help with unbalanced # classifications. if 'class_weight' in classifier_definition: if classifier_definition['class_weight'] == 'auto': classifier_definition['class_weight'] = get_class_weights(Y_train) # Make the classifier kind = classifier_definition.get('kind') clf = c.make_classifier(kind=kind, X_train=X_train, Y_train=Y_train, features=features, classifier_definition=classifier_definition) # We can either tune the decision threshold if classifier_definition.get('find_threshold', False): decision_threshold = clf.find_threshold(X_test, Y_test).tolist() # Or set it based on a prior fitted value elif classifier_definition.get('use_threshold', None) is not None: decision_threshold = classifier_definition['use_threshold'] clf.set_threshold(decision_threshold) # Or stick with the default decision threshold built into the classifier else: decision_threshold = None ## If binarize mode has been set, then the classifier is already ## configured to find treat the problem as a binary classification, ## so finding the classification threshold is straightforward. #if binarize_mode is '+0/-' # best_loose_f1, loose_threshold = utils.find_threshold( # clf, X_test, Y_test) # best_strict_f1, strict_threshold = None, None # clf.set_threshold(threshold) #elif binarize_mode is '+/0-' # best_strict_f1, strict_threshold = utils.find_threshold( # clf, X_test, Y_test) # best_loose_f1, loose_threshold = None, None # clf.set_threshold(threshold) #else: # best_strict_f1, strict_threshold = utils.find_threshold( # clf, X_test, Y_test_strict, positive=set([1,0]), # negative=set([-1]) # ) # best_loose_f1, loose_threshold = utils.find_threshold( # clf, X_test, Y_test_strict, positive=set([1]), # negative=set([0,-1]) # ) # Test it on the test set, generating a confusion matrix Y_predicted = clf.predict(X_test) for word, pred, actual in zip(Q_test, Y_predicted, Y_test): if actual == 1: print '%s : %d' % (word, pred) analyze_errors_f.write('%s\t%d\n' % (word, pred)) confusion_matrix = generate_confusion_matrix(clf, X_test, Y_test) # Calculate the F1 relative to each class, and the macro-average if binarize_mode is None: f1s, macro_f1 = calculate_f1(confusion_matrix) tight_f1 = f1s[1] loose_f1 = calculate_f1_loose(confusion_matrix) elif binarize_mode == '+0/-': loose_f1 = calculate_simple_f1(confusion_matrix) tight_f1 = None macro_f1 = None elif binarize_mode == '+/0-': tight_f1 = calculate_simple_f1(confusion_matrix) loose_f1 = None macro_f1 = None # Calculate the MAP AP = calculate_classifier_MAP(clf, X_test, Y_test, [1, 0]) results = { 'confusion_matrix': confusion_matrix, 'precision': loose_f1[0], 'recall': loose_f1[1], 'f1': loose_f1[1], 'AP': AP, 'threshold': decision_threshold } performance_record = ('{"name":"%s",\n\n' % name + '"run-specification":' + json.dumps(classifier_definition, indent=2) + ',' + '\n\n' + '"results":' + json.dumps(results) + '}\n\n\n') print performance_record if out_path is not None: out_file.write(performance_record) return clf, results
train_filename = sys.argv[i] i += 1 print >>sys.stderr, "reading labelled dataset from '" + train_filename + "'..." input = open(train_filename, "r") if train_filename != "-" else sys.stdin input.readline() X = numpy.loadtxt(input, delimiter=",", dtype=numpy.uint8) labels = X[:,0] X=X[:,1:].astype(float) print >>sys.stderr, "training KNN with", min(train_threshold, X.shape[0]), "training instances and k=", k, "..." clf = make_classifier(preprocess(X[:train_threshold]), labels[:train_threshold], name="KNN", params=[k]) print >>sys.stderr, "making predicitions for", max(0,X.shape[0]-train_threshold), "instances ..." predictions = clf.predict(preprocess(X[train_threshold:])) print >>sys.stderr, "evaluating ..." if verbose: for i in range(len(predictions)): print labels[train_threshold:][i], predictions[i] if labels[train_threshold:][i] != predictions[i]: print >>sys.stderr, "should be:", labels[train_threshold:][i], ", was:", predictions[i] put_image(X[train_threshold:][i], 0, sys.stderr) print >>sys.stderr else: for i in range(len(predictions)):