예제 #1
0
def generate_candidates(num_to_generate, out_path, pos, neg, exclude):

    # Open the file to which we will write candidates
    out_f = open(out_path, 'w')

    # Read in the extracted features, which we'll also need for a couple things
    features = extract_features.FeatureAccumulator(
        load=BEST_WORDNET_ONLY_FEATURES_PATH)

    # Make the best performing classifier.  This is what we'll use to score the
    # "relationalness" of new words.
    clf = classifier.make_classifier(kind='svm',
                                     features=features,
                                     positives=pos,
                                     negatives=neg,
                                     **BEST_CLASSIFIER_CONFIG)

    # Now generate the candidates.  We only keep track of the number of
    # positives generated, because there are always more negatives
    num_generated = 0
    for token in features.dictionary.get_token_list():
        if token in exclude:
            print '\t\tx\t%s' % token
            continue
        score = clf.score(token)[0]
        if score > clf.threshold:
            print '%s\t+' % token
            out_f.write('%s\t+\t%f\n' % (token, score))
            num_generated += 1
            if num_generated == num_to_generate:
                break
        else:
            print '\t-\t%s' % token
            out_f.write('%s\t-\t%f\n' % (token, score))
예제 #2
0
def get_map_evaluator(
    kind='svm',
    on_unk=False,
    syntax_feature_types=['baseline', 'dependency', 'hand_picked'],
    semantic_similarity=None,
    syntactic_multiplier=1.0,
    semantic_multiplier=1.0,
    k=3,
):
    classifier = c.make_classifier(
        kind=kind,
        on_unk=on_unk,
        syntax_feature_types=syntax_feature_types,
        semantic_similarity=semantic_similarity,
        syntactic_multiplier=syntactic_multiplier,
        semantic_multiplier=semantic_multiplier,
        k=k,
    )
    train_positives, train_negatives, train_neutrals = get_train_sets()
    test_positives, test_negatives, test_neutrals = get_test_sets()
    evaluator = RelationalNounMapEvaluator(classifier, train_positives,
                                           train_negatives, test_positives,
                                           test_negatives)

    return evaluator
def test_classifier_with_mass_fallback():
    clf = classifier.make_classifier(fallback_enabled=True)
    assert clf is not None

    predictions = classifier.predict(clf, dragoon_data)
    assert predictions[0] == True
    assert predictions[1] == True
def test_make_classifier():
    clf = classifier.make_classifier()
    assert clf is not None

    predictions = classifier.predict(clf, dragoon_data)
    assert predictions[0] == True
    assert predictions[1] == True
def test_harris_paper_data():
    # Load up the real harris data from the paper
    data = pd.read_csv('tests/table_harris.csv')

    paper_prediction = data['BHS']
    paper_prediction_fallback = data['BHS (Fallback)']

    data = data.drop(columns=['BHS', 'BHS (Fallback)', 'Cluster Name'])

    clf = classifier.make_classifier()
    clf_fallback = classifier.make_classifier(fallback_enabled=True)
    assert clf is not None
    assert clf_fallback is not None

    assert arrays_almost_same(paper_prediction.values, clf.predict(data))
    assert arrays_almost_same(paper_prediction_fallback.values,
                              clf_fallback.predict(data))
def test_classifier_no_estimate():
    clf = classifier.make_classifier(use_relaxation_time_estimate=False)
    assert clf is not None

    # Should raise since dragoon_data lacks Half-Mass Relaxation Time
    with pytest.raises(KeyError):
        predictions = classifier.predict(clf, dragoon_data)

    # Let's add some fake values and make sure it works
    dragoon_copy = dragoon_data.copy()
    dragoon_copy['Half-Mass Relaxation Time'] = 10000

    predictions = classifier.predict(clf, dragoon_copy)
    assert len(predictions) == 3
예제 #7
0
def generate_candidates_ordinal(num_to_generate,
                                out_path,
                                pos,
                                neg,
                                neut,
                                exclude,
                                kernel=None,
                                features=None):

    # Open the file to which we will write candidates
    out_f = open(out_path, 'w')

    # Read in the extracted features, which we'll also need for a couple things
    if features is None:
        features = extract_features.FeatureAccumulator(
            load=BEST_WORDNET_ONLY_FEATURES_PATH)

    # Make the best performing classifier.  This is what we'll use to score the
    # "relationalness" of new words.
    clf = classifier.make_classifier(kind='osvm',
                                     kernel=kernel,
                                     features=features,
                                     positives=pos,
                                     negatives=neg,
                                     neutrals=neut,
                                     **BEST_CLASSIFIER_CONFIG)

    # Now generate the candidates.  We only keep track of the number of
    # positives generated, because there are always more negatives
    num_generated = 0
    filtered_tokens = [
        t for t in features.dictionary.get_token_list() if t not in exclude
    ]

    for token, score in clf.score_parallel(filtered_tokens):
        if score >= 1:
            print '%s\t+' % token
            out_f.write('%s\t+\t%f\n' % (token, score))
            num_generated += 1
            if num_generated == num_to_generate:
                break
        elif score > -1:
            print '\t0\t%s' % token
            out_f.write('%s\t0\t%f\n' % (token, score))
        else:
            print '\t-\t%s' % token
            out_f.write('%s\t-\t%f\n' % (token, score))
예제 #8
0
def evaluate_classifier(name, classifier_definition, features, out_path=None):
    """
    Assesses the performance of the classifier defined by the dictionary
    ``classifier_definitions``.  That dictionary should provide the arguments
    needed to construct the classifier when provided to the function
    classifier.make_classifier.
    """

    print 'evaluating:', json.dumps(classifier_definition, indent=2)
    print 'writing result to:%s' % out_path

    errors_path = classifier_definition.get('error-analysis-path',
                                            'analyze-errors.tsv')
    print 'errors_path:', errors_path
    analyze_errors_f = open(os.path.join(DATA_DIR, errors_path), 'w')

    if out_path is not None:
        out_file = open(out_path, 'a')

    # Some of the "classifier definition" settings control the train / test
    # data and features supplied to the classifier, rather than the classifiers
    # config.  Handle those settings now.

    # Get the desired datset
    data_source = classifier_definition.get('data_source', None)
    seed = classifier_definition.get('seed', 0)
    print 'seed:', seed
    if data_source == 'seed':
        train, test = utils.get_train_test_seed_split(features.dictionary)
    elif data_source == 'crowdflower-annotated-top':
        train, test = annotations.Annotations(
            features.dictionary).get_train_test('top', seed=seed)
    elif data_source == 'crowdflower-annotated-rand':
        train, test = annotations.Annotations(
            features.dictionary).get_train_test('rand', seed=seed)
    elif data_source == 'crowdflower-dev-top':
        train, test = annotations.Annotations(
            features.dictionary).get_train_dev('top')
    elif data_source == 'crowdflower-dev-rand':
        train, test = annotations.Annotations(
            features.dictionary).get_train_dev('rand')
    elif data_source == 'crowdflower-dev':
        train, test = annotations.Annotations(
            features.dictionary).get_train_dev()
    else:
        raise ValueError('Unexpected data_source: %s' % data_source)

    # Binarize the dataset if desired by converting items labelled `neutral`
    # into either positive or negative.
    binarize_mode = classifier_definition.get('binarize_mode', None)
    if binarize_mode is not None:
        if binarize_mode == '+/0-':
            train['neg'] = train['neut'] | train['neg']
            test['neg'] = test['neut'] | test['neg']
        elif binarize_mode == '+0/-':
            train['pos'] = train['pos'] | train['neut']
            test['pos'] = test['pos'] | test['neut']
        else:
            raise ValueError('Unexpected binarize_mode: %s' % binarize_mode)
        train['neut'] = set()
        test['neut'] = set()

    # Convert the training and test sets into a vectorized format. In the
    # "kernel" format, the feature vectors are just token ids (the kernel
    # function is "smart" and knows how to compute the kernels given ids).
    data_format = classifier_definition.get('data_format')
    if data_format == 'kernel':
        X_train, Y_train = utils.make_kernel_vector(train, features)
        X_test, Y_test = utils.make_kernel_vector(test, features)
        classifier_definition['verbose'] = 1
        precomputed_kernel = kernels.PrecomputedKernel(features,
                                                       classifier_definition)

        # Trigger the necessary various lazy calculates on the features class
        # by doing one kernel calculation
        precomputed_kernel.eval_pair_token('car', 'tree')
        # Now precompute values of the kernel for all example pairs
        num_processes = classifier_definition.get('kernel_processes', 4)
        precomputed_kernel.precompute_parallel(examples=X_train + X_test,
                                               num_processes=num_processes)
        classifier_definition['pre-bound-kernel'] = precomputed_kernel

    # Or convert the training and test sets into a numpy sparse matrix format
    elif data_format == 'vector':
        count_based_features = classifier_definition.get(
            'count_based_features')
        non_count_features = classifier_definition.get('non_count_features')
        count_feature_mode = classifier_definition.get('count_feature_mode')
        whiten = classifier_definition.get('whiten', False)
        feature_threshold = classifier_definition.get('feature_threshold', 0.5)

        Q_train, X_train, Y_train = utils.make_vectors(
            train,
            features,
            count_based_features,
            non_count_features,
            count_feature_mode,
            whiten=whiten,
            threshold=feature_threshold)
        Q_test, X_test, Y_test = utils.make_vectors(
            test,
            features,
            count_based_features,
            non_count_features,
            count_feature_mode,
            whiten=whiten,
            threshold=feature_threshold)

    else:
        raise ValueError('Unexpected data_format: %s' % data_format)

    # Allow automatically re-weighting the class to help with unbalanced
    # classifications.
    if 'class_weight' in classifier_definition:
        if classifier_definition['class_weight'] == 'auto':
            classifier_definition['class_weight'] = get_class_weights(Y_train)

    # Make the classifier
    kind = classifier_definition.get('kind')
    clf = c.make_classifier(kind=kind,
                            X_train=X_train,
                            Y_train=Y_train,
                            features=features,
                            classifier_definition=classifier_definition)

    # We can either tune the decision threshold
    if classifier_definition.get('find_threshold', False):
        decision_threshold = clf.find_threshold(X_test, Y_test).tolist()

    # Or set it based on a prior fitted value
    elif classifier_definition.get('use_threshold', None) is not None:
        decision_threshold = classifier_definition['use_threshold']
        clf.set_threshold(decision_threshold)

    # Or stick with the default decision threshold built into the classifier
    else:
        decision_threshold = None

        ## If binarize mode has been set, then the classifier is already
        ## configured to find treat the problem as a binary classification,
        ## so finding the classification threshold is straightforward.
        #if binarize_mode is '+0/-'
        #    best_loose_f1, loose_threshold = utils.find_threshold(
        #        clf, X_test, Y_test)
        #    best_strict_f1, strict_threshold = None, None
        #    clf.set_threshold(threshold)

        #elif binarize_mode is '+/0-'
        #    best_strict_f1, strict_threshold = utils.find_threshold(
        #        clf, X_test, Y_test)
        #    best_loose_f1, loose_threshold = None, None
        #    clf.set_threshold(threshold)

        #else:
        #    best_strict_f1, strict_threshold = utils.find_threshold(
        #        clf, X_test, Y_test_strict, positive=set([1,0]),
        #        negative=set([-1])
        #    )
        #    best_loose_f1, loose_threshold = utils.find_threshold(
        #        clf, X_test, Y_test_strict, positive=set([1]),
        #        negative=set([0,-1])
        #    )

    # Test it on the test set, generating a confusion matrix

    Y_predicted = clf.predict(X_test)
    for word, pred, actual in zip(Q_test, Y_predicted, Y_test):
        if actual == 1:
            print '%s : %d' % (word, pred)
            analyze_errors_f.write('%s\t%d\n' % (word, pred))

    confusion_matrix = generate_confusion_matrix(clf, X_test, Y_test)

    # Calculate the F1 relative to each class, and the macro-average
    if binarize_mode is None:
        f1s, macro_f1 = calculate_f1(confusion_matrix)
        tight_f1 = f1s[1]
        loose_f1 = calculate_f1_loose(confusion_matrix)

    elif binarize_mode == '+0/-':
        loose_f1 = calculate_simple_f1(confusion_matrix)
        tight_f1 = None
        macro_f1 = None

    elif binarize_mode == '+/0-':
        tight_f1 = calculate_simple_f1(confusion_matrix)
        loose_f1 = None
        macro_f1 = None

    # Calculate the MAP
    AP = calculate_classifier_MAP(clf, X_test, Y_test, [1, 0])

    results = {
        'confusion_matrix': confusion_matrix,
        'precision': loose_f1[0],
        'recall': loose_f1[1],
        'f1': loose_f1[1],
        'AP': AP,
        'threshold': decision_threshold
    }

    performance_record = ('{"name":"%s",\n\n' % name + '"run-specification":' +
                          json.dumps(classifier_definition, indent=2) + ',' +
                          '\n\n' + '"results":' + json.dumps(results) +
                          '}\n\n\n')

    print performance_record
    if out_path is not None:
        out_file.write(performance_record)

    return clf, results
예제 #9
0
		train_filename = sys.argv[i]
	i += 1

print >>sys.stderr, "reading labelled dataset from '" + train_filename + "'..."

input = open(train_filename, "r") if train_filename != "-" else sys.stdin

input.readline()

X = numpy.loadtxt(input, delimiter=",", dtype=numpy.uint8)

labels = X[:,0]
X=X[:,1:].astype(float)

print >>sys.stderr, "training KNN with", min(train_threshold, X.shape[0]), "training instances and k=", k, "..."
clf = make_classifier(preprocess(X[:train_threshold]), labels[:train_threshold], name="KNN", params=[k])

print >>sys.stderr, "making predicitions for", max(0,X.shape[0]-train_threshold), "instances ..."
predictions = clf.predict(preprocess(X[train_threshold:]))

print >>sys.stderr, "evaluating ..."

if verbose:
	for i in range(len(predictions)):
		print labels[train_threshold:][i], predictions[i]
		if labels[train_threshold:][i] != predictions[i]:
			print >>sys.stderr, "should be:", labels[train_threshold:][i], ", was:", predictions[i]
			put_image(X[train_threshold:][i], 0, sys.stderr)
			print >>sys.stderr
else:
	for i in range(len(predictions)):