예제 #1
0
def pickle_similarities():
    """
    Pickle similarities based on all records
    """
    # TODO this is kind of wrong since the similarities will change as the word features are generated per split
    records = load_records()

    # set up extractor using desired features
    extractor = FeatureExtractor(word_gap=True, count_dict=True, phrase_count=True, word_features=5)
    extractor.create_dictionaries(records, how_many=5)

    data, _ = extractor.generate_features(records)
    data = vec.fit_transform(data).toarray()
    similarities = get_similarities(data)

    pickle.dump(similarities, open('pickles/similarities_all.p', 'wb'))
def build_pipeline(bag_of_words):
    """
    Set up classfier and extractor here to avoid repetition
    """
    if bag_of_words:
        clf = Pipeline([('normaliser', preprocessing.Normalizer(norm='l2')),
                        ('svm', LinearSVC(dual=True, C=1))])
        # set up extractor using desired features
        extractor = FeatureExtractor(word_gap=False, count_dict=False, phrase_count=False, pos=False, combo=True,
                                     entity_type=True, word_features=False, bag_of_words=True, bigrams=True)
        sim = pickle.load(open('pickles/cross_valid_similarities_orig_bag_of_words.p', 'rb'))

    else:
        clf = Pipeline([('normaliser', preprocessing.Normalizer(norm='l2')),
                        ('svm', SVC(kernel='rbf', gamma=100, cache_size=2000, C=10))])
        # set up extractor using desired features
        extractor = FeatureExtractor(word_gap=False, count_dict=False, phrase_count=True, pos=True, combo=True,
                                     entity_type=True, word_features=False, bag_of_words=False, bigrams=False)
        # below must be used to create active features
        #extractor.create_dictionaries(all_records, how_many=5)
        sim = pickle.load(open('pickles/cross_valid_similarities_orig_features_only.p', 'rb'))

    return clf, extractor, sim
예제 #3
0
def pickle_similarities(orig_only, bag_of_words):
    """
    Pickle similarities for newly annotated data only
    """
    # TODO this is kind of wrong since the similarities will change as the word features are generated per split
    orig_records, new_records = load_records(orig_only)
    all_records = orig_records + new_records
    orig_length = len(orig_records)

    # set up extractor using desired features
    if bag_of_words:
        # FOR THE SPARSE LINEAR SVM
        extractor = FeatureExtractor(word_gap=False, count_dict=False, phrase_count=False, pos=False, combo=True,
                                     entity_type=True, word_features=False, bag_of_words=True, bigrams=True)
        if orig_only:
            f_name = 'pickles/orig_no_accents_similarities_bag_of_words.p'
        else:
            f_name = 'pickles/similarities_bag_of_words.p'

    else:
        # FOR THE FEATURES ONE
        extractor = FeatureExtractor(word_gap=False, count_dict=False, phrase_count=True, pos=True, combo=True,
                                     entity_type=True, word_features=False, bag_of_words=False, bigrams=False)
        '''
        # BELOW FOR SPECIFIC WORD FEATURES
        #extractor.create_dictionaries(all_records, how_many=5)
        #extractor = FeatureExtractor(word_gap=False, count_dict=False, phrase_count=False, word_features=True)
        '''
        if orig_only:
            f_name = 'pickles/orig_no_accents_similarities_features_only.p'
        else:
            f_name = 'pickles/similarities_features_only.p'

    similarities = get_similarities(all_records, extractor, orig_length)

    # only want to pickle new data since orig always used for training
    pickle.dump(similarities[orig_length:], open(f_name, 'wb'))
예제 #4
0
def build_pipeline(which, train):
    """
    Set up classfier here to avoid repetition
    """
    if which == 'bag_of_words':
        clf = Pipeline([('vectoriser', DictVectorizer()),
                        #('scaler', preprocessing.StandardScaler(with_mean=False)),
                        ('normaliser', preprocessing.Normalizer(norm='l2')),
                        ('svm', LinearSVC(dual=True, C=1))])
        # set up extractor using desired features
        extractor = FeatureExtractor(word_gap=False, count_dict=False, phrase_count=True, pos=False, combo=True,
                                     entity_type=True, word_features=False, bag_of_words=True, bigrams=True)

    elif which == 'word_features':
        clf = Pipeline([('vectoriser', DictVectorizer(sparse=False)),
                        #('scaler', preprocessing.StandardScaler(with_mean=False)),
                        ('normaliser', preprocessing.Normalizer()),
                        #('svm', SVC(kernel='poly', coef0=1, degree=2, gamma=10, C=1, cache_size=2000))])
                        #('svm', SVC(kernel='rbf', gamma=1, cache_size=1000, C=1))])
                        #('svm', SVC(kernel='linear', cache_size=1000, C=1))])
                        ('svm', LinearSVC(dual=True, C=1))])

        extractor = FeatureExtractor(word_gap=False, count_dict=False, phrase_count=True, word_features=True,
                                     combo=True, pos=True, entity_type=True, bag_of_words=False, bigrams=False)
        extractor.create_dictionaries(train, how_many=5)

    else:
        clf = Pipeline([('vectoriser', DictVectorizer(sparse=False)),
                        #('scaler', preprocessing.StandardScaler(with_mean=False)),
                        ('normaliser', preprocessing.Normalizer()),
                        #('svm', SVC(kernel='poly', coef0=1, degree=3, gamma=1, C=1, cache_size=2000))])
                        #('svm', SVC(kernel='rbf', gamma=100, cache_size=1000, C=10))])
                        #('svm', SVC(kernel='linear', cache_size=1000, C=1))])
                        ('svm', LinearSVC(dual=True, C=1))])

        extractor = FeatureExtractor(word_gap=False, count_dict=False, phrase_count=True, word_features=False,
                                     combo=True, pos=True, entity_type=True, bag_of_words=False, bigrams=False)

    return clf, extractor
예제 #5
0
def plot_roc_curve():
    """
    Plot roc curve, not cross validated for now
    """
    clf = build_pipeline()
    extractor = FeatureExtractor(word_gap=True,
                                 word_features=True,
                                 count_dict=True,
                                 phrase_count=True)

    features, labels = load_features_data(extractor)
    # transform from dict into array for training
    vec = DictVectorizer()
    data = vec.fit_transform(features).toarray()

    # split data into train and test, may want to use cross validation later
    train_data, test_data, train_labels, test_labels = cross_validation.train_test_split(
        data, labels, train_size=0.9, random_state=1)
    clf.fit(train_data, train_labels)

    confidence = clf.decision_function(test_data)
    fpr, tpr, thresholds = metrics.roc_curve(test_labels, confidence)
    auroc = metrics.auc(fpr, tpr)

    print len(fpr), len(tpr)
    # set up the figure
    plt.figure()
    #plt.grid()
    plt.xlabel('FP rate')
    plt.ylabel('TP rate')
    plt.title('Receiver operating characteristic')
    plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % auroc)
    plt.plot([0, 1], [0, 1], 'k--')

    plt.legend(loc='best')
    filepath = 'results/' + time_stamped('roc.png')
    plt.savefig(filepath, format='png')
예제 #6
0
def tune_parameters(type):
    """
    Find best parameters for given kernels (and features)
    """
    records = load_records()

    if type == 'bag_of_words':
        print 'bag of words'
        extractor = FeatureExtractor(word_gap=False,
                                     count_dict=False,
                                     phrase_count=False,
                                     word_features=False,
                                     combo=True,
                                     pos=False,
                                     entity_type=True,
                                     bag_of_words=True,
                                     bigrams=True)
        data, labels = extractor.generate_features(records)
        cv = cross_validation.StratifiedKFold(labels,
                                              shuffle=True,
                                              n_folds=10,
                                              random_state=0)

        # use linear svm for sparse bag of words feature vector
        pipeline = Pipeline([('vectoriser', DictVectorizer()),
                             ('normaliser', preprocessing.Normalizer()),
                             ('svm', LinearSVC(dual=True))])

        param_grid = [{'svm__C': np.array([0.001, 0.1, 1, 10, 100])}]

    elif type == 'linear':
        print 'linear'
        extractor = FeatureExtractor(word_gap=False,
                                     count_dict=False,
                                     phrase_count=True,
                                     word_features=False,
                                     combo=True,
                                     pos=False,
                                     entity_type=True,
                                     bag_of_words=False)
        data, labels = extractor.generate_features(records)
        cv = cross_validation.StratifiedKFold(labels,
                                              shuffle=True,
                                              n_folds=10,
                                              random_state=0)

        # use linear svm for sparse bag of words feature vector
        pipeline = Pipeline([('vectoriser', DictVectorizer()),
                             ('normaliser', preprocessing.Normalizer()),
                             ('svm', LinearSVC(dual=True))])

        param_grid = [{'svm__C': np.array([1, 10, 100, 1000, 10000])}]

    elif type == 'rbf':
        print 'rbf'
        # non baog of words features
        extractor = FeatureExtractor(word_gap=False,
                                     count_dict=False,
                                     phrase_count=True,
                                     word_features=False,
                                     combo=True,
                                     pos=False,
                                     entity_type=True,
                                     bag_of_words=False)
        data, labels = extractor.generate_features(records)
        cv = cross_validation.StratifiedKFold(labels,
                                              shuffle=True,
                                              n_folds=10,
                                              random_state=0)

        # use linear svm for sparse bag of words feature vector
        pipeline = Pipeline([('vectoriser', DictVectorizer()),
                             ('normaliser', preprocessing.Normalizer()),
                             ('svm', SVC(kernel='rbf', cache_size=2000))])

        param_grid = [{
            'svm__C': np.array([1, 10, 100, 1000, 10000]),
            'svm__gamma': np.array([0.01, 0.1, 1, 100, 1000])
        }]

    elif type == 'poly':
        print 'poly'
        # non baog of words features
        extractor = FeatureExtractor(word_gap=False,
                                     count_dict=False,
                                     phrase_count=True,
                                     word_features=False,
                                     combo=True,
                                     pos=False,
                                     entity_type=True,
                                     bag_of_words=False)
        data, labels = extractor.generate_features(records)
        cv = cross_validation.StratifiedKFold(labels,
                                              shuffle=True,
                                              n_folds=10,
                                              random_state=0)

        # use linear svm for sparse bag of words feature vector
        pipeline = Pipeline([
            ('vectoriser', DictVectorizer()),
            ('normaliser', preprocessing.Normalizer()),
            #('svm', SVC(kernel='poly', cache_size=2000))])
            ('svm', SVC(kernel='poly', degree=3, C=1, gamma=2,
                        cache_size=2000))
        ])

        #param_grid = [{'svm__C': np.array([1, 10]), 'svm__gamma': np.array([1, 10]),
        #'svm__degree': np.array([2, 3, 4, 5]), 'svm__coef0': np.array([1, 2, 3, 4])}]
        param_grid = [{'svm__coef0': np.array([1, 2, 3, 4])}]

    elif type == 'sigmoid':
        print 'sigmoid'
        # non baog of words features
        extractor = FeatureExtractor(word_gap=False,
                                     count_dict=False,
                                     phrase_count=True,
                                     word_features=False,
                                     combo=True,
                                     pos=False,
                                     entity_type=True,
                                     bag_of_words=False)
        data, labels = extractor.generate_features(records)
        cv = cross_validation.StratifiedKFold(labels,
                                              shuffle=True,
                                              n_folds=10,
                                              random_state=0)

        # use linear svm for sparse bag of words feature vector
        pipeline = Pipeline([('vectoriser', DictVectorizer()),
                             ('normaliser', preprocessing.Normalizer()),
                             ('svm', SVC(kernel='sigmoid', cache_size=2000))])

        param_grid = [{
            'svm__C': np.array([1, 10]),
            'svm__gamma': np.array([0.001, 0.1, 1, 10]),
            'svm__coef0': np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
        }]

    clf = GridSearchCV(pipeline,
                       param_grid,
                       cv=cv,
                       scoring='f1',
                       n_jobs=-1,
                       verbose=True)
    clf.fit(data, labels)
    print clf.best_estimator_
    print clf.best_params_
    print clf.best_score_
예제 #7
0
def create_results():
    """
    Get cross validated scores for classifiers built with various parameters
    Write results to csv file for easy human analysis
    """
    # test a variety of features and algorithms
    clf = build_pipeline()

    # set up output file
    with open('results/feature_selection/bigram_features.csv', 'wb') as f_out:
        csv_writer = csv.writer(f_out, delimiter=',')
        csv_writer.writerow([
            'features', 'accuracy', 'auroc', 'true_P', 'true_R', 'true_F',
            'false_P', 'false_R', 'false_F', 'average_P', 'average_R',
            'average_F'
        ])

        # NEW SHIZ
        extractor = FeatureExtractor(word_gap=False,
                                     count_dict=False,
                                     phrase_count=False,
                                     word_features=False,
                                     combo=False,
                                     pos=False,
                                     entity_type=False,
                                     bag_of_words=True,
                                     bigrams=True)
        write_scores(csv_writer, clf, extractor, -1, 'bag of words + bigrams')
        print 'done'

        extractor = FeatureExtractor(word_gap=False,
                                     count_dict=False,
                                     phrase_count=False,
                                     word_features=False,
                                     combo=False,
                                     pos=False,
                                     entity_type=True,
                                     bag_of_words=True,
                                     bigrams=True)
        write_scores(csv_writer, clf, extractor, -1,
                     'bag of words + bigrams + type')
        print 'done'

        extractor = FeatureExtractor(word_gap=False,
                                     count_dict=False,
                                     phrase_count=False,
                                     word_features=False,
                                     combo=True,
                                     pos=False,
                                     entity_type=True,
                                     bag_of_words=True,
                                     bigrams=True)
        write_scores(csv_writer, clf, extractor, -1,
                     'bag of words + bigrams + type + combo set')
        print 'done'

        extractor = FeatureExtractor(word_gap=False,
                                     count_dict=False,
                                     phrase_count=True,
                                     word_features=False,
                                     combo=True,
                                     pos=False,
                                     entity_type=True,
                                     bag_of_words=True,
                                     bigrams=True)
        write_scores(csv_writer, clf, extractor, -1,
                     'bag of words + bigrams + type + combo_set + phrase')
        print 'done'
        '''
        # BAG OF WORDS
        # first using all words and no other features
        extractor = FeatureExtractor(word_gap=False, count_dict=False, phrase_count=False, word_features=False,
                                     combo=False, pos=False, entity_type=False, bag_of_words=True)
        write_scores(csv_writer, clf, extractor, -1, 'bag of words')
        print 'done'

        extractor = FeatureExtractor(word_gap=False, count_dict=False, phrase_count=False, word_features=False,
                                     combo=False, pos=False, entity_type=True, bag_of_words=True)
        write_scores(csv_writer, clf, extractor, -1, 'bag of words, type')
        print 'done'

        extractor = FeatureExtractor(word_gap=True, count_dict=False, phrase_count=False, word_features=False,
                                     combo=False, pos=False, entity_type=False, bag_of_words=True)
        write_scores(csv_writer, clf, extractor, -1, 'bag of words, gap')
        print 'done'

        extractor = FeatureExtractor(word_gap=False, count_dict=False, phrase_count=True, word_features=False,
                                     combo=False, pos=False, entity_type=False, bag_of_words=True)
        write_scores(csv_writer, clf, extractor, -1, 'bag of words, phrase count')
        print 'done'

        extractor = FeatureExtractor(word_gap=False, count_dict=False, phrase_count=False, word_features=False,
                                     combo=False, pos=True, entity_type=False, bag_of_words=True)
        write_scores(csv_writer, clf, extractor, -1, 'bag of words, non-count pos')
        print 'done'

        extractor = FeatureExtractor(word_gap=False, count_dict=False, phrase_count=False, word_features=False,
                                     combo=True, pos=False, entity_type=False, bag_of_words=True)
        write_scores(csv_writer, clf, extractor, -1, 'bag of words, non-count combo')
        print 'done'

        extractor = FeatureExtractor(word_gap=False, count_dict=True, phrase_count=False, word_features=False,
                                     combo=False, pos=True, entity_type=False, bag_of_words=True)
        write_scores(csv_writer, clf, extractor, -1, 'bag of words, count pos')
        print 'done'

        extractor = FeatureExtractor(word_gap=False, count_dict=True, phrase_count=False, word_features=False,
                                     combo=True, pos=False, entity_type=False, bag_of_words=True)
        write_scores(csv_writer, clf, extractor, -1, 'bag of words, count combo')
        print 'done'
        '''
        '''
        # NO BAG OF WORDS
        # first using all words and no other features
        #extractor = FeatureExtractor(word_gap=True, count_dict=True, phrase_count=True, word_features=False,
                                     #combo=True, pos=True, entity_type=True, bag_of_words=False)
        #write_scores(csv_writer, clf, extractor, -1, 'all')
        #print 'done'

        extractor = FeatureExtractor(word_gap=True, count_dict=False, phrase_count=True, word_features=False,
                                     combo=True, pos=True, entity_type=True, bag_of_words=False)
        write_scores(csv_writer, clf, extractor, -1, 'non-counting')
        print 'done'

        extractor = FeatureExtractor(word_gap=False, count_dict=False, phrase_count=True, word_features=False,
                                     combo=True, pos=True, entity_type=True, bag_of_words=False)
        write_scores(csv_writer, clf, extractor, -1, 'no gap')
        print 'done'

        extractor = FeatureExtractor(word_gap=True, count_dict=False, phrase_count=True, word_features=False,
                                     combo=True, pos=True, entity_type=False, bag_of_words=False)
        write_scores(csv_writer, clf, extractor, -1, 'no type')
        print 'done'

        extractor = FeatureExtractor(word_gap=True, count_dict=False, phrase_count=False, word_features=False,
                                     combo=True, pos=True, entity_type=True, bag_of_words=False)
        write_scores(csv_writer, clf, extractor, -1, 'no phrase count')
        print 'done'

        extractor = FeatureExtractor(word_gap=True, count_dict=False, phrase_count=True, word_features=False,
                                     combo=True, pos=False, entity_type=True, bag_of_words=False)
        write_scores(csv_writer, clf, extractor, -1, 'no pos')
        print 'done'

        extractor = FeatureExtractor(word_gap=True, count_dict=False, phrase_count=True, word_features=False,
                                     combo=False, pos=True, entity_type=True, bag_of_words=False)
        write_scores(csv_writer, clf, extractor, -1, 'no combo')
        print 'done'
        '''
        '''
예제 #8
0
def learning_method_comparison(splits, repeats):
    """
    Plot learning curves to compare accuracy of different learning methods
    """
    clf = build_pipeline()
    # set up extractor using desired features
    extractor = FeatureExtractor(word_gap=True,
                                 count_dict=True,
                                 phrase_count=True,
                                 word_features=0)

    # want to have original records AND data
    records = load_records(eu_adr_only=False)
    orig_data, orig_labels = extractor.generate_features(records)

    # TODO what is the deal here???
    # this needs to match whatever percentage is being used for testing
    samples_per_split = (0.8 / splits) * len(records)

    # if using density sampling only want to calculate similarities once
    sim = pickle.load(open('pickles/similarities_all.p', 'rb'))

    r_scores = np.zeros(shape=(repeats, splits, 3, 2))
    u_scores = np.zeros(shape=(repeats, splits, 3, 2))
    d_scores = np.zeros(shape=(repeats, splits, 3, 2))

    r_accuracy = np.zeros(shape=(repeats, splits))
    u_accuracy = np.zeros(shape=(repeats, splits))
    d_accuracy = np.zeros(shape=(repeats, splits))

    # run test with same starting conditions
    for i in xrange(repeats):
        print i
        # going to split the data here, then pass identical indices to the different learning methods
        all_indices = np.arange(len(records))

        # seed the shuffle here so can repeat experiment for different numbers of splits
        np.random.seed(2 * i)
        np.random.shuffle(all_indices)

        # take off 20% for testing
        test_indices = all_indices[:len(records) / 5]
        train_indices = all_indices[len(records) / 5:]

        # split the data here using cross validator and return
        r_scores[i], r_accuracy[i] = random_sampling(clf, extractor, records,
                                                     orig_data, orig_labels,
                                                     train_indices,
                                                     test_indices, splits)
        u_scores[i], u_accuracy[i] = uncertainty_sampling(
            clf, extractor, records, orig_data, orig_labels, train_indices,
            test_indices, splits)
        d_scores[i], d_accuracy[i] = density_sampling(clf, extractor, records,
                                                      orig_data, orig_labels,
                                                      train_indices,
                                                      test_indices, sim,
                                                      splits)

    # create array of scores to pass to plotter
    scores = [['Accuracy'], ['Precision'], ['Recall'], ['F-Score']]
    # accuracy scores
    scores[0].append(r_accuracy.mean(axis=0, dtype=np.float64))
    scores[0].append(u_accuracy.mean(axis=0, dtype=np.float64))
    scores[0].append(d_accuracy.mean(axis=0, dtype=np.float64))

    # average over the repeats
    r_scores = r_scores.mean(axis=0, dtype=np.float64)
    u_scores = u_scores.mean(axis=0, dtype=np.float64)
    d_scores = d_scores.mean(axis=0, dtype=np.float64)
    # then true and false
    r_scores = r_scores.mean(axis=2, dtype=np.float64)
    u_scores = u_scores.mean(axis=2, dtype=np.float64)
    d_scores = d_scores.mean(axis=2, dtype=np.float64)

    # using numpy slicing to select correct scores
    for i in xrange(3):
        scores[i + 1].append(r_scores[:, i])
        scores[i + 1].append(u_scores[:, i])
        scores[i + 1].append(d_scores[:, i])

    for i in xrange(4):
        draw_learning_comparison(splits, scores[i][1], scores[i][2],
                                 scores[i][3], samples_per_split, repeats,
                                 scores[i][0])
예제 #9
0
def build_pipeline(bag_of_words, orig_only):
    """
    Set up classifier and extractor here to avoid repetition
    """
    if bag_of_words == 1:
        # BAG OF WORDS FEATURES
        clf = Pipeline([('vectoriser', DictVectorizer()),
                        ('normaliser', preprocessing.Normalizer(norm='l2')),
                        ('svm', LinearSVC(dual=True, C=1))])
        # set up extractor using desired features
        extractor = FeatureExtractor(word_gap=False,
                                     count_dict=False,
                                     phrase_count=False,
                                     pos=False,
                                     combo=True,
                                     entity_type=True,
                                     word_features=False,
                                     bag_of_words=True,
                                     bigrams=True)
        if orig_only:
            sim = pickle.load(
                open('pickles/orig_no_accents_similarities_bag_of_words.p',
                     'rb'))
        else:
            sim = pickle.load(open('pickles/similarities_bag_of_words.p',
                                   'rb'))

    elif bag_of_words == 2:
        # ACTIVELY GENERATED WORD FEATURES
        clf = Pipeline([('vectoriser', DictVectorizer(sparse=False)),
                        ('normaliser', preprocessing.Normalizer(norm='l2')),
                        ('svm', LinearSVC(dual=True, C=1))])
        # set up extractor using desired features
        extractor = FeatureExtractor(word_gap=False,
                                     count_dict=False,
                                     phrase_count=True,
                                     pos=True,
                                     combo=True,
                                     entity_type=True,
                                     word_features=True,
                                     bag_of_words=False,
                                     bigrams=False,
                                     after=False)
        if orig_only:
            sim = pickle.load(
                open('pickles/orig_no_accents_similarities_features_only.p',
                     'rb'))
        else:
            sim = pickle.load(
                open('pickles/similarities_features_only.p', 'rb'))

    elif bag_of_words == 3:
        # NON-WORD FEATURES RBF
        clf = Pipeline([('vectoriser', DictVectorizer(sparse=False)),
                        ('normaliser', preprocessing.Normalizer(norm='l2')),
                        ('svm',
                         SVC(kernel='rbf', gamma=100, cache_size=2000, C=10))])
        #('svm', SVC(kernel='poly', coef0=1, degree=3, gamma=2, cache_size=2000, C=1))])
        #('svm', LinearSVC(dual=True, C=1))])
        # set up extractor using desired features
        extractor = FeatureExtractor(word_gap=False,
                                     count_dict=False,
                                     phrase_count=True,
                                     pos=True,
                                     combo=True,
                                     entity_type=True,
                                     word_features=False,
                                     bag_of_words=False,
                                     bigrams=False)
        if orig_only:
            sim = pickle.load(
                open('pickles/orig_no_accents_similarities_features_only.p',
                     'rb'))
        else:
            sim = pickle.load(
                open('pickles/similarities_features_only.p', 'rb'))

    else:
        # NON-WORD FEATURES
        clf = Pipeline([
            ('vectoriser', DictVectorizer(sparse=False)),
            ('normaliser', preprocessing.Normalizer(norm='l2')),
            #('svm', SVC(kernel='rbf', gamma=100, cache_size=2000, C=10))])
            ('svm', LinearSVC(dual=True, C=1))
        ])
        # set up extractor using desired features
        extractor = FeatureExtractor(word_gap=False,
                                     count_dict=False,
                                     phrase_count=True,
                                     pos=True,
                                     combo=True,
                                     entity_type=True,
                                     word_features=False,
                                     bag_of_words=False,
                                     bigrams=False)
        if orig_only:
            sim = pickle.load(
                open('pickles/orig_no_accents_similarities_features_only.p',
                     'rb'))
        else:
            sim = pickle.load(
                open('pickles/similarities_features_only.p', 'rb'))

    return clf, extractor, sim
예제 #10
0
def learning_method_comparison(splits, repeats):
    """
    Plot learning curves to compare accuracy of different learning methods
    """
    clf = build_pipeline()
    # set up extractor using desired features
    extractor = FeatureExtractor(word_gap=True, count_dict=True, phrase_count=True, word_features=5)

    # want to have original records AND data
    records = load_records(eu_adr_only=False, orig_only=False)
    #print 'total records', len(records)

    # TODO what is the deal here???
    # this needs to match whatever percentage is being used for testing
    #samples_per_split = (0.8/splits) * len(records)
    samples_per_split = 4 * len(records)/(5 * splits)
    #print 'samples per split', samples_per_split

    # if using density sampling only want to calculate similarities once
    sim = pickle.load(open('pickles/similarities_all.p', 'rb'))

    r_scores = np.zeros(shape=(repeats, splits, 3, 2))
    u_scores = np.zeros(shape=(repeats, splits, 3, 2))
    d_scores = np.zeros(shape=(repeats, splits, 3, 2))

    r_accuracy = np.zeros(shape=(repeats, splits))
    u_accuracy = np.zeros(shape=(repeats, splits))
    d_accuracy = np.zeros(shape=(repeats, splits))

    # loop number of times to generate average scores
    for i in xrange(repeats):
        print i
        # going to split the data here, then pass identical indices to the different learning methods
        all_indices = np.arange(len(records))

        # seed the shuffle here so can repeat experiment for different numbers of splits
        np.random.seed(5*i)
        np.random.shuffle(all_indices)

        # take off 20% for testing
        # want the training set to be of a nice fixed size, rounding errors go into test set
        # this means that learning curves will always start and finish at same points
        test_indices = all_indices[4*(len(records)/5):]
        #print 'testing', len(test_indices)
        train_indices = all_indices[:4*(len(records)/5)]
        #print 'training', len(train_indices)

        # now use same test and train indices to generate scores for each learning method
        u_scores[i], u_accuracy[i] = uncertainty_sampling(clf, extractor, records, train_indices, test_indices, splits)
        r_scores[i], r_accuracy[i] = random_sampling(clf, extractor, records, train_indices, test_indices, splits)
        d_scores[i], d_accuracy[i] = density_sampling(clf, extractor, records, train_indices, test_indices, sim,
                                                      splits)

    # create array of scores to pass to plotter
    scores = [['Accuracy'], ['Precision'], ['Recall'], ['F-Score']]
    # accuracy scores
    scores[0].append(r_accuracy.mean(axis=0, dtype=np.float64))
    scores[0].append(u_accuracy.mean(axis=0, dtype=np.float64))
    scores[0].append(d_accuracy.mean(axis=0, dtype=np.float64))

    # average over the repeats
    r_scores = r_scores.mean(axis=0, dtype=np.float64)
    u_scores = u_scores.mean(axis=0, dtype=np.float64)
    d_scores = d_scores.mean(axis=0, dtype=np.float64)
    # then true and false
    r_scores = r_scores.mean(axis=2, dtype=np.float64)
    u_scores = u_scores.mean(axis=2, dtype=np.float64)
    d_scores = d_scores.mean(axis=2, dtype=np.float64)

    # using numpy slicing to select correct scores
    for i in xrange(3):
        scores[i+1].append(r_scores[:, i])
        scores[i+1].append(u_scores[:, i])
        scores[i+1].append(d_scores[:, i])

    for i in xrange(4):
        draw_learning_comparison(splits, scores[i][1], scores[i][2], scores[i][3], samples_per_split, repeats,
                                 scores[i][0])