def pickle_similarities():
    """
    Pickle similarities based on all records
    """
    # TODO this is kind of wrong since the similarities will change as the word features are generated per split
    records = load_records()

    # set up extractor using desired features
    extractor = FeatureExtractor(word_gap=True, count_dict=True, phrase_count=True, word_features=5)
    extractor.create_dictionaries(records, how_many=5)

    data, _ = extractor.generate_features(records)
    data = vec.fit_transform(data).toarray()
    similarities = get_similarities(data)

    pickle.dump(similarities, open('pickles/similarities_all.p', 'wb'))
Example #2
0
def build_pipeline(which, train):
    """
    Set up classfier here to avoid repetition
    """
    if which == 'bag_of_words':
        clf = Pipeline([('vectoriser', DictVectorizer()),
                        #('scaler', preprocessing.StandardScaler(with_mean=False)),
                        ('normaliser', preprocessing.Normalizer(norm='l2')),
                        ('svm', LinearSVC(dual=True, C=1))])
        # set up extractor using desired features
        extractor = FeatureExtractor(word_gap=False, count_dict=False, phrase_count=True, pos=False, combo=True,
                                     entity_type=True, word_features=False, bag_of_words=True, bigrams=True)

    elif which == 'word_features':
        clf = Pipeline([('vectoriser', DictVectorizer(sparse=False)),
                        #('scaler', preprocessing.StandardScaler(with_mean=False)),
                        ('normaliser', preprocessing.Normalizer()),
                        #('svm', SVC(kernel='poly', coef0=1, degree=2, gamma=10, C=1, cache_size=2000))])
                        #('svm', SVC(kernel='rbf', gamma=1, cache_size=1000, C=1))])
                        #('svm', SVC(kernel='linear', cache_size=1000, C=1))])
                        ('svm', LinearSVC(dual=True, C=1))])

        extractor = FeatureExtractor(word_gap=False, count_dict=False, phrase_count=True, word_features=True,
                                     combo=True, pos=True, entity_type=True, bag_of_words=False, bigrams=False)
        extractor.create_dictionaries(train, how_many=5)

    else:
        clf = Pipeline([('vectoriser', DictVectorizer(sparse=False)),
                        #('scaler', preprocessing.StandardScaler(with_mean=False)),
                        ('normaliser', preprocessing.Normalizer()),
                        #('svm', SVC(kernel='poly', coef0=1, degree=3, gamma=1, C=1, cache_size=2000))])
                        #('svm', SVC(kernel='rbf', gamma=100, cache_size=1000, C=10))])
                        #('svm', SVC(kernel='linear', cache_size=1000, C=1))])
                        ('svm', LinearSVC(dual=True, C=1))])

        extractor = FeatureExtractor(word_gap=False, count_dict=False, phrase_count=True, word_features=False,
                                     combo=True, pos=True, entity_type=True, bag_of_words=False, bigrams=False)

    return clf, extractor