Esempio n. 1
0
import re
from data_helper.names import keys, get_filename
from data_helper.features import Condition, FeatureException
import numpy as np
from operator import itemgetter
import settings
import matplotlib.pyplot as plt
from util.files import Exporter

conditions = {}
for key in keys():
    if len(re.findall('_', key)) == 2:
        fname = get_filename(key)
        try:
            ConditionStats = Condition(import_dir=settings.CLUSTER_DIR).load(fname, remove_outliers=True).make_condition_stats()
        except FeatureException as e:
            print str(e)
        else:
            stats = ConditionStats.condition_stats('usa')
            for condition, val in stats.iteritems():
                try:
                    conditions[condition] = conditions[condition] + val['prices_centered']
                except KeyError:
                    try:
                        conditions[condition] = val['prices_centered']
                    except KeyError:
                        pass

results = []
for condition, offset in conditions.iteritems():
    results.append([float(condition), np.average(offset)])
Esempio n. 2
0
def test(FID, show_price_pairs=default_show_price_pairs, retrain=True):

    dataset, price_mmr_state_condition_vin = get_dataset(
        get_filename(FID),
        remove_initial_outliers=False,
        extra_continuous_exclusions=EXTRA_CONTINUOUS_EXCLUSIONS,
        extra_categorical_exclusions=EXTRA_CATEGORICAL_EXCLUSIONS,
        expand_odometer=True,
        capture_condition=True,
        capture_state=True)

    vec = DictVectorizer()
    vectors = vec.fit_transform(dataset).toarray()
    features = vec.get_feature_names()
    # print features

    # create a testing set
    X_train, X_test, y_train, y_test = train_test_split(
        vectors, price_mmr_state_condition_vin, test_size=0.3)

    price_train = [p[0] for p in y_train]

    ###############################################################################

    # IDEAL
    # n_samples > n_features ** 2
    sample_size = len(X_train)
    min_sample_size = len(features) ** 2
    print 'training set: %s samples, %s features^2' % (sample_size, min_sample_size)

    # get the best classifier for the original full set of data
    best_clf = train(X_train, price_train)
    # predict targets on the same set of data
    predicted_y, _, _ = predict.run(best_clf, X_train, y_train, show_prices=False)

    if retrain:
        # remove the worst performers and consider them outliers
        X_train, y_train = filter_worst(X_train, y_train, predicted_y)
        price_train = [p[0] for p in y_train]
        # retrain the model
        best_clf = train(X_train, price_train)
        sample_size = len(X_train)
        min_sample_size = len(features) ** 2
        print '\ntraining set (outliers removed): %s samples, %s features^2' % (sample_size, min_sample_size)

    if sample_size > min_sample_size:
        print 'sufficient sample to feature ratio'
    else:
        print 'WARN - insufficient sample to feature ratio to be highly confident of fit'

    # n_samples > n_features ** 2

    price_test = [p[0] for p in y_test]
    # scores = cross_val_score(best_clf, vectors, selling_prices, cv=5, scoring='r2')
    scores = cross_val_score(best_clf, X_test, price_test, cv=5, scoring='r2')

    print 'Grid Search Results'
    print "Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)

    _, score, explained_variance = predict.run(best_clf,
        X_test, y_test, offset_state=False, offset_condition=False, show_prices=show_price_pairs)

    print 'held out test sample results:'
    print 'test r^2 score: %s' % score
    print 'test explained variance: %s' % explained_variance
    print '\n'