Ejemplo n.º 1
0
    def test_fit_and_transform(self):
        augmenter = FeatureAugmenter(column_value="val", column_id="id", column_sort="sort",
                                     column_kind="kind",
                                     kind_to_fc_parameters=self.kind_to_fc_parameters)

        # Fit should do nothing
        returned_df = augmenter.fit()
        six.assertCountEqual(self, returned_df.__dict__, augmenter.__dict__)
        self.assertRaises(RuntimeError, augmenter.transform, None)

        augmenter.set_timeseries_container(self.test_df)

        # Add features to all time series
        X_with_index = pd.DataFrame([{"feature_1": 1}]*2, index=[10, 500])
        X_transformed = augmenter.transform(X_with_index)

        # Require same shape
        for i in X_transformed.index:
            self.assertIn(i, X_with_index.index)

        for i in X_with_index.index:
            self.assertIn(i, X_transformed.index)

        self.assertEqual(X_transformed.shape, (2, 3))

        # Preserve old features
        six.assertCountEqual(self, list(X_transformed.columns), ["feature_1", "a__length", "b__length"])

        # Features are not allowed to be NaN
        for index, row in X_transformed.iterrows():
            print((index, row))
            self.assertFalse(np.isnan(row["a__length"]))
            self.assertFalse(np.isnan(row["b__length"]))
Ejemplo n.º 2
0
    def test_add_features_to_only_a_part(self):
        augmenter = FeatureAugmenter(column_value="val",
                                     column_id="id",
                                     column_sort="sort",
                                     column_kind="kind",
                                     settings=self.settings)

        augmenter.set_timeseries_container(self.test_df)

        X_with_not_all_ids = pd.DataFrame([{"feature_1": 1}], index=[1])
        X_transformed = augmenter.transform(X_with_not_all_ids)

        for i in X_transformed.index:
            self.assertIn(i, X_with_not_all_ids.index)

        for i in X_with_not_all_ids.index:
            self.assertIn(i, X_transformed.index)

        self.assertEqual(X_transformed.shape, (1, 3))
        self.assertEqual(X_transformed.index, [1])

        # Features are not allowed to be NaN
        for index, row in X_transformed.iterrows():
            print(index, row)
            self.assertFalse(np.isnan(row["a__length"]))
            self.assertFalse(np.isnan(row["b__length"]))
Ejemplo n.º 3
0
    def test_add_features_to_only_a_part(self):
        augmenter = FeatureAugmenter(column_value="val", column_id="id", column_sort="sort",
                                     column_kind="kind",
                                     kind_to_fc_parameters=self.kind_to_fc_parameters,
                                     n_jobs=0,
                                     disable_progressbar = True)

        augmenter.set_timeseries_container(self.test_df)

        X_with_not_all_ids = pd.DataFrame([{"feature_1": 1}], index=[10])
        X_transformed = augmenter.transform(X_with_not_all_ids)

        for i in X_transformed.index:
            self.assertIn(i, X_with_not_all_ids.index)

        for i in X_with_not_all_ids.index:
            self.assertIn(i, X_transformed.index)

        self.assertEqual(X_transformed.shape, (1, 3))
        self.assertEqual(X_transformed.index, [10])

        # Features are not allowed to be NaN
        for index, row in X_transformed.iterrows():
            print((index, row))
            self.assertFalse(np.isnan(row["a__length"]))
            self.assertFalse(np.isnan(row["b__length"]))
Ejemplo n.º 4
0
    def test_no_ids_present(self):
        augmenter = FeatureAugmenter(
            column_value="val",
            column_id="id",
            column_sort="sort",
            column_kind="kind",
            kind_to_fc_parameters=self.kind_to_fc_parameters,
            n_jobs=0,
            disable_progressbar=True)

        augmenter.set_timeseries_container(self.test_df)

        X_with_not_all_ids = pd.DataFrame([{"feature_1": 1}], index=[-999])
        self.assertRaisesRegex(AttributeError,
                               r"The ids of the time series container",
                               augmenter.transform, X_with_not_all_ids)
Ejemplo n.º 5
0
def run(filename='data/clean_data.csv', city_regions_file='data/CityRegions.csv', load_from_file=True, grid_search=False, baseline=False):
    if city_regions_file == None:
        temp = [['Abiline', 'Texas','South'],['West Jordon', 'Utah', 'West' ], ['Yonkers','New York', 'Northeast']]
        city_regions = pd.DataFrame(temp, columns=['City', 'State','Region'])
    else:
        city_regions = pd.read_csv(city_regions_file, header=0).reset_index(drop=True)

    FEATURE_EXTRACTION='data/data_with_features.csv'
    if not os.path.isfile(FEATURE_EXTRACTION):
        df = pd.read_csv(filename, header=0)
        df.dropna(inplace=True)

        X_labels = ['City', 'State', 'dt', 'AverageTemperature', 'CityIndex']
        df = df[X_labels]
        df = df.dropna()
        #city_state = df[['City', 'State']]
        # Sadness because multiple cities with same name.......
        #df['CityIndex'] = city_state.apply(number_cities, axis=1)
        #df.to_csv('data/clean_data.csv', index=False)

        orig_cities = city_regions[['City','State']]
        print "Total cities ", len(orig_cities)
        y_regions = city_regions['Region']
        y_regions = y_regions.apply(number_regions)

        feature_extraction_settings = FeatureExtractionSettings()
        feature_extraction_settings.IMPUTE = impute
        feat_extractor = FeatureAugmenter(feature_extraction_settings,
                                          column_id='CityIndex', column_sort='dt', column_value='AverageTemperature')

        empty_df = pd.DataFrame(index=y_regions.index)
        feat_extractor.set_timeseries_container(df)
        output = feat_extractor.fit_transform(empty_df,y_regions)
        output['City'] = city_regions['City']
        output['State'] = city_regions['State']
        output['Region'] = city_regions['Region']

        output.to_csv(FEATURE_EXTRACTION, index=False)
    else:
        output = pd.read_csv(FEATURE_EXTRACTION)

    output = output.drop(['City', 'State', 'Region'], axis=1)

    if baseline:
        output = output['AverageTemperature__mean'].to_frame()

    train, test, validation = split_data(output, city_regions)

    """
    aug = FeatureAugmenter(feature_extraction_settings, column_id='CityIndex',
                    column_sort='dt', column_value='AverageTemperature',
                    timeseries_container=train['df'])
    output = aug.fit_transform(train['X'], train['y'])
    output['City_Name'] = train['city_names']
    output.to_csv('data/features_from_tsfresh.csv', index=False)
    """
    if load_from_file:
        clf = joblib.load('./model.joblib.pkl')
    else:
        clf = DecisionTreeClassifier(criterion='entropy', max_features=None,
                                     min_samples_split=0.1, max_depth=50, class_weight=None)
        # feat_extractor = RelevantFeatureAugmenter(column_id='CityIndex', column_sort='dt', column_value='AverageTemperature')

        # for the fit on the train test set, we set the fresh__timeseries_container to `df_train`
        if grid_search and not baseline:
            grid = {'max_features': [2, 10, 20, 30, 50, 100, 200, None],
                    'max_depth': [1, 25, 50, 100],
                    'class_weight': [None, 'balanced'],
                    'min_samples_split': [0.1, 0.25, 0.75, 1.0]}
            scorer = metrics.make_scorer(partial(metrics.accuracy_score))
            clf = GridSearchCV(clf, grid, scoring=scorer, n_jobs=multiprocessing.cpu_count())

        clf.fit(train['X'], train['y'])
        # pipeline.set_params(augmenter__timeseries_container=train['df'])
        # pipeline.fit(train['X'], train['y'])

        y_pred = pd.Series(clf.predict(train['X']))
        y_true = pd.Series(np.array(train['y']))
        result = train['city_names']
        result.reset_index(drop=True, inplace=True)
        result['Orig'] = y_true
        result['Pred'] = y_pred
        wrongs = y_true == y_pred
        result['Correct'] = wrongs
        result.to_csv('data/results_train.csv', index=False)
        

        if grid_search and not baseline:
            print "Best Parameters found from grid search: "
            print clf.best_params_

        print "train accuracy ", accuracy_score(y_true, y_pred)
        cm_train = confusion_matrix(y_true, y_pred)
        print "Confusion matrix for training\n", cm_train
        # for the predict on the test test set, we set the fresh__timeseries_container to `df_test`
        joblib.dump(clf, './model.joblib.pkl')
    #### ENDIF

    y_pred = pd.Series(clf.predict(test['X']))
    y_true = pd.Series(np.array(test['y']))
    result = test['city_names']
    result.reset_index(drop=True, inplace=True)
    result['Orig'] = y_true
    result['Pred'] = y_pred
    wrongs = y_true == y_pred
    result['Correct'] = wrongs
    result.to_csv('data/results_test.csv', index=False)
    
    print "test accuracy ", accuracy_score(y_true, y_pred)
    cm_test = confusion_matrix(y_true, y_pred)
    print "Confusion matrix for testing\n", cm_test

    class_names = ['Northeast', 'Midwest', 'West', 'South']
    if not load_from_file:
        plot_confusion_matrix(cm_train, class_names)
        plt.tight_layout()
        plt.savefig('train_cm.png')
    plt.hold(False)
    plot_confusion_matrix(cm_test, class_names)
    plt.tight_layout()
    plt.savefig('test_cm.png')

    if not load_from_file and not grid_search:
        features = output.columns.values
        importances = clf.feature_importances_
        with open("tree_viz.dot", "w") as f:
            f = tree.export_graphviz(clf, out_file=f)
        top_n = 20
        ndx = np.argsort(importances)[::-1]
        sorted_features = features[ndx][:20]
        sorted_importances = importances[ndx][:20]
        print '%80s & %s' %('Feature', 'Importance')
        for f, i in zip(sorted_features, sorted_importances):
            # print '%80s & %.2f \\\\' % (f[20:], i)
            print '%s & %.2f \\\\' % (f[20:], i)

    y_pred = clf.predict(validation['X'])
    y_true = np.array(validation['y'])

    y_pred = pd.Series(clf.predict(validation['X']))
    y_true = pd.Series(np.array(validation['y']))
    result = validation['city_names']
    result.reset_index(drop=True, inplace=True)
    result['Orig'] = y_true
    result['Pred'] = y_pred
    wrongs = y_true == y_pred
    result['Correct'] = wrongs
    result.to_csv('data/results_val.csv', index=False)
    

    print "validation accuracy ", accuracy_score(y_true, y_pred)
    cm_val = confusion_matrix(y_true, y_pred)
    print "Confusion matrix for validation\n", cm_val
    print "done"

    class_names = ['Northeast', 'Midwest', 'West', 'South']
    plt.hold(False)
    plot_confusion_matrix(cm_val, class_names)
    plt.tight_layout()
    plt.savefig('val_cm.png')