Example #1
0
 def run(self):
     fb = FeatureBuilder(
         pd.read_csv(TrainDataIngestion().output().path))
     dataframe = fb.featurize()
     print("In Data Pre Processing")
     print(dataframe.columns)
     dataframe.to_csv(self.output().path, index=False)
 def run(self):
     fb = FeatureBuilder(pandas.read_csv(
         AggregateTrainData().output().path))
     dataframe = fb.featurize()
     print("In Data Pre Processing")
     print(dataframe.columns)
     dataframe.to_csv(self.output().path, index=False)
def test():
    df_test = pandas.read_csv("/tmp/test.csv")
    df_stores = pandas.read_csv("/tmp/rossman_sales_train_stores.csv")
    df = pandas.merge(df_test, df_stores, on='Store')
    df["Sales"] = 0
    sales_model = joblib.load(Train().output().path)
    fb = FeatureBuilder(df)
    df = fb.featurize()
    print df.columns
    print test_model_ridge(df, sales_model)
Example #4
0
def run(args):
    output_file = args.output_file
    resource_file = args.resource_file
    target_file = args.target_file
    social_media_type = args.social_media_type
    model_type = args.model_type
    feature_builder = FeatureBuilder(resource_file, target_file, output_file,
                                     social_media_type,
                                     model_type)  # get training data
    features = feature_builder.get_features(
    )  # may need to further flatten this out, currently 3 dimensional
    labels = []  # Get from real wtwd dataset
def predict_sales(json_data=None):
    if json_data is not None:
        df_to_predict = get_dataframe(json_data)
        df_stores_data = pandas.read_csv(
            os.path.join(os.getcwd(), "data", "store.csv"))
        df_aggregated_to_predict = pandas.merge(df_to_predict,
                                                df_stores_data,
                                                on='Store')
        fb = FeatureBuilder(df_aggregated_to_predict, training=False)
        dataframe = fb.featurize()
        result = predict(obj["model"], obj["training_features"], dataframe)
        return result[0]
    else:
        return "No Data"
Example #6
0
 def predict(self, fecha, x_original, modelos):
     estaciones = x_original[['linea', 'estacion']].drop_duplicates()
     n = estaciones.shape[0]
     temp = pd.DataFrame({'fecha':[fecha]*n, 
                         'ano':[2020]*n, 
                         'linea':estaciones['linea'], 
                         'estacion':estaciones['estacion'], 
                         'afluencia':[0]*n})        
     x_mat = FeatureBuilder()
     x_mat = x_mat.featurize(x_original.append(temp))
     variables_a_eliminar = ['fecha', 'ano', 'afluencia']
     x_mat = x_mat.drop(variables_a_eliminar, axis = 1)
     x_nuevas = x_mat.tail(n)
     
     pred = modelos[1-1].predict_proba(x_nuevas)[:,1]
     prob = pred.copy()
     pred = np.where(pred >= 0.56, 1, 0)
     pred_bajo = pred.copy()
     prob_bajo = prob.copy()
     
     pred = modelos[2-1].predict_proba(x_nuevas)[:,1]
     prob = pred.copy()
     pred = np.where(pred >= 0.50, 1, 0)
     pred_normal = pred.copy()
     prob_normal = prob.copy()
     
     pred = modelos[3-1].predict_proba(x_nuevas)[:,1]
     prob = pred.copy()
     pred = np.where(pred >= 0.50, 1, 0)
     pred_alto = pred.copy()
     prob_alto = prob.copy()
     
     pred_f = pred_final(pred_bajo, prob_bajo, 
                         pred_normal, prob_normal, 
                         pred_alto, prob_alto)
     resultado = pd.DataFrame({'fecha':[fecha]*n, 
                               'linea':estaciones['linea'], 
                               'estacion':estaciones['estacion'], 
                               'pronostico_afluencia':pred_f})
     
     return(resultado)
Example #7
0
    def test_stacker_historic_fit_rf(self):
        n_samples = 5000
        n_features = 15
        num_classes = 10
        models = ['c-rf' for x in range(5)]
        params = ['test_params_rf' for x in range(5)]
        X, y = make_classification(n_samples=n_samples,
                                   n_features=n_features,
                                   n_informative=5,
                                   n_redundant=3,
                                   n_classes=num_classes)
        df = pd.DataFrame(
            {'feature_' + str(i): X[:, i]
             for i in range(X.shape[1])})
        df['ID'] = np.random.randint(0, 100, n_samples)
        historical_df = df[:2500]

        def create_f1(df):
            squares = df.feature_2.apply(lambda x: x * 2)
            return squares.values

        def create_f2(df):
            some_feature = df.feature_3.apply(
                lambda x: np.sin(x) / np.cos(x * 2))
            return some_feature.values

        def create_historic_f1(df, historical=historical_df):
            value_dict = historical[['ID', 'feature_1'
                                     ]].groupby('ID').mean().to_dict()
            historical_feature = df.ID.apply(
                lambda x: value_dict['feature_1'].get(x, 0))
            return historical_feature

        def create_historic_f2(df, historical=historical_df):
            value_dict = historical[['ID', 'feature_5'
                                     ]].groupby('ID').median().to_dict()
            historical_feature = df.ID.apply(
                lambda x: value_dict['feature_5'].get(x, 0))
            return historical_feature

        non_historical_features = [create_f1, create_f2]
        historical_features = [create_historic_f1, create_historic_f2]
        fb = FeatureBuilder(non_historical_features, historical_features)
        meta_clf = Stacker('xgb',
                           models,
                           10,
                           fb,
                           meta_model_params='test_params',
                           base_model_params=params)
        meta_clf.fit(X, y, df)
        self.assertTrue(
            meta_clf.predict(X[-1000:], df=df[-1000:],
                             historical_df=df).shape[0] == 1000)
    def test_init(self):

        df = pd.DataFrame({
            'a': np.random.randint(0, 3, 10),
            'b': np.random.randn(10),
            'c': np.random.randn(10)
        })
        historical_df = df[:5]

        def create_f1(df):
            squares = df.a.apply(lambda x: x * 2)
            return squares.values

        def create_f2(df, historical=historical_df):
            value_dict = historical[['a', 'b']].groupby('a').mean().to_dict()
            historical_feature = df.a.apply(
                lambda x: value_dict['b'].get(x, 0))
            return historical_feature

        non_historical_features = [create_f1]
        historical_features = [create_f2]
        fb = FeatureBuilder(non_historical_features, historical_features)
        whole_dict = df[['a', 'b']].groupby('a').mean().to_dict()
        whole_dict_values = df.a.apply(
            lambda x: whole_dict['b'].get(x, 0)).values
        self.assertTrue(
            len(fb.create_historical_features(df, historical_df)) == len(df))
        self.assertTrue(len(fb.create_non_historical_features(df)) == len(df))
        self.assertTrue((fb.create_non_historical_features(df) == df.a.values *
                         2).sum() == len(df))
        self.assertTrue((fb.create_historical_features(df, historical_df) ==
                         whole_dict_values).sum() < len(df))
Example #9
0
 def test_init_rf(self):
     models = ['c-rf' for x in range(5)]
     params = ['test_params_rf' for x in range(5)]
     feature_builder = FeatureBuilder([], [])
     meta_clf = Stacker('xgb',
                        models,
                        10,
                        feature_builder,
                        meta_model_params='test_params',
                        base_model_params=params)
     self.assertEqual(meta_clf.meta_model.name, 'xgb')
     self.assertEqual(meta_clf.meta_model.params['num_class'], 10)
     self.assertTrue(meta_clf.meta_model.num_rounds >= 10)
     self.assertTrue(meta_clf.meta_model.num_rounds <= 75)
Example #10
0
 def test_base_predictions_xgb(self):
     n_samples = 5000
     n_features = 15
     num_classes = 10
     models = ['xgb' for x in range(5)]
     params = ['test_params' for x in range(5)]
     fb = FeatureBuilder([], [])
     meta_clf = Stacker('xgb',
                        models,
                        10,
                        fb,
                        meta_model_params='test_params',
                        base_model_params=params)
     self.assertEqual(meta_clf.meta_model.name, 'xgb')
     X, y = make_classification(n_samples=n_samples,
                                n_features=n_features,
                                n_informative=5,
                                n_redundant=3,
                                n_classes=num_classes)
     meta_prediction = meta_clf.generate_base_model_predictions(X, y)
     self.assertTrue(meta_prediction.shape[0] == n_samples)
     self.assertTrue(meta_prediction.shape[1] == num_classes * len(models))
Example #11
0
 def __init__(self):
     self.fb = FeatureBuilder()
     self.train()
 def run(self):
     fb = FeatureBuilder(pandas.read_csv(AggregateTrainData().output().path))
     dataframe = fb.featurize()
     print "In Data Pre Processing"
     print dataframe.columns
     dataframe.to_csv(self.output().path, index=False)