def run(self): fb = FeatureBuilder( pd.read_csv(TrainDataIngestion().output().path)) dataframe = fb.featurize() print("In Data Pre Processing") print(dataframe.columns) dataframe.to_csv(self.output().path, index=False)
def run(self): fb = FeatureBuilder(pandas.read_csv( AggregateTrainData().output().path)) dataframe = fb.featurize() print("In Data Pre Processing") print(dataframe.columns) dataframe.to_csv(self.output().path, index=False)
def test(): df_test = pandas.read_csv("/tmp/test.csv") df_stores = pandas.read_csv("/tmp/rossman_sales_train_stores.csv") df = pandas.merge(df_test, df_stores, on='Store') df["Sales"] = 0 sales_model = joblib.load(Train().output().path) fb = FeatureBuilder(df) df = fb.featurize() print df.columns print test_model_ridge(df, sales_model)
def run(args): output_file = args.output_file resource_file = args.resource_file target_file = args.target_file social_media_type = args.social_media_type model_type = args.model_type feature_builder = FeatureBuilder(resource_file, target_file, output_file, social_media_type, model_type) # get training data features = feature_builder.get_features( ) # may need to further flatten this out, currently 3 dimensional labels = [] # Get from real wtwd dataset
def predict_sales(json_data=None): if json_data is not None: df_to_predict = get_dataframe(json_data) df_stores_data = pandas.read_csv( os.path.join(os.getcwd(), "data", "store.csv")) df_aggregated_to_predict = pandas.merge(df_to_predict, df_stores_data, on='Store') fb = FeatureBuilder(df_aggregated_to_predict, training=False) dataframe = fb.featurize() result = predict(obj["model"], obj["training_features"], dataframe) return result[0] else: return "No Data"
def predict(self, fecha, x_original, modelos): estaciones = x_original[['linea', 'estacion']].drop_duplicates() n = estaciones.shape[0] temp = pd.DataFrame({'fecha':[fecha]*n, 'ano':[2020]*n, 'linea':estaciones['linea'], 'estacion':estaciones['estacion'], 'afluencia':[0]*n}) x_mat = FeatureBuilder() x_mat = x_mat.featurize(x_original.append(temp)) variables_a_eliminar = ['fecha', 'ano', 'afluencia'] x_mat = x_mat.drop(variables_a_eliminar, axis = 1) x_nuevas = x_mat.tail(n) pred = modelos[1-1].predict_proba(x_nuevas)[:,1] prob = pred.copy() pred = np.where(pred >= 0.56, 1, 0) pred_bajo = pred.copy() prob_bajo = prob.copy() pred = modelos[2-1].predict_proba(x_nuevas)[:,1] prob = pred.copy() pred = np.where(pred >= 0.50, 1, 0) pred_normal = pred.copy() prob_normal = prob.copy() pred = modelos[3-1].predict_proba(x_nuevas)[:,1] prob = pred.copy() pred = np.where(pred >= 0.50, 1, 0) pred_alto = pred.copy() prob_alto = prob.copy() pred_f = pred_final(pred_bajo, prob_bajo, pred_normal, prob_normal, pred_alto, prob_alto) resultado = pd.DataFrame({'fecha':[fecha]*n, 'linea':estaciones['linea'], 'estacion':estaciones['estacion'], 'pronostico_afluencia':pred_f}) return(resultado)
def test_stacker_historic_fit_rf(self): n_samples = 5000 n_features = 15 num_classes = 10 models = ['c-rf' for x in range(5)] params = ['test_params_rf' for x in range(5)] X, y = make_classification(n_samples=n_samples, n_features=n_features, n_informative=5, n_redundant=3, n_classes=num_classes) df = pd.DataFrame( {'feature_' + str(i): X[:, i] for i in range(X.shape[1])}) df['ID'] = np.random.randint(0, 100, n_samples) historical_df = df[:2500] def create_f1(df): squares = df.feature_2.apply(lambda x: x * 2) return squares.values def create_f2(df): some_feature = df.feature_3.apply( lambda x: np.sin(x) / np.cos(x * 2)) return some_feature.values def create_historic_f1(df, historical=historical_df): value_dict = historical[['ID', 'feature_1' ]].groupby('ID').mean().to_dict() historical_feature = df.ID.apply( lambda x: value_dict['feature_1'].get(x, 0)) return historical_feature def create_historic_f2(df, historical=historical_df): value_dict = historical[['ID', 'feature_5' ]].groupby('ID').median().to_dict() historical_feature = df.ID.apply( lambda x: value_dict['feature_5'].get(x, 0)) return historical_feature non_historical_features = [create_f1, create_f2] historical_features = [create_historic_f1, create_historic_f2] fb = FeatureBuilder(non_historical_features, historical_features) meta_clf = Stacker('xgb', models, 10, fb, meta_model_params='test_params', base_model_params=params) meta_clf.fit(X, y, df) self.assertTrue( meta_clf.predict(X[-1000:], df=df[-1000:], historical_df=df).shape[0] == 1000)
def test_init(self): df = pd.DataFrame({ 'a': np.random.randint(0, 3, 10), 'b': np.random.randn(10), 'c': np.random.randn(10) }) historical_df = df[:5] def create_f1(df): squares = df.a.apply(lambda x: x * 2) return squares.values def create_f2(df, historical=historical_df): value_dict = historical[['a', 'b']].groupby('a').mean().to_dict() historical_feature = df.a.apply( lambda x: value_dict['b'].get(x, 0)) return historical_feature non_historical_features = [create_f1] historical_features = [create_f2] fb = FeatureBuilder(non_historical_features, historical_features) whole_dict = df[['a', 'b']].groupby('a').mean().to_dict() whole_dict_values = df.a.apply( lambda x: whole_dict['b'].get(x, 0)).values self.assertTrue( len(fb.create_historical_features(df, historical_df)) == len(df)) self.assertTrue(len(fb.create_non_historical_features(df)) == len(df)) self.assertTrue((fb.create_non_historical_features(df) == df.a.values * 2).sum() == len(df)) self.assertTrue((fb.create_historical_features(df, historical_df) == whole_dict_values).sum() < len(df))
def test_init_rf(self): models = ['c-rf' for x in range(5)] params = ['test_params_rf' for x in range(5)] feature_builder = FeatureBuilder([], []) meta_clf = Stacker('xgb', models, 10, feature_builder, meta_model_params='test_params', base_model_params=params) self.assertEqual(meta_clf.meta_model.name, 'xgb') self.assertEqual(meta_clf.meta_model.params['num_class'], 10) self.assertTrue(meta_clf.meta_model.num_rounds >= 10) self.assertTrue(meta_clf.meta_model.num_rounds <= 75)
def test_base_predictions_xgb(self): n_samples = 5000 n_features = 15 num_classes = 10 models = ['xgb' for x in range(5)] params = ['test_params' for x in range(5)] fb = FeatureBuilder([], []) meta_clf = Stacker('xgb', models, 10, fb, meta_model_params='test_params', base_model_params=params) self.assertEqual(meta_clf.meta_model.name, 'xgb') X, y = make_classification(n_samples=n_samples, n_features=n_features, n_informative=5, n_redundant=3, n_classes=num_classes) meta_prediction = meta_clf.generate_base_model_predictions(X, y) self.assertTrue(meta_prediction.shape[0] == n_samples) self.assertTrue(meta_prediction.shape[1] == num_classes * len(models))
def __init__(self): self.fb = FeatureBuilder() self.train()
def run(self): fb = FeatureBuilder(pandas.read_csv(AggregateTrainData().output().path)) dataframe = fb.featurize() print "In Data Pre Processing" print dataframe.columns dataframe.to_csv(self.output().path, index=False)