def preprocess_income_data(label_path, edge_path, x_path, y_path, edge_list_path): targets = pd.read_csv(label_path, sep=' ') targets.columns = ['out_id', 'mean_income'] print 'target labels of shape: ', targets.shape edges = pd.read_csv(edge_path) edges.columns = ['out_id', 'in_id'] print 'edge list of shape: ', edges.shape all_data = edges.merge(targets) print 'all data of shape: ', all_data.shape X, y, edge_list = preprocess_data(all_data) utils.persist_edgelist(edge_list, edge_list_path) utils.persist_data(x_path, y_path, X, y)
def weatherbit(self): target = 'energy' features_data = read_data('tsdb_weatherbit_cleaned') target_data = read_data('tsdb_sma_energy') features_data.index.name = 'datetime' target_data.index.name = 'datetime' merged = features_data.merge(target_data, on='datetime') q = merged[target].quantile(0.99) merged = merged[merged[target] < q] df = self.to_DataFrame(merged, self.features, target) persist_data(self.table, df)
def mers_cams_data(self): target = 'consumption' energy = read_data('tsdb_fronius_energy') cams = read_data('tsdb_cams') merra = read_data('tsdb_merra') energy.index.name = 'datetime' cams.index.name = 'datetime' merra.index.name = 'datetime' energy = energy.resample('1H').agg('mean') merged = cams.merge(energy, on='datetime') merged = merged.merge(merra, on='datetime') merged['consumption'] = merged['FromGenToConsumer'] + merged[ 'FromGridToConsumer'] q = merged[target].quantile(0.99) merged = merged[merged[target] < q] df = self.to_DataFrame(merged, self.features, target) persist_data(self.table, df)
def preprocess_income_data(): targets = pd.read_csv( 'local_resources/Socio_economic_classification_data/income_dataset/users-income', sep=' ') targets.columns = ['fan_id', 'mean_income'] print 'target labels of shape: ', targets.shape edges = pd.read_csv( 'local_resources/Socio_economic_classification_data/income_dataset/users_friends.csv' ) edges.columns = ['fan_id', 'star_id'] print 'edge list of shape: ', edges.shape all_data = edges.merge(targets) print 'all data of shape: ', all_data.shape X, y, edge_list = preprocess_data(all_data) utils.persist_edgelist( edge_list, 'local_resources/Socio_economic_classification_data/income_dataset/income.edgelist' ) utils.persist_data( 'local_resources/Socio_economic_classification_data/income_dataset/X.p', 'local_resources/Socio_economic_classification_data/income_dataset/y.p', X, y)
def save_response(resp): persist_data('tsdb_sma_energy', resp)
def persist_predictions(self, target, predictions): table = 'tsdb_' + target + '_predictions' persist_data(table, predictions) print("persisted")
def persist_data(self, table, dataframe): persist_data(table, dataframe)