def run(self): pool = utils.load_data(self.input()[0].path) tss = utils.load_data(self.input()[1].path) final = self._fill_na(pool, tss) final.to_csv(self.output().path, index=False)
def run(self): fitted_valids = utils.load_data(self.input()[0][1].path) fitted_tests = utils.load_data(self.input()[1][1].path) final = pd.concat([fitted_valids, fitted_tests]) final.to_csv(self.output().path, index=False)
def run(self): valids = utils.load_data(self.input()[0][0].path) valids.to_csv(self.output()[0].path, index=False) tests = utils.load_data(self.input()[1][0].path) tests.to_csv(self.output()[1].path, index=False)
def _source(self): self.resource_file = settings.resources[ self.resource].absolute().as_posix() if 'trajectories' in self.resource: self.flat_func = self.travel_time_features self.trajectories = utils.load_data(self.input()[0].path) else: raise Exception('Not finished') self.links = utils.load_data(self.input()[1].path)
def run(self): self._source() df0 = utils.load_data(self.input()[0].path) df1 = utils.load_data(self.input()[1].path) final = pd.merge(df0, df1, how='left', on=self.oncols) # excludes those data in col:tsa from ARIMA which is null final = final[-final.tsv.isnull()] final.to_csv(self.output().path, index=False)
def _flat_trajectories(self): df = utils.load_data(self.resource_file) df.travel_time = pd.to_numeric(df.travel_time) df.vehicle_id = pd.to_numeric(df.vehicle_id) df.starting_time = pd.to_datetime(df.starting_time) # flat the source data flatted = [] for row in df.iterrows(): (index, data) = row total_cost = reduce( lambda x, y: x + y, map(lambda x: float(x.split('#')[-1]), data.travel_seq.split(';'))) time_window_start = datetime( *data.starting_time.timetuple()[:4], math.floor(data.starting_time.minute / 20) * 20) time_window_end = time_window_start + timedelta(minutes=20) flatted.append([ data.intersection_id, data.tollgate_id, time_window_start, time_window_end, data.travel_time, data.vehicle_id, total_cost, data.travel_time - total_cost ]) flatted = pd.DataFrame(flatted, columns=('intersection_id', 'tollgate_id', 'time_window_start', 'time_window_end', 'travel_time', 'vehicle_id', 'cost', 'extracost')) return flatted
def run(self): self._source() pool = utils.load_data(self.input().path) features = self._gen_gbdt_features(pool) features.to_csv(self.output().path, index=False)
def run(self): self._source() df = utils.load_data(self.input().path) final = pd.DataFrame([]) for g in df.groupby(self.key_cols): (keys, data) = g meta_cols = data[self.meta_cols] processed_df = self._process(data.drop(self.meta_cols, axis=1)) # reset the index, otherwise the concat will not work meta_cols.reset_index(drop=True, inplace=True) processed_df.reset_index(drop=True, inplace=True) g_final = pd.concat([meta_cols, processed_df], axis=1) final = pd.concat([final, g_final]) #features = feature_selection.VarianceThreshold().fit_transform(features) #print(ployed_df.head()) #print(meta_df.head()) #print(final.head()) final.to_csv(self.output().path, index=False)
def run(self): self.valids = utils.load_data(self.input()[0][0].path) self.tests = utils.load_data(self.input()[0][1].path) self.valids_real = utils.load_data(self.input()[1].path) utils.valid_submssion(self.task, self.tests) (df, mape) = self._calculate_mape() fig = self._output_plots(df) with open(self.output().path, 'w') as f: f.write('MAPE for {0} = {1}\n[[file:{2}][Lines compare]]\n'.format( self.task, mape, fig )) logger.info('===== MAPE = {0} for {1} ======'.format(mape, self.task))
def run(self): (kcols, vcol, on_cols) = self._source() svr_valids = utils.load_data(self.input()[0][0].path) svr_tests = utils.load_data(self.input()[0][1].path) gbr_valids = utils.load_data(self.input()[1][0].path) gbr_tests = utils.load_data(self.input()[1][1].path) valids = pd.merge(svr_valids, gbr_valids, how='left', on=on_cols) tests = pd.merge(svr_tests, gbr_tests, how='left', on=on_cols) valids[vcol] = (valids[vcol + '_x'] * 2 + valids[vcol + '_y'] * 1) / 3.0 tests[vcol] = (tests[vcol + '_x'] * 2 + tests[vcol + '_y'] * 1) / 3.0 to_drops = [vcol + '_x', vcol + '_y'] valids.drop(to_drops, axis=1, inplace=True) tests.drop(to_drops, axis=1, inplace=True) valids.to_csv(self.output()[0].path, index=False) tests.to_csv(self.output()[1].path, index=False)
def run(self): self._source() features = utils.load_data(self.input().path) if not self.select: features.to_csv(self.output().path, index=False) return None # drop useless cols via the SVR(traj) or RF(vol) features.drop(self.dropcols, axis=1, inplace=True) print(features.columns) features.to_csv(self.output().path, index=False)
def _ts_features(self): pool1 = utils.load_data(self.input()[0].path) # train pool2 = utils.load_data(self.input()[1].path) # valids pool3 = self._tests_metas() # tests pool = pd.concat([pool1, pool2, pool3]) pool.reset_index(drop=True, inplace=True) pool['minutes_since_0'] = pool.time_window_start.map( lambda x: x.hour * 60 + x.minute) pool['minutes_diff_13'] = pool.time_window_start.map( lambda x: abs(x.hour - 13) * 60 + x.minute) pool['before_holiday'] = pool.time_window_start.map( utils.before_holiday) pool['after_holiday'] = pool.time_window_start.map(utils.after_holiday) pool['start_holiday'] = pool.time_window_start.map(utils.start_holiday) pool['end_holiday'] = pool.time_window_start.map(utils.end_holiday) pool['holiday_len'] = pool.time_window_start.map(utils.holiday_len) pool['is_am'] = pool.time_window_start.map(lambda x: x.hour > 13) return pool
def run(self): pool = utils.load_data(self.input().path) filter_ts = self._source()[1] final = pd.DataFrame([], columns=pool.columns, dtype=pool.dtypes) for i in self.metas: #logger.info(i) data = filter_ts(pool, *i) final = pd.concat([final, data]) final.to_csv(self.output().path, index=False)
def run(self): (metas, tsfunc, genfunc, key1, key2, vcol) = self._source() dates = utils.get_meta('dates') fitted_cols = [key1, key2, 'time_window_start', 'tsv'] forecasts, fitted = {}, pd.DataFrame([], columns=fitted_cols) pool = utils.load_data(self.input().path) #logger.info(pool) for meta in metas: ts = tsfunc(pool, meta) to_forecast_dates = pd.datetime(*dates[-1]).date() - ts.index[-1].date() (forecast, fit) = utils.fit_arima(ts, to_forecast_dates.days) forecasts[str(meta)] = forecast[-1 * len(dates):] assert len(forecasts[str(meta)]) == len(dates), 'Code is wrong!' # append fitted values tmp = pd.DataFrame(fit, columns=['tsv']) tmp['time_window_start'] = tmp.index tmp[key1] = meta[0] tmp[key2] = meta[1] fitted = pd.concat([fitted, tmp[fitted_cols]]) # output the predict csv & fitted csv # predict csv should following submit sample(with time_window) # fitted csv has no time_window, but has time_window_start final = [] for meta in metas: forecast = forecasts[str(meta)] for i in range(len(dates)): #logger.info(meta) #logger.info(dates[i]) #logger.info(forecast) (year, month, day) = dates[i] final.append([*meta, year, month, day, forecast[i]]) finaldf = genfunc(final) finaldf.to_csv(self.output()[0].path, index=False) # output fitted values, merge the predicts values into here finaldf['tsv'] = finaldf[vcol] finaldf['time_window_start'] = finaldf.time_window.map( lambda x: datetime.strptime(x.split(',')[0][1:], '%Y-%m-%d %H:%M:%S') ) finaldf.drop(['time_window', vcol], axis=1, inplace=True) fitted = pd.concat([fitted, finaldf]) fitted.to_csv(self.output()[1].path, index=False)
def run(self): self._source() df = utils.load_data(self.input().path) # remove those data not in valids or tests time # those invalids features only used for rollings features = self._remove_not_in_times(df) # generate some ploy features features = self.ploy_func(features) features.to_csv(self.output().path, index=False)
def run(self): links = utils.load_data(settings.Data.Train.links) links['capacity'] = links.length * links.width links['tan'] = links.length / links.width links['intop_cnt'] = links.in_top.map(lambda x: len(str(x).split(','))) links['outtop_cnt'] = links.out_top.map( lambda x: len(str(x).split(','))) links['io_link_ratio'] = links.intop_cnt / links.outtop_cnt links['in_cap_ratio'], links['out_cap_ratio'] = np.nan, np.nan links['in_lane_ratio'], links['out_lane_ratio'] = np.nan, np.nan links['io_cap_ratio'], links['io_lane_ratio'] = np.nan, np.nan for row in links.iterrows(): (index, data) = row intop = [] if pd.isnull(data.in_top) else data.in_top.split(',') outop = [] if pd.isnull(data.out_top) else data.out_top.split(',') in_caps, in_lanes = 0, 0 for item in intop: in_caps += links[links.link_id == int(item)].capacity.iloc[0] in_lanes += links[links.link_id == int(item)].lanes.iloc[0] out_caps, out_lanes = 0, 0 for item in outop: out_caps += links[links.link_id == int(item)].capacity.iloc[0] out_lanes += links[links.link_id == int(item)].lanes.iloc[0] # fix zero in_caps = in_caps or 1 out_caps = out_caps or 1 in_lanes = in_lanes or 1 out_lanes = out_lanes or 1 links.set_value(index, 'in_cap_ratio', float(in_caps / data.capacity)) links.set_value(index, 'out_cap_ratio', float(data.capacity / out_caps)) links.set_value(index, 'io_cap_ratio', float(in_caps / out_caps)) links.set_value(index, 'in_lane_ratio', float(in_lanes / data.lanes)) links.set_value(index, 'out_lane_ratio', float(data.lanes / out_lanes)) links.set_value(index, 'io_lane_ratio', float(in_lanes / out_lanes)) links.drop(['in_top', 'out_top', 'lane_width'], axis=1, inplace=True) links.to_csv(self.output().path, index=False)
def run(self): (times_cols, kcols, vcol, submit_cols) = self._source() pool = utils.load_data(self.input().path) predicts = pd.DataFrame([], columns=pool.columns) for g in pool.groupby(kcols): (keys, df) = g test = df[df.time_window_start >= pd.datetime(2016, 10, 18)] train = df[df.time_window_start < pd.datetime(2016, 10, 18)] useless_cols = [ x for x in train.columns if pd.isnull(train[x]).all() ] train_x = train.drop([*times_cols, *kcols, vcol, *useless_cols], axis=1) train_y = train[vcol] regor = self._algorithm().fit(train_x, train_y) #if self.algorithm.lower() == 'svr': # print(regor.best_params_) #print(regor.estimators_.tolist()[0]) test_x = test.drop([*times_cols, *kcols, vcol, *useless_cols], axis=1) test_y = regor.predict(test_x) test[vcol] = test_y #print(test_x.head()) #print(test_y) #print(test.head()) predicts = pd.concat([predicts, test]) valids = self._fetch(predicts, utils.get_meta('valids_times'), kcols) tests = self._fetch(predicts, utils.get_meta('tests_times'), kcols) valids = utils.merge_time_window(valids[[*times_cols, *kcols, vcol]]) tests = utils.merge_time_window(tests[[*times_cols, *kcols, vcol]]) # rearrange columns valids = valids[submit_cols] tests = tests[submit_cols] valids.to_csv(self.output()[0].path, index=False) tests.to_csv(self.output()[1].path, index=False)
def run(self): self._source() pool = utils.load_data(self.input().path) final = pd.DataFrame() for g in pool.groupby(self.kcols): (keys, df) = g train_ori = df[df.time_window_start < pd.datetime(2016, 10, 18)] tests_ori = df[df.time_window_start >= pd.datetime(2016, 10, 18)] valid_ori = tests_ori[~pd.isnull(tests_ori[self.vcol])] train = train_ori.drop([*self.timecols, *self.kcols], axis=1) valid = valid_ori.drop([*self.timecols, *self.kcols], axis=1) fcols = train.columns.tolist() fcols.remove(self.vcol) train = train.reindex_axis([*fcols, self.vcol], axis=1) valid = valid.reindex_axis([*fcols, self.vcol], axis=1) # remove null columns useless_cols = [ x for x in train.columns if pd.isnull(train[x]).all() ] train.drop(useless_cols, axis=1, inplace=True) valid.drop(useless_cols, axis=1, inplace=True) ga = GA(train.values, valid.values, SVR(), iter=10, r_sample=0.5, verbose=True, r_keep_best=0.01) (sample, gene, varies) = ga.select_instance() #sns.tsplot(varies) #plt.show() #assert None final = pd.concat([final, train_ori[gene], tests_ori]) #print(final) #assert None final.to_csv(self.output().path, index=False)
def get_batch(batch_size, num_steps, name=None): """Returns: A pair of Tensors, each shaped [batch_size, num_steps]. The second element of the tuple is the same data time-shifted to the right by one.""" data = utils.load_data() with tf.name_scope(name, "Input", [data, batch_size, num_steps]): data = tf.convert_to_tensor(data, name="data", dtype=tf.int32) data_len = tf.size(data) batch_len = data_len // batch_size data = tf.reshape(data[0:batch_size * batch_len], [batch_size, batch_len]) epoch_size = (batch_len - 1) // num_steps i = tf.train.range_input_producer(epoch_size, shuffle=False).dequeue() x = tf.slice(data, [0, i * num_steps], [batch_size, num_steps]) y = tf.slice(data, [0, i * num_steps + 1], [batch_size, num_steps]) return x, y
def run(self): self._source() pool = utils.load_data(self.input().path) final = pd.DataFrame() for g in pool.groupby(self.kcols): (keys, df) = g train_ori = df[df.time_window_start < pd.datetime(2016, 10, 18)] tests_ori = df[df.time_window_start >= pd.datetime(2016, 10, 18)] valid_ori = tests_ori[~pd.isnull(tests_ori[self.vcol])] train = train_ori.drop([*self.timecols, *self.kcols], axis=1) valid = valid_ori.drop([*self.timecols, *self.kcols], axis=1) fcols = train.columns.tolist() fcols.remove(self.vcol) train = train.reindex_axis([*fcols, self.vcol], axis=1) valid = valid.reindex_axis([*fcols, self.vcol], axis=1) ga = GA(train.values, valid.values, SVR(), iter=5, r_sample=0.5, verbose=True, r_keep_best=0.01) (sample, gene, varies) = ga.select_feature() print(train.columns[gene]) useless_cols = train.columns[~gene] train_ori[useless_cols] = np.nan final = pd.concat([final, train_ori, tests_ori]) final.to_csv(self.output().path, index=False)
def run(self): self._source() rawdata = utils.load_data(self.input().path) roll_features = self._rolling_time(self.roll_cols, rawdata) #roll_features.to_csv('tmp.csv', index=False) pool = pd.merge(rawdata, roll_features, how='left', on=self.roll_ons) # after generate rolling features, some NA exists # we need use the history value to fill NA to_fill_cols = set(roll_features.columns) - set([ 'time_window_start', self.key1, self.key2]) pool = self._fill_na_with_previous(pool, list(to_fill_cols)) pool.drop(self.extra_cols, axis=1, inplace=True) print(pool.shape) non_na_cols = set(pool.columns) - set({self.vcol}) pool.dropna(axis=0, how='any', subset=non_na_cols, inplace=True) print(pool.shape) pool.to_csv(self.output().path, index=False)
import pandas as pd import datetime as dt from glob import glob from collections import UserDict from IPython.display import Image from sklearn.preprocessing import MinMaxScaler from common.utils import load_data, mape, TimeSeriesTensor, create_evaluation_df pd.options.display.float_format = '{:,.2f}'.format np.set_printoptions(precision=2) warnings.filterwarnings("ignore") data_dir = 'data/' energy = load_data(data_dir) energy.head() valid_start_dt = '2014-09-01 00:00:00' test_start_dt = '2014-11-01 00:00:00' energy[energy.index < valid_start_dt][['load']].rename(columns={'load':'train'}) \ .join(energy[(energy.index >=valid_start_dt) & (energy.index < test_start_dt)][['load']] \ .rename(columns={'load':'validation'}), how='outer') \ .join(energy[test_start_dt:][['load']].rename(columns={'load':'test'}), how='outer') \ .plot(y=['train', 'validation', 'test'], figsize=(15, 8), fontsize=12) plt.xlabel('timestamp', fontsize=12) plt.ylabel('load', fontsize=12) plt.show() T = 10 ## learn from previous 10 steps
# preset settings if len(sys.argv) == 3 and sys.argv[1] == '--model': args['common']['model'] = sys.argv[2] model = args['common']['model'] data_path = Path('../data/project') args['common'][ 'cuda'] = args['common']['cuda'] and torch.cuda.is_available() args['common']['device'] = torch.device( "cuda" if args['common']['cuda'] else "cpu") print("Using GPU" if args['common']['cuda'] else "Using CPU") # start loading data X_train_val, y_train_val, X_test, y_test, person_train_val, person_test = load_data( data_path) # standarize dataset scaler = StandardScaler() X_train_val = scaler.fit_transform( X_train_val.reshape(-1, X_train_val.shape[-1])).reshape(X_train_val.shape) # note that only use transform here because training dataset is larger X_test = scaler.transform(X_test.reshape(-1, X_test.shape[-1])).reshape( X_test.shape) # upsample data if args['common']['scale'] != 1: scale = args['common']['scale'] X_train_val = Tensor(X_train_val)
import warnings import matplotlib.pyplot as plt import numpy as np import pandas as pd import datetime as dt from collections import UserDict from IPython.display import Image # %matplotlib inline from common.utils import load_data, mape, TimeSeriesTensor, create_evaluation_df pd.options.display.float_format = '{:,.2f}'.format np.set_printoptions(precision=2) warnings.filterwarnings("ignore") energy = load_data('data/') energy.head() valid_start_dt = '2014-09-01 00:00:00' test_start_dt = '2014-11-01 00:00:00' T = 6 HORIZON = 3 train = energy.copy()[energy.index < valid_start_dt][['load', 'temp']] from sklearn.preprocessing import MinMaxScaler y_scaler = MinMaxScaler() y_scaler.fit(train[['load']])
from keras.callbacks import ModelCheckpoint from sklearn.metrics import explained_variance_score from sklearn.metrics import mean_absolute_error from sklearn.metrics import mean_squared_error from sklearn.metrics import mean_squared_log_error from sklearn.metrics import median_absolute_error from sklearn.metrics import r2_score if __name__ == '__main__': time_step_lag = 6 HORIZON = 1 data_dir = 'data/' multi_time_series = load_data(data_dir) print(multi_time_series.head()) valid_start_dt = '2011-09-01 00:00:00' test_start_dt = '2011-11-01 00:00:00' train_inputs, valid_inputs, test_inputs, y_scaler = split_train_validation_test( multi_time_series, valid_start_time=valid_start_dt, test_start_time=test_start_dt, time_step_lag=time_step_lag, horizon=HORIZON, features=["load", "imf1", "imf2"], target=["load", "imf1", "imf2"]) X_train = train_inputs['X']
type=str, default='cpu', help='You can choose cpu or cudnn.') parser.add_argument('--device', '-d', type=int, default=0, help='You can choose the device id when you use cudnn.') args = parser.parse_args() if args.context == 'cudnn': from nnabla.ext_utils import get_extension_context ctx = get_extension_context('cudnn', device_id=args.device) nn.set_default_context(ctx) train_data = load_data('./ptb/train.txt', with_bos=True) train_data = with_padding(train_data, padding_type='post') valid_data = load_data('./ptb/valid.txt', with_bos=True) valid_data = with_padding(valid_data, padding_type='post') vocab_size = len(w2i) sentence_length = 60 embedding_size = 128 hidden_size = 128 batch_size = 32 max_epoch = 100 x_train = train_data[:, :sentence_length].astype(np.int32) y_train = train_data[:, 1:sentence_length - 1].astype(np.int32)