예제 #1
0
파일: filler.py 프로젝트: Marcnuth/KDD2017
    def run(self):

        pool = utils.load_data(self.input()[0].path)
        tss = utils.load_data(self.input()[1].path)

        final = self._fill_na(pool, tss)
        final.to_csv(self.output().path, index=False)
예제 #2
0
    def run(self):

        fitted_valids = utils.load_data(self.input()[0][1].path)
        fitted_tests = utils.load_data(self.input()[1][1].path)

        final = pd.concat([fitted_valids, fitted_tests])
        final.to_csv(self.output().path, index=False)
예제 #3
0
    def run(self):

        valids = utils.load_data(self.input()[0][0].path)
        valids.to_csv(self.output()[0].path, index=False)

        tests = utils.load_data(self.input()[1][0].path)
        tests.to_csv(self.output()[1].path, index=False)
예제 #4
0
파일: flats.py 프로젝트: Marcnuth/KDD2017
    def _source(self):
        self.resource_file = settings.resources[
            self.resource].absolute().as_posix()
        if 'trajectories' in self.resource:
            self.flat_func = self.travel_time_features
            self.trajectories = utils.load_data(self.input()[0].path)
        else:
            raise Exception('Not finished')

        self.links = utils.load_data(self.input()[1].path)
예제 #5
0
    def run(self):
        self._source()

        df0 = utils.load_data(self.input()[0].path)

        df1 = utils.load_data(self.input()[1].path)

        final = pd.merge(df0, df1, how='left', on=self.oncols)

        # excludes those data in col:tsa from ARIMA which is null
        final = final[-final.tsv.isnull()]
        final.to_csv(self.output().path, index=False)
예제 #6
0
파일: flats.py 프로젝트: Marcnuth/KDD2017
    def _flat_trajectories(self):
        df = utils.load_data(self.resource_file)
        df.travel_time = pd.to_numeric(df.travel_time)
        df.vehicle_id = pd.to_numeric(df.vehicle_id)
        df.starting_time = pd.to_datetime(df.starting_time)

        # flat the source data
        flatted = []
        for row in df.iterrows():
            (index, data) = row
            total_cost = reduce(
                lambda x, y: x + y,
                map(lambda x: float(x.split('#')[-1]),
                    data.travel_seq.split(';')))

            time_window_start = datetime(
                *data.starting_time.timetuple()[:4],
                math.floor(data.starting_time.minute / 20) * 20)
            time_window_end = time_window_start + timedelta(minutes=20)
            flatted.append([
                data.intersection_id, data.tollgate_id, time_window_start,
                time_window_end, data.travel_time, data.vehicle_id, total_cost,
                data.travel_time - total_cost
            ])

        flatted = pd.DataFrame(flatted,
                               columns=('intersection_id', 'tollgate_id',
                                        'time_window_start', 'time_window_end',
                                        'travel_time', 'vehicle_id', 'cost',
                                        'extracost'))

        return flatted
예제 #7
0
    def run(self):
        self._source()

        pool = utils.load_data(self.input().path)

        features = self._gen_gbdt_features(pool)
        features.to_csv(self.output().path, index=False)
예제 #8
0
    def run(self):
        self._source()

        df = utils.load_data(self.input().path)

        final = pd.DataFrame([])
        for g in df.groupby(self.key_cols):

            (keys, data) = g

            meta_cols = data[self.meta_cols]
            processed_df = self._process(data.drop(self.meta_cols, axis=1))

            # reset the index, otherwise the concat will not work
            meta_cols.reset_index(drop=True, inplace=True)
            processed_df.reset_index(drop=True, inplace=True)
            g_final = pd.concat([meta_cols, processed_df], axis=1)

            final = pd.concat([final, g_final])

        #features = feature_selection.VarianceThreshold().fit_transform(features)

        #print(ployed_df.head())
        #print(meta_df.head())
        #print(final.head())

        final.to_csv(self.output().path, index=False)
예제 #9
0
    def run(self):

        self.valids = utils.load_data(self.input()[0][0].path)
        self.tests = utils.load_data(self.input()[0][1].path)
        self.valids_real = utils.load_data(self.input()[1].path)

        utils.valid_submssion(self.task, self.tests)

        (df, mape) = self._calculate_mape()
        fig = self._output_plots(df)

        with open(self.output().path, 'w') as f:
            f.write('MAPE for {0} = {1}\n[[file:{2}][Lines compare]]\n'.format(
                self.task, mape, fig
            ))

        logger.info('===== MAPE = {0} for {1} ======'.format(mape, self.task))
예제 #10
0
    def run(self):
        (kcols, vcol, on_cols) = self._source()

        svr_valids = utils.load_data(self.input()[0][0].path)
        svr_tests = utils.load_data(self.input()[0][1].path)
        gbr_valids = utils.load_data(self.input()[1][0].path)
        gbr_tests = utils.load_data(self.input()[1][1].path)

        valids = pd.merge(svr_valids, gbr_valids, how='left', on=on_cols)
        tests = pd.merge(svr_tests, gbr_tests, how='left', on=on_cols)

        valids[vcol] = (valids[vcol + '_x'] * 2 +
                        valids[vcol + '_y'] * 1) / 3.0
        tests[vcol] = (tests[vcol + '_x'] * 2 + tests[vcol + '_y'] * 1) / 3.0

        to_drops = [vcol + '_x', vcol + '_y']
        valids.drop(to_drops, axis=1, inplace=True)
        tests.drop(to_drops, axis=1, inplace=True)

        valids.to_csv(self.output()[0].path, index=False)
        tests.to_csv(self.output()[1].path, index=False)
예제 #11
0
    def run(self):
        self._source()

        features = utils.load_data(self.input().path)
        if not self.select:
            features.to_csv(self.output().path, index=False)
            return None

        # drop useless cols via the SVR(traj) or RF(vol)
        features.drop(self.dropcols, axis=1, inplace=True)
        print(features.columns)
        features.to_csv(self.output().path, index=False)
예제 #12
0
    def _ts_features(self):

        pool1 = utils.load_data(self.input()[0].path)  # train
        pool2 = utils.load_data(self.input()[1].path)  # valids
        pool3 = self._tests_metas()  # tests
        pool = pd.concat([pool1, pool2, pool3])
        pool.reset_index(drop=True, inplace=True)

        pool['minutes_since_0'] = pool.time_window_start.map(
            lambda x: x.hour * 60 + x.minute)
        pool['minutes_diff_13'] = pool.time_window_start.map(
            lambda x: abs(x.hour - 13) * 60 + x.minute)
        pool['before_holiday'] = pool.time_window_start.map(
            utils.before_holiday)
        pool['after_holiday'] = pool.time_window_start.map(utils.after_holiday)
        pool['start_holiday'] = pool.time_window_start.map(utils.start_holiday)
        pool['end_holiday'] = pool.time_window_start.map(utils.end_holiday)
        pool['holiday_len'] = pool.time_window_start.map(utils.holiday_len)
        pool['is_am'] = pool.time_window_start.map(lambda x: x.hour > 13)

        return pool
예제 #13
0
    def run(self):

        pool = utils.load_data(self.input().path)
        filter_ts = self._source()[1]

        final = pd.DataFrame([], columns=pool.columns, dtype=pool.dtypes)
        for i in self.metas:
            #logger.info(i)
            data = filter_ts(pool, *i)
            final = pd.concat([final, data])

        final.to_csv(self.output().path, index=False)
예제 #14
0
    def run(self):
        (metas, tsfunc, genfunc, key1, key2, vcol) = self._source()

        dates = utils.get_meta('dates')

        fitted_cols = [key1, key2, 'time_window_start', 'tsv']
        forecasts, fitted = {}, pd.DataFrame([], columns=fitted_cols)

        pool = utils.load_data(self.input().path)
        #logger.info(pool)
        for meta in metas:
            ts = tsfunc(pool, meta)

            to_forecast_dates = pd.datetime(*dates[-1]).date() - ts.index[-1].date()
            (forecast, fit) = utils.fit_arima(ts, to_forecast_dates.days)

            forecasts[str(meta)] = forecast[-1 * len(dates):]
            assert len(forecasts[str(meta)]) == len(dates), 'Code is wrong!'

            # append fitted values
            tmp = pd.DataFrame(fit, columns=['tsv'])
            tmp['time_window_start'] = tmp.index
            tmp[key1] = meta[0]
            tmp[key2] = meta[1]

            fitted = pd.concat([fitted, tmp[fitted_cols]])

        # output the predict csv & fitted csv
        # predict csv should following submit sample(with time_window)
        # fitted csv has no time_window, but has time_window_start
        final = []
        for meta in metas:
            forecast = forecasts[str(meta)]
            for i in range(len(dates)):
                #logger.info(meta)
                #logger.info(dates[i])
                #logger.info(forecast)
                (year, month, day) = dates[i]
                final.append([*meta, year, month, day, forecast[i]])

        finaldf = genfunc(final)
        finaldf.to_csv(self.output()[0].path, index=False)

        # output fitted values, merge the predicts values into here
        finaldf['tsv'] = finaldf[vcol]
        finaldf['time_window_start'] = finaldf.time_window.map(
            lambda x: datetime.strptime(x.split(',')[0][1:], '%Y-%m-%d %H:%M:%S')
        )
        finaldf.drop(['time_window', vcol], axis=1, inplace=True)

        fitted = pd.concat([fitted, finaldf])
        fitted.to_csv(self.output()[1].path, index=False)
예제 #15
0
    def run(self):
        self._source()

        df = utils.load_data(self.input().path)

        # remove those data not in valids or tests time
        # those invalids features only used for rollings
        features = self._remove_not_in_times(df)

        # generate some ploy features
        features = self.ploy_func(features)

        features.to_csv(self.output().path, index=False)
예제 #16
0
파일: flats.py 프로젝트: Marcnuth/KDD2017
    def run(self):
        links = utils.load_data(settings.Data.Train.links)
        links['capacity'] = links.length * links.width
        links['tan'] = links.length / links.width

        links['intop_cnt'] = links.in_top.map(lambda x: len(str(x).split(',')))
        links['outtop_cnt'] = links.out_top.map(
            lambda x: len(str(x).split(',')))
        links['io_link_ratio'] = links.intop_cnt / links.outtop_cnt

        links['in_cap_ratio'], links['out_cap_ratio'] = np.nan, np.nan
        links['in_lane_ratio'], links['out_lane_ratio'] = np.nan, np.nan
        links['io_cap_ratio'], links['io_lane_ratio'] = np.nan, np.nan
        for row in links.iterrows():
            (index, data) = row

            intop = [] if pd.isnull(data.in_top) else data.in_top.split(',')
            outop = [] if pd.isnull(data.out_top) else data.out_top.split(',')

            in_caps, in_lanes = 0, 0
            for item in intop:
                in_caps += links[links.link_id == int(item)].capacity.iloc[0]
                in_lanes += links[links.link_id == int(item)].lanes.iloc[0]

            out_caps, out_lanes = 0, 0
            for item in outop:
                out_caps += links[links.link_id == int(item)].capacity.iloc[0]
                out_lanes += links[links.link_id == int(item)].lanes.iloc[0]

            # fix zero
            in_caps = in_caps or 1
            out_caps = out_caps or 1
            in_lanes = in_lanes or 1
            out_lanes = out_lanes or 1

            links.set_value(index, 'in_cap_ratio',
                            float(in_caps / data.capacity))
            links.set_value(index, 'out_cap_ratio',
                            float(data.capacity / out_caps))
            links.set_value(index, 'io_cap_ratio', float(in_caps / out_caps))

            links.set_value(index, 'in_lane_ratio',
                            float(in_lanes / data.lanes))
            links.set_value(index, 'out_lane_ratio',
                            float(data.lanes / out_lanes))
            links.set_value(index, 'io_lane_ratio',
                            float(in_lanes / out_lanes))

        links.drop(['in_top', 'out_top', 'lane_width'], axis=1, inplace=True)
        links.to_csv(self.output().path, index=False)
예제 #17
0
    def run(self):

        (times_cols, kcols, vcol, submit_cols) = self._source()

        pool = utils.load_data(self.input().path)
        predicts = pd.DataFrame([], columns=pool.columns)

        for g in pool.groupby(kcols):
            (keys, df) = g

            test = df[df.time_window_start >= pd.datetime(2016, 10, 18)]
            train = df[df.time_window_start < pd.datetime(2016, 10, 18)]

            useless_cols = [
                x for x in train.columns if pd.isnull(train[x]).all()
            ]

            train_x = train.drop([*times_cols, *kcols, vcol, *useless_cols],
                                 axis=1)
            train_y = train[vcol]

            regor = self._algorithm().fit(train_x, train_y)
            #if self.algorithm.lower() == 'svr':
            #    print(regor.best_params_)
            #print(regor.estimators_.tolist()[0])

            test_x = test.drop([*times_cols, *kcols, vcol, *useless_cols],
                               axis=1)
            test_y = regor.predict(test_x)
            test[vcol] = test_y
            #print(test_x.head())
            #print(test_y)
            #print(test.head())

            predicts = pd.concat([predicts, test])

        valids = self._fetch(predicts, utils.get_meta('valids_times'), kcols)
        tests = self._fetch(predicts, utils.get_meta('tests_times'), kcols)

        valids = utils.merge_time_window(valids[[*times_cols, *kcols, vcol]])
        tests = utils.merge_time_window(tests[[*times_cols, *kcols, vcol]])

        # rearrange columns
        valids = valids[submit_cols]
        tests = tests[submit_cols]

        valids.to_csv(self.output()[0].path, index=False)
        tests.to_csv(self.output()[1].path, index=False)
예제 #18
0
    def run(self):
        self._source()

        pool = utils.load_data(self.input().path)

        final = pd.DataFrame()
        for g in pool.groupby(self.kcols):
            (keys, df) = g

            train_ori = df[df.time_window_start < pd.datetime(2016, 10, 18)]
            tests_ori = df[df.time_window_start >= pd.datetime(2016, 10, 18)]
            valid_ori = tests_ori[~pd.isnull(tests_ori[self.vcol])]

            train = train_ori.drop([*self.timecols, *self.kcols], axis=1)
            valid = valid_ori.drop([*self.timecols, *self.kcols], axis=1)

            fcols = train.columns.tolist()
            fcols.remove(self.vcol)

            train = train.reindex_axis([*fcols, self.vcol], axis=1)
            valid = valid.reindex_axis([*fcols, self.vcol], axis=1)

            # remove null columns
            useless_cols = [
                x for x in train.columns if pd.isnull(train[x]).all()
            ]
            train.drop(useless_cols, axis=1, inplace=True)
            valid.drop(useless_cols, axis=1, inplace=True)

            ga = GA(train.values,
                    valid.values,
                    SVR(),
                    iter=10,
                    r_sample=0.5,
                    verbose=True,
                    r_keep_best=0.01)
            (sample, gene, varies) = ga.select_instance()

            #sns.tsplot(varies)
            #plt.show()
            #assert None

            final = pd.concat([final, train_ori[gene], tests_ori])

        #print(final)
        #assert None
        final.to_csv(self.output().path, index=False)
예제 #19
0
def get_batch(batch_size, num_steps, name=None):
    """Returns:
    A pair of Tensors, each shaped [batch_size, num_steps]. The second element
    of the tuple is the same data time-shifted to the right by one."""
    data = utils.load_data()

    with tf.name_scope(name, "Input", [data, batch_size, num_steps]):
        data = tf.convert_to_tensor(data, name="data", dtype=tf.int32)

        data_len = tf.size(data)
        batch_len = data_len // batch_size
        data = tf.reshape(data[0:batch_size * batch_len],
                          [batch_size, batch_len])

        epoch_size = (batch_len - 1) // num_steps

        i = tf.train.range_input_producer(epoch_size, shuffle=False).dequeue()
        x = tf.slice(data, [0, i * num_steps], [batch_size, num_steps])
        y = tf.slice(data, [0, i * num_steps + 1], [batch_size, num_steps])
        return x, y
예제 #20
0
    def run(self):
        self._source()

        pool = utils.load_data(self.input().path)

        final = pd.DataFrame()
        for g in pool.groupby(self.kcols):
            (keys, df) = g

            train_ori = df[df.time_window_start < pd.datetime(2016, 10, 18)]
            tests_ori = df[df.time_window_start >= pd.datetime(2016, 10, 18)]
            valid_ori = tests_ori[~pd.isnull(tests_ori[self.vcol])]

            train = train_ori.drop([*self.timecols, *self.kcols], axis=1)
            valid = valid_ori.drop([*self.timecols, *self.kcols], axis=1)

            fcols = train.columns.tolist()
            fcols.remove(self.vcol)

            train = train.reindex_axis([*fcols, self.vcol], axis=1)
            valid = valid.reindex_axis([*fcols, self.vcol], axis=1)

            ga = GA(train.values,
                    valid.values,
                    SVR(),
                    iter=5,
                    r_sample=0.5,
                    verbose=True,
                    r_keep_best=0.01)
            (sample, gene, varies) = ga.select_feature()
            print(train.columns[gene])

            useless_cols = train.columns[~gene]
            train_ori[useless_cols] = np.nan

            final = pd.concat([final, train_ori, tests_ori])

        final.to_csv(self.output().path, index=False)
예제 #21
0
    def run(self):
        self._source()

        rawdata = utils.load_data(self.input().path)

        roll_features = self._rolling_time(self.roll_cols, rawdata)
        #roll_features.to_csv('tmp.csv', index=False)
        pool = pd.merge(rawdata, roll_features, how='left', on=self.roll_ons)

        # after generate rolling features, some NA exists
        # we need use the history value to fill NA
        to_fill_cols = set(roll_features.columns) - set([
            'time_window_start', self.key1, self.key2])
        pool = self._fill_na_with_previous(pool, list(to_fill_cols))

        pool.drop(self.extra_cols, axis=1, inplace=True)

        print(pool.shape)
        non_na_cols = set(pool.columns) - set({self.vcol})
        pool.dropna(axis=0, how='any', subset=non_na_cols, inplace=True)
        print(pool.shape)

        pool.to_csv(self.output().path, index=False)
import pandas as pd
import datetime as dt
from glob import glob

from collections import UserDict
from IPython.display import Image
from sklearn.preprocessing import MinMaxScaler

from common.utils import load_data, mape, TimeSeriesTensor, create_evaluation_df

pd.options.display.float_format = '{:,.2f}'.format
np.set_printoptions(precision=2)
warnings.filterwarnings("ignore")

data_dir = 'data/'
energy = load_data(data_dir)
energy.head()

valid_start_dt = '2014-09-01 00:00:00'
test_start_dt = '2014-11-01 00:00:00'

energy[energy.index < valid_start_dt][['load']].rename(columns={'load':'train'}) \
    .join(energy[(energy.index >=valid_start_dt) & (energy.index < test_start_dt)][['load']] \
          .rename(columns={'load':'validation'}), how='outer') \
    .join(energy[test_start_dt:][['load']].rename(columns={'load':'test'}), how='outer') \
    .plot(y=['train', 'validation', 'test'], figsize=(15, 8), fontsize=12)
plt.xlabel('timestamp', fontsize=12)
plt.ylabel('load', fontsize=12)
plt.show()

T = 10  ## learn from previous 10 steps
예제 #23
0
    # preset settings
    if len(sys.argv) == 3 and sys.argv[1] == '--model':
        args['common']['model'] = sys.argv[2]

    model = args['common']['model']
    data_path = Path('../data/project')

    args['common'][
        'cuda'] = args['common']['cuda'] and torch.cuda.is_available()
    args['common']['device'] = torch.device(
        "cuda" if args['common']['cuda'] else "cpu")

    print("Using GPU" if args['common']['cuda'] else "Using CPU")

    # start loading data
    X_train_val, y_train_val, X_test, y_test, person_train_val, person_test = load_data(
        data_path)

    # standarize dataset
    scaler = StandardScaler()

    X_train_val = scaler.fit_transform(
        X_train_val.reshape(-1,
                            X_train_val.shape[-1])).reshape(X_train_val.shape)
    # note that only use transform here because training dataset is larger
    X_test = scaler.transform(X_test.reshape(-1, X_test.shape[-1])).reshape(
        X_test.shape)

    # upsample data
    if args['common']['scale'] != 1:
        scale = args['common']['scale']
        X_train_val = Tensor(X_train_val)
예제 #24
0
import warnings
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import datetime as dt
from collections import UserDict
from IPython.display import Image
# %matplotlib inline

from common.utils import load_data, mape, TimeSeriesTensor, create_evaluation_df

pd.options.display.float_format = '{:,.2f}'.format
np.set_printoptions(precision=2)
warnings.filterwarnings("ignore")

energy = load_data('data/')
energy.head()

valid_start_dt = '2014-09-01 00:00:00'
test_start_dt = '2014-11-01 00:00:00'

T = 6
HORIZON = 3

train = energy.copy()[energy.index < valid_start_dt][['load', 'temp']]

from sklearn.preprocessing import MinMaxScaler

y_scaler = MinMaxScaler()
y_scaler.fit(train[['load']])
예제 #25
0
from keras.callbacks import ModelCheckpoint

from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import r2_score

if __name__ == '__main__':

    time_step_lag = 6
    HORIZON = 1

    data_dir = 'data/'
    multi_time_series = load_data(data_dir)
    print(multi_time_series.head())

    valid_start_dt = '2011-09-01 00:00:00'
    test_start_dt = '2011-11-01 00:00:00'

    train_inputs, valid_inputs, test_inputs, y_scaler = split_train_validation_test(
        multi_time_series,
        valid_start_time=valid_start_dt,
        test_start_time=test_start_dt,
        time_step_lag=time_step_lag,
        horizon=HORIZON,
        features=["load", "imf1", "imf2"],
        target=["load", "imf1", "imf2"])

    X_train = train_inputs['X']
예제 #26
0
                    type=str,
                    default='cpu',
                    help='You can choose cpu or cudnn.')
parser.add_argument('--device',
                    '-d',
                    type=int,
                    default=0,
                    help='You can choose the device id when you use cudnn.')
args = parser.parse_args()

if args.context == 'cudnn':
    from nnabla.ext_utils import get_extension_context
    ctx = get_extension_context('cudnn', device_id=args.device)
    nn.set_default_context(ctx)

train_data = load_data('./ptb/train.txt', with_bos=True)
train_data = with_padding(train_data, padding_type='post')

valid_data = load_data('./ptb/valid.txt', with_bos=True)
valid_data = with_padding(valid_data, padding_type='post')

vocab_size = len(w2i)
sentence_length = 60
embedding_size = 128
hidden_size = 128
batch_size = 32
max_epoch = 100

x_train = train_data[:, :sentence_length].astype(np.int32)
y_train = train_data[:, 1:sentence_length - 1].astype(np.int32)