예제 #1
0
def dataprep(data_file, forward, file_data, file_truth):
    from DataProcess import wrangle as wr

    # get our massive dataset for 72 instruments #
    data = pd.read_csv(data_file, index_col='date')

    # several instruments have incomplete datasets for the last 3 years
    bad_instruments = [
        'FRA40', 'CHN50', 'US2000', 'USOil', 'SOYF', 'WHEATF', 'CORNF',
        'EMBasket', 'JPYBasket', 'BTC/USD', 'BCH/USD', 'ETH/USD', 'LTC/USD',
        'XRP/USD', 'CryptoMajor', 'USEquities'
    ]
    bad_cols = wr.get_cols(data, bad_instruments)

    # clean up our data and fill the gaps that are left (from market shutdown over weekend)
    data = data.drop(bad_cols, axis=1)
    data = data.fillna(method='ffill')
    data = data.fillna(method='bfill')

    good_cols = wr.get_cols(data, ['bidopen', 'bidhigh', 'bidlow'
                                   ])  #['EUR/USD'])  # ['bidopen'])# , 'tick'
    data = data[good_cols]

    # testcode
    #test_cols = wr.get_cols(data, ['bidopenEUR/USD','bidhighEUR/USD'])
    #data = data[test_cols]

    # get the precentage differance of the data
    data = wr.p_diff(data)

    # mean norm data (used by generator)
    d_mean = data.mean()
    d_std = data.std()

    truth = data.rolling(forward).sum()
    t_mean = truth.mean()
    t_std = truth.std()

    d_msd = (data - d_mean) / d_std
    t_msd = (truth - t_mean) / t_std

    #testcode
    #d_msd = data
    #t_msd = truth

    d_msd.to_csv(file_data + '/d_msd.csv')
    t_msd.to_csv(file_truth + '/t_msd.csv')

    d_mean.to_csv(file_data + '/d_mean.csv')
    d_std.to_csv(file_data + '/d_std.csv')
    t_mean.to_csv(file_truth + '/t_mean.csv')
    t_std.to_csv(file_truth + '/t_std.csv')
예제 #2
0
    def __init__(self):
        self.instruments = ['EUR/USD']
        self.colums = ['bidclose', 'bidhigh', 'bidlow', 'tickqty']
        self.spacings = [5, 30, 240]
        self.backwards = [30, 30, 30]

        self.datafile = []
        data_dir = 'PyTorch/data/finance1m/'
        for i in range(len(self.instruments)):
            self.datafile.append(
                pd.read_csv(data_dir + self.instruments[i].replace('/', '_'),
                            index_col='date'))
            self.datafile[i] = self.datafile[i][wr.get_cols(
                self.datafile[i],
                ['bidclose', 'bidhigh', 'bidlow', 'tickqty'])]
예제 #3
0
file_data = 'PyTorch/data/linear/d_msd.csv'
file_truth = 'PyTorch/data/linear/t_msd.csv'

backwards = 32
forward = 12

# collect data
file_data = 'all_data_223k_3y_m5.csv'
dat = pd.read_csv(file_data, index_col='date')

bad_instruments = [
    'FRA40', 'CHN50', 'US2000', 'USOil', 'SOYF', 'WHEATF', 'CORNF', 'EMBasket',
    'JPYBasket', 'BTC/USD', 'BCH/USD', 'ETH/USD', 'LTC/USD', 'XRP/USD',
    'CryptoMajor', 'USEquities'
]
bad_cols = wr.get_cols(dat, bad_instruments)

# clean up our data and fill the gaps that are left (from market shutdown over weekend)
dat = dat.drop(bad_cols, axis=1)
dat = dat.fillna(method='ffill')
dat = dat.fillna(method='bfill')

good_cols = wr.get_cols(
    dat,
    ['bidopen', 'bidhigh', 'bidlow'])  #['EUR/USD'])  # ['bidopen'])# , 'tick'
dat = dat[good_cols]
dat = dat.diff()[1:]
tru = dat.rolling(forward).sum()
dat, tru = dat / dat.std(), tru / tru.std()

#d_mean = dat.mean()
예제 #4
0
    data_dir = 'PyTorch/data/finance1m/'
    file_data = ['EUR_USD', 'GBP_JPY', 'AUD_CAD'
                 ]  #['EUR_USD','EUR_GBP','GBP_USD','EUR_JPY','GBP_JPY']
    dat = pd.read_csv(data_dir + file_data[0], index_col='date')
    dat = dat.join(pd.read_csv(data_dir + file_data[1], index_col='date'),
                   lsuffix=file_data[0],
                   rsuffix=file_data[1])
    for i in range(1, len(file_data)):
        dat = dat.join(pd.read_csv(data_dir + file_data[i], index_col='date'),
                       lsuffix='',
                       rsuffix=file_data[i])

    print('dropping data', time.time() - now)
    # drop unnesesary columns
    good_cols = wr.get_cols(
        dat, ['bidopen', 'bidhigh', 'bidlow'
              ])  #,'tick'])#['EUR/USD'])  # ['bidopen'])# , 'tick'
    dat = dat[good_cols]

    print('cleaning data', time.time() - now)
    # clean up our data and fill the gaps that are left (from market shutdown over weekend)
    dat = dat.fillna(method='ffill')
    dat = dat.fillna(method='bfill')

    print('manipulating data', time.time() - now)
    # diffrence and manipulate
    dat = (dat.diff() / dat)[1:]
    dat = dat.rolling(spacing).sum()[spacing:]
    dat = dat / dat.std()
    tru = dat.copy()