def dataprep(data_file, forward, file_data, file_truth): from DataProcess import wrangle as wr # get our massive dataset for 72 instruments # data = pd.read_csv(data_file, index_col='date') # several instruments have incomplete datasets for the last 3 years bad_instruments = [ 'FRA40', 'CHN50', 'US2000', 'USOil', 'SOYF', 'WHEATF', 'CORNF', 'EMBasket', 'JPYBasket', 'BTC/USD', 'BCH/USD', 'ETH/USD', 'LTC/USD', 'XRP/USD', 'CryptoMajor', 'USEquities' ] bad_cols = wr.get_cols(data, bad_instruments) # clean up our data and fill the gaps that are left (from market shutdown over weekend) data = data.drop(bad_cols, axis=1) data = data.fillna(method='ffill') data = data.fillna(method='bfill') good_cols = wr.get_cols(data, ['bidopen', 'bidhigh', 'bidlow' ]) #['EUR/USD']) # ['bidopen'])# , 'tick' data = data[good_cols] # testcode #test_cols = wr.get_cols(data, ['bidopenEUR/USD','bidhighEUR/USD']) #data = data[test_cols] # get the precentage differance of the data data = wr.p_diff(data) # mean norm data (used by generator) d_mean = data.mean() d_std = data.std() truth = data.rolling(forward).sum() t_mean = truth.mean() t_std = truth.std() d_msd = (data - d_mean) / d_std t_msd = (truth - t_mean) / t_std #testcode #d_msd = data #t_msd = truth d_msd.to_csv(file_data + '/d_msd.csv') t_msd.to_csv(file_truth + '/t_msd.csv') d_mean.to_csv(file_data + '/d_mean.csv') d_std.to_csv(file_data + '/d_std.csv') t_mean.to_csv(file_truth + '/t_mean.csv') t_std.to_csv(file_truth + '/t_std.csv')
def __init__(self): self.instruments = ['EUR/USD'] self.colums = ['bidclose', 'bidhigh', 'bidlow', 'tickqty'] self.spacings = [5, 30, 240] self.backwards = [30, 30, 30] self.datafile = [] data_dir = 'PyTorch/data/finance1m/' for i in range(len(self.instruments)): self.datafile.append( pd.read_csv(data_dir + self.instruments[i].replace('/', '_'), index_col='date')) self.datafile[i] = self.datafile[i][wr.get_cols( self.datafile[i], ['bidclose', 'bidhigh', 'bidlow', 'tickqty'])]
file_data = 'PyTorch/data/linear/d_msd.csv' file_truth = 'PyTorch/data/linear/t_msd.csv' backwards = 32 forward = 12 # collect data file_data = 'all_data_223k_3y_m5.csv' dat = pd.read_csv(file_data, index_col='date') bad_instruments = [ 'FRA40', 'CHN50', 'US2000', 'USOil', 'SOYF', 'WHEATF', 'CORNF', 'EMBasket', 'JPYBasket', 'BTC/USD', 'BCH/USD', 'ETH/USD', 'LTC/USD', 'XRP/USD', 'CryptoMajor', 'USEquities' ] bad_cols = wr.get_cols(dat, bad_instruments) # clean up our data and fill the gaps that are left (from market shutdown over weekend) dat = dat.drop(bad_cols, axis=1) dat = dat.fillna(method='ffill') dat = dat.fillna(method='bfill') good_cols = wr.get_cols( dat, ['bidopen', 'bidhigh', 'bidlow']) #['EUR/USD']) # ['bidopen'])# , 'tick' dat = dat[good_cols] dat = dat.diff()[1:] tru = dat.rolling(forward).sum() dat, tru = dat / dat.std(), tru / tru.std() #d_mean = dat.mean()
data_dir = 'PyTorch/data/finance1m/' file_data = ['EUR_USD', 'GBP_JPY', 'AUD_CAD' ] #['EUR_USD','EUR_GBP','GBP_USD','EUR_JPY','GBP_JPY'] dat = pd.read_csv(data_dir + file_data[0], index_col='date') dat = dat.join(pd.read_csv(data_dir + file_data[1], index_col='date'), lsuffix=file_data[0], rsuffix=file_data[1]) for i in range(1, len(file_data)): dat = dat.join(pd.read_csv(data_dir + file_data[i], index_col='date'), lsuffix='', rsuffix=file_data[i]) print('dropping data', time.time() - now) # drop unnesesary columns good_cols = wr.get_cols( dat, ['bidopen', 'bidhigh', 'bidlow' ]) #,'tick'])#['EUR/USD']) # ['bidopen'])# , 'tick' dat = dat[good_cols] print('cleaning data', time.time() - now) # clean up our data and fill the gaps that are left (from market shutdown over weekend) dat = dat.fillna(method='ffill') dat = dat.fillna(method='bfill') print('manipulating data', time.time() - now) # diffrence and manipulate dat = (dat.diff() / dat)[1:] dat = dat.rolling(spacing).sum()[spacing:] dat = dat / dat.std() tru = dat.copy()