def gen_args(bucket=None, gcs_dir=None): args = [ "--output-dir=./output_test", "--model-dir=./output_test/checkpoint", "--shuffle-batch=False", ] if bucket is not None: args.append("--bucket=" + bucket) if gcs_dir is not None: args.append("--gcs-dir=" + gcs_dir) return get_args(args)
def get_historical_agg_trades(base, quote): args = get_args() with_parquet = args.parquet symbol = base + quote start_at = args.start_at limit = args.limit params = {'symbol': symbol, 'fromId': start_at, 'limit': limit} return all_trade_to_csv(base=base, quote=quote, params=params, with_parquet=with_parquet)
def get_historical_candlesticks(base, quote): args = get_args() with_parquet = args.parquet symbol = base + quote interval = args.interval start_at = args.start_at limit = args.limit params = { 'symbol': symbol, 'interval': interval, 'startTime': start_at, 'limit': limit } return all_candle_to_csv(base=base, quote=quote, params=params, interval=interval, with_parquet=with_parquet)
def main(): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") attack_param = { "ord": np.inf, "epsilon": 8. / 255., "alpha": 2. / 255., "num_iter": 20, "restart": 1 } args = get_args() logger = metaLogger(args) logging.basicConfig(filename=args.j_dir + "/log/log.txt", format='%(asctime)s %(message)s', level=logging.INFO) logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) seed_everything(args.seed) train_loader, test_loader = load_dataset(args.dataset, args.batch_size) model = get_model(args, device) opt, lr_scheduler = get_optim(model, args) ckpt_epoch = 0 ckpt_dir = args.j_dir + "/" + str(args.j_id) + "/" ckpt_location = os.path.join(ckpt_dir, "custome_ckpt_" + logger.ckpt_status + ".pth") if os.path.exists(ckpt_location): ckpt = torch.load(ckpt_location) model.load_state_dict(ckpt["state_dict"]) opt.load_state_dict(ckpt["optimizer"]) ckpt_epoch = ckpt["epoch"] if lr_scheduler: lr_scheduler.load_state_dict(ckpt["lr_scheduler"]) print("LOADED CHECKPOINT") for _epoch in range(ckpt_epoch, args.epoch): train_log = train(args, _epoch, logger, train_loader, model, opt, device) test_log = test_clean(test_loader, model, device) adv_log = test_adv(test_loader, model, pgd_rand, attack_param, device) logger.add_scalar("pgd20/acc", adv_log[0], _epoch + 1) logger.add_scalar("pgd20/loss", adv_log[1], _epoch + 1) logger.add_scalar("test/acc", test_log[0], _epoch + 1) logger.add_scalar("test/loss", test_log[1], _epoch + 1) logging.info("Test set: Loss: {loss:.6f}\t" "Accuracy: {acc:.2f}".format(loss=test_log[1], acc=test_log[0])) logging.info("PGD20: Loss: {loss:.6f}\t" "Accuracy: {acc:.2f}".format(loss=adv_log[1], acc=adv_log[0])) if lr_scheduler: lr_scheduler.step() if (_epoch + 1) % args.ckpt_freq == 0: rotateCheckpoint(ckpt_dir, "custome_ckpt", model, opt, _epoch, lr_scheduler) logger.save_log() logger.close() torch.save(model.state_dict(), args.j_dir + "/model/model.pt")
def main(): """Main loop; loop over all currency pairs that exist on the exchange. """ args = get_args() print(args) with_parquet = args.parquet upload_parquet = args.upload interval = args.interval data_type = args.dtype pairs = "".join(args.pairs.split()) # remove whitespace if pairs == 'all': # get all pairs currently available all_symbols = pd.DataFrame( requests.get(f'{API_BASE}exchangeInfo').json()['symbols']) all_pairs = [ tuple(x) for x in all_symbols[['baseAsset', 'quoteAsset']].to_records( index=False) ] else: all_pairs = [tuple(pair.split('-')) for pair in pairs.split(',')] #all_pairs = [('BTC', 'USDT')] #all_pairs = [('DF', 'ETH')] # randomising order helps during testing and doesn't make any difference in production random.shuffle(all_pairs) # make sure data folders exist os.makedirs('data', exist_ok=True) os.makedirs('compressed', exist_ok=True) # do a full update on all pairs n_count = len(all_pairs) for n, pair in enumerate(all_pairs, 1): base, quote = pair # default params for klines symbol = base + quote if data_type == 'candle': new_lines = get_historical_candlesticks(base, quote) elif data_type == 'trade': new_lines = get_historical_agg_trades(base, quote) if new_lines > 0: print( f'{datetime.now()} {n}/{n_count} Wrote {new_lines} new lines to file for {data_type}_{base}-{quote}_interval-{interval}' ) else: print( f'{datetime.now()} {n}/{n_count} Already up to date with {data_type}_{base}-{quote}_interval-{interval}' ) # clean the data folder and upload a new version of the dataset to kaggle try: os.remove('compressed/.DS_Store') except FileNotFoundError: pass if with_parquet and upload_parquet: write_metadata(n_count) yesterday = date.today() - timedelta(days=1) subprocess.run([ 'kaggle', 'datasets', 'version', '-p', 'compressed/', '-m', f'full update of all {n_count} pairs up to {str(yesterday)}' ]) os.remove('compressed/dataset-metadata.json')
def all_trade_to_csv(base, quote, params=None, with_parquet=False): """Collect a list of candlestick batches with all candlesticks of a trading pair, concat into a dataframe and write it to CSV. """ args = get_args() filepath = f'data/trade_{base}-{quote}.csv' api_path = 'aggTrades' # see if there is any data saved on disk already try: if params['fromId'] == 0: batches = [pd.read_csv(filepath)] last_id = batches[-1]['a'].max() params['fromId'] = last_id + 1 else: last_id = params['fromId'] params['fromId'] = last_id + 1 batches = [pd.DataFrame([])] # clear # if already have data start from last_id except FileNotFoundError: batches = [pd.DataFrame([])] last_id = params['fromId'] old_lines = len(batches[-1].index) # gather all trades available, starting from the last id loaded from disk or provided fromId # stop if the id that comes back from the api is the same as the last one previous_id = -1 while previous_id != last_id: # stop if we reached data if previous_id >= last_id and previous_id > 0: break previous_id = last_id new_batch = get_batch(params=params, api_path=api_path, timeout=args.timeout) # requesting candles from the future returns empty # also stop in case response code was not 200 if new_batch.empty: break last_id = new_batch['a'].max() print(last_id, previous_id) timestamp = new_batch['T'].max() # update fromId to continue from last id params['fromId'] = last_id + 1 batches.append(new_batch) last_datetime = datetime.fromtimestamp(timestamp / 1000) covering_spaces = 20 * ' ' print(datetime.now(), base, quote, str(last_datetime) + covering_spaces, end='\r', flush=True) # if huge data # compute size @TODO get field not hardcoded lines = len(batches) * params['limit'] if lines >= 5000: df = pp.prepare_df(batches, field='a') pp.append_to_csv(df, filepath) # reset batches.clear() if len(batches) > 1: df = pp.prepare_df(batches, field='a') if with_parquet: # write clean version of csv to parquet parquet_name = f'{base}-{quote}.parquet' full_path = f'compressed/{parquet_name}' pp.write_raw_to_parquet(df, full_path) METADATA['data'].append({ 'description': f'All {data_type} history for the pair {base} and {quote} at {interval} intervals. Counts {df.index.size} records.', 'name': parquet_name, 'totalBytes': os.stat(full_path).st_size, 'columns': [] }) # in the case that new data was gathered write it to disk if len(batches) > 1: pp.append_to_csv(df, filepath) #df.to_csv(filepath, index=False) return len(df.index) - old_lines return 0
f'{datetime.now()} {n}/{n_count} Wrote {new_lines} new lines to file for {data_type}_{base}-{quote}_interval-{interval}' ) else: print( f'{datetime.now()} {n}/{n_count} Already up to date with {data_type}_{base}-{quote}_interval-{interval}' ) # clean the data folder and upload a new version of the dataset to kaggle try: os.remove('compressed/.DS_Store') except FileNotFoundError: pass if with_parquet and upload_parquet: write_metadata(n_count) yesterday = date.today() - timedelta(days=1) subprocess.run([ 'kaggle', 'datasets', 'version', '-p', 'compressed/', '-m', f'full update of all {n_count} pairs up to {str(yesterday)}' ]) os.remove('compressed/dataset-metadata.json') if __name__ == '__main__': args = get_args() if args.check_trade != None: pp.check_trade_index(args.check_trade) else: main()