def test_set_brute_force( data_dic, start, time_range_obs=30, time_range_test=360, obs_freq=10, prediction_freq=30 ): ''' time_range_obs is in days time_range_test is in minutes obs_freq is in minutes prediction_freq is in minutes ''' # Time range for the observations start_td = data_fetching_utils.time_in_datetime(start) end_td = start_td + timedelta(days=time_range_obs) end = data_fetching_utils.time_in_string(end_td) n_obs = data_fetching_utils.calculate_observations(start, end, obs_freq) # Time range for the train X cols end_obs_td = start_td + timedelta(minutes=time_range_test) end_obs = data_fetching_utils.time_in_string(end_obs_td) n_train = data_fetching_utils.calculate_observations(start, end_obs, obs_freq) # Initializing dfs df_X = pd.DataFrame(columns = COLS) df_Y = pd.DataFrame(columns = ['time', 'label']) # Building up the df print('\nStarting to build up the df...') current = start_td while current <= end_td: current_str = data_fetching_utils.time_in_string(current) label = get_label( data_dic, current_str, prediction_freq ) if label is not None: # only append if label is not None i_X = len(df_X) row_X = build_row(data_dic, current_str) df_X.loc[i_X] = row_X i_Y = len(df_Y) row_Y = [current_str, label] df_Y.loc[i_Y] = row_Y current = current + timedelta(minutes=obs_freq) if i_X: if i_X % 500 == 0: print('Progress: ' + str(round(i_X/n_obs*100)) + '%') return df_X, df_Y
def subset_for_testing(data_dic, start, freq=10, time_range=360): ''' receives the data dict and start time and returns a set of observations within the time range specified (forward-looking) and in the frequencies defined freq and time_range are defined in minutes ''' start_td = data_fetching_utils.time_in_datetime(start) end_td = start_td + timedelta(minutes=time_range) current = start_td data_return = [] while current <= end_td: current_str = data_fetching_utils.time_in_string(current) if current_str in data_dic: data_return.append(data_dic[current_str]) else: # if one of the times is not in the dic, # we return a None return None current = current + timedelta(minutes=freq) return data_return
def subset_for_training(data_dic, end, freq=10, time_range=60): ''' receives the data dict and an end time and returns a set of observations within the time range specified (backwards) and in the frequencies defined freq and time_range are defined in minutes ''' end_td = data_fetching_utils.time_in_datetime(end) start_td = end_td - timedelta(minutes=time_range) current = end_td data_return = [] while current >= start_td: current_str = data_fetching_utils.time_in_string(current) if current_str in data_dic: data_return.append(data_dic[current_str]) else: # we take the data of the most recent previous obs #print('\nWarning: Data for obs ' + current_str + ' not found') not_in_dic = True freq2 = 0 + freq while not_in_dic: current2 = current - timedelta(minutes=freq2) current_str2 = data_fetching_utils.time_in_string(current2) if current_str2 in data_dic: #print('Using data from ' + current_str2 + ' instead') data_return.append(data_dic[current_str2]) not_in_dic = False else: freq2 += freq current = current - timedelta(minutes=freq) return data_return
def price_increased(data_dic, time_now, minutes): ''' Checks if the price increased from the price X minutes ago ''' end_time = data_fetching_utils.time_in_datetime(time_now) start_time = end_time - timedelta(minutes=minutes) time_before = data_fetching_utils.time_in_string(start_time) price_now = data_dic[time_now]['price_close'] price_before = data_dic[time_before]['price_close'] if price_now > price_before: return 1 else: return 0
def latest_time(data_dic): ''' Evaluates every key of the input data dictionary and returns the latest time ''' latest = None for time_str in data_dic: time = data_fetching_utils.time_in_datetime(time_str) if latest == None or time > latest: latest = time time_str = data_fetching_utils.time_in_string(latest) return time_str
def get_label(data_dic, time, time_range=30): ''' returns a binary value indicating if the price went up after the time specified in time_range time_range is in minutes ''' time_td = data_fetching_utils.time_in_datetime(time) after_td = time_td + timedelta(minutes=time_range) after = data_fetching_utils.time_in_string(after_td) if time in data_dic and after in data_dic: initial_price = data_dic[time]['price_close'] final_price = data_dic[after]['price_close'] if final_price > initial_price: return 1 else: return 0 else: return None
from datetime import datetime import sys import pickle sys.path.insert(1, '../utils') import data_fetching_utils as dfu import feature_engineering_utils as feu data_dic_path = '../../data/working/total_data.txt' with open(data_dic_path, 'rb') as f: data_dic = pickle.load(f) now = datetime.now() end = dfu.time_in_string(now) start = feu.latest_time(data_dic) freq = 10 # in minutes crypto = 'BTC' latest_data = dfu.get_data(crypto=crypto, period=str(freq) + 'MIN', start=start, end=end) file = '../../data/raw/data_' + crypto + '_' + str(freq) + \ 'min_' + start[:10] + '_' + end[:10] + '.txt' print('\nSaving data...') with open(file, 'wb') as f: pickle.dump(latest_data, f) print('\nData saved in', file)