def __load_dir(dir_path): """ Load all the data in "dir_path", and complement the data in the dates that no transaction happened :return data: (list) e.g. [ # include transactions happen in many days ['bond_a', 'bond_b', ...], # represent transaction happen in one day ['bond_a', 'bond_b', ...], ... ] """ data = [] # load the date list date_list = os.listdir(dir_path) date_list.sort() # generate a date dict so that we can check whether there is transaction happens in that date date_dict = utils.list_2_dict(date_list) # find out the start and end date of all the transactions start_date = date_list[0][len('doc_'):-len('.json')] end_date = date_list[-1][len('doc_'):-len('.json')] # covert the date to timestamp cur_timestamp = utils.date_2_timestamp(start_date) end_timestamp = utils.date_2_timestamp(end_date) + 86000 # traverse all the date between the start date and the end date, but skip the holidays while cur_timestamp < end_timestamp: _date = utils.timestamp_2_date(cur_timestamp) file_name = f'doc_{_date}.json' # check if there is any transaction if file_name in date_dict: file_path = os.path.join(dir_path, file_name) # remove nan in doc tmp_doc = list( map(lambda x: x if isinstance(x, str) else '', utils.load_json(file_path))) while '' in tmp_doc: tmp_doc.remove('') data.append(tmp_doc) # if it is holidays, then skip it elif date.is_holiday(_date): pass # if no transaction happens in that date else: data.append([]) # move to the next day cur_timestamp += 86400 return data
def __generate_date_structure(len_bonds, start_date='2015-01-02', end_date='2015-12-31', with_day_off=True, buy_sell_plan=2): """ return: date_matrix: np.array, shape: (date_num, len_bonds + 3) or (date_num, len_bonds * 2 + 3), values are all ones. Therefore, a mask for the input is needed. "+ 3" is because the last 3 tokens are for <start> <end> <day-off> <pad> uses all zeros, thus, it does not need a place in vocabulary date_mask: np.array, shape: (date_num, len_bonds + 3) or (date_num, len_bonds * 2 + 3), values are all zeros. The mask would be changed when traversing the transaction history. dict_date_2_input_m_index: dict, map the date to the index of date_matrix """ dict_date_2_input_m_index = {} # convert timestamp start_timestamp = utils.date_2_timestamp(start_date) end_timestamp = utils.date_2_timestamp(end_date, True) # temporary variables l = [] cur_timestamp = start_timestamp # generate the dict_date_2_input_m_index and calculate the date_num while cur_timestamp <= end_timestamp: _date = utils.timestamp_2_date(cur_timestamp) cur_timestamp += 86400 if date.is_holiday(_date): if with_day_off: l.append(0) continue dict_date_2_input_m_index[_date] = len(l) l.append(1) # calculate the shape date_num = len(l) extra_token_num = 3 if with_day_off else 2 len_bonds = len_bonds * 2 + extra_token_num if buy_sell_plan == 2 else len_bonds + extra_token_num # generate variables date_matrix = np.ones((date_num, 1)) * np.arange(len_bonds) date_mask = np.zeros((date_num, len_bonds)) * np.arange(len_bonds) # change the value in day off pos to be 1 date_mask[np.where(np.array(l) == 0)[0], -1] = 1 return date_matrix, date_mask, dict_date_2_input_m_index
for i, v in enumerate(data): if i % 20 == 0: progress = float(i) / length * 100. print('\rprogress: %.2f%% ' % progress, end='') bond_id = v['bond_id'] offering_date = v['offering_date'] report_dealer_index = str(v['report_dealer_index']) contra_party_index = str(v['contra_party_index']) date = v['date'] volume = v['volume'] if str(offering_date)[0] != '2': continue offering_timestamp = utils.date_2_timestamp(str(offering_date).split(' ')[0]) if offering_timestamp >= bound_timestamp: continue if report_dealer_index not in dict_first_num_dealers and contra_party_index not in dict_first_num_dealers: continue in_bond = False for _dealer in tmp_dealers: no_below_0_val = _dealer[3] no_below_dict_0 = no_below_0_val[-1] if no_below_dict_0.doc2idx([bond_id])[0] != -1: in_bond = True break if in_bond:
tmp_data = utils.load_json(os.path.join(dir_path, file_name)) d_bonds = {} for i, v in enumerate(tmp_data): bond_id = v[0] if bond_id not in d_bonds: d_bonds[bond_id] = [] d_bonds[bond_id].append(v[2]) l_bonds = [] for bond_id, date_list in d_bonds.items(): date_list.sort() first_date = date_list[0][:-9] last_date = date_list[-1][:-9] first_date_timestamp = utils.date_2_timestamp(first_date) last_date_timestamp = utils.date_2_timestamp(last_date) interval = int((last_date_timestamp - first_date_timestamp) / 86400) if interval > 0: interval += 1 l_dates = list(map(lambda x: x[:-9], date_list)) l_dates = list(set(l_dates)) count = len(l_dates) if interval == 0: count = 0 count_divide_interval = min(float(count) / interval, 1.0) if interval > 0 else 0 l_bonds.append([bond_id, interval, count_divide_interval, count])
def __init__(self, input_days=20, force_refresh=False): self.__input_days = input_days self.__output_days = input_days // 10 self.__input_mode = 0 # get the path of the cache data data_subset = load.DATA_SUBSET_FOR_TEMPORAL_INPUT subset = os.path.split(data_subset)[1] year = load.YEAR_FOR_TEMPORAL_INPUT volume_level = load.VOLUME_LEVEL_FOR_TEMPORAL_INPUT no_below = load.NO_BELOW_FOR_TEMPORAL_INPUT data_index = load.DATA_INDEX_FOR_TEMPORAL_INPUT data_all_root_dir = os.path.join(data_subset, year, volume_level) all_level = os.path.split(data_all_root_dir)[1] self.__data_pkl_path = os.path.join( path.PATH_TMP_DIR, f'temporal_input_interval_output_for_pretrain_same_volume_{all_level}_{subset}_{year}_{volume_level}_{data_index}_no_below_{no_below}_input_days_{input_days}.pkl' ) if os.path.isfile(self.__data_pkl_path) and not force_refresh: self.__train_X, self.__train_y, self.__test_X, self.__test_y, self.dict, self.voc_size = \ utils.load_pkl(self.__data_pkl_path) else: print(f'\nStart loading data from {data_all_root_dir} ...') train_start_timestamp = utils.date_2_timestamp('2015-01-02') train_end_timestamp = utils.date_2_timestamp('2015-10-14', True) test_start_timestamp = utils.date_2_timestamp('2015-10-14') test_end_timestamp = utils.date_2_timestamp('2015-12-31', True) data_all_pkl_path = os.path.join( path.PATH_TMP_DIR, f'all_doc_list_for_pretrain_{subset}_{year}_{all_level}.pkl') if os.path.isfile(data_all_pkl_path): train_all_doc_list, test_all_doc_list = utils.load_pkl( data_all_pkl_path) else: train_all_doc_list = self.__load_dir_all( data_all_root_dir, train_start_timestamp, train_end_timestamp, 'train') test_all_doc_list = self.__load_dir_all( data_all_root_dir, test_start_timestamp, test_end_timestamp, 'test') # train_all_doc_list = [] # test_all_doc_list = [] # for _volume in os.listdir(data_all_root_dir): # sub_all_root_dir = os.path.join(data_all_root_dir, _volume) # sub_train_all_doc_list = self.__load_dir_all(sub_all_root_dir, train_start_timestamp, # train_end_timestamp, # 'train') # sub_test_all_doc_list = self.__load_dir_all(sub_all_root_dir, test_start_timestamp, # test_end_timestamp, # 'test') # # train_all_doc_list += sub_train_all_doc_list # test_all_doc_list += sub_test_all_doc_list utils.write_pkl(data_all_pkl_path, [train_all_doc_list, test_all_doc_list]) print(f'Finish loading \n\nStart processing data ... ') train_all_docs = [] for v in train_all_doc_list: train_all_docs += v test_all_docs = [] for v in test_all_doc_list: test_all_docs += v del train_all_doc_list del test_all_doc_list self.dict, self.voc_size = self.__gen_dict(train_all_docs, 150) # # generate the dictionary which maps the bond_id to index # self.dict, self.voc_size = self.__gen_dict(train_all_doc_list, no_below) print(self.voc_size) # print(self.voc_size_all) print( 'Finish generating dict\n\nStart converting input output ...') # convert doc list to trainable interval summed one-hot vector self.__train_X = self.__convert_input(train_all_docs, self.dict, self.voc_size, 'allow_unknown') self.__train_y = self.__convert_output(train_all_docs) self.__test_X = self.__convert_input(test_all_docs, self.dict, self.voc_size, 'allow_unknown') self.__test_y = self.__convert_output(test_all_docs) # self.__train_X = 0. # self.__test_X = 0. # self.__train_y = 0. # self.__test_y = 0. # # for doc_list in train_all_doc_list: # self.__train_X += self.__convert_input(doc_list, self.dict, self.voc_size, 'allow_unknown') # self.__train_y += self.__convert_output(doc_list) # self.__train_X /= len(train_all_doc_list) # # for doc_list in test_all_doc_list: # self.__test_X += self.__convert_input(doc_list, self.dict, self.voc_size, 'allow_unknown') # self.__test_y += self.__convert_output(doc_list) # self.__test_X /= len(test_all_doc_list) # self.__train_all_X = np.array( # list(map(lambda x: self.__convert_input(x, self.dict_all, self.voc_size_all, 'allow_unknown'), # train_all_doc_list))) # self.__train_all_y = np.array(list(map(self.__convert_output, train_all_doc_list))) # self.__test_all_X = np.array( # list(map(lambda x: self.__convert_input(x, self.dict_all, self.voc_size_all, 'allow_unknown'), # test_all_doc_list))) # self.__test_all_y = np.array(list(map(self.__convert_output, test_all_doc_list))) # self.__train_all_X = np.mean(self.__train_all_X, axis=0) # self.__train_all_y = np.mean(self.__train_all_y, axis=0) # self.__test_all_X = np.mean(self.__test_all_X, axis=0) # self.__test_all_y = np.mean(self.__test_all_y, axis=0) # # convert doc list to trainable interval summed one-hot vector # self.__train_X = self.__convert_input(train_doc_list, self.dict, self.voc_size, 'allow_unknown') # self.__train_y = self.__convert_output(train_doc_list) # self.__test_X = self.__convert_input(test_doc_list, self.dict, self.voc_size, 'allow_unknown') # self.__test_y = self.__convert_output(test_doc_list) # self.__train_X = np.vstack([self.__train_X.transpose([2, 0, 1]), self.__train_all_X.transpose([2, 0, 1])]) # self.__test_X = np.vstack([self.__test_X.transpose([2, 0, 1]), self.__test_all_X.transpose([2, 0, 1])]) # self.__train_X = self.__train_X.transpose([1, 2, 0]) # self.__test_X = self.__test_X.transpose([1, 2, 0]) print('Finish processing ') # cache data for faster processing next time utils.write_pkl(self.__data_pkl_path, [ self.__train_X, self.__train_y, self.__test_X, self.__test_y, self.dict, self.voc_size ]) self.__gen_topics_mask() self.__statistic()
force_no_direction = True dir_path = os.path.join(path.PREDICTION_DATE_DIR, dir_name, '2015', f'dealer_{dealer_index}') name = dir_name.replace('bonds_by_dealer_', '').replace('bonds_by_dealer', '') name = name_map[name] save_dir = os.path.join(path.ROOT_DIR, 'runtime', 'test') if not os.path.isdir(save_dir): os.mkdir(save_dir) path_dict_bond_id_offering_date = os.path.join(path.DATA_ROOT_DIR, 'dict_bond_id_offering_date.json') dict_bond_id_offering_date = utils.load_json(path_dict_bond_id_offering_date) remove_bond_list = [] bound_date = '2014-06-01' bound_timestamp = utils.date_2_timestamp(bound_date) for bond_id, offering_date in dict_bond_id_offering_date.items(): if offering_date[:2] == '19': continue elif offering_date[:1] != '2': remove_bond_list.append(bond_id) continue offering_date = offering_date.replace('-00', '-01') offering_timestamp = utils.date_2_timestamp(offering_date) if offering_timestamp >= bound_timestamp: remove_bond_list.append(bond_id) data = __load_dir(dir_path) dates, voc_size, original_bond_size = __process(data, remove_bond_list, dir_name, no_below, force_no_direction)
'offering_date': x[15], 'report_dealer_index': int(x[10]), 'contra_party_index': int(x[11]), 'date': x[9], 'volume': float(x[3]), }, data)) print('finish converting\n\nstart traversing data ...') d_new_bonds = {} d_dealers = {} total_volume = 0. total_transaction_count = len(data) bound_timestamp = utils.date_2_timestamp('2014-06-01') length = len(data) for i, v in enumerate(data): if i % 20 == 0: progress = float(i) / length * 100. print('\rprogress: %.2f%% ' % progress, end='') bond_id = v['bond_id'] offering_date = v['offering_date'] report_dealer_index = str(v['report_dealer_index']) contra_party_index = str(v['contra_party_index']) date = v['date'] volume = v['volume'] if str(offering_date)[0] != '2':