def __init__(self, input_days=20): self.__input_days = input_days self.__output_days = input_days // 10 # get the path of the cache data data_subset = load.DATA_SUBSET_FOR_TEMPORAL_INPUT subset = os.path.split(data_subset)[1] year = load.YEAR_FOR_TEMPORAL_INPUT volume_level = load.VOLUME_LEVEL_FOR_TEMPORAL_INPUT no_below = load.NO_BELOW_FOR_TEMPORAL_INPUT data_index = load.DATA_INDEX_FOR_TEMPORAL_INPUT self.__data_pkl_path = os.path.join( path.PATH_TMP_DIR, f'temporal_input_interval_output_emb_{subset}_{year}_{volume_level}_{data_index}_no_below_{no_below}_input_days_{input_days}.pkl') self.__test_emb_pkl_path = os.path.join(path.PATH_TMP_DIR, f'emb_test_data_w_3_no_below_1000.pkl') if os.path.isfile(self.__data_pkl_path): self.__train_X, self.__train_y, self.__test_X, self.__test_y, \ self.emb_dict, self.emb_voc_size, self.dict, self.voc_size = \ utils.load_pkl(self.__data_pkl_path) else: print('\nStart loading embedding model ... ') self.__load_emb_model() print('Finish loading embedding model ') _, _, self.emb_dict, self.emb_voc_size = utils.load_pkl(self.__test_emb_pkl_path) data_root_dir = os.path.join(data_subset, year, volume_level, data_index) print(f'\nStart loading data from {data_root_dir} ...') # load doc list train_doc_list = self.__load_dir(os.path.join(data_root_dir, 'train')) test_doc_list = self.__load_dir(os.path.join(data_root_dir, 'test')) print(f'Finish loading \n\nStart generating dict for output ... ') # generate the dictionary which maps the bond_id to index self.dict, self.voc_size = self.__gen_dict(train_doc_list) print('Finish generating\n\nStart converting data ...') # convert doc list to trainable interval summed one-hot vector self.__train_X = self.__convert_input(train_doc_list) self.__train_y = self.__convert_output(train_doc_list) self.__test_X = self.__convert_input(test_doc_list) self.__test_y = self.__convert_output(test_doc_list) print('Finish processing ') # cache data for faster processing next time utils.write_pkl(self.__data_pkl_path, [self.__train_X, self.__train_y, self.__test_X, self.__test_y, self.emb_dict, self.emb_voc_size, self.dict, self.voc_size]) self.__statistic()
def __load(self): data_queue = [] max_queue_size = min(self.size(), self.queue_size) max_buffer_size = min(self.size(), self.buffer_size) while self.__running: while len(data_queue) < max_queue_size: file_path = self.__file_list[self.__cur_index] self.__cur_index = (self.__cur_index + 1) % self.__len_files batch_src, batch_tar = load_pkl(file_path) # preprocess data batch_x, batch_y, batch_lan_x, batch_lan_y, batch_pos_y = utils.pipeline( self.__encoder_pl, batch_src, batch_tar, {**self.__data_params, 'tokenizer': self.__tokenizer}, verbose=False ) data_queue += list(zip(batch_x, batch_y, batch_lan_x, batch_lan_y, batch_pos_y)) if len(self.__data) < max_buffer_size: random.seed(42) random.shuffle(data_queue) self.__data += data_queue data_queue = [] time.sleep(0.1) print('Stop thread for loading data ')
def __load_train(self): """ load train data """ print('\nStart loading train data') if os.path.isfile(self.__emb_pkl_path): self.__train_X, self.__train_y, self.dict, self.voc_size = load_pkl( self.__emb_pkl_path) else: print('loading doc list ...') # load the doc_list emb_json_path = os.path.join(path.PATH_TMP_DIR, 'emb_data.json') if os.path.isfile(emb_json_path): docs = load_json(emb_json_path) else: path_list = self.__get_path_list() docs = self.__load_docs(path_list, emb_json_path) print('generating dictionary ...') # generate the dictionary which maps the bond_id to index self.dict, self.voc_size = self.__gen_dict(docs) print('converting docs to trainable data format ...') # convert the doc list to trainable data format self.__train_X, self.__train_y = self.__convert( docs, self.__emb_pkl_path) print('Finish loading train data')
def __load_test(self): """ load test data """ print('\nStart loading test data ...') if os.path.isfile(self.__test_emb_pkl_path): self.__test_X, self.__test_y, _, _ = load_pkl( self.__test_emb_pkl_path) else: print('loading test doc list ...') # load the doc_list emb_json_path = os.path.join(path.PATH_TMP_DIR, 'emb_test_data.json') if os.path.isfile(emb_json_path): docs = load_json(emb_json_path) else: path_list = self.__get_path_list('test') docs = self.__load_docs(path_list, emb_json_path) print('converting test docs to trainable test data format ...') # convert the doc list to trainable data format self.__test_X, self.__test_y = self.__convert( docs, self.__test_emb_pkl_path) print('Finish loading test data')
def __init__(self): # load the data self.train_loader = self.Loader(*self.train_preprocess_dirs) self.val_loader = self.Loader(*self.val_preprocess_dirs) # get the generator of the dataset self.train_data = self.train_loader.generator( self.M.pos_emb, self.M.train_params['batch_size']) self.val_data = self.val_loader.generator( self.M.pos_emb, self.M.train_params['batch_size']) # get the data size self.train_size = self.train_loader.size() self.val_size = self.val_loader.size() # get an example of a batch self.train_example_x, self.train_example_y = self.train_loader.batch_example( self.M.pos_emb) self.train_batch = self.train_loader.batch_data(self.M.pos_emb) self.val_batch = self.val_loader.batch_data(self.M.pos_emb) # load the tokenizer self.tokenizer = load_pkl( get_file_path(data_dir, 'tokenizer', self.tokenizer_dir, 'tokenizer.pkl')) self.vocab_size = self.tokenizer.vocab_size # show some statistics for dataset print(f'vocab_size: {self.vocab_size}\n') self.train_stats = self.train_loader.show_statistics(self.M.pos_emb) self.val_stats = self.val_loader.show_statistics(self.M.pos_emb)
def __init__(self, start_ratio=0.0, end_ratio=0.98, _sample_rate=1.0, data_params={}, tokenizer_pl=[], encoder_pl=[], _tokenizer_dir='cdlm', _dataset='cdlm'): # initialize variables self.__data_params = data_params self.__tokenizer_pl = tokenizer_pl self.__encoder_pl = encoder_pl self.__sample_rate = _sample_rate self.__tokenizer_path = os.path.join( create_dir(data_dir, 'tokenizer', _tokenizer_dir), 'tokenizer.pkl') self.__processed_dir_path = create_dir(data_dir, 'preprocessed', _dataset) # load data from files # zh_en_dict = load_json(filtered_pos_union_en_zh_dict_path) zh_en_dict = load_json(filtered_pos_union_zh_en_dict_path) zh_en_list = list( filter(lambda x: 'translation' in x[1] and x[1]['translation'], zh_en_dict.items())) zh_en_list = list( map( lambda x: [[x[0]] * len(x[1]['translation']), x[1][ 'translation']], zh_en_list)) # data = reduce(lambda x, y: [x[0] + y[0], x[1] + y[1]], zh_en_list) zh_data = [] en_data = [] length = len(zh_en_list) for i, val in enumerate(zh_en_list): if i % 50 == 0: progress = float(i + 1) / length * 100. print('\rprogress: %.2f%% ' % progress, end='') zh_data += val[0] en_data += val[1] data = list(zip(zh_data, en_data)) # shuffle the data random.seed(self.RANDOM_STATE) random.shuffle(data) # get tokenizer if os.path.isfile(self.__tokenizer_path): self.__tokenizer = load_pkl(self.__tokenizer_path) else: self.__tokenizer_src, self.__tokenizer_tar = list(zip(*data)) self.get_tokenizer() # get the data set (train or validation or test) data = self.__split_data(data, start_ratio, end_ratio) self.gen_preprocessed_data(data, self.BATCH_SIZE_PER_FILE)
def __init__(self, tokenizer_dir, un_preprocess_dirs, data_params={}, pretrain_params={}, encoder_pl=[]): # initialize variables self.__data_params = data_params self.__pretrain_params = pretrain_params self.__encoder_pl = encoder_pl self.__dirs = un_preprocess_dirs self.__running = True self.__cur_index = 0 self.__data = [] self.__file_list = [] self.__tokenizer = load_pkl(get_file_path(data_dir, 'tokenizer', tokenizer_dir, 'tokenizer.pkl')) # get the list of all files for dir_name in self.__dirs: _dir_path = create_dir(data_dir, 'un_preprocessed', dir_name) self.__file_list += list(map(lambda x: os.path.join(_dir_path, x), os.listdir(_dir_path))) self.__len_files = len(self.__file_list) random.seed(self.RANDOM_STATE) random.shuffle(self.__file_list) self.start()
def __load(self): data_queue = [] max_queue_size = min(self.size(), self.queue_size) max_buffer_size = min(self.size(), self.buffer_size) while self.__running: while len(data_queue) < max_queue_size: file_path = self.__file_list[self.__cur_index] self.__cur_index = (self.__cur_index + 1) % self.__len_files batch_x, batch_y, batch_lan_x, batch_lan_y, batch_pos_y = load_pkl( file_path) data_queue += list( zip(batch_x, batch_y, batch_lan_x, batch_lan_y, batch_pos_y)) if len(self.__data) < max_buffer_size: random.seed(42) random.shuffle(data_queue) self.__data += data_queue data_queue = [] time.sleep(0.1) print('Stop thread for loading data ')
def __init__(self, input_days=20, force_refresh=False): self.__input_days = input_days self.__output_days = input_days // 10 # get the path of the cache data data_subset = load.DATA_SUBSET_FOR_DEALER_PRED subset = os.path.split(data_subset)[1] year = load.YEAR_FOR_DEALER_PRED volume_level = load.VOLUME_LEVEL_FOR_DEALER_PRED no_below = load.NO_BELOW_FOR_DEALER_PRED data_index = load.DATA_INDEX_FOR_DEALER_PRED self.__data_pkl_path = os.path.join( path.PATH_TMP_DIR, f'temporal_input_interval_output_{subset}_{year}_{volume_level}_{data_index}_no_below_{no_below}_input_days_{input_days}.pkl' ) if os.path.isfile(self.__data_pkl_path) and not force_refresh: self.__train_X, self.__train_y, self.__test_X, self.__test_y, self.dict, self.voc_size = \ utils.load_pkl(self.__data_pkl_path) else: data_root_dir = os.path.join(data_subset, year, volume_level, data_index) print(f'\nStart loading data from {data_root_dir} ...') # load doc list train_doc_list = self.__load_dir( os.path.join(data_root_dir, 'train')) test_doc_list = self.__load_dir(os.path.join( data_root_dir, 'test')) print(f'Finish loading \n\nStart processing data ... ') # generate the dictionary which maps the bond_id to index self.dict, self.voc_size = self.__gen_dict(train_doc_list, no_below) # convert doc list to trainable interval summed one-hot vector self.__train_X = self.__convert_input(train_doc_list) self.__train_y = self.__convert_output(train_doc_list) self.__test_X = self.__convert_input(test_doc_list) self.__test_y = self.__convert_output(test_doc_list) print('Finish processing ') # cache data for faster processing next time utils.write_pkl(self.__data_pkl_path, [ self.__train_X, self.__train_y, self.__test_X, self.__test_y, self.dict, self.voc_size ]) self.__gen_topics_mask() self.__statistic()
def __preprocess(self): """ preprocess the data to list of list token idx """ print('\nProcessing data ... ') if self.tokenizer_dir: self.__src_tokenizer = load_pkl( get_file_path(data_dir, 'tokenizer', self.tokenizer_dir, 'tokenizer.pkl')) self.__tar_tokenizer = self.__src_tokenizer elif self.M.checkpoint_params['load_model']: load_model_params = self.M.checkpoint_params['load_model'] tokenizer_path = create_dir_in_root('runtime', 'tokenizer', load_model_params[0], load_model_params[1], 'tokenizer.pkl') self.__src_tokenizer = self.__tar_tokenizer = read_cache( tokenizer_path) else: self.__src_tokenizer = utils.pipeline( self.M.tokenizer_pl, self.__tokenizer_data_src, self.__tokenizer_data_tar, self.M.data_params, ) self.__tar_tokenizer = self.__src_tokenizer del self.__tokenizer_data_src del self.__tokenizer_data_tar params = { **self.M.data_params, 'tokenizer': self.__src_tokenizer, 'src_tokenizer': self.__src_tokenizer, 'tar_tokenizer': self.__tar_tokenizer, } self.__train_src_encode, self.__train_tar_encode, _, _ = utils.pipeline( self.M.encode_pipeline, self.__train_src, self.__train_tar, params) self.__val_src_encode, self.__val_tar_encode, _, _ = utils.pipeline( self.M.encode_pipeline, self.__val_src, self.__val_tar, params) self.__test_src_encode, self.__test_tar_encode, _, _ = utils.pipeline( self.M.encode_pipeline, self.__test_src, self.__test_tar, params) # get vocabulary size self.__src_vocab_size = self.__src_tokenizer.vocab_size self.__tar_vocab_size = self.__tar_tokenizer.vocab_size print('\nFinish preprocessing ')
def __init__(self, _is_train, _sample_rate=1.0, data_params={}, tokenizer_pl=[], encoder_pl=[], _tokenizer_dir='cdlm', _dataset='cdlm'): # initialize variables self.__data_params = data_params self.__tokenizer_pl = tokenizer_pl self.__encoder_pl = encoder_pl self.__sample_rate = _sample_rate self.__tokenizer_path = os.path.join( create_dir(data_dir, 'tokenizer', _tokenizer_dir), 'tokenizer.pkl') self.__processed_dir_path = create_dir(data_dir, 'preprocessed', _dataset) # initialize wmt news loader start_ratio = 0.0 if _is_train else zh_en_wmt_news.Loader.PRETRAIN_TRAIN_RATIO end_ratio = zh_en_wmt_news.Loader.PRETRAIN_TRAIN_RATIO if _is_train else 1.0 zh_en_wmt_loader = zh_en_wmt_news.Loader(start_ratio, end_ratio) # initialize news commentary loader start_ratio = 0.0 if _is_train else zh_en_news_commentary.Loader.PRETRAIN_TRAIN_RATIO end_ratio = zh_en_news_commentary.Loader.PRETRAIN_TRAIN_RATIO if _is_train else 1.0 zh_en_news_commentary_loader = zh_en_news_commentary.Loader( start_ratio, end_ratio, 0.2) # load the data zh_data, en_data = zh_en_wmt_loader.data() zh_data_2, en_data_2 = zh_en_news_commentary_loader.data() # combine data zh_data += zh_data_2 en_data += en_data_2 data = list(zip(zh_data, en_data)) # shuffle the data random.seed(self.RANDOM_STATE) random.shuffle(data) # get tokenizer if os.path.isfile(self.__tokenizer_path): self.__tokenizer = load_pkl(self.__tokenizer_path) else: self.__tokenizer_src, self.__tokenizer_tar = list(zip(*data)) self.get_tokenizer() self.gen_preprocessed_data(data, self.BATCH_SIZE_PER_FILE)
def __load(self): while self.__running: while len(self.__X) < self.__buffer_size: file_path = self.__file_list[self.__cur_index] self.__cur_index = (self.__cur_index + 1) % self.__len_files X_mask, Y = utils.load_pkl(file_path) self.__X = np.vstack([self.__X, X_mask]) if len( self.__X) else X_mask self.__y = np.vstack([self.__y, Y]) if len(self.__y) else Y time.sleep(1) print('Stop thread for loading data ')
def __init__(self, input_days=20): self.__input_days = input_days self.__output_days = input_days // 10 # get the path of the cache data subset = os.path.split(load.DATA_SUBSET)[1] self.__data_pkl_path = os.path.join( path.PATH_TMP_DIR, f'interval_input_output_{subset}_{load.YEAR}_{load.VOLUME_LEVEL}_{load.DATA_INDEX}_no_below_{load.NO_BELOW}_input_days_{input_days}.pkl' ) if os.path.isfile(self.__data_pkl_path): self.__train_data, self.__test_data, self.dict, self.voc_size = utils.load_pkl( self.__data_pkl_path) else: data_root_dir = os.path.join(load.DATA_SUBSET, load.YEAR, load.VOLUME_LEVEL, load.DATA_INDEX) print(f'\nStart loading data from {data_root_dir} ...') # load doc list train_doc_list = self.__load_dir( os.path.join(data_root_dir, 'train')) test_doc_list = self.__load_dir(os.path.join( data_root_dir, 'test')) print(f'Finish loading \n\nStart processing data ... ') # generate the dictionary which maps the bond_id to index self.dict, self.voc_size = self.__gen_dict(train_doc_list) # convert doc list to trainable interval summed one-hot vector self.__train_data = self.__convert(train_doc_list) self.__test_data = self.__convert(test_doc_list) print('Finish processing ') # cache data for faster processing next time utils.write_pkl(self.__data_pkl_path, [ self.__train_data, self.__test_data, self.dict, self.voc_size ]) self.__gen_topics_mask() self.__statistic()
def __init__(self, start_ratio=0.0, end_ratio=0.98, _sample_rate=1.0, data_params={}, tokenizer_pl=[], encoder_pl=[], _tokenizer_dir='cdlm', _dataset='cdlm'): # initialize variables self.__data_params = data_params self.__tokenizer_pl = tokenizer_pl self.__encoder_pl = encoder_pl self.__sample_rate = _sample_rate self.__tokenizer_path = os.path.join( create_dir(data_dir, 'tokenizer', _tokenizer_dir), 'tokenizer.pkl') self.__processed_dir_path = create_dir(data_dir, 'preprocessed', _dataset) # load data from files data = news_commentary.zh_en() data = self.__split_data(data, 0., self.NMT_TRAIN_RATIO) # shuffle the data random.seed(self.RANDOM_STATE) random.shuffle(data) # get tokenizer if os.path.isfile(self.__tokenizer_path): self.__tokenizer = load_pkl(self.__tokenizer_path) else: tmp_data = reduce(lambda x, y: x + y, data) self.__tokenizer_src, self.__tokenizer_tar = list(zip(*tmp_data)) self.get_tokenizer() # get the data set (train or validation or test) data = self.__split_data(data, start_ratio, end_ratio) data = reduce(lambda x, y: x + y, data) self.gen_preprocessed_data(data, self.BATCH_SIZE_PER_FILE)
def all(self): if self.__has_load_all: return self.__X, self.__y print(f'Loading all data from {self.__dir_path} ...') for i, file_path in enumerate(self.__file_list): if i % 2 == 0: progress = float(i + 1) / self.__len_files * 100. print('\rprogress: %.2f%% ' % progress, end='') X_mask, Y = utils.load_pkl(file_path) self.__X = np.vstack([self.__X, X_mask]) if len( self.__X) else X_mask self.__y = np.vstack([self.__y, Y]) if len(self.__y) else Y print('\rprogress: 100.0%%\nFinish loading ') self.__has_load_all = True # add statistics to log self.__statistic() return self.__X, self.__y
def __init__(self, input_days=20, force_refresh=False): self.__input_days = input_days self.__output_days = input_days // 10 # get the path of the cache data data_subset = load.DATA_SUBSET_FOR_TEMPORAL_INPUT subset = os.path.split(data_subset)[1] year = load.YEAR_FOR_TEMPORAL_INPUT volume_level = load.VOLUME_LEVEL_FOR_TEMPORAL_INPUT no_below = load.NO_BELOW_FOR_TEMPORAL_INPUT data_index = load.DATA_INDEX_FOR_TEMPORAL_INPUT self.__data_pkl_path = os.path.join( path.PATH_TMP_DIR, f'temporal_input_interval_output_has_pretrain_same_volume_{subset}_{year}_{volume_level}_{data_index}_no_below_{no_below}_input_days_{input_days}.pkl' ) data_all_root_dir = os.path.join(data_subset, year, volume_level) all_level = os.path.split(data_all_root_dir)[1] self.__pretrain_pkl_path = os.path.join( path.PATH_TMP_DIR, f'temporal_input_interval_output_for_pretrain_same_volume_{all_level}_{subset}_{year}_{volume_level}_dealer_83_no_below_{no_below}_input_days_{input_days}.pkl' ) _, _, _, _, self.dict_pretrain, self.voc_size_pretrain = utils.load_pkl( self.__pretrain_pkl_path) if os.path.isfile(self.__data_pkl_path) and not force_refresh: self.__train_X, self.__train_y, self.__test_X, self.__test_y, self.dict, self.voc_size = \ utils.load_pkl(self.__data_pkl_path) else: data_root_dir = os.path.join(data_subset, year, volume_level, data_index) print(f'\nStart loading data from {data_root_dir} ...') # load doc list train_doc_list = self.__load_dir( os.path.join(data_root_dir, 'train')) test_doc_list = self.__load_dir(os.path.join( data_root_dir, 'test')) print(f'Finish loading \n\nStart processing data ... ') # generate the dictionary which maps the bond_id to index self.dict, self.voc_size = self.__gen_dict(train_doc_list, no_below) # convert doc list to trainable interval summed one-hot vector self.__train_X = self.__convert_input(train_doc_list, self.dict, self.voc_size) self.__train_y = self.__convert_output(train_doc_list) self.__test_X = self.__convert_input(test_doc_list, self.dict, self.voc_size) self.__test_y = self.__convert_output(test_doc_list) self.__train_X_pretrain = self.__convert_input( train_doc_list, self.dict_pretrain, self.voc_size_pretrain) self.__test_X_pretrain = self.__convert_input( test_doc_list, self.dict_pretrain, self.voc_size_pretrain) o_lstm = LSTM('2020_01_12_18_49_46', 'lstm_for_pretrain_with_same_volume', 2007) o_lstm.compile(0.001) o_lstm.load_model( r'D:\Github\bond_prediction\runtime\models\lstm_for_pretrain_with_same_volume\2020_01_12_18_49_46\lstm_for_pretrain_with_same_volume.030-0.0596.hdf5', np.zeros([1, 20, 2007]), np.zeros([1, 20, 2007])) self.__train_X_pretrain = o_lstm.predict(self.__train_X_pretrain) self.__test_X_pretrain = o_lstm.predict(self.__test_X_pretrain) self.__train_X_pretrain = np.array([ self.__train_X_pretrain for i in range(20) ]).transpose([1, 0, 2]) self.__test_X_pretrain = np.array([ self.__test_X_pretrain for i in range(20) ]).transpose([1, 0, 2]) # print(self.__train_X_pretrain.shape) # print(self.__test_X_pretrain.shape) self.__train_X = np.vstack([ self.__train_X.transpose([2, 0, 1]), self.__train_X_pretrain.transpose([2, 0, 1]) ]) self.__test_X = np.vstack([ self.__test_X.transpose([2, 0, 1]), self.__test_X_pretrain.transpose([2, 0, 1]) ]) self.__train_X = self.__train_X.transpose([1, 2, 0]) self.__test_X = self.__test_X.transpose([1, 2, 0]) # print(self.__train_X.shape) # print(self.__test_X.shape) print('Finish processing ') # cache data for faster processing next time utils.write_pkl(self.__data_pkl_path, [ self.__train_X, self.__train_y, self.__test_X, self.__test_y, self.dict, self.voc_size ]) self.__gen_topics_mask() self.__statistic()
def gen_inputs(group_file_path, group_index, input_time_steps_list, output_time_steps_list, with_day_off=True, buy_sell_plan=2, use_volume=False, save_path='', split_ratio=0.9, is_train=True): d_dealer_index_2_group_label = utils.load_json(group_file_path) # d_dealer_index_2_trace_list = utils.load_pkl(os.path.join(path.ROOT_DIR, 'runtime', 'tmp_d_dealers.pkl')) d_dealer_for_gen_input = utils.load_pkl( os.path.join(path.ROOT_DIR, 'runtime', 'd_dealer_for_gen_input_with_no_below_50_25_10.pkl')) tmp_list = [ dealer_index for dealer_index, group_label in d_dealer_index_2_group_label.items() if group_label == group_index ] print(f'len_group_member: {len(tmp_list)}') # get total trace list train_trace_list = [] test_trace_list = [] for dealer_index, val in d_dealer_for_gen_input.items(): if dealer_index not in d_dealer_index_2_group_label or \ d_dealer_index_2_group_label[dealer_index] != group_index: continue trace_list = val['trace_list'] trace_list.sort(key=lambda x: x[-1]) num_samples = len(trace_list) - max(input_time_steps_list) - max( output_time_steps_list) split_index = int(num_samples * split_ratio + max(input_time_steps_list)) train_trace_list += trace_list[:split_index] test_trace_list += trace_list[split_index:] # print(dealer_index, len(trace_list), trace_list) train_trace_list.sort(key=lambda x: x[-1]) # get dictionary train_doc_list = [list(map(lambda x: x[0], train_trace_list))] dictionary = corpora.Dictionary(train_doc_list) len_bonds = len(dictionary) print(f'total bond num (group {group_index}): {len_bonds}') X = [] X_mask = [] Y = [] for dealer_index, val in d_dealer_for_gen_input.items(): if dealer_index not in d_dealer_index_2_group_label or \ d_dealer_index_2_group_label[dealer_index] != group_index: continue # filter bonds that only appear in test set trace_list = val['trace_list'] num_samples = len(trace_list) - max(input_time_steps_list) - max( output_time_steps_list) split_index = int(num_samples * split_ratio + max(input_time_steps_list)) if is_train: trace_list = trace_list[:split_index] else: trace_list = trace_list[split_index:] trace_list = [ v for v in trace_list if dictionary.doc2idx([v[0]])[0] != -1 ] trace_list.sort(key=lambda x: x[-1]) start_date = trace_list[0][-1] end_date = trace_list[-1][-1] # Format the data in date structure date_matrix, date_mask, dict_date_2_input_m_index = __generate_date_structure( len_bonds, start_date, end_date, with_day_off, buy_sell_plan) # according to the transaction history, fill the data into date structure for i, trace in enumerate(trace_list): bond_id = trace[0] volume = trace[1] _date = trace[-1] trace_type = trace[2] bond_index = dictionary.doc2idx([bond_id])[0] value = 1 if not use_volume else np.log10(volume) if _date not in dict_date_2_input_m_index: continue date_mask = __change_mask(buy_sell_plan, date_mask, bond_index, len_bonds, dict_date_2_input_m_index, _date, trace_type, value) # sample the data longest_input_length = max(input_time_steps_list) + 2 for input_time_steps in input_time_steps_list: for output_time_steps in output_time_steps_list: # input_list = __sample_list(date_matrix, input_time_steps, 0, output_time_steps, # __token(date_matrix.shape[-1]), __token(date_matrix.shape[-1])) input_mask_list = __sample_list( date_mask, input_time_steps, 0, output_time_steps, __start_token_mask(date_mask.shape[1:], with_day_off), __end_token_mask(date_mask.shape[1:], with_day_off), longest_len=longest_input_length) convert_fn = __convert_2_zero_one if buy_sell_plan in [ 0, 2 ] else None output_list = __sample_list(date_mask, output_time_steps, input_time_steps, 0, convert_fn=convert_fn) if len(input_mask_list) != len(output_list): continue # ... # X += input_list X_mask += input_mask_list Y += output_list # d_dealer_index_2_trace_list_ordered[dealer_index] = [date_matrix, date_mask] if save_path: del d_dealer_for_gen_input del d_dealer_index_2_group_label del X X_mask = np.asarray(X_mask, dtype=np.int32) Y = np.asarray(Y, dtype=np.int32) print('\n------------------------------') # print(len(X)) print(X_mask.shape) print(Y.shape) len_X = len(X_mask) num_files = int(np.ceil(len_X / 2000.)) for i in range(num_files): start_index = i * 2000 end_index = (i + 1) * 2000 utils.write_pkl( save_path + f'_{i}.pkl', [X_mask[start_index:end_index], Y[start_index:end_index]]) # return d_dealer_index_2_trace_list_ordered return X_mask, Y
def gen_group_according_to(file_path): print('loading data ...') dict_dealer_index_2_group = utils.load_json(file_path) data, d_dealers, total_volume, total_transaction_count, bound_timestamp, d_new_bonds = utils.load_pkl( os.path.join(path.ROOT_DIR, 'runtime', 'tmp123.pkl')) utils.write_pkl( os.path.join(path.ROOT_DIR, 'runtime', 'tmp_d_dealers.pkl'), d_dealers) # d_dealers = utils.load_pkl(os.path.join(path.ROOT_DIR, 'runtime', 'tmp_d_dealers.pkl')) labels = set(list(map(lambda x: x[1], dict_dealer_index_2_group.items()))) group_list = [{} for i in range(len(labels))] print('traversing data ...') length = len(d_dealers) cur = 0 for dealer_index, trace_list in d_dealers.items(): # show progress if cur % 5 == 0: progress = float(cur + 1) / length * 100. print('\rprogress: %.2f%% ' % progress, end='') cur += 1 if dealer_index not in dict_dealer_index_2_group: continue group_index = dict_dealer_index_2_group[dealer_index] group_list[group_index][dealer_index] = trace_list print('\rprogress: 100.0% \nsaving data ...') plan_name = os.path.splitext(os.path.split(file_path)[1])[0] + '.json' group_path = os.path.join(path.DATA_ROOT_DIR, 'groups', plan_name) utils.write_json(group_path, group_list)
def __init__(self, input_days=20, force_refresh=False): self.__input_days = input_days self.__output_days = input_days // 10 self.__input_mode = 0 # get the path of the cache data data_subset = load.DATA_SUBSET_FOR_TEMPORAL_INPUT subset = os.path.split(data_subset)[1] year = load.YEAR_FOR_TEMPORAL_INPUT volume_level = load.VOLUME_LEVEL_FOR_TEMPORAL_INPUT no_below = load.NO_BELOW_FOR_TEMPORAL_INPUT data_index = load.DATA_INDEX_FOR_TEMPORAL_INPUT data_all_root_dir = os.path.join(data_subset, year, volume_level) all_level = os.path.split(data_all_root_dir)[1] self.__data_pkl_path = os.path.join( path.PATH_TMP_DIR, f'temporal_input_interval_output_for_pretrain_same_volume_{all_level}_{subset}_{year}_{volume_level}_{data_index}_no_below_{no_below}_input_days_{input_days}.pkl' ) if os.path.isfile(self.__data_pkl_path) and not force_refresh: self.__train_X, self.__train_y, self.__test_X, self.__test_y, self.dict, self.voc_size = \ utils.load_pkl(self.__data_pkl_path) else: print(f'\nStart loading data from {data_all_root_dir} ...') train_start_timestamp = utils.date_2_timestamp('2015-01-02') train_end_timestamp = utils.date_2_timestamp('2015-10-14', True) test_start_timestamp = utils.date_2_timestamp('2015-10-14') test_end_timestamp = utils.date_2_timestamp('2015-12-31', True) data_all_pkl_path = os.path.join( path.PATH_TMP_DIR, f'all_doc_list_for_pretrain_{subset}_{year}_{all_level}.pkl') if os.path.isfile(data_all_pkl_path): train_all_doc_list, test_all_doc_list = utils.load_pkl( data_all_pkl_path) else: train_all_doc_list = self.__load_dir_all( data_all_root_dir, train_start_timestamp, train_end_timestamp, 'train') test_all_doc_list = self.__load_dir_all( data_all_root_dir, test_start_timestamp, test_end_timestamp, 'test') # train_all_doc_list = [] # test_all_doc_list = [] # for _volume in os.listdir(data_all_root_dir): # sub_all_root_dir = os.path.join(data_all_root_dir, _volume) # sub_train_all_doc_list = self.__load_dir_all(sub_all_root_dir, train_start_timestamp, # train_end_timestamp, # 'train') # sub_test_all_doc_list = self.__load_dir_all(sub_all_root_dir, test_start_timestamp, # test_end_timestamp, # 'test') # # train_all_doc_list += sub_train_all_doc_list # test_all_doc_list += sub_test_all_doc_list utils.write_pkl(data_all_pkl_path, [train_all_doc_list, test_all_doc_list]) print(f'Finish loading \n\nStart processing data ... ') train_all_docs = [] for v in train_all_doc_list: train_all_docs += v test_all_docs = [] for v in test_all_doc_list: test_all_docs += v del train_all_doc_list del test_all_doc_list self.dict, self.voc_size = self.__gen_dict(train_all_docs, 150) # # generate the dictionary which maps the bond_id to index # self.dict, self.voc_size = self.__gen_dict(train_all_doc_list, no_below) print(self.voc_size) # print(self.voc_size_all) print( 'Finish generating dict\n\nStart converting input output ...') # convert doc list to trainable interval summed one-hot vector self.__train_X = self.__convert_input(train_all_docs, self.dict, self.voc_size, 'allow_unknown') self.__train_y = self.__convert_output(train_all_docs) self.__test_X = self.__convert_input(test_all_docs, self.dict, self.voc_size, 'allow_unknown') self.__test_y = self.__convert_output(test_all_docs) # self.__train_X = 0. # self.__test_X = 0. # self.__train_y = 0. # self.__test_y = 0. # # for doc_list in train_all_doc_list: # self.__train_X += self.__convert_input(doc_list, self.dict, self.voc_size, 'allow_unknown') # self.__train_y += self.__convert_output(doc_list) # self.__train_X /= len(train_all_doc_list) # # for doc_list in test_all_doc_list: # self.__test_X += self.__convert_input(doc_list, self.dict, self.voc_size, 'allow_unknown') # self.__test_y += self.__convert_output(doc_list) # self.__test_X /= len(test_all_doc_list) # self.__train_all_X = np.array( # list(map(lambda x: self.__convert_input(x, self.dict_all, self.voc_size_all, 'allow_unknown'), # train_all_doc_list))) # self.__train_all_y = np.array(list(map(self.__convert_output, train_all_doc_list))) # self.__test_all_X = np.array( # list(map(lambda x: self.__convert_input(x, self.dict_all, self.voc_size_all, 'allow_unknown'), # test_all_doc_list))) # self.__test_all_y = np.array(list(map(self.__convert_output, test_all_doc_list))) # self.__train_all_X = np.mean(self.__train_all_X, axis=0) # self.__train_all_y = np.mean(self.__train_all_y, axis=0) # self.__test_all_X = np.mean(self.__test_all_X, axis=0) # self.__test_all_y = np.mean(self.__test_all_y, axis=0) # # convert doc list to trainable interval summed one-hot vector # self.__train_X = self.__convert_input(train_doc_list, self.dict, self.voc_size, 'allow_unknown') # self.__train_y = self.__convert_output(train_doc_list) # self.__test_X = self.__convert_input(test_doc_list, self.dict, self.voc_size, 'allow_unknown') # self.__test_y = self.__convert_output(test_doc_list) # self.__train_X = np.vstack([self.__train_X.transpose([2, 0, 1]), self.__train_all_X.transpose([2, 0, 1])]) # self.__test_X = np.vstack([self.__test_X.transpose([2, 0, 1]), self.__test_all_X.transpose([2, 0, 1])]) # self.__train_X = self.__train_X.transpose([1, 2, 0]) # self.__test_X = self.__test_X.transpose([1, 2, 0]) print('Finish processing ') # cache data for faster processing next time utils.write_pkl(self.__data_pkl_path, [ self.__train_X, self.__train_y, self.__test_X, self.__test_y, self.dict, self.voc_size ]) self.__gen_topics_mask() self.__statistic()
# _idx = total_dictionary.doc2idx([bond_id])[0] # if tmp_date not in dict_date_2_input_index or _idx == -1: # continue # tmp_inputs[dict_date_2_input_index[tmp_date]][_idx] = 1 # d_dealer_index_2_input[dealer_index] = tmp_inputs # # for v in origin_l_dealers: # dealer_index = v[0] # v.append(d_dealer_index_2_input[dealer_index]) # -------------------------------- print('Loading variables ...') # utils.write_pkl(os.path.join(path.ROOT_DIR, 'runtime', 'tmp_origin_l_dealers.pkl'), origin_l_dealers) origin_l_dealers = utils.load_pkl( os.path.join(path.ROOT_DIR, 'runtime', 'tmp_origin_l_dealers.pkl')) print('Converting ...') new_l_dealers = [] for v in origin_l_dealers: if v[-3] <= 5: continue new_l_dealers.append(v) origin_l_dealers = new_l_dealers[:240] l_dealers = list( map( lambda x: # [x[0], np.log(x[4]), np.log10(x[5] + 1.1), x[6], x[7], x[8]], [
import os import numpy as np from config import path from lib import utils path_d_issue_id_offering_date = os.path.join( path.DATA_ROOT_DIR, 'dict_issue_id_offering_date.json') dict_issue_id_offering_date = utils.load_json(path_d_issue_id_offering_date) path_pkl_2015 = os.path.join(path.TRACE_DIR, 'finra_trace_2015.pkl') data = utils.load_pkl(path_pkl_2015) print('\nstart converting data ...') data = np.array(data) data = list(map(lambda x: {'bond_id': x[0], 'issue_id': x[16]}, data)) print('finish converting ') dict_skip_bond = {} dict_bond_id_offering_date = {} for v in data: bond_id = v['bond_id'] issue_id = str(int(v['issue_id'])) if issue_id not in dict_issue_id_offering_date: dict_skip_bond[bond_id] = True print('------------------------') print(bond_id, issue_id) continue offering_date = dict_issue_id_offering_date[issue_id]