def _read_10_days_dep(cls, train_dir, test_dir, auction, norm_type, setup): """Read train_1, test_1, ... test_9 in order.""" assert setup == 2 train_paths = cls._get_data_file_path_list(train_dir, True, auction, norm_type) test_paths = cls._get_data_file_path_list(test_dir, False, auction, norm_type) # Read data from .txt files features, targets = [], {} horizons = [10, 20, 30, 50, 100] for h in horizons: targets[h] = [] dim = 144 # For Setup 2, train_set = train_7, test_set = test_[789] data_paths = [train_paths[6]] + test_paths[-3:] for i, path in enumerate(data_paths): # Create day slot for features and targets features.append([]) for h in horizons: targets[h].append([]) console.show_status("Reading data from `{}` ...".format( os.path.basename(path))) with open(path, "r") as f: lines = f.readlines() # Sanity check assert len(lines) == dim + len(horizons) # Parse data data = [[s for s in line.split(" ") if s] for line in lines] total = len(data[0]) assert [len(d) == total for d in data] bar = ProgressBar(total) # Put data appropriately for j, str_list in enumerate(zip(*data)): col = np.array(str_list, dtype=np.float) features[-1].append(col[:dim]) for k, h in enumerate(horizons): targets[h][-1].append(col[dim + k]) # Refresh progress bar bar.show(j + 1) # Stack list features[-1] = np.stack(features[-1], axis=0) for k, h in enumerate(horizons): targets[h][-1] = (np.array(np.stack(targets[h][-1], axis=0), dtype=np.int64) - 1) console.show_status( "Successfully read {} event blocks".format(total)) # Sanity check and return total = sum([len(x) for x in features]) console.show_status("Totally {} event blocks read.".format(total)) if not auction: assert total == 394337 else: assert total == 458125 return features, targets
def evaluate(self, fetches, data, batch_size=None, postprocessor=None, verbose=False, num_steps=None, suppress_n_to_one=False): """ Evaluate tensors based on data TODO: note that if num_steps != -1, outputs from a same sequence may be partitioned. e.g., if single_fetch, outputs will be [array_1_1, ..., array_1_k1, array_2_1, ..., array_2_k2, ...] |-------- input_1 ----------|------------ input_2 ----------| it's OK for seq2seq validation, but need to be post-proceeded in tasks like sequence classification (currently forbidden) :param fetches: a (tuple/list of) tf.Tensor(s) to be evaluated :param data: data used for evaluation :param batch_size: if not specified (None by default), batch_size will be assigned accordingly. If assigned with a positive integer, evaluation will be performed batch by batch. :param postprocessor: post-processor for outputs :return: commonly a (list of) tf.Tensor(s), each of which has the same batch size with the provided data """ # Sanity check for fetches checker.check_fetchable(fetches) single_fetch = not isinstance(fetches, (tuple, list)) # Wrap fetches into a list if necessary if single_fetch: fetches = [fetches] if num_steps is None: num_steps = hub.val_num_steps if batch_size is None: batch_size = data.size # Get outputs (sometimes fetches may contain operations which yields None) outputs = [[] for op in fetches if not isinstance(op, tf.Operation)] if verbose: bar = ProgressBar(data.get_round_length(batch_size, num_steps)) console.show_status('Evaluating on {} ...'.format(data.name)) for cursor, data_batch in enumerate( self.get_data_batches(data, batch_size, num_steps)): data_batch = self._sanity_check_before_use(data_batch) # Get batch outputs fetches[0] fetches[1] # for FNN, batch_outputs = [np_array_1, np_array_2, ...] # each np_array_k have a same batch_size # for RNN, batch_outputs = [[s1_1, s1_2, ..., s1_N], <= fetches[0] # [s2_1, s2_2, ..., s2_N], ...] <= fetches[1] # N is the batch_size, and each sk_i is a numpy array batch_outputs = self._evaluate_batch( fetches, data_batch, num_steps=num_steps, suppress_n_to_one=suppress_n_to_one) assert isinstance(batch_outputs, list) assert len(batch_outputs) == len(outputs) # Add batch_outputs to outputs accordingly for i, batch_output in enumerate(batch_outputs): assert isinstance(outputs[i], list) output_is_a_batch = fetches[i].shape.as_list()[0] is None if self.input_type is InputTypes.RNN_BATCH and output_is_a_batch: # batch_output is [s1_1, s1_2, ..., s1_N] assert isinstance(batch_output, list) outputs[i] = outputs[i] + batch_output else: # batch_output is a numpy array of length batch_size outputs[i].append(batch_output) # Show progress bar if necessary if verbose: bar.show(cursor + 1) # Merge outputs if necessary if self.input_type is InputTypes.BATCH: outputs = [ np.concatenate(array_list, axis=0) for array_list in outputs ] # Post-proceed and return if postprocessor is not None: assert callable(postprocessor) outputs = postprocessor(outputs) assert isinstance(outputs, list) if single_fetch: outputs = outputs[0] return outputs
def _check_raw_lob(cls, data_dir, auction, lob_list, raise_err=False): console.show_status('Checking LOB list ...') # Sanity check assert isinstance(auction, bool) and len(lob_list) == 2 for lob in lob_list: assert isinstance(lob, np.ndarray) and lob.shape[1] == 40 # Calculate stats for normalization lob_1_9 = lob_list[0] mu, sigma = np.mean(lob_1_9, axis=0), np.std(lob_1_9, axis=0) x_min, x_max = np.min(lob_1_9, axis=0), np.max(lob_1_9, axis=0) x_deno = x_max - x_min # Load z-score data zscore_set = cls.load_as_tframe_data(data_dir, auction=auction, norm_type='zscore', setup=9, file_slices=(slice(8, 9), slice(8, 9))) assert isinstance(zscore_set, SequenceSet) zs_all = np.concatenate( [array[:, :40] for array in zscore_set.data_dict['raw_data']], axis=0) # Load min-max data mm_set = cls.load_as_tframe_data(data_dir, auction=False, norm_type='minmax', setup=9, file_slices=(slice(8, 9), slice(8, 9))) mm_all = np.concatenate( [array[:, :40] for array in mm_set.data_dict['raw_data']], axis=0) # Generate lob -> zscore data for validation lob_all = np.concatenate(lob_list, axis=0) lob_zs_all = (lob_all - mu) / sigma # Check error max_err = 1e-4 delta_all = np.abs(lob_zs_all - zs_all) if np.max(delta_all) < max_err: console.show_info('LOB list is correct.') return True if raise_err: raise AssertionError # Correct LOB using console.show_status('Correcting LOB list ...') V_errs, P_errs = 0, 0 bar = ProgressBar(total=len(lob_all)) for i, j in np.argwhere(delta_all > max_err): price_err = j % 2 == 0 V_errs, P_errs = V_errs + 1 - price_err, P_errs + price_err # Find correct value val_zs = zs_all[i][j] * sigma[j] + mu[j] val_mm = mm_all[i][j] * x_deno[j] + x_min[j] zs_mm_err = abs(val_zs - val_mm) if zs_mm_err > 0.1: raise AssertionError( 'In LOB[{}, {}] val_zs = {} while val_mm = {}'.format( i, j, val_zs, val_mm)) correct_val = val_mm if not P_errs: correct_val = np.round(val_mm) cor_mm_err = abs(correct_val - val_mm) if cor_mm_err > 1e-3: raise AssertionError( 'In LOB[{}, {}] cor_val = {} while val_mm = {}'.format( i, j, cor_mm_err, val_mm)) # Correct value in lob_all lob_all[i, j] = correct_val bar.show(i) # Show status after correction console.show_status( '{} price errors and {} volume errors have been corrected'.format( P_errs, V_errs)) new_lob_list = [] for s in [len(array) for array in lob_list]: day_block, lob_all = np.split(lob_all, [s]) new_lob_list.append(day_block) assert cls._check_raw_lob(data_dir, auction, new_lob_list, True) # for i in range(10): lob_list[i] = new_lob_list[i] TODO assert False
def _read_train_test(cls, train_dir, test_dir, auction, norm_type, file_slices=None): """This method is better used for reading DecPre data for further restoring """ train_paths = cls._get_data_file_path_list(train_dir, True, auction, norm_type) test_paths = cls._get_data_file_path_list(test_dir, False, auction, norm_type) # Read data from .txt files features, targets = [], {} horizons = [10, 20, 30, 50, 100] for h in horizons: targets[h] = [] dim = 144 if file_slices is None: train_slice, test_slice = slice(0, 1), slice(0, 9) else: checker.check_type(file_slices, slice) assert len(file_slices) == 2 train_slice, test_slice = file_slices data_paths = train_paths[train_slice] + test_paths[test_slice] for i, path in enumerate(data_paths): # Create day slot for features and targets features.append([]) for h in horizons: targets[h].append([]) console.show_status('Reading data from `{}` ...'.format( os.path.basename(path))) with open(path, 'r') as f: lines = f.readlines() # Sanity check assert len(lines) == dim + len(horizons) # Parse data data = [[s for s in line.split(' ') if s] for line in lines] total = len(data[0]) assert [len(d) == total for d in data] bar = ProgressBar(total) # Put data appropriately for j, str_list in enumerate(zip(*data)): col = np.array(str_list, dtype=np.float) features[-1].append(col[:dim]) for k, h in enumerate(horizons): targets[h][-1].append(col[dim + k]) # Refresh progress bar bar.show(j + 1) # Stack list features[-1] = np.stack(features[-1], axis=0) for k, h in enumerate(horizons): targets[h][-1] = np.array(np.stack(targets[h][-1], axis=0), dtype=np.int64) - 1 console.show_status( 'Successfully read {} event blocks'.format(total)) # Sanity check and return total = sum([len(x) for x in features]) console.show_status('Totally {} event blocks read.'.format(total)) if not auction: assert total == 394337 else: assert total == 458125 return features, targets