Exemple #1
0
 def _read_10_days_dep(cls, train_dir, test_dir, auction, norm_type, setup):
     """Read train_1, test_1, ... test_9 in order."""
     assert setup == 2
     train_paths = cls._get_data_file_path_list(train_dir, True, auction,
                                                norm_type)
     test_paths = cls._get_data_file_path_list(test_dir, False, auction,
                                               norm_type)
     # Read data from .txt files
     features, targets = [], {}
     horizons = [10, 20, 30, 50, 100]
     for h in horizons:
         targets[h] = []
     dim = 144
     # For Setup 2, train_set = train_7, test_set = test_[789]
     data_paths = [train_paths[6]] + test_paths[-3:]
     for i, path in enumerate(data_paths):
         # Create day slot for features and targets
         features.append([])
         for h in horizons:
             targets[h].append([])
         console.show_status("Reading data from `{}` ...".format(
             os.path.basename(path)))
         with open(path, "r") as f:
             lines = f.readlines()
         # Sanity check
         assert len(lines) == dim + len(horizons)
         # Parse data
         data = [[s for s in line.split(" ") if s] for line in lines]
         total = len(data[0])
         assert [len(d) == total for d in data]
         bar = ProgressBar(total)
         # Put data appropriately
         for j, str_list in enumerate(zip(*data)):
             col = np.array(str_list, dtype=np.float)
             features[-1].append(col[:dim])
             for k, h in enumerate(horizons):
                 targets[h][-1].append(col[dim + k])
             # Refresh progress bar
             bar.show(j + 1)
         # Stack list
         features[-1] = np.stack(features[-1], axis=0)
         for k, h in enumerate(horizons):
             targets[h][-1] = (np.array(np.stack(targets[h][-1], axis=0),
                                        dtype=np.int64) - 1)
         console.show_status(
             "Successfully read {} event blocks".format(total))
     # Sanity check and return
     total = sum([len(x) for x in features])
     console.show_status("Totally {} event blocks read.".format(total))
     if not auction:
         assert total == 394337
     else:
         assert total == 458125
     return features, targets
Exemple #2
0
    def evaluate(self,
                 fetches,
                 data,
                 batch_size=None,
                 postprocessor=None,
                 verbose=False,
                 num_steps=None,
                 suppress_n_to_one=False):
        """
    Evaluate tensors based on data
    TODO: note that if num_steps != -1, outputs from a same sequence may be
          partitioned. e.g., if single_fetch, outputs will be
          [array_1_1, ..., array_1_k1, array_2_1, ..., array_2_k2, ...]
         |-------- input_1 ----------|------------ input_2 ----------|
         it's OK for seq2seq validation, but need to be post-proceeded in
         tasks like sequence classification (currently forbidden)

    :param fetches: a (tuple/list of) tf.Tensor(s) to be evaluated
    :param data: data used for evaluation
    :param batch_size: if not specified (None by default), batch_size will be
                       assigned accordingly. If assigned with a positive
                       integer, evaluation will be performed batch by batch.
    :param postprocessor: post-processor for outputs
    :return: commonly a (list of) tf.Tensor(s), each of which has the
             same batch size with the provided data
    """
        # Sanity check for fetches
        checker.check_fetchable(fetches)
        single_fetch = not isinstance(fetches, (tuple, list))
        # Wrap fetches into a list if necessary
        if single_fetch: fetches = [fetches]
        if num_steps is None: num_steps = hub.val_num_steps
        if batch_size is None: batch_size = data.size

        # Get outputs (sometimes fetches may contain operations which yields None)
        outputs = [[] for op in fetches if not isinstance(op, tf.Operation)]

        if verbose:
            bar = ProgressBar(data.get_round_length(batch_size, num_steps))
            console.show_status('Evaluating on {} ...'.format(data.name))

        for cursor, data_batch in enumerate(
                self.get_data_batches(data, batch_size, num_steps)):
            data_batch = self._sanity_check_before_use(data_batch)
            # Get batch outputs          fetches[0]  fetches[1]
            #  for FNN, batch_outputs = [np_array_1, np_array_2, ...]
            #           each np_array_k have a same batch_size
            #  for RNN, batch_outputs = [[s1_1, s1_2, ..., s1_N],       <= fetches[0]
            #                            [s2_1, s2_2, ..., s2_N], ...]  <= fetches[1]
            #           N is the batch_size, and each sk_i is a numpy array
            batch_outputs = self._evaluate_batch(
                fetches,
                data_batch,
                num_steps=num_steps,
                suppress_n_to_one=suppress_n_to_one)
            assert isinstance(batch_outputs, list)
            assert len(batch_outputs) == len(outputs)

            # Add batch_outputs to outputs accordingly
            for i, batch_output in enumerate(batch_outputs):
                assert isinstance(outputs[i], list)
                output_is_a_batch = fetches[i].shape.as_list()[0] is None
                if self.input_type is InputTypes.RNN_BATCH and output_is_a_batch:
                    # batch_output is [s1_1, s1_2, ..., s1_N]
                    assert isinstance(batch_output, list)
                    outputs[i] = outputs[i] + batch_output
                else:
                    # batch_output is a numpy array of length batch_size
                    outputs[i].append(batch_output)

            # Show progress bar if necessary
            if verbose: bar.show(cursor + 1)

        # Merge outputs if necessary
        if self.input_type is InputTypes.BATCH:
            outputs = [
                np.concatenate(array_list, axis=0) for array_list in outputs
            ]

        # Post-proceed and return
        if postprocessor is not None:
            assert callable(postprocessor)
            outputs = postprocessor(outputs)

        assert isinstance(outputs, list)
        if single_fetch: outputs = outputs[0]
        return outputs
Exemple #3
0
 def _check_raw_lob(cls, data_dir, auction, lob_list, raise_err=False):
     console.show_status('Checking LOB list ...')
     # Sanity check
     assert isinstance(auction, bool) and len(lob_list) == 2
     for lob in lob_list:
         assert isinstance(lob, np.ndarray) and lob.shape[1] == 40
     # Calculate stats for normalization
     lob_1_9 = lob_list[0]
     mu, sigma = np.mean(lob_1_9, axis=0), np.std(lob_1_9, axis=0)
     x_min, x_max = np.min(lob_1_9, axis=0), np.max(lob_1_9, axis=0)
     x_deno = x_max - x_min
     # Load z-score data
     zscore_set = cls.load_as_tframe_data(data_dir,
                                          auction=auction,
                                          norm_type='zscore',
                                          setup=9,
                                          file_slices=(slice(8, 9),
                                                       slice(8, 9)))
     assert isinstance(zscore_set, SequenceSet)
     zs_all = np.concatenate(
         [array[:, :40] for array in zscore_set.data_dict['raw_data']],
         axis=0)
     # Load min-max data
     mm_set = cls.load_as_tframe_data(data_dir,
                                      auction=False,
                                      norm_type='minmax',
                                      setup=9,
                                      file_slices=(slice(8, 9), slice(8,
                                                                      9)))
     mm_all = np.concatenate(
         [array[:, :40] for array in mm_set.data_dict['raw_data']], axis=0)
     # Generate lob -> zscore data for validation
     lob_all = np.concatenate(lob_list, axis=0)
     lob_zs_all = (lob_all - mu) / sigma
     # Check error
     max_err = 1e-4
     delta_all = np.abs(lob_zs_all - zs_all)
     if np.max(delta_all) < max_err:
         console.show_info('LOB list is correct.')
         return True
     if raise_err: raise AssertionError
     # Correct LOB using
     console.show_status('Correcting LOB list ...')
     V_errs, P_errs = 0, 0
     bar = ProgressBar(total=len(lob_all))
     for i, j in np.argwhere(delta_all > max_err):
         price_err = j % 2 == 0
         V_errs, P_errs = V_errs + 1 - price_err, P_errs + price_err
         # Find correct value
         val_zs = zs_all[i][j] * sigma[j] + mu[j]
         val_mm = mm_all[i][j] * x_deno[j] + x_min[j]
         zs_mm_err = abs(val_zs - val_mm)
         if zs_mm_err > 0.1:
             raise AssertionError(
                 'In LOB[{}, {}] val_zs = {} while val_mm = {}'.format(
                     i, j, val_zs, val_mm))
         correct_val = val_mm
         if not P_errs:
             correct_val = np.round(val_mm)
             cor_mm_err = abs(correct_val - val_mm)
             if cor_mm_err > 1e-3:
                 raise AssertionError(
                     'In LOB[{}, {}] cor_val = {} while val_mm = {}'.format(
                         i, j, cor_mm_err, val_mm))
         # Correct value in lob_all
         lob_all[i, j] = correct_val
         bar.show(i)
     # Show status after correction
     console.show_status(
         '{} price errors and {} volume errors have been corrected'.format(
             P_errs, V_errs))
     new_lob_list = []
     for s in [len(array) for array in lob_list]:
         day_block, lob_all = np.split(lob_all, [s])
         new_lob_list.append(day_block)
     assert cls._check_raw_lob(data_dir, auction, new_lob_list, True)
     # for i in range(10): lob_list[i] = new_lob_list[i] TODO
     assert False
Exemple #4
0
 def _read_train_test(cls,
                      train_dir,
                      test_dir,
                      auction,
                      norm_type,
                      file_slices=None):
     """This method is better used for reading DecPre data for further restoring
 """
     train_paths = cls._get_data_file_path_list(train_dir, True, auction,
                                                norm_type)
     test_paths = cls._get_data_file_path_list(test_dir, False, auction,
                                               norm_type)
     # Read data from .txt files
     features, targets = [], {}
     horizons = [10, 20, 30, 50, 100]
     for h in horizons:
         targets[h] = []
     dim = 144
     if file_slices is None:
         train_slice, test_slice = slice(0, 1), slice(0, 9)
     else:
         checker.check_type(file_slices, slice)
         assert len(file_slices) == 2
         train_slice, test_slice = file_slices
     data_paths = train_paths[train_slice] + test_paths[test_slice]
     for i, path in enumerate(data_paths):
         # Create day slot for features and targets
         features.append([])
         for h in horizons:
             targets[h].append([])
         console.show_status('Reading data from `{}` ...'.format(
             os.path.basename(path)))
         with open(path, 'r') as f:
             lines = f.readlines()
         # Sanity check
         assert len(lines) == dim + len(horizons)
         # Parse data
         data = [[s for s in line.split(' ') if s] for line in lines]
         total = len(data[0])
         assert [len(d) == total for d in data]
         bar = ProgressBar(total)
         # Put data appropriately
         for j, str_list in enumerate(zip(*data)):
             col = np.array(str_list, dtype=np.float)
             features[-1].append(col[:dim])
             for k, h in enumerate(horizons):
                 targets[h][-1].append(col[dim + k])
             # Refresh progress bar
             bar.show(j + 1)
         # Stack list
         features[-1] = np.stack(features[-1], axis=0)
         for k, h in enumerate(horizons):
             targets[h][-1] = np.array(np.stack(targets[h][-1], axis=0),
                                       dtype=np.int64) - 1
         console.show_status(
             'Successfully read {} event blocks'.format(total))
     # Sanity check and return
     total = sum([len(x) for x in features])
     console.show_status('Totally {} event blocks read.'.format(total))
     if not auction: assert total == 394337
     else: assert total == 458125
     return features, targets