Exemple #1
0
    def __init__(self,
                 signals,
                 responses=None,
                 data_dict=None,
                 summ_dict=None,
                 n_to_one=False,
                 name='signal_set1',
                 converter=None,
                 **kwargs):
        # Check signals
        signal_dict, fs = self._check_signals(signals, responses)
        data_dict = {} if data_dict is None else data_dict
        data_dict.update(signal_dict)
        kwargs.update({pedia.sampling_frequency: fs})

        # Call parent's constructor
        SequenceSet.__init__(self,
                             data_dict=data_dict,
                             summ_dict=summ_dict,
                             n_to_one=n_to_one,
                             name=name,
                             **kwargs)
        # Attributes
        if converter is not None:
            assert callable(converter)
            self.converter = converter
Exemple #2
0
 def load_as_tframe_data(cls,
                         data_dir,
                         file_name=None,
                         permute=False,
                         permute_mark='alpha',
                         **kwargs):
     # Check file name
     if file_name is None:
         file_name = cls._get_file_name(permute, permute_mark) + '.tfds'
     data_path = os.path.join(data_dir, file_name)
     if os.path.exists(data_path): return SequenceSet.load(data_path)
     # If data does not exist, create a new data set
     console.show_status('Creating data ...')
     images, labels = MNIST.load_as_numpy_arrays(data_dir)
     # images (70000, 784, 1), np.float64
     images = images.reshape(images.shape[0], -1, 1) / 255.
     # permute images if necessary
     if permute:
         images = np.swapaxes(images, 0, 1)
         images = np.random.permutation(images)
         images = np.swapaxes(images, 0, 1)
     # labels (70000, 10), np.float64
     labels = convert_to_one_hot(labels, 10)
     # Wrap data into a Sequence Set
     features = [image for image in images]
     targets = [label for label in labels]
     data_set = SequenceSet(features,
                            summ_dict={'targets': targets},
                            n_to_one=True,
                            name='pMNIST')
     console.show_status('Saving data set ...')
     data_set.save(data_path)
     console.show_status('Data set saved to `{}`'.format(data_path))
     return data_set
Exemple #3
0
    def divide(cls, lob_set, k_list, first_name, second_name):
        assert isinstance(lob_set, SequenceSet) and lob_set.size <= 5
        if isinstance(k_list, int):
            k_list = [k_list] * lob_set.size
        first_features, second_features = [], []
        first_targets, second_targets = [], []
        # Separate each stock
        len_per_day_per_stock = lob_set[cls.LEN_PER_DAY_PER_STOCK]
        assert len(len_per_day_per_stock) == lob_set.size
        for stock, (k, lob, move) in enumerate(
                zip(k_list, lob_set.features, lob_set.targets)):
            lengths = len_per_day_per_stock[stock]
            L = sum(lengths[:k])
            if k != 0:
                first_features.append(lob[:L])
                first_targets.append(move[:L])
            if k != len(lengths):
                second_features.append(lob[L:])
                second_targets.append(move[L:])
        # Wrap data sets and return
        first_properties = {
            cls.LEN_PER_DAY_PER_STOCK:
            [s[:k] for k, s in zip(k_list, len_per_day_per_stock) if k != 0]
        }
        first_set = SequenceSet(first_features,
                                first_targets,
                                name=first_name,
                                **first_properties)
        second_properties = {
            cls.LEN_PER_DAY_PER_STOCK: [
                s[k:] for k, s in zip(k_list, len_per_day_per_stock)
                if k != len(s)
            ]
        }
        second_set = SequenceSet(second_features,
                                 second_targets,
                                 name=second_name,
                                 **second_properties)

        for seq_set in [first_set, second_set]:
            assert np.sum(seq_set.structure) == np.sum(
                np.concatenate(seq_set[cls.LEN_PER_DAY_PER_STOCK]))

        return first_set, second_set
Exemple #4
0
  def load_as_tframe_data(cls, data_dir, train_size=1000, test_size=200,
                          file_name=None, unique_=True, cheat=True,
                          local_binary=True, multiple=1, rule=None):
    assert rule in ('lstm97', 'pau19', None)

    # Check file_name
    if file_name is None:
      file_name = cls._get_file_name(
        train_size, test_size, unique_, cheat, local_binary, multiple, rule)
    data_path = os.path.join(data_dir, file_name)
    if os.path.exists(data_path): return SequenceSet.load(data_path)
    # If data does not exist, create a new one
    console.show_status('Making data ...')

    if rule == 'pau19':
      erg_list = ReberGrammar.make_strings(
        train_size + test_size, True, embedded=True, multiple=multiple,
        verbose=True)
    elif rule == 'lstm97':
      train_list = ReberGrammar.make_strings(
        train_size, False, embedded=True, verbose=True, multiple=multiple)
      test_list = ReberGrammar.make_strings(
        test_size, False, embedded=True, exclusive=train_list, verbose=True,
        multiple=multiple)
      erg_list = train_list + test_list
    else:
      erg_list = ReberGrammar.make_strings(
        train_size + test_size, unique_, embedded=True, verbose=True,
        multiple=multiple)

    # Wrap erg into a DataSet
    features = [erg.one_hot for erg in erg_list]
    val_targets = [erg.local_binary if local_binary else erg.transfer_prob
                   for erg in erg_list]
    targets = ([erg.observed_prob for erg in erg_list]
               if not cheat else val_targets)
    # targets = [erg.transfer_prob for erg in erg_list]
    data_set = SequenceSet(
      features, targets, data_dict={'val_targets': val_targets},
      erg_list=tuple(erg_list), name='Embedded Reber Grammar')
    console.show_status('Saving data set ...')
    data_set.save(data_path)
    console.show_status('Data set saved to {}'.format(data_path))
    return data_set
Exemple #5
0
    def load_as_tframe_data(cls,
                            data_dir,
                            auction=False,
                            norm_type="zscore",
                            setup=None,
                            file_slices=None,
                            **kwargs):
        # Confirm type of normalization
        nt_lower = norm_type.lower()
        # 'Zscore' for directory names and 'ZScore' for file names
        if nt_lower in ["1", "zscore"]:
            type_id, norm_type = 1, "Zscore"
        elif nt_lower in ["2", "minmax"]:
            type_id, norm_type = 2, "MinMax"
        elif nt_lower in ["3", "decpre"]:
            type_id, norm_type = 3, "DecPre"
        else:
            raise KeyError(
                "Unknown type of normalization `{}`".format(norm_type))
        # Load directly if dataset exists
        data_path = cls._get_data_path(data_dir, auction, norm_type, setup)
        if os.path.exists(data_path):
            return SequenceSet.load(data_path)
        # If dataset does not exist, create from raw data
        console.show_status("Creating `{}` from raw data ...".format(
            os.path.basename(data_path)))
        # Load raw data
        features, targets = cls._load_raw_data(data_dir,
                                               auction,
                                               norm_type,
                                               type_id,
                                               file_slices=file_slices)

        # Wrap raw data into tframe Sequence set
        data_dict = {"raw_data": features}
        data_dict.update(targets)
        seq_set = SequenceSet(data_dict=data_dict, name=cls.DATA_NAME)
        # Save Sequence set
        seq_set.save(data_path)
        console.show_status("Sequence set saved to `{}`".format(data_path))
        # Return
        return seq_set
Exemple #6
0
 def extract_seq_set(cls, raw_set, horizon):
     assert isinstance(raw_set,
                       SequenceSet) and horizon in [10, 20, 30, 50, 100]
     seq_set = SequenceSet(
         features=[
             array[:, :40] for array in raw_set.data_dict['raw_data']
         ],
         targets=raw_set.data_dict[horizon],
         name=raw_set.name,
     )
     return seq_set
Exemple #7
0
 def _get_one_data_set(cls, size, N, T, var_x, noisy, var_y):
   features, targets = [], []
   for _ in range(size):
     number = np.random.choice([-1, 1])
     x, y = engine(number, N, T, var_x, noisy, var_y)
     features.append(x)
     targets.append(y)
   # Wrap data into a SequenceSet
   data_set = SequenceSet(
     features, summ_dict={'targets': targets}, n_to_one=True,
     name='Noisy Sequences' if noisy else 'Noise-free Sequences',
     N=N, T=T, var_x=var_x, noisy=noisy, var_y=var_y)
   return data_set
Exemple #8
0
    def load_as_tframe_data(cls, data_dir, num_words=10000, **kwargs):
        # Load directly if data set exists
        data_path = cls._get_data_path(data_dir, num_words)
        if os.path.exists(data_path): return SequenceSet.load(data_path)
        # If data does not exist, create from raw data
        console.show_status('Creating data sets ...')
        (train_data,
         train_labels), (test_data,
                         test_labels) = cls._load_raw_data(data_dir,
                                                           num_words=num_words)
        data_list = list(train_data) + list(test_data)
        features = [np.array(cmt).reshape([-1, 1]) for cmt in data_list]

        targets = list(np.concatenate((train_labels, test_labels)))

        data_set = SequenceSet(features,
                               summ_dict={'targets': targets},
                               n_to_one=True,
                               name='IMDB')
        console.show_status('Saving data set ...')
        data_set.save(data_path)
        console.show_status('Data set saved to `{}`'.format(data_path))
        return data_set
Exemple #9
0
 def load_as_tframe_data(cls, data_dir, size=2560, file_name=None, N=3, T=100,
                         var_x=0.2, add_noise=False, var_y=0.1, prefix=''):
   # Check file_name
   if file_name is None:
     file_name = cls._get_file_name(size, N, T, var_x, add_noise, var_y)
     file_name = prefix + file_name + '.tfds'
   data_path = os.path.join(data_dir, file_name)
   if os.path.exists(data_path): return SequenceSet.load(data_path)
   # If data does not exist, create a new data set
   console.show_status('Creating data ...')
   data_set = cls._get_one_data_set(size, N, T, var_x, add_noise, var_y)
   console.show_status('Saving data set ...')
   data_set.save(data_path)
   console.show_status('Data set saved to `{}`'.format(data_path))
   return data_set
Exemple #10
0
    def load_as_tframe_data(cls,
                            data_dir,
                            file_name=None,
                            rgb=True,
                            permute=False,
                            permute_mark='alpha',
                            **kwargs):
        assert rgb and not permute
        # Check file name
        if file_name is None:
            file_name = cls._get_file_name(rgb, permute,
                                           permute_mark) + '.tfds'
        data_path = os.path.join(data_dir, file_name)
        if os.path.exists(data_path): return SequenceSet.load(data_path)

        # If data does not exist, create a new data set
        console.show_status('Creating data ...')
        images, labels = CIFAR10.load_as_numpy_arrays(data_dir)
        # images (60000, 32, 32, 3), np.float64
        images = images.reshape(60000, 32 * 32, 3 if rgb else 1) / 255.
        # permute images if necessary
        if permute: raise NotImplementedError

        # labels (60000, 10), np.int32
        labels = convert_to_one_hot(labels, 10)
        # Wrap data into a Sequence Set
        features = [image for image in images]
        targets = [label for label in labels]
        data_set = SequenceSet(features,
                               summ_dict={'targets': targets},
                               n_to_one=True,
                               name='sCIFAR10')
        console.show_status('Saving data set ...')
        data_set.save(data_path)
        console.show_status('Data set saved to `{}`'.format(data_path))
        return data_set
Exemple #11
0
 def _synthesize(cls, size, L, N, fixed_length, verbose=False):
     features, targets = [], []
     for i in range(size):
         x, y = engine(L, N, fixed_length)
         features.append(x)
         targets.append(y)
         if verbose:
             console.clear_line()
             console.print_progress(i + 1, size)
     # Wrap data into a SequenceSet
     data_set = SequenceSet(features,
                            summ_dict={'targets': targets},
                            n_to_one=True,
                            name='TemporalOrder')
     return data_set
Exemple #12
0
def get_balanced_seq_set(seq_set, sections=None, name='Balanced Set', M=None):
    assert isinstance(seq_set, SequenceSet)
    if sections is None:
        if M is None: M = min(seq_set.structure)
        sections = [int(np.ceil(s / M)) for s in seq_set.structure]
    assert isinstance(sections, list)
    features, targets = [], []
    for x, y, s in zip(seq_set.features, seq_set.targets, sections):
        if s == 1:
            features.append(x)
            targets.append(y)
            continue
        L = int(len(x) / s)
        indices = [(i + 1) * L for i in range(s - 1)]
        features += np.split(x, indices)
        targets += np.split(y, indices)
    balanced_set = SequenceSet(features, targets, name=name)
    assert len(balanced_set.structure) == sum(sections)
    assert sum(balanced_set.structure) == sum(seq_set.structure)
    return balanced_set
Exemple #13
0
 def load_as_tframe_data(cls,
                         data_dir,
                         size=1000,
                         L=100,
                         N=3,
                         fixed_length=True,
                         file_name=None,
                         prefix='',
                         **kwargs):
     # Check file_name
     if file_name is None:
         file_name = cls._get_file_name(size, L, N, fixed_length)
         file_name = prefix + file_name + '.tfds'
     data_path = os.path.join(data_dir, file_name)
     if os.path.exists(data_path): return SequenceSet.load(data_path)
     # If data does not exist, create a new data set
     console.show_status('Creating data ...')
     data_set = cls._synthesize(size, L, N, fixed_length, verbose=True)
     console.show_status('Saving data set ...')
     data_set.save(data_path)
     console.show_status('Data set saved to `{}`'.format(data_path))
     return data_set
Exemple #14
0
 def load_as_tframe_data(cls,
                         data_dir,
                         size=10000,
                         T=150,
                         fixed_length=True,
                         file_name=None,
                         prefix='',
                         **kwargs):
     """In IRNN15: `..., we noticed that both LSTMs and RNNs started to
     struggle when T is around 150.`"""
     # Check file_name
     if file_name is None:
         file_name = cls._get_file_name(size, T, fixed_length)
         file_name = prefix + file_name + '.tfds'
     data_path = os.path.join(data_dir, file_name)
     if os.path.exists(data_path): return SequenceSet.load(data_path)
     # If data does not exist, create a new data set
     console.show_status('Creating data ...')
     data_set = cls._synthesize(size, T, fixed_length, verbose=True)
     console.show_status('Saving data set ...')
     data_set.save(data_path)
     console.show_status('Data set saved to `{}`'.format(data_path))
     return data_set
Exemple #15
0
    def load_raw_LOBs(cls, data_dir, auction=False):
        # Load directly if dataset exists
        data_path = cls._get_data_path(data_dir, auction=auction)
        if os.path.exists(data_path): return SequenceSet.load(data_path)
        # Otherwise restore raw LOBs from decimal precision data
        dp_set = cls.load_as_tframe_data(data_dir,
                                         auction=auction,
                                         norm_type='decpre',
                                         setup=9,
                                         file_slices=(slice(8, 9), slice(8,
                                                                         9)))
        # Extract first 40 dimensions in de_set.raw_data
        dp_lob_list = [array[:, :40] for array in dp_set.data_dict['raw_data']]
        # Set parameters for restoration
        p_coef, v_coef = 10000, 100000
        coefs = np.array([p_coef, v_coef] * 20).reshape(1, 40)
        lob_list = [array * coefs for array in dp_lob_list]
        # Check targets
        cls._check_targets(data_dir, auction, dp_set.data_dict)
        # Check lob list
        cls._check_raw_lob(data_dir, auction, lob_list, raise_err=True)

        # Separate sequences for each stock
        # i  0 1 2 3 4 5 6 7
        # --------------------
        #    1 1 0 0 0 1 1 1        := x
        #      1 1 0 0 0 1 1 1
        # d  x 0 1 0 0 1 0 0 x      x[0:2], x[2:5], x[5:8]
        # --------------------
        # j    0 1 2 3 4 5 6
        #        *     *
        # |x[1:] - x[:-1]| reveals cliffs
        LOBs = [[] for _ in range(5)]
        horizons = [10, 20, 30, 50, 100]
        targets = {h: [[] for _ in range(5)] for h in horizons}
        for j, lobs in enumerate(lob_list):
            # Find cliff indices
            max_delta = 300 if auction else 200
            indices = cls._get_cliff_indices(lobs,
                                             auction,
                                             max_delta=max_delta)
            # Fill LOBs
            from_i = 0
            for stock in range(5):
                to_i = (indices[stock] + 1) if stock < 4 else len(lobs)
                slc = slice(from_i, to_i)
                LOBs[stock].append(lobs[slc])
                for h in horizons:
                    targets[h][stock].append(dp_set.data_dict[h][j][slc])
                if stock != 4: from_i = indices[stock] + 1
        # Generate new data_dict
        data_dict = {
            h: [np.concatenate(tgt_list) for tgt_list in tgt_lists]
            for h, tgt_lists in targets.items()
        }
        data_dict['raw_data'] = [np.concatenate(lb_list) for lb_list in LOBs]
        # Initiate a new seq_set
        seq_set = SequenceSet(data_dict=data_dict,
                              name='FI-2010-LOBs',
                              **{
                                  cls.LEN_PER_DAY_PER_STOCK:
                                  cls._get_len_per_day_per_stock(
                                      data_dir, auction)
                              })
        # Sanity check (394337)
        assert sum(seq_set.structure) == sum(cls.DAY_LENGTH[auction])
        # Save and return
        seq_set.save(filename=data_path)
        console.show_status('{} saved to `{}`'.format(seq_set.name, data_path))
        return seq_set