def __init__(self, signals, responses=None, data_dict=None, summ_dict=None, n_to_one=False, name='signal_set1', converter=None, **kwargs): # Check signals signal_dict, fs = self._check_signals(signals, responses) data_dict = {} if data_dict is None else data_dict data_dict.update(signal_dict) kwargs.update({pedia.sampling_frequency: fs}) # Call parent's constructor SequenceSet.__init__(self, data_dict=data_dict, summ_dict=summ_dict, n_to_one=n_to_one, name=name, **kwargs) # Attributes if converter is not None: assert callable(converter) self.converter = converter
def load_as_tframe_data(cls, data_dir, file_name=None, permute=False, permute_mark='alpha', **kwargs): # Check file name if file_name is None: file_name = cls._get_file_name(permute, permute_mark) + '.tfds' data_path = os.path.join(data_dir, file_name) if os.path.exists(data_path): return SequenceSet.load(data_path) # If data does not exist, create a new data set console.show_status('Creating data ...') images, labels = MNIST.load_as_numpy_arrays(data_dir) # images (70000, 784, 1), np.float64 images = images.reshape(images.shape[0], -1, 1) / 255. # permute images if necessary if permute: images = np.swapaxes(images, 0, 1) images = np.random.permutation(images) images = np.swapaxes(images, 0, 1) # labels (70000, 10), np.float64 labels = convert_to_one_hot(labels, 10) # Wrap data into a Sequence Set features = [image for image in images] targets = [label for label in labels] data_set = SequenceSet(features, summ_dict={'targets': targets}, n_to_one=True, name='pMNIST') console.show_status('Saving data set ...') data_set.save(data_path) console.show_status('Data set saved to `{}`'.format(data_path)) return data_set
def divide(cls, lob_set, k_list, first_name, second_name): assert isinstance(lob_set, SequenceSet) and lob_set.size <= 5 if isinstance(k_list, int): k_list = [k_list] * lob_set.size first_features, second_features = [], [] first_targets, second_targets = [], [] # Separate each stock len_per_day_per_stock = lob_set[cls.LEN_PER_DAY_PER_STOCK] assert len(len_per_day_per_stock) == lob_set.size for stock, (k, lob, move) in enumerate( zip(k_list, lob_set.features, lob_set.targets)): lengths = len_per_day_per_stock[stock] L = sum(lengths[:k]) if k != 0: first_features.append(lob[:L]) first_targets.append(move[:L]) if k != len(lengths): second_features.append(lob[L:]) second_targets.append(move[L:]) # Wrap data sets and return first_properties = { cls.LEN_PER_DAY_PER_STOCK: [s[:k] for k, s in zip(k_list, len_per_day_per_stock) if k != 0] } first_set = SequenceSet(first_features, first_targets, name=first_name, **first_properties) second_properties = { cls.LEN_PER_DAY_PER_STOCK: [ s[k:] for k, s in zip(k_list, len_per_day_per_stock) if k != len(s) ] } second_set = SequenceSet(second_features, second_targets, name=second_name, **second_properties) for seq_set in [first_set, second_set]: assert np.sum(seq_set.structure) == np.sum( np.concatenate(seq_set[cls.LEN_PER_DAY_PER_STOCK])) return first_set, second_set
def load_as_tframe_data(cls, data_dir, train_size=1000, test_size=200, file_name=None, unique_=True, cheat=True, local_binary=True, multiple=1, rule=None): assert rule in ('lstm97', 'pau19', None) # Check file_name if file_name is None: file_name = cls._get_file_name( train_size, test_size, unique_, cheat, local_binary, multiple, rule) data_path = os.path.join(data_dir, file_name) if os.path.exists(data_path): return SequenceSet.load(data_path) # If data does not exist, create a new one console.show_status('Making data ...') if rule == 'pau19': erg_list = ReberGrammar.make_strings( train_size + test_size, True, embedded=True, multiple=multiple, verbose=True) elif rule == 'lstm97': train_list = ReberGrammar.make_strings( train_size, False, embedded=True, verbose=True, multiple=multiple) test_list = ReberGrammar.make_strings( test_size, False, embedded=True, exclusive=train_list, verbose=True, multiple=multiple) erg_list = train_list + test_list else: erg_list = ReberGrammar.make_strings( train_size + test_size, unique_, embedded=True, verbose=True, multiple=multiple) # Wrap erg into a DataSet features = [erg.one_hot for erg in erg_list] val_targets = [erg.local_binary if local_binary else erg.transfer_prob for erg in erg_list] targets = ([erg.observed_prob for erg in erg_list] if not cheat else val_targets) # targets = [erg.transfer_prob for erg in erg_list] data_set = SequenceSet( features, targets, data_dict={'val_targets': val_targets}, erg_list=tuple(erg_list), name='Embedded Reber Grammar') console.show_status('Saving data set ...') data_set.save(data_path) console.show_status('Data set saved to {}'.format(data_path)) return data_set
def load_as_tframe_data(cls, data_dir, auction=False, norm_type="zscore", setup=None, file_slices=None, **kwargs): # Confirm type of normalization nt_lower = norm_type.lower() # 'Zscore' for directory names and 'ZScore' for file names if nt_lower in ["1", "zscore"]: type_id, norm_type = 1, "Zscore" elif nt_lower in ["2", "minmax"]: type_id, norm_type = 2, "MinMax" elif nt_lower in ["3", "decpre"]: type_id, norm_type = 3, "DecPre" else: raise KeyError( "Unknown type of normalization `{}`".format(norm_type)) # Load directly if dataset exists data_path = cls._get_data_path(data_dir, auction, norm_type, setup) if os.path.exists(data_path): return SequenceSet.load(data_path) # If dataset does not exist, create from raw data console.show_status("Creating `{}` from raw data ...".format( os.path.basename(data_path))) # Load raw data features, targets = cls._load_raw_data(data_dir, auction, norm_type, type_id, file_slices=file_slices) # Wrap raw data into tframe Sequence set data_dict = {"raw_data": features} data_dict.update(targets) seq_set = SequenceSet(data_dict=data_dict, name=cls.DATA_NAME) # Save Sequence set seq_set.save(data_path) console.show_status("Sequence set saved to `{}`".format(data_path)) # Return return seq_set
def extract_seq_set(cls, raw_set, horizon): assert isinstance(raw_set, SequenceSet) and horizon in [10, 20, 30, 50, 100] seq_set = SequenceSet( features=[ array[:, :40] for array in raw_set.data_dict['raw_data'] ], targets=raw_set.data_dict[horizon], name=raw_set.name, ) return seq_set
def _get_one_data_set(cls, size, N, T, var_x, noisy, var_y): features, targets = [], [] for _ in range(size): number = np.random.choice([-1, 1]) x, y = engine(number, N, T, var_x, noisy, var_y) features.append(x) targets.append(y) # Wrap data into a SequenceSet data_set = SequenceSet( features, summ_dict={'targets': targets}, n_to_one=True, name='Noisy Sequences' if noisy else 'Noise-free Sequences', N=N, T=T, var_x=var_x, noisy=noisy, var_y=var_y) return data_set
def load_as_tframe_data(cls, data_dir, num_words=10000, **kwargs): # Load directly if data set exists data_path = cls._get_data_path(data_dir, num_words) if os.path.exists(data_path): return SequenceSet.load(data_path) # If data does not exist, create from raw data console.show_status('Creating data sets ...') (train_data, train_labels), (test_data, test_labels) = cls._load_raw_data(data_dir, num_words=num_words) data_list = list(train_data) + list(test_data) features = [np.array(cmt).reshape([-1, 1]) for cmt in data_list] targets = list(np.concatenate((train_labels, test_labels))) data_set = SequenceSet(features, summ_dict={'targets': targets}, n_to_one=True, name='IMDB') console.show_status('Saving data set ...') data_set.save(data_path) console.show_status('Data set saved to `{}`'.format(data_path)) return data_set
def load_as_tframe_data(cls, data_dir, size=2560, file_name=None, N=3, T=100, var_x=0.2, add_noise=False, var_y=0.1, prefix=''): # Check file_name if file_name is None: file_name = cls._get_file_name(size, N, T, var_x, add_noise, var_y) file_name = prefix + file_name + '.tfds' data_path = os.path.join(data_dir, file_name) if os.path.exists(data_path): return SequenceSet.load(data_path) # If data does not exist, create a new data set console.show_status('Creating data ...') data_set = cls._get_one_data_set(size, N, T, var_x, add_noise, var_y) console.show_status('Saving data set ...') data_set.save(data_path) console.show_status('Data set saved to `{}`'.format(data_path)) return data_set
def load_as_tframe_data(cls, data_dir, file_name=None, rgb=True, permute=False, permute_mark='alpha', **kwargs): assert rgb and not permute # Check file name if file_name is None: file_name = cls._get_file_name(rgb, permute, permute_mark) + '.tfds' data_path = os.path.join(data_dir, file_name) if os.path.exists(data_path): return SequenceSet.load(data_path) # If data does not exist, create a new data set console.show_status('Creating data ...') images, labels = CIFAR10.load_as_numpy_arrays(data_dir) # images (60000, 32, 32, 3), np.float64 images = images.reshape(60000, 32 * 32, 3 if rgb else 1) / 255. # permute images if necessary if permute: raise NotImplementedError # labels (60000, 10), np.int32 labels = convert_to_one_hot(labels, 10) # Wrap data into a Sequence Set features = [image for image in images] targets = [label for label in labels] data_set = SequenceSet(features, summ_dict={'targets': targets}, n_to_one=True, name='sCIFAR10') console.show_status('Saving data set ...') data_set.save(data_path) console.show_status('Data set saved to `{}`'.format(data_path)) return data_set
def _synthesize(cls, size, L, N, fixed_length, verbose=False): features, targets = [], [] for i in range(size): x, y = engine(L, N, fixed_length) features.append(x) targets.append(y) if verbose: console.clear_line() console.print_progress(i + 1, size) # Wrap data into a SequenceSet data_set = SequenceSet(features, summ_dict={'targets': targets}, n_to_one=True, name='TemporalOrder') return data_set
def get_balanced_seq_set(seq_set, sections=None, name='Balanced Set', M=None): assert isinstance(seq_set, SequenceSet) if sections is None: if M is None: M = min(seq_set.structure) sections = [int(np.ceil(s / M)) for s in seq_set.structure] assert isinstance(sections, list) features, targets = [], [] for x, y, s in zip(seq_set.features, seq_set.targets, sections): if s == 1: features.append(x) targets.append(y) continue L = int(len(x) / s) indices = [(i + 1) * L for i in range(s - 1)] features += np.split(x, indices) targets += np.split(y, indices) balanced_set = SequenceSet(features, targets, name=name) assert len(balanced_set.structure) == sum(sections) assert sum(balanced_set.structure) == sum(seq_set.structure) return balanced_set
def load_as_tframe_data(cls, data_dir, size=1000, L=100, N=3, fixed_length=True, file_name=None, prefix='', **kwargs): # Check file_name if file_name is None: file_name = cls._get_file_name(size, L, N, fixed_length) file_name = prefix + file_name + '.tfds' data_path = os.path.join(data_dir, file_name) if os.path.exists(data_path): return SequenceSet.load(data_path) # If data does not exist, create a new data set console.show_status('Creating data ...') data_set = cls._synthesize(size, L, N, fixed_length, verbose=True) console.show_status('Saving data set ...') data_set.save(data_path) console.show_status('Data set saved to `{}`'.format(data_path)) return data_set
def load_as_tframe_data(cls, data_dir, size=10000, T=150, fixed_length=True, file_name=None, prefix='', **kwargs): """In IRNN15: `..., we noticed that both LSTMs and RNNs started to struggle when T is around 150.`""" # Check file_name if file_name is None: file_name = cls._get_file_name(size, T, fixed_length) file_name = prefix + file_name + '.tfds' data_path = os.path.join(data_dir, file_name) if os.path.exists(data_path): return SequenceSet.load(data_path) # If data does not exist, create a new data set console.show_status('Creating data ...') data_set = cls._synthesize(size, T, fixed_length, verbose=True) console.show_status('Saving data set ...') data_set.save(data_path) console.show_status('Data set saved to `{}`'.format(data_path)) return data_set
def load_raw_LOBs(cls, data_dir, auction=False): # Load directly if dataset exists data_path = cls._get_data_path(data_dir, auction=auction) if os.path.exists(data_path): return SequenceSet.load(data_path) # Otherwise restore raw LOBs from decimal precision data dp_set = cls.load_as_tframe_data(data_dir, auction=auction, norm_type='decpre', setup=9, file_slices=(slice(8, 9), slice(8, 9))) # Extract first 40 dimensions in de_set.raw_data dp_lob_list = [array[:, :40] for array in dp_set.data_dict['raw_data']] # Set parameters for restoration p_coef, v_coef = 10000, 100000 coefs = np.array([p_coef, v_coef] * 20).reshape(1, 40) lob_list = [array * coefs for array in dp_lob_list] # Check targets cls._check_targets(data_dir, auction, dp_set.data_dict) # Check lob list cls._check_raw_lob(data_dir, auction, lob_list, raise_err=True) # Separate sequences for each stock # i 0 1 2 3 4 5 6 7 # -------------------- # 1 1 0 0 0 1 1 1 := x # 1 1 0 0 0 1 1 1 # d x 0 1 0 0 1 0 0 x x[0:2], x[2:5], x[5:8] # -------------------- # j 0 1 2 3 4 5 6 # * * # |x[1:] - x[:-1]| reveals cliffs LOBs = [[] for _ in range(5)] horizons = [10, 20, 30, 50, 100] targets = {h: [[] for _ in range(5)] for h in horizons} for j, lobs in enumerate(lob_list): # Find cliff indices max_delta = 300 if auction else 200 indices = cls._get_cliff_indices(lobs, auction, max_delta=max_delta) # Fill LOBs from_i = 0 for stock in range(5): to_i = (indices[stock] + 1) if stock < 4 else len(lobs) slc = slice(from_i, to_i) LOBs[stock].append(lobs[slc]) for h in horizons: targets[h][stock].append(dp_set.data_dict[h][j][slc]) if stock != 4: from_i = indices[stock] + 1 # Generate new data_dict data_dict = { h: [np.concatenate(tgt_list) for tgt_list in tgt_lists] for h, tgt_lists in targets.items() } data_dict['raw_data'] = [np.concatenate(lb_list) for lb_list in LOBs] # Initiate a new seq_set seq_set = SequenceSet(data_dict=data_dict, name='FI-2010-LOBs', **{ cls.LEN_PER_DAY_PER_STOCK: cls._get_len_per_day_per_stock( data_dir, auction) }) # Sanity check (394337) assert sum(seq_set.structure) == sum(cls.DAY_LENGTH[auction]) # Save and return seq_set.save(filename=data_path) console.show_status('{} saved to `{}`'.format(seq_set.name, data_path)) return seq_set