def load_as_tframe_data(cls, data_dir, file_name=None, permute=False, permute_mark='alpha', **kwargs): # Check file name if file_name is None: file_name = cls._get_file_name(permute, permute_mark) + '.tfds' data_path = os.path.join(data_dir, file_name) if os.path.exists(data_path): return SequenceSet.load(data_path) # If data does not exist, create a new data set console.show_status('Creating data ...') images, labels = MNIST.load_as_numpy_arrays(data_dir) # images (70000, 784, 1), np.float64 images = images.reshape(images.shape[0], -1, 1) / 255. # permute images if necessary if permute: images = np.swapaxes(images, 0, 1) images = np.random.permutation(images) images = np.swapaxes(images, 0, 1) # labels (70000, 10), np.float64 labels = convert_to_one_hot(labels, 10) # Wrap data into a Sequence Set features = [image for image in images] targets = [label for label in labels] data_set = SequenceSet(features, summ_dict={'targets': targets}, n_to_one=True, name='pMNIST') console.show_status('Saving data set ...') data_set.save(data_path) console.show_status('Data set saved to `{}`'.format(data_path)) return data_set
def load_as_tframe_data(cls, data_dir, train_size=1000, test_size=200, file_name=None, unique_=True, cheat=True, local_binary=True, multiple=1, rule=None): assert rule in ('lstm97', 'pau19', None) # Check file_name if file_name is None: file_name = cls._get_file_name( train_size, test_size, unique_, cheat, local_binary, multiple, rule) data_path = os.path.join(data_dir, file_name) if os.path.exists(data_path): return SequenceSet.load(data_path) # If data does not exist, create a new one console.show_status('Making data ...') if rule == 'pau19': erg_list = ReberGrammar.make_strings( train_size + test_size, True, embedded=True, multiple=multiple, verbose=True) elif rule == 'lstm97': train_list = ReberGrammar.make_strings( train_size, False, embedded=True, verbose=True, multiple=multiple) test_list = ReberGrammar.make_strings( test_size, False, embedded=True, exclusive=train_list, verbose=True, multiple=multiple) erg_list = train_list + test_list else: erg_list = ReberGrammar.make_strings( train_size + test_size, unique_, embedded=True, verbose=True, multiple=multiple) # Wrap erg into a DataSet features = [erg.one_hot for erg in erg_list] val_targets = [erg.local_binary if local_binary else erg.transfer_prob for erg in erg_list] targets = ([erg.observed_prob for erg in erg_list] if not cheat else val_targets) # targets = [erg.transfer_prob for erg in erg_list] data_set = SequenceSet( features, targets, data_dict={'val_targets': val_targets}, erg_list=tuple(erg_list), name='Embedded Reber Grammar') console.show_status('Saving data set ...') data_set.save(data_path) console.show_status('Data set saved to {}'.format(data_path)) return data_set
def load_as_tframe_data(cls, data_dir, auction=False, norm_type="zscore", setup=None, file_slices=None, **kwargs): # Confirm type of normalization nt_lower = norm_type.lower() # 'Zscore' for directory names and 'ZScore' for file names if nt_lower in ["1", "zscore"]: type_id, norm_type = 1, "Zscore" elif nt_lower in ["2", "minmax"]: type_id, norm_type = 2, "MinMax" elif nt_lower in ["3", "decpre"]: type_id, norm_type = 3, "DecPre" else: raise KeyError( "Unknown type of normalization `{}`".format(norm_type)) # Load directly if dataset exists data_path = cls._get_data_path(data_dir, auction, norm_type, setup) if os.path.exists(data_path): return SequenceSet.load(data_path) # If dataset does not exist, create from raw data console.show_status("Creating `{}` from raw data ...".format( os.path.basename(data_path))) # Load raw data features, targets = cls._load_raw_data(data_dir, auction, norm_type, type_id, file_slices=file_slices) # Wrap raw data into tframe Sequence set data_dict = {"raw_data": features} data_dict.update(targets) seq_set = SequenceSet(data_dict=data_dict, name=cls.DATA_NAME) # Save Sequence set seq_set.save(data_path) console.show_status("Sequence set saved to `{}`".format(data_path)) # Return return seq_set
def load_as_tframe_data(cls, data_dir, num_words=10000, **kwargs): # Load directly if data set exists data_path = cls._get_data_path(data_dir, num_words) if os.path.exists(data_path): return SequenceSet.load(data_path) # If data does not exist, create from raw data console.show_status('Creating data sets ...') (train_data, train_labels), (test_data, test_labels) = cls._load_raw_data(data_dir, num_words=num_words) data_list = list(train_data) + list(test_data) features = [np.array(cmt).reshape([-1, 1]) for cmt in data_list] targets = list(np.concatenate((train_labels, test_labels))) data_set = SequenceSet(features, summ_dict={'targets': targets}, n_to_one=True, name='IMDB') console.show_status('Saving data set ...') data_set.save(data_path) console.show_status('Data set saved to `{}`'.format(data_path)) return data_set
def load_as_tframe_data(cls, data_dir, file_name=None, rgb=True, permute=False, permute_mark='alpha', **kwargs): assert rgb and not permute # Check file name if file_name is None: file_name = cls._get_file_name(rgb, permute, permute_mark) + '.tfds' data_path = os.path.join(data_dir, file_name) if os.path.exists(data_path): return SequenceSet.load(data_path) # If data does not exist, create a new data set console.show_status('Creating data ...') images, labels = CIFAR10.load_as_numpy_arrays(data_dir) # images (60000, 32, 32, 3), np.float64 images = images.reshape(60000, 32 * 32, 3 if rgb else 1) / 255. # permute images if necessary if permute: raise NotImplementedError # labels (60000, 10), np.int32 labels = convert_to_one_hot(labels, 10) # Wrap data into a Sequence Set features = [image for image in images] targets = [label for label in labels] data_set = SequenceSet(features, summ_dict={'targets': targets}, n_to_one=True, name='sCIFAR10') console.show_status('Saving data set ...') data_set.save(data_path) console.show_status('Data set saved to `{}`'.format(data_path)) return data_set
def load_raw_LOBs(cls, data_dir, auction=False): # Load directly if dataset exists data_path = cls._get_data_path(data_dir, auction=auction) if os.path.exists(data_path): return SequenceSet.load(data_path) # Otherwise restore raw LOBs from decimal precision data dp_set = cls.load_as_tframe_data(data_dir, auction=auction, norm_type='decpre', setup=9, file_slices=(slice(8, 9), slice(8, 9))) # Extract first 40 dimensions in de_set.raw_data dp_lob_list = [array[:, :40] for array in dp_set.data_dict['raw_data']] # Set parameters for restoration p_coef, v_coef = 10000, 100000 coefs = np.array([p_coef, v_coef] * 20).reshape(1, 40) lob_list = [array * coefs for array in dp_lob_list] # Check targets cls._check_targets(data_dir, auction, dp_set.data_dict) # Check lob list cls._check_raw_lob(data_dir, auction, lob_list, raise_err=True) # Separate sequences for each stock # i 0 1 2 3 4 5 6 7 # -------------------- # 1 1 0 0 0 1 1 1 := x # 1 1 0 0 0 1 1 1 # d x 0 1 0 0 1 0 0 x x[0:2], x[2:5], x[5:8] # -------------------- # j 0 1 2 3 4 5 6 # * * # |x[1:] - x[:-1]| reveals cliffs LOBs = [[] for _ in range(5)] horizons = [10, 20, 30, 50, 100] targets = {h: [[] for _ in range(5)] for h in horizons} for j, lobs in enumerate(lob_list): # Find cliff indices max_delta = 300 if auction else 200 indices = cls._get_cliff_indices(lobs, auction, max_delta=max_delta) # Fill LOBs from_i = 0 for stock in range(5): to_i = (indices[stock] + 1) if stock < 4 else len(lobs) slc = slice(from_i, to_i) LOBs[stock].append(lobs[slc]) for h in horizons: targets[h][stock].append(dp_set.data_dict[h][j][slc]) if stock != 4: from_i = indices[stock] + 1 # Generate new data_dict data_dict = { h: [np.concatenate(tgt_list) for tgt_list in tgt_lists] for h, tgt_lists in targets.items() } data_dict['raw_data'] = [np.concatenate(lb_list) for lb_list in LOBs] # Initiate a new seq_set seq_set = SequenceSet(data_dict=data_dict, name='FI-2010-LOBs', **{ cls.LEN_PER_DAY_PER_STOCK: cls._get_len_per_day_per_stock( data_dir, auction) }) # Sanity check (394337) assert sum(seq_set.structure) == sum(cls.DAY_LENGTH[auction]) # Save and return seq_set.save(filename=data_path) console.show_status('{} saved to `{}`'.format(seq_set.name, data_path)) return seq_set