def train_rnn_noncluster(file_name, hidden_size, n_layers=1, bidirectional=False, classifier=False, idx_label=None, n_epochs_max=2000, train_ratio=0.8, batch_size=128, n_workers=4, root_dir=ROOT_DIR, lr=0.001, betas=(0.9, 0.999)): ''' NOTE: to be deprecated. classifier works but regressor has not been integrated yet trains the recurrent neural network given a file that contains data. this data can be either scat transformed or pure simulated data inputs ------ file_name: string type name of file hidden_size: list type, sizes of hidden states n_layers: number of recurrent layers bidirectional: if True, becomes a bidirectional LSTM classifier: boolean indicating whether it's a classifier or regressor. idx_label: int indicating index of parameter to infer. should be given when classifer is False n_epochs_max: maximum number of epochs to run. can terminate with ctrl + c to move on to next neural network training. train_ratio: float indicating ratio for training data. should be between 0 and 1 batch_size: size of batch for computing gradient n_workers: how many subprocesses to use for data loading. 0 means that the data will be loaded in the main process. root_dir: string type root directory name lr - float type learning rate betas - tuple of floats indicating betas arguments in Adam optimizer outputs ------- None: saves weights and meta data into file ''' file_name, _ = os.path.splitext(file_name) file_path = os.path.join(root_dir, file_name + '.pt') transformed = 'scat' in file_name samples = torch.load(file_path) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") nums = cu.match_filename(r'{}_meta_rnn_([0-9]+).pt'.format(file_name), root_dir=root_dir) nums = [int(num) for num in nums] idx = max(nums) + 1 if nums else 0 file_name_meta = '{}_meta_rnn_{}.pt'.format(file_name, idx) # data shape: (n_param_1, n_param_2,..., n_param_N, n_samples_total, n_channels, (n_nodes), data_len) data, labels, label_names = samples['data'], samples['labels'], samples[ 'label_names'] # the number of dimensions that do not correspond to the batch dimension is 4 if scat transformed. # Otherwise, it's 3 n_none_param_dims = 4 if transformed else 3 n_samples_total = data.shape[-n_none_param_dims] n_data_total = np.prod(data.shape[:-(n_none_param_dims - 1)]) n_labels = len(label_names) # number of labels to predict if classifier: assert ( idx_label is None ), "Invalid idx_label input: should not be given for training classifier" assert (isinstance( hidden_size, int)), "Invalid format of hidden_size given. Should be type int" else: raise NotImplementedError( "Training regressor for non-cluster version has not been implemented yet" ) assert ( isinstance(idx_label, int) ), "Invalid idx_label input: int type idx_label required for training regressor" if n_labels == 1 and isinstance(hidden_size, int): hidden_size = [hidden_size] assert (len(hidden_size) == n_labels ), "Invalid format of hidden state sizes given.\ Should have length n_labels" assert(all([isinstance(hidden_size_label, int) for hidden_size_label in hidden_size])),\ "Invalid format of hidden_size given. Should be list with int type elements" index = _train_test_split(n_data_total, train_ratio) index['val'] = index.pop('test') # reshape data. output is shaped (n_data_total, n_channels * (n_scat_nodes), data_len). # (n_scat_nodes) means 1 if data not transformed data = np.reshape(data, (n_data_total, -1, data.shape[-1])) input_size = data.shape[-2] # initialize meta data and save it to a file meta = { 'file_name': file_name_meta, 'root_dir': root_dir, 'input_size': input_size, 'hidden_size': hidden_size, 'n_layers': n_layers, 'bidirectional': bidirectional, 'classifier': classifier, 'n_epochs_max': n_epochs_max, 'train_ratio': train_ratio, 'batch_size': batch_size, 'n_workers': n_workers, 'index': index, 'device': device, 'labels': samples['labels'], 'label_names': samples['label_names'] } labels = np.array(list(product(*labels)), dtype='float32') # shaped (n_conditions, n_labels) if classifier: label_to_idx = { tuple(condition): idx_condition for idx_condition, condition in enumerate(labels) } n_conditions = len(label_to_idx) meta.update({ 'epoch': [], 'weights': None, 'elapsed': [], 'loss': { 'train': [], 'val': [] }, 'criterion': 'cross_entropy_mean', 'label_to_idx': label_to_idx }) _init_meta(**meta) labels = np.arange(n_conditions) # shaped (n_conditions,) labels = np.repeat( labels, n_samples_total) # shaped (n_conditions * n_samples_total,) # which, for example, looks like [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4] # for n_samples_total being 3 and n_conditions being 5 dataset = TimeSeriesDataset(data, labels, transform=ToTensor()) # train the neural network for classification print("Beginning training of {}:".format(', '.join( samples['label_names']))) _train_rnn_noncluster(dataset, index, hidden_size=hidden_size, n_layers=n_layers, bidirectional=bidirectional, classifier=classifier, n_epochs_max=n_epochs_max, batch_size=batch_size, n_workers=n_workers, device=device, file_name=file_name_meta, root_dir=root_dir, lr=lr, betas=betas) else: meta.update({ 'epoch': [[] for _ in range(n_labels)], 'weights': [None for _ in range(n_labels)], 'elapsed': [[] for _ in range(n_labels)], 'loss': [{ 'train': [], 'val': [] } for _ in range(n_labels)], 'criterion': 'rmse' }) _init_meta(**meta) # following is shaped (n_labels, n_conditions) labels = labels.swapaxes(0, 1) # following is shaped (n_labels, n_data_total) labels = np.tile(labels[:, :, np.newaxis], [1, 1, n_samples_total]).reshape( [n_labels, n_data_total]) for idx_label in range(n_labels): dataset = TimeSeriesDataset(data, labels[idx_label], transform=ToTensor()) # train the rnn for the given idx_label print("Beginning training of {}:".format( samples['label_names'][idx_label])) _train_rnn_noncluster(dataset, index, hidden_size=hidden_size[idx_label], n_layers=n_layers, bidirectional=bidirectional, classifier=classifier, n_epochs_max=n_epochs_max, batch_size=batch_size, n_workers=n_workers, device=device, idx_label=idx_label, file_name=file_name_meta, root_dir=root_dir, lr=lr, betas=betas)
def train_rnn(file_name, hidden_size, n_layers=1, bidirectional=False, classifier=False, idx_label=None, n_epochs_max=2000, train_ratio=0.8, batch_size=128, n_workers=4, root_dir=ROOT_DIR, lr=0.001, betas=(0.9, 0.999), opt_level="O0", seed=42, log_interval=10): ''' trains the recurrent neural network given a file that contains data. this data can be either scat transformed or pure simulated data inputs ------ file_name: string type name of file hidden_size: list type, sizes of hidden states n_layers: number of recurrent layers bidirectional: if True, becomes a bidirectional LSTM classifier: boolean indicating whether it's a classifier or regressor. idx_label - int representing which neural network to train. should be given only when classifier is False n_epochs_max: maximum number of epochs to run. can terminate with ctrl + c to move on to next neural network training. train_ratio: float indicating ratio for training data. should be between 0 and 1 batch_size: size of batch for computing gradient n_workers: how many subprocesses to use for data loading. 0 means that the data will be loaded in the main process. root_dir: string type root directory name lr - float type learning rate betas - tuple of floats indicating betas arguments in Adam optimizer opt_level - optimization level seed - random seed log_interval - how many batches to wait before logging training status outputs ------- None: saves weights and meta data into file ''' # NOTE: regression means you train on data whose parameters are sampled continuously and test also for data whose parameters are sampled continuously, whereas # classifier means you train on data on the grid and test on the grid. # pass the dataset as an argument to _train_rnn() not with the index but the dataset being a dictionary with keys 'train' and 'val' hvd.init() torch.cuda.set_device(hvd.local_rank()) #device = hvd.local_rank() root_process = hvd.local_rank() == 0 file_name, _ = os.path.splitext(file_name) file_path = os.path.join(root_dir, file_name + '.pt') transformed = 'scat' in file_name samples = torch.load(file_path) # shape of data: (n_data_total, n_channels, (n_nodes), data_len) data, labels, label_names = samples['data'], samples['labels'], samples[ 'label_names'] n_data_total = len(data) if root_process: assert (isinstance( hidden_size, int)), "Invalid format of hidden_size given. Should be type int" if classifier: idx = 0 if root_process: nums = cu.match_filename( r'{}_meta_rnn_([0-9]+).pt'.format(file_name), root_dir=root_dir) # FIXME nums = [int(num) for num in nums] idx = max(nums) + 1 if nums else 0 idx = hvd.broadcast(torch.tensor(idx), root_rank=0, name='idx_file_meta').item() file_name_meta = '{}_meta_rnn_{}.pt'.format(file_name, idx) else: label = labels[idx_label] label_name = label_names[idx_label] idx = 0 if root_process: nums = cu.match_filename(r'{}_meta_rnn_([0-9]+)_{}.pt'.format( file_name, label_name), root_dir=root_dir) nums = [int(num) for num in nums] idx = max(nums) + 1 if nums else 0 idx = hvd.broadcast(torch.tensor(idx), root_rank=0, name='idx_file_meta').item() file_name_meta = '{}_meta_rnn_{}_{}.pt'.format(file_name, idx, label_name) index = _train_test_split(n_data_total, train_ratio) index['val'] = index.pop('test') # reshape data. output is shaped (n_data_total, n_channels * (n_scat_nodes), data_len). # (n_scat_nodes) means 1 if data not transformed if isinstance(data, np.ndarray): data = np.reshape(data, (n_data_total, -1, data.shape[-1])) elif isinstance(data, list): data = [ np.reshape(data_slice, (-1, data_slice.shape[-1])) for data_slice in data ] else: raise ValueError("Invalid type of data given") input_size = data[0].shape[0] # initialize meta data and save it to a file meta = { 'file_name': file_name_meta, 'root_dir': root_dir, 'input_size': input_size, 'hidden_size': hidden_size, 'n_layers': n_layers, 'bidirectional': bidirectional, 'classifier': classifier, 'n_epochs_max': n_epochs_max, 'train_ratio': train_ratio, 'batch_size': batch_size, 'n_workers': n_workers, 'index': index, 'epoch': [], 'weights': None, 'elapsed': [], 'loss': { 'train': [], 'val': [] }, 'criterion': 'cross_entropy_mean' if classifier else 'rmse', 'labels': labels if classifier else label, 'label_names': label_names if classifier else label_name } if classifier: if 'labels_lut' in samples.keys(): meta.update({'labels_lut': samples['labels_lut']}) _init_meta( **meta ) # done for all processes to ensure data gets loaded after initializing file dataset = TimeSeriesDataset(data, labels, transform=ToTensor()) # train the neural network for classification if root_process: print("Training classifier for {}:".format(', '.join( samples['label_names']))) _train_rnn(dataset, index, hidden_size=hidden_size, n_layers=n_layers, bidirectional=bidirectional, classifier=classifier, n_epochs_max=n_epochs_max, batch_size=batch_size, n_workers=n_workers, file_name=file_name_meta, root_dir=root_dir, lr=lr, betas=betas, opt_level=opt_level, seed=seed, log_interval=log_interval) else: _init_meta( **meta ) # done for all processes to ensure data gets loaded after initializing file dataset = TimeSeriesDataset(data, label, transform=ToTensor()) # train the rnn for the given idx_label if root_process: print("Training regressor for {}:".format(label_name)) _train_rnn(dataset, index, hidden_size=hidden_size, n_layers=n_layers, bidirectional=bidirectional, classifier=classifier, n_epochs_max=n_epochs_max, batch_size=batch_size, n_workers=n_workers, file_name=file_name_meta, root_dir=root_dir, lr=lr, betas=betas, opt_level=opt_level, seed=seed, log_interval=log_interval)
'(tbd_0_meta_rnn_[0-9]+_diff_coef_ratios.pt)' ] idx_file_start = 0 # None or 0 to start from beginning idx_file_end = 5 # None for going to end epoch_len = 200 # only consider files that went through 2000 epochs #plt.style.use('dark_background') fontsize_label = 14 fontsize_title = 18 fig_w = 12 fig_h = 8 file_names = [] for file_name_regex in file_name_regexs: file_names += cu.match_filename(file_name_regex, root_dir) file_paths = [os.path.join(root_dir, file_name) for file_name in file_names] file_paths_tmp = [] plt.close('all') for file_path in file_paths: meta = torch.load(file_path) if len(meta['epoch']) == epoch_len: file_paths_tmp.append(file_path) file_paths = file_paths_tmp[idx_file_start:idx_file_end] n_files = len(file_paths) figs = [] axs = [] for file_path in file_paths:
def train_nn(file_name, n_nodes_hidden, classifier=False, n_epochs_max=2000, train_ratio=0.8, batch_size=128, n_workers=4, root_dir=ROOT_DIR, lr=0.001, betas=(0.9, 0.999)): ''' trains the neural network given a file that contains data. this data can be either scat transformed or pure simulated data NOTE: requires refactoring to run on cluster inputs ------ file_name: string type name of file n_nodes_hidden: list type, where values are nodes (list of nodes) in the hidden layers for classification (regression). For regression of multiple labels, the number of lists should match with the number of labels to predict classifier: boolean indicating whether it's a classifier or regressor. n_epochs_max: int, maximum number of epochs to run. can terminate with ctrl + c to move on to next neural network training. train_ratio: float indicating ratio for training data. should be between 0 and 1 batch_size: size of batch for computing gradient n_workers: how many subprocesses to use for data loading. 0 means that the data will be loaded in the main process. root_dir: string type root directory name lr - float type learning rate betas - tuple of floats indicating betas arguments in Adam optimizer outputs ------- None: saves weights and meta data into file ''' file_name, _ = os.path.splitext(file_name) file_path = os.path.join(root_dir, file_name + '.pt') transformed = 'scat' in file_name samples = torch.load(file_path) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") nums = cu.match_filename(r'{}_meta_nn_([0-9]+).pt'.format(file_name), root_dir=root_dir) nums = [int(num) for num in nums] idx = max(nums) + 1 if nums else 0 file_name_meta = '{}_meta_nn_{}.pt'.format(file_name, idx) # data shape: (n_param_1, n_param_2,..., n_param_N, n_samples_total, n_channels, (n_nodes), data_len) data, labels, label_names = samples['data'], samples['labels'], samples[ 'label_names'] # the number of dimensions that do not correspond to the batch dimension is 4 if scat transformed. # Otherwise, it's 3 n_none_param_dims = 4 if transformed else 3 n_samples_total = data.shape[-n_none_param_dims] n_data_total = np.prod(data.shape[:-(n_none_param_dims - 1)]) n_labels = len(label_names) # number of labels to predict assert (isinstance( n_nodes_hidden, list)), "Invalid format of nodes given. Should be type list" if not classifer: if n_labels == 1 and not isinstance(n_nodes_hidden[0], list): n_nodes_hidden = [n_nodes_hidden] assert ( len(n_nodes_hidden) == n_labels), "Invalid format of nodes given.\ Should be n_labels number of lists" assert(all([isinstance(n_nodes_hidden_label, list) for n_nodes_hidden_label in n_nodes_hidden])),\ "Invalid format of nodes given. Should provide list of {} lists".format(n_labels) index = _train_test_split(n_data_total, train_ratio) index['val'] = index.pop('test') # reshape data. output is shaped (n_data_total, n_channels * (n_scat_nodes) * data_len). # (n_scat_nodes) means 1 if data not transformed data = np.reshape(data, (n_data_total, -1)) # initialize meta data and save it to a file meta = { 'file_name': file_name_meta, 'root_dir': root_dir, 'n_nodes': n_nodes, 'classifier': classifier, 'n_epochs_max': n_epochs_max, 'train_ratio': train_ratio, 'batch_size': batch_size, 'n_workers': n_workers, 'index': index, 'device': device, 'labels': samples['labels'], 'label_names': samples['label_names'] } labels = np.array(list(product(*labels)), dtype='float32') # shaped (n_conditions, n_labels) if classifier: label_to_idx = { tuple(condition): idx_condition for idx_condition, condition in enumerate(labels) } n_conditions = len(label_to_idx) n_nodes = [data.shape[-1]] + n_nodes_hidden + [n_conditions] meta.update({ 'epoch': [], 'weights': None, 'elapsed': [], 'loss': { 'train': [], 'val': [] }, 'criterion': 'cross_entropy_mean', 'label_to_idx': label_to_idx, 'n_nodes': n_nodes }) _init_meta(**meta) labels = np.arange(n_conditions) # shaped (n_conditions,) labels = np.repeat( labels, n_samples_total) # shaped (n_conditions * n_samples_total,) # which, for example, looks like [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4] # for n_samples_total being 3 and n_conditions being 5 dataset = TimeSeriesDataset(data, labels, transform=ToTensor()) # train the neural network for classification print("Beginning training of {}:".format(', '.join( samples['label_names']))) _train_nn(dataset, index, n_nodes_hidden=n_nodes_hidden, classifier=classifier, n_epochs_max=n_epochs_max, batch_size=batch_size, device=device, n_workers=n_workers, file_name=file_name_meta, root_dir=root_dir) else: n_nodes = [[data.shape[-1]] + n_nodes_hidden_label + [1] for n_nodes_hidden_label in n_nodes_hidden] meta.update({ 'epoch': [[] for _ in range(n_labels)], 'weights': [None for _ in range(n_labels)], 'elapsed': [[] for _ in range(n_labels)], 'loss': [{ 'train': [], 'val': [] } for _ in range(n_labels)], 'criterion': 'rmse' }) _init_meta(**meta) # following is shaped (n_labels, n_conditions) labels = labels.swapaxes(0, 1) # following is shaped (n_labels, n_data_total) labels = np.tile(labels[:, :, np.newaxis], [1, 1, n_samples_total]).reshape( [n_labels, n_data_total]) for idx_label in range(n_labels): dataset = TimeSeriesDataset(data, labels[idx_label], transform=ToTensor()) # train the neural network for the given idx_label print("Beginning training of {}:".format( samples['label_names'][idx_label])) _train_nn(dataset, index, n_nodes_hidden=n_nodes_hidden[idx_label], classifier=classifier, n_epochs_max=n_epochs_max, batch_size=batch_size, device=device, n_workers=n_workers, idx_label=idx_label, file_name=file_name_meta, root_dir=root_dir)
def sim_two_beads(data_len, gammas, k_ratios, diff_coef_ratios, dt, n_data=1, n_steps_initial=10000, save_file=False, root_dir=ROOT_DIR, dtype='float32'): ''' returns ensemble of two bead simulation trajectories. inputs: ------- - data_len: int, length of each process - gammas: numeric or list-like, drag coefficient values - k_ratios: numeric or list-like, ratios of spring constants - diff_coef_ratios: numeric or list-like, ratios of diffusion coefficients - dt: float, time step between data points - n_data: int, number of processes in ensemble - n_steps_initial: number of steps to take in Langevin equation for simulating initial positions - save_file: boolean, whether or not to save the file. If True, file name is returned. Otherwise, data is returned - root_dir: string, root directory to save file if save_file is True - dtype: 'float32' or 'float64', precision of output data outputs: -------- - (processes): dict whose key-value pairs are the following: 'data': ndarray shaped (n_data, 2, data_len) which is an ensemble of two beads trajectories. the 2nd dimension is for the number of channels. returned if save_file is False 'labels': list whose values are indices for the label values 'label_names': list whose elements are string type, ['gammas', 'k_ratios', 'diff_coef_ratios'] 'labels_lut': list where the values are the label values given the index values in 'labels' 'dt': float type dt 'n_steps_initial': int type n_steps_initial - (file_name): string type file name of the simulated data. data returned if save_file is True FIXME: check the code to see if the dimensions are not mixed up, check if actual simulation part is not mixed up with initial condition simulation ''' if isinstance(gammas, (int, float)): gammas = [gammas] if isinstance(k_ratios, (int, float)): k_ratios = [k_ratios] if isinstance(diff_coef_ratios, (int, float)): diff_coef_ratios = [diff_coef_ratios] gammas = np.array(gammas, dtype=dtype) k_ratios = np.array(k_ratios, dtype=dtype) diff_coef_ratios = np.array(diff_coef_ratios, dtype=dtype) n_gammas = len(gammas) n_k_ratios = len(k_ratios) n_diff_coef_ratios = len(diff_coef_ratios) file_size_est = data_len * n_gammas * n_diff_coef_ratios * n_k_ratios * n_data * 2 * np.dtype( dtype).itemsize file_size_est_gb = file_size_est / 1.e9 if file_size_est_gb > 2.: warnings.warn("Generating file with size roughly {:.2f} GB".format( file_size_est_gb), Category=BytesWarning) processes = np.empty((n_gammas, n_k_ratios, n_diff_coef_ratios, n_data, 2, data_len)).astype(dtype) for idx0, gamma in enumerate(gammas): for idx1, k in enumerate(k_ratios): for idx2, diff_coef in enumerate(diff_coef_ratios): force_matrix = np.array([[-(1 + k), k], [k, -(1 + k)]]) diffusion_matrix = np.array([[diff_coef, 0], [0, 1]]) prefactor1 = force_matrix * dt prefactor2 = np.sqrt(2 * diffusion_matrix * dt) rand_nums = np.random.normal(0, 1, [n_steps_initial, 2, n_data]) x0 = np.zeros((2, n_data)) for idx in range(n_steps_initial): x0 = x0 + np.matmul(prefactor1, x0) + np.matmul( prefactor2, rand_nums[idx]) processes[idx0, idx1, idx2, :, :, 0] = x0.T for idx1, k in enumerate(k_ratios): for idx2, diff_coef in enumerate(diff_coef_ratios): x = x0 force_matrix = np.array([[-(1 + k), k], [k, -(1 + k)]]) diffusion_matrix = np.array([[diff_coef, 0], [0, 1]]) prefactor1 = force_matrix * dt prefactor2 = np.sqrt(2 * diffusion_matrix * dt) rand_nums = np.random.normal(0, 1, [data_len - 1, 2, n_data]) for idx in range(data_len - 1): x = x + np.matmul(prefactor1, x) + np.matmul( prefactor2, rand_nums[idx]) processes[idx0, idx1, idx2, :, :, idx + 1] = x.T processes[idx0] = processes[idx0] / gamma # reshape data n_data_total = n_gammas * n_k_ratios * n_diff_coef_ratios * n_data processes = np.reshape( processes, (n_data_total, 2, data_len)) # shaped (n_data_total, 2, data_len) # reshape labels labels = [gammas, k_ratios, diff_coef_ratios] labels = np.array(list(product(*labels)), dtype=dtype) # shaped (n_conditions, n_labels) labels_lut = [tuple(condition) for condition in labels] n_conditions = len(labels_lut) labels = np.arange(n_conditions) # shaped (n_conditions,) labels = np.repeat(labels, n_data) # shaped (n_conditions * n_samples_total,) samples = { 'data': processes, 'labels': labels, 'label_names': ['gammas', 'k_ratios', 'diff_coefs'], 'dt': dt, 'n_steps_initial': n_steps_initial, 'labels_lut': labels_lut } if not save_file: return samples nums = cu.match_filename(r'tbd_([0-9]+).pt', root_dir=root_dir) nums = [int(num) for num in nums] idx = max(nums) + 1 if nums else 0 file_name = 'tbd_{}.pt'.format(idx) file_path = os.path.join(root_dir, file_name) torch.save(samples, file_path) return file_name
def sim_two_beads_sample(data_len, gammas, k_ratios, diff_coef_ratios, dt, n_data=1, n_steps_initial=10000, save_file=False, root_dir=ROOT_DIR, dtype='float32'): ''' returns ensemble of two bead simulation trajectories for a given range of spring constant and diffusion coefficient values. inputs: ------- - data_len: int, length of each process - gammas: numeric or length 2 list-like representing low, high values of the drag coefficient values - k_ratios: numeric or length 2 list-like representing low, high values of the ratios of spring constants - diff_coef_ratios: numeric or length 2 list-like representing low, high values of the ratios of diffusion coefficients - dt: float, time step between data points - n_data: int, number of processes in ensemble - n_steps_initial: number of steps to take in Langevin equation for simulating initial positions - save_file: boolean, whether or not to save the file. If True, file name is returned. Otherwise, data is returned - root_dir: string, root directory to save file if save_file is True - dtype: 'float32' or 'float64', precision of output data outputs: -------- - (processes): dict whose key-value pairs are the following: 'data': ndarray shaped (n_data, 2, data_len) which is an ensemble of two beads trajectories. the 2nd dimension is for the number of channels. returned if save_file is False 'labels': list whose values are the ndarrays. each ndarray is shaped (n_data,) whose values are the gammas, k_ratios, diff_coef_ratios values 'label_names': list whose elements are string type, ['gammas', 'k_ratios', 'diff_coef_ratios'] 'dt': float type dt 'n_steps_initial': int type n_steps_initial - (file_name): string type file name of the simulated data. data returned if save_file is True FIXME: check the code to see if actual simulation part is not mixed up with initial condition simulation ''' if isinstance(gammas, (int, float)): gammas = np.array([gammas, gammas], dtype=dtype) if isinstance(k_ratios, (int, float)): k_ratios = np.array([k_ratios, k_ratios], dtype=dtype) if isinstance(diff_coef_ratios, (int, float)): diff_coef_ratios = np.array([diff_coef_ratios, diff_coef_ratios], dtype=dtype) assert ( len(gammas) == 2 ), "Invalid gammas given: should be numeric or length 2 list-like format" assert ( len(k_ratios) == 2 ), "Invalid k_ratios given: should be numeric or length 2 list-like format" assert ( len(diff_coef_ratios) == 2 ), "Invalid diff_coef_ratios given: should be numeric or length 2 list-like format" gamma_low, gamma_high = gammas k_ratio_low, k_ratio_high = k_ratios diff_coef_ratio_low, diff_coef_ratio_high = diff_coef_ratios gamma_samples = (gamma_high - gamma_low) * np.random.random( n_data, ) + gamma_low k_ratio_samples = (k_ratio_high - k_ratio_low) * np.random.random( n_data, ) + k_ratio_low diff_coef_ratio_samples = (diff_coef_ratio_high - diff_coef_ratio_low) * np.random.random( n_data, ) + diff_coef_ratio_low param_samples = np.stack( [gamma_samples, k_ratio_samples, diff_coef_ratio_samples], axis=1) concat_list = [] for gamma_sample, k_ratio_sample, diff_coef_ratio_sample in param_samples: process = sim_two_beads(data_len, gammas=gamma_sample, k_ratios=k_ratio_sample, diff_coef_ratios=diff_coef_ratio_sample, dt=dt, n_data=1, n_steps_initial=n_steps_initial, save_file=False, dtype=dtype) process = process['data'] concat_list.append(process) processes = np.concatenate(concat_list, axis=0) # shaped (n_data, 2, data_len) samples = { 'data': processes, 'labels': [gamma_samples, k_ratio_samples, diff_coef_ratio_samples], 'label_names': ['gammas', 'k_ratios', 'diff_coef_ratios'], 'dt': dt, 'n_steps_initial': n_steps_initial } if not save_file: return samples nums = cu.match_filename(r'tbd_([0-9]+).pt', root_dir=root_dir) nums = [int(num) for num in nums] idx = max(nums) + 1 if nums else 0 file_name = 'tbd_{}.pt'.format(idx) file_path = os.path.join(root_dir, file_name) torch.save(samples, file_path) return file_name
def sim_poisson_sample(data_len, lams, dt, n_data=1, save_file=False, root_dir=ROOT_DIR, dtype='float32'): ''' returns ensemble of poisson processes for a given range of lambda values inputs: ------- - data_len: int, length of each process - lams: numeric or length 2 list-like representing low, high values of expectation per interval value - dt: time step between data points - n_data: int, number of processes in ensemble - save_file: boolean, whether or not to save the file. If True, file name is returned. Otherwise, data is returned - root_dir: string, root directory to save file if save_file is True - dtype: 'float32' or 'float64', precision of output data outputs: -------- - (processes): dict whose key-value pairs are the following: 'data': ndarray shaped (n_data, 1, data_len) which is an ensemble of poisson trajectories. the singleton dimension is for the number of channels. returned if save_file is False 'labels': list whose values are the lams values 'label_names': list whose elements are string type, ['lams'] 'dt': float type dt - (file_name): string type file name of the simulated data. data returned if save_file is True REVIEW: confirm this method of using fixed time step generates identical statistics to that of Gielespie algorithm ''' if isinstance(lams, (int, float)): lams = np.array([lams, lams], dtype=dtype) assert ( len(lams) == 2 ), "Invalid lams given: should be numeric or length 2 list-like format" lam_low, lam_high = lams lam_samples = (lam_high - lam_low) * np.random.random(n_data, ) + lam_low concat_list = [] for lam_sample in lam_samples: process = sim_poisson(data_len, lams=lam_sample, dt=dt, n_data=1, save_file=False, dtype=dtype) process = process['data'] concat_list.append(process) processes = np.concatenate(concat_list, axis=0) # shaped (n_data, 1, data_len) samples = { 'data': processes, 'labels': [lam_samples], 'label_names': ['lams'], 'dt': dt } if not save_file: return samples nums = cu.match_filename(r'pos_([0-9]+).pt', root_dir=root_dir) nums = [int(num) for num in nums] idx = max(nums) + 1 if nums else 0 file_name = 'pos_{}.pt'.format(idx) file_path = os.path.join(root_dir, file_name) torch.save(samples, file_path) return file_name
def sim_one_bead(data_len, ks, diff_coefs, dt, n_data=1, n_steps_initial=10000, save_file=False, root_dir=ROOT_DIR, dtype='float32'): ''' returns ensemble of one bead simulation trajectories. as there is only one heat bath, this is a passive trajectory inputs: ------- - data_len: int, length of each process - k: numeric or list or ndarray, spring constant - diff_coef: numeric or list or ndarray, diffusion coefficient - dt: time step between data points - n_data: number of processes in ensemble - n_steps_initial: number of steps to take in Langevin equation for simulating initial positions - save_file: boolean, whether or not to save the file. If True, file name is returned. Otherwise, data is returned - root_dir: string, root directory to save file if save_file is True - dtype: 'float32' or 'float64', precision of output data outputs: -------- - (processes): dict whose key-value pairs are the following: 'data': ndarray shaped (n_data, 1, data_len) which is an ensemble of one bead trajectories. the singleton dimension is for the number of channels. returned if save_file is False 'labels': list whose values are indices for the label values 'label_names': list whose elements are string type, ['ks', 'diff_coefs'] 'labels_lut': list where the values are the label values given the index values in 'labels' 'dt': float type dt 'n_steps_initial': int type n_steps_initial - (file_name): string type file name of the simulated data. returned if save_file is True FIXME: check the code to see if actual simulation part is not mixed up with initial condition simulation ''' if isinstance(ks, (int, float)): ks = [ks] if isinstance(diff_coefs, (int, float)): diff_coefs = [diff_coefs] ks = np.array(ks, dtype=dtype) diff_coefs = np.array(diff_coefs, dtype=dtype) n_ks = len(ks) n_diff_coefs = len(diff_coefs) file_size_est = data_len * n_diff_coefs * n_ks * n_data * np.dtype( dtype).itemsize file_size_est_gb = file_size_est / 1.e9 if file_size_est_gb > 2.: warnings.warn("Generating file with size roughly {:.2f} GB".format( file_size_est_gb), Category=BytesWarning) processes = np.empty((n_ks, n_diff_coefs, n_data, data_len)).astype(dtype) for idx0, k in enumerate(ks): for idx1, diff_coef in enumerate(diff_coefs): prefactor1 = k * dt prefactor2 = np.sqrt(2 * diff_coef * dt) rand_nums = np.random.normal(0, 1, [n_steps_initial, n_data]) x0 = np.zeros(n_data) for idx in range(n_steps_initial): x0 = x0 - prefactor1 * x0 + prefactor2 * rand_nums[idx] processes[idx0, idx1, :, 0] = x0 for idx0, k in enumerate(ks): for idx1, diff_coef in enumerate(diff_coefs): x = processes[idx0, idx1, :, 0] prefactor1 = k * dt prefactor2 = np.sqrt(2 * diff_coef * dt) rand_nums = np.random.normal(0, 1, [data_len - 1, n_data]) for idx in range(data_len - 1): x = x - prefactor1 * x + prefactor2 * rand_nums[idx] processes[idx0, idx1, :, idx + 1] = x processes = np.expand_dims(processes, axis=-2) # reshape data n_data_total = n_ks * n_diff_coefs * n_data processes = np.reshape( processes, (n_data_total, 1, data_len)) # shaped (n_data_total, 1, data_len) # reshape labels labels = [ks, diff_coefs] labels = np.array(list(product(*labels)), dtype=dtype) # shaped (n_conditions, n_labels) labels_lut = [tuple(condition) for condition in labels] n_conditions = len(labels_lut) labels = np.arange(n_conditions) # shaped (n_conditions,) labels = np.repeat(labels, n_data) # shaped (n_conditions * n_samples_total,) samples = { 'data': processes, 'labels': labels, 'label_names': ['ks', 'diff_coefs'], 'dt': dt, 'n_steps_initial': n_steps_initial, 'labels_lut': labels_lut } if not save_file: return samples nums = cu.match_filename(r'obd_([0-9]+).pt', root_dir=root_dir) nums = [int(num) for num in nums] idx = max(nums) + 1 if nums else 0 file_name = 'obd_{}.pt'.format(idx) file_path = os.path.join(root_dir, file_name) torch.save(samples, file_path) return file_name
def sim_brownian(data_len, diff_coefs, dt, n_data=1, save_file=False, root_dir=ROOT_DIR, dtype='float32'): ''' returns ensemble of brownian trajectories inputs: ------- - data_len: int, length of each process - diff_coef: numeric or list or ndarray, diffusion coefficient. - dt: float, time step between data points - n_data: int, number of processes in ensemble - save_file: boolean, whether or not to save the file. If True, file name is returned. Otherwise, data is returned - root_dir: string, root directory to save file if save_file is True - dtype: 'float32' or 'float64', precision of output data outputs: -------- - (processes): dict whose key-value pairs are the following: 'data': ndarray shaped (n_data, 1, data_len) which is an ensemble of brownian trajectories. the singleton dimension is for the number of channels. returned if save_file is False 'labels': list whose values are indices for the label values 'label_names': list whose elements are string type, ['diff_coefs'] 'labels_lut': list where the values are the label values given the index values in 'labels' 'dt': float type dt - (file_name): string type file name of the simulated data. returned if save_file is True ''' concat_list = [] if isinstance(diff_coefs, (int, float)): diff_coefs = [diff_coefs] diff_coefs = np.array(diff_coefs, dtype=dtype) n_diff_coefs = len(diff_coefs) file_size_est = data_len * len(diff_coefs) * n_data * np.dtype( dtype).itemsize file_size_est_gb = file_size_est / 1.e9 if file_size_est_gb > 2.: warnings.warn("Generating file with size roughly {:.2f} GB".format( file_size_est_gb), Category=BytesWarning) for diff_coef in diff_coefs: increments = np.sqrt(2 * diff_coef * dt) * np.random.normal( 0, 1, [n_data, data_len - 1]) x0 = np.random.normal(0, 1, [n_data, 1]) increments = np.concatenate([x0, increments], axis=1) processes = increments.cumsum(axis=1) concat_list.append(processes.astype(dtype)) processes = np.stack(concat_list, axis=0) processes = np.expand_dims(processes, axis=-2) # reshape data n_data_total = n_diff_coefs * n_data processes = np.reshape( processes, (n_data_total, 1, data_len)) # shaped (n_data_total, 1, data_len) # reshape labels labels = [diff_coefs] labels = np.array(list(product(*labels)), dtype=dtype) # shaped (n_conditions, n_labels) labels_lut = [tuple(condition) for condition in labels] n_conditions = len(labels_lut) labels = np.arange(n_conditions) # shaped (n_conditions,) labels = np.repeat(labels, n_data) # shaped (n_conditions * n_samples_total,) samples = { 'data': processes, 'labels': labels, 'label_names': ['diff_coefs'], 'dt': dt, 'labels_lut': labels_lut } if not save_file: return samples nums = cu.match_filename(r'brw_([0-9]+).pt', root_dir=root_dir) nums = [int(num) for num in nums] idx = max(nums) + 1 if nums else 0 file_name = 'brw_{}.pt'.format(idx) file_path = os.path.join(root_dir, file_name) torch.save(samples, file_path) return file_name
def sim_poisson(data_len, lams, dt, n_data=1, save_file=False, root_dir=ROOT_DIR, dtype='float32'): ''' returns ensemble of poisson processes inputs: ------- - data_len: int, length of each process - lams: numeric or list or ndarray, expectation per interval - dt: time step between data points - n_data: number of processes in ensemble - save_file: boolean, whether or not to save the file. If True, file name is returned. Otherwise, data is returned - root_dir: string, root directory to save file if save_file is True - dtype: 'float32' or 'float64', precision of output data outputs: -------- - (processes): dict whose key-value pairs are the following: 'data': ndarray shaped (n_data, 1, data_len) which is an ensemble of poisson trajectories. the singleton dimension is for the number of channels. returned if save_file is False 'labels': list whose values are indices for the label values 'label_names': list whose elements are string type, ['lams'] 'labels_lut': list where the values are the label values given the index values in 'labels' 'dt': float type dt - (file_name): string type file name of the simulated data. data returned if save_file is True REVIEW: confirm this method of using fixed time step generates identical statistics to that of Gielespie algorithm ''' if isinstance(lams, (int, float)): lams = [lams] lams = np.array(lams, dtype=dtype) n_lams = len(lams) file_size_est = data_len * n_lams * n_data * np.dtype(dtype).itemsize file_size_est_gb = file_size_est / 1.e9 if file_size_est_gb > 2.: warnings.warn("Generating file with size roughly {:.2f} GB".format( file_size_est_gb), Category=BytesWarning) concat_list = [] for lam in lams: increments = np.random.poisson(lam * dt, size=[n_data, data_len]) processes = increments.cumsum(axis=1) concat_list.append(processes.astype(dtype)) processes = np.stack(concat_list, axis=0) processes = np.expand_dims(processes, axis=-2) # reshape data n_data_total = n_lams * n_data processes = np.reshape( processes, (n_data_total, 1, data_len)) # shaped (n_data_total, 1, data_len) # reshape labels labels = [lams] labels = np.array(list(product(*labels)), dtype=dtype) # shaped (n_conditions, n_labels) labels_lut = [tuple(condition) for condition in labels] n_conditions = len(labels_lut) labels = np.arange(n_conditions) # shaped (n_conditions,) labels = np.repeat(labels, n_data) # shaped (n_conditions * n_samples_total,) samples = { 'data': processes, 'labels': labels, 'label_names': ['lams'], 'dt': dt, 'labels_lut': labels_lut } if not save_file: return samples nums = cu.match_filename(r'pos_([0-9]+).pt', root_dir=root_dir) nums = [int(num) for num in nums] idx = max(nums) + 1 if nums else 0 file_name = 'pos_{}.pt'.format(idx) file_path = os.path.join(root_dir, file_name) torch.save(samples, file_path) return file_name
def sim_brownian_sample(data_len, diff_coefs, dt, n_data=1, save_file=False, root_dir=ROOT_DIR, dtype='float32'): ''' returns ensemble of brownian trajectories for a given range of diffusion coefficients inputs: ------- - data_len: int, length of each process - diff_coef: numeric or length 2 list-like representing low, high values of diffusion coefficients - dt: float, time step between data points - n_data: int, number of processes in ensemble - save_file: boolean, whether or not to save the file. If True, file name is returned. Otherwise, data is returned - root_dir: string, root directory to save file if save_file is True - dtype: 'float32' or 'float64', precision of output data outputs: -------- - (processes): dict whose key-value pairs are the following: 'data': ndarray shaped (n_data, 1, data_len) which is an ensemble of brownian trajectories. the singleton dimension is for the number of channels. returned if save_file is False 'labels': list whose values are the diff_coefs values 'label_names': list whose elements are string type, ['diff_coefs'] 'dt': float type dt - (file_name): string type file name of the simulated data. returned if save_file is True ''' if isinstance(diff_coefs, (int, float)): diff_coefs = np.array([diff_coefs, diff_coefs], dtype=dtype) assert ( len(diff_coefs) == 2 ), "Invalid diff_coefs given: should be numeric or length 2 list-like format" diff_coef_low, diff_coef_high = diff_coefs diff_coef_samples = (diff_coef_high - diff_coef_low) * np.random.random( n_data, ) + diff_coef_low concat_list = [] for diff_coef_sample in diff_coef_samples: process = sim_brownian(data_len, diff_coefs=diff_coef_sample, dt=dt, n_data=1, save_file=False, dtype=dtype) process = process['data'] concat_list.append(process) processes = np.concatenate(concat_list, axis=0) # shaped (n_data, 1, data_len) samples = { 'data': processes, 'labels': [diff_coef_samples], 'label_names': ['diff_coefs'], 'dt': dt } if not save_file: return samples nums = cu.match_filename(r'brw_([0-9]+).pt', root_dir=root_dir) nums = [int(num) for num in nums] idx = max(nums) + 1 if nums else 0 file_name = 'brw_{}.pt'.format(idx) file_path = os.path.join(root_dir, file_name) torch.save(samples, file_path) return file_name
] for file_name_meta in file_names_meta: assert(len(file_name_meta) == 1), "Invalid number of files. Should be only 1 trained model for each case" file_names_meta = [file_name_meta[0] for file_name_meta in file_names_meta] file_names_meta = [ cu.match_filename('(tbd_4_meta_rnn_[0-9]+_k_ratios.pt)', root_dir), cu.match_filename('(tbd_4_scat_0_meta_rnn_[0-9]+_k_ratios.pt)', root_dir), ] for file_name_meta in file_names_meta: assert(len(file_name_meta) == 1), "Invalid number of files. Should be only 1 trained model for each case" file_names_meta = [file_name_meta[0] for file_name_meta in file_names_meta] """ file_names_meta = [ cu.match_filename('(tbd_4_meta_rnn_[0-9]+_diff_coef_ratios.pt)', root_dir), cu.match_filename('(tbd_4_scat_0_meta_rnn_[0-9]+_diff_coef_ratios.pt)', root_dir), ] for file_name_meta in file_names_meta: assert(len(file_name_meta) == 1), "Invalid number of files. Should be only 1 trained model for each case" file_names_meta = [file_name_meta[0] for file_name_meta in file_names_meta] #file_names_meta = ['data_meta_rnn_1.pt', 'data_scat_0_meta_rnn_1.pt'] # IRFP #file_names_meta = ['data_meta_rnn_11.pt', 'data_scat_0_meta_rnn_11.pt'] # OR, provide file names and paths using regular expression #file_paths_meta = glob.glob(os.path.join(root_dir, 'tbd_0_scat_meta_rnn_*.pt'))
except: print("exception occurred during scat transformation for n_data:{} with parameters avg_len:{}, n_filter_octave:{}".format(n_data, avg_len, n_filter_octave)) # simulate data for testing performance print("simulating data for evaluation for randomly sampled labels") k_ratios_test = (k_ratios_test_high - k_ratios_test_low) * np.random.random(n_data_test,) + k_ratios_test_low diff_coef_ratios_test = (diff_coef_ratios_test_high - diff_coef_ratios_test_low) * np.random.random(n_data_test,) + diff_coef_ratios_test_low k_ratios_diff_coef_ratios_test = np.stack([k_ratios_test, diff_coef_ratios_test], axis=1) data_tests = [] for k_ratio_test, diff_coef_ratio_test in k_ratios_diff_coef_ratios_test: data_test = siu.sim_two_beads(data_len, k_ratios=k_ratio_test, diff_coef_ratios=diff_coef_ratio_test, dt=dt, n_data=1, n_steps_initial=10000, save_file=False) data_tests.append(data_test) processes = np.concatenate(data_tests, axis=2) # shaped (1, 1, n_data_test, n_channels, data_len) samples = {'data':processes, 'labels':k_ratios_diff_coef_ratios_test, 'label_names':'k_ratios_diff_coef_ratios', 'dt':dt, 'n_steps_initial':10000} nums = cu.match_filename(r'tbd_([0-9]+).pt', root_dir=root_dir) nums = [int(num) for num in nums] idx = max(nums) + 1 if nums else 0 file_name_test = 'tbd_{}.pt'.format(idx) file_path_test = os.path.join(root_dir, file_name_test) torch.save(samples, file_path_test) # scat transforming test data file_names_scat_test = [] for avg_len in avg_lens: for n_filter_octave in n_filter_octaves: try: print("scat transforming n_data_test:{} with parameters avg_len:{}, n_filter_octave:{}".format(n_data_test, avg_len, n_filter_octave)) file_name_scat_test = scu.scat_transform(file_name_test, avg_len, log_transform=False, n_filter_octave=n_filter_octave, save_file=True, root_dir=root_dir) file_names_scat_test.append(file_name_scat_test)
def scat_transform(file_name, avg_len, log_transform=False, n_filter_octave=[1, 1], filter_format='fourier_truncated', save_file=True, root_dir=ROOT_DIR): ''' performs invariant scattering transform for a given file inputs: ------- - file_name: str type file name - avg_len: window length of scaling function in scat transform - log_transform: boolean whether to apply logarithm on scat transform results - n_filter_octave: number of filters when halving the frequency. 1 indicates dyadic filter bank - filter_format: 'fourier_multires', 'fourier_truncated', 'fourier' - save_file: boolean whether to save the results into a file or return as a dictionary. If True, file name is returned. Otherwise, data is returned - root_dir: str type directory name data_len: int, length of each process outputs: -------- - (processes): dict whose key-value pairs are the following: 'data': ndarray shaped (n_data, n_channels, n_nodes, data_len) or list of ndarrays where each array is shaped (n_channels, n_nodes, data_len) whatever key-values were in the given file hyperparameters used for performing the scat transform Returned if save_file is False - (file_name): string type file name of the simulated data. returned if save_file is True ''' file_name, _ = os.path.splitext(file_name) file_path = os.path.join(root_dir, file_name + '.pt') samples = torch.load(file_path) nums = cu.match_filename(r'{}_scat_([0-9]+).pt'.format(file_name), root_dir=root_dir) nums = [int(num) for num in nums]; idx = max(nums) + 1 if nums else 0 file_name_scat = '{}_scat_{}.pt'.format(file_name, idx) file_path_scat = os.path.join(root_dir, file_name_scat) data = samples['data'] if isinstance(data, np.ndarray): assert(len(data.shape) == 3),\ "Invalid data shape given. If type is ndarray, should be rank 3" n_data, n_channels, data_len = data.shape # perform scattering transform scat = ScatNet(data_len, avg_len, n_filter_octave=n_filter_octave, filter_format=filter_format) S = scat.transform(data) if log_transform: S = log_scat(S) data_scat = stack_scat(S) # shaped (n_data, n_channels, n_scat_nodes, data_len) elif isinstance(data, list): assert(len(data[0].shape) == 2),\ "Invalid data shape given. If type is list, elements should be rank 2 ndarrays" data_scat = [] for track in data: track_len = track.shape[1] scat = ScatNet(track_len, avg_len, n_filter_octave=n_filter_octave, filter_format=filter_format) S = scat.transform(track[np.newaxis, :, :]) S = stack_scat(S)[0] # shaped (2, n_nodes, track_scat_len) where n_nodes is fixed but track_scat_len varies data_scat.append(S) else: raise ValueError("Invalid data given. Type should be either ndarray or list") samples_out = copy.deepcopy(samples) del samples_out['data'] samples_out.update({'data':data_scat, 'avg_len':avg_len, 'log_transform':log_transform, 'n_filter_octave':n_filter_octave, 'filter_format':filter_format, 'file_name':file_name}) if not save_file: return samples_out torch.save(samples_out, file_path_scat) return file_name_scat
elif isinstance(samples['data'], list): assert(len(samples['data'][0].shape) == 2),\ "Invalid data shape given. If type is list, elements should be rank 2 ndarrays" data = [] for track in samples['data']: data.append(np.diff(track, n=1, axis=-1)) samples_out['data'] = data else: raise ValueError( "Invalid data given. Type should be either ndarray or list") file_name_no_ext, _ = os.path.splitext(file_name) file_name_out = file_name_no_ext + '_disp.pt' file_path_out = os.path.join(root_dir, file_name_out) torch.save(samples_out, file_path_out) file_names_scat = cu.match_filename( r'({}_scat_[0-9]+.pt)'.format(file_name_no_ext), root_dir=root_dir) for file_name_scat in file_names_scat: file_path_scat = os.path.join(root_dir, file_name_scat) samples_scat = torch.load(file_path_scat) avg_len = samples_scat['avg_len'] n_filter_octave = samples_scat['n_filter_octave'] # perform scat transform and append to list print( "scat transforming {} with parameters avg_len:{}, n_filter_octave:{}" .format(file_name_out, avg_len, n_filter_octave)) file_name_scat_out = scu.scat_transform( file_name_out, avg_len, log_transform=False, n_filter_octave=n_filter_octave,