def datagen(self, epoch=0, print_out=1, test=0): files = self.files # order = range(13, 17) # Temporarily train on only a few files range(len(files)) # Randomize files after first training epoch # if epoch: # order = np.random.permutation(order) # choose a random sample to train on if not test: order = random.sample( list(self.train_ind), int(self.sampling_density * len(self.train_ind))) else: order = self.test_ind for f_ind in order: if print_out: print(files[f_ind], '\n') (X, nbrs, resnums) = helper.get_data_arrays(files[f_ind]) # normalizing the location coordinates and bond lengths and scale type encoding # Changed the xyz normalization from 255 to 350 if self.type_feature: Xnorm = np.concatenate([ X[:, :, :, 0:3] / 320., X[:, :, :, 3:8], X[:, :, :, 8:] / 10. ], axis=3) # only consider the location coordinates and bond lengths per molecule else: Xnorm = np.concatenate( [X[:, :, :, 0:3] / 320., X[:, :, :, 8:] / 10.], axis=3) num_frames = X.shape[0] xt_all = np.array([]) yt_all = np.array([]) num_active_frames = random.sample( range(num_frames), int(self.sampling_density * num_frames)) print('Datagen on the following frames', num_active_frames) for i in num_active_frames: if self.conv_net: xt = Xnorm[i] if self.nbr_type == 'relative': xt = helper.append_nbrs_relative( xt, nbrs[i], self.molecular_nbrs) elif self.nbr_type == 'invariant': xt = helper.append_nbrs_invariant( xt, nbrs[i], self.molecular_nbrs) else: print('Invalid nbr_type') exit() yt = xt.copy() xt = xt.reshape(xt.shape[0], 1, xt.shape[1], 1) if self.full_conv_net: yt = xt.copy() else: xt = Xnorm[i] if self.nbr_type == 'relative': xt = helper.append_nbrs_relative( xt, nbrs[i], self.molecular_nbrs) elif self.nbr_type == 'invariant': xt = helper.append_nbrs_invariant( xt, nbrs[i], self.molecular_nbrs) else: print('Invalid nbr_type') exit() yt = xt.copy() if not len(xt_all): xt_all = np.expand_dims(xt, axis=0) yt_all = np.expand_dims(yt, axis=0) else: xt_all = np.append(xt_all, np.expand_dims(xt, axis=0), axis=0) yt_all = np.append(yt_all, np.expand_dims(yt, axis=0), axis=0) yield files[f_ind], xt_all, yt_all return
def run(GP): # set the seed if GP['seed']: np.random.seed(GP['seed']) else: np.random.seed(np.random.randint(10000)) # Set paths if not os.path.isdir(GP['home_dir']): print('Keras home directory not set') sys.exit(0) sys.path.append(GP['home_dir']) # Setup loggin args = candle.ArgumentStruct(**GP) # set_seed(args.rng_seed) # ext = extension_from_parameters(args) candle.verify_path(args.save_path) prefix = args.save_path # + ext logfile = args.logfile if args.logfile else prefix + '.log' candle.set_up_logger(logfile, logger, False) #args.verbose logger.info('Params: {}'.format(GP)) import p2b1 as hf reload(hf) #import keras_model_utils as KEU #reload(KEU) #reload(p2ck) #reload(p2ck.optimizers) maps = hf.autoencoder_preprocess() from keras.optimizers import SGD, RMSprop, Adam from keras.datasets import mnist from keras.callbacks import LearningRateScheduler, ModelCheckpoint from keras import callbacks from keras.layers.advanced_activations import ELU from keras.preprocessing.image import ImageDataGenerator # GP=hf.ReadConfig(opts.config_file) batch_size = GP['batch_size'] learning_rate = GP['learning_rate'] kerasDefaults = candle.keras_default_config() ##### Read Data ######## import helper (data_files, fields) = p2b1.get_list_of_data_files(GP) # Read from local directoy #(data_files, fields) = helper.get_local_files('/p/gscratchr/brainusr/datasets/cancer/pilot2/3k_run16_10us.35fs-DPPC.20-DIPC.60-CHOL.20.dir/') #(data_files, fields) = helper.get_local_files('3k_run16', '/p/lscratchf/brainusr/datasets/cancer/pilot2/') # Define datagenerator datagen = hf.ImageNoiseDataGenerator(corruption_level=GP['noise_factor']) # get data dimension ## num_samples = 0 for f in data_files: # Seperate different arrays from the data (X, nbrs, resnums) = helper.get_data_arrays(f) num_samples += X.shape[0] (X, nbrs, resnums) = helper.get_data_arrays(data_files[0]) print('\nData chunk shape: ', X.shape) molecular_hidden_layers = GP['molecular_num_hidden'] if not molecular_hidden_layers: X_train = hf.get_data(X, case=GP['case']) input_dim = X_train.shape[1] else: # computing input dimension for outer AE input_dim = X.shape[1] * molecular_hidden_layers[-1] print('\nState AE input/output dimension: ', input_dim) # get data dimension for molecular autoencoder molecular_nbrs = np.int(GP['molecular_nbrs']) num_molecules = X.shape[1] num_beads = X.shape[2] if GP['nbr_type'] == 'relative': # relative x, y, z positions num_loc_features = 3 loc_feat_vect = ['rel_x', 'rel_y', 'rel_z'] elif GP['nbr_type'] == 'invariant': # relative distance and angle num_loc_features = 2 loc_feat_vect = ['rel_dist', 'rel_angle'] else: print('Invalid nbr_type!!') exit() if not GP['type_bool']: # only consider molecular location coordinates num_type_features = 0 type_feat_vect = [] else: num_type_features = 5 type_feat_vect = list(fields.keys())[3:8] num_features = num_loc_features + num_type_features + num_beads dim = np.prod([num_beads, num_features, molecular_nbrs + 1]) bead_kernel_size = num_features molecular_input_dim = dim mol_kernel_size = num_beads feature_vector = loc_feat_vect + type_feat_vect + list(fields.keys())[8:] print('\nMolecular AE input/output dimension: ', molecular_input_dim) print( '\nData Format:\n[Frames (%s), Molecules (%s), Beads (%s), %s (%s)]' % (num_samples, num_molecules, num_beads, feature_vector, num_features)) ### Define Model, Solver and Compile ########## print('\nDefine the model and compile') opt = candle.build_optimizer(GP['optimizer'], learning_rate, kerasDefaults) model_type = 'mlp' memo = '%s_%s' % (GP['base_memo'], model_type) ######## Define Molecular Model, Solver and Compile ######### molecular_nonlinearity = GP['molecular_nonlinearity'] len_molecular_hidden_layers = len(molecular_hidden_layers) conv_bool = GP['conv_bool'] full_conv_bool = GP['full_conv_bool'] if conv_bool: molecular_model, molecular_encoder = AE_models.conv_dense_mol_auto( bead_k_size=bead_kernel_size, mol_k_size=mol_kernel_size, weights_path=None, input_shape=(1, molecular_input_dim, 1), nonlinearity=molecular_nonlinearity, hidden_layers=molecular_hidden_layers, l2_reg=GP['l2_reg'], drop=float(GP['drop_prob'])) elif full_conv_bool: molecular_model, molecular_encoder = AE_models.full_conv_mol_auto( bead_k_size=bead_kernel_size, mol_k_size=mol_kernel_size, weights_path=None, input_shape=(1, molecular_input_dim, 1), nonlinearity=molecular_nonlinearity, hidden_layers=molecular_hidden_layers, l2_reg=GP['l2_reg'], drop=float(GP['drop_prob'])) else: molecular_model, molecular_encoder = AE_models.dense_auto( weights_path=None, input_shape=(molecular_input_dim, ), nonlinearity=molecular_nonlinearity, hidden_layers=molecular_hidden_layers, l2_reg=GP['l2_reg'], drop=float(GP['drop_prob'])) if GP['loss'] == 'mse': loss_func = 'mse' elif GP['loss'] == 'custom': loss_func = helper.combined_loss molecular_model.compile( optimizer=opt, loss=loss_func, metrics=['mean_squared_error', 'mean_absolute_error']) print('\nModel Summary: \n') molecular_model.summary() ##### set up callbacks and cooling for the molecular_model ########## drop = 0.5 mb_epochs = GP['epochs'] initial_lrate = GP['learning_rate'] epochs_drop = 1 + int(np.floor(mb_epochs / 3)) def step_decay(epoch): global initial_lrate, epochs_drop, drop lrate = initial_lrate * np.power(drop, np.floor((1 + epoch) / epochs_drop)) return lrate lr_scheduler = LearningRateScheduler(step_decay) history = callbacks.History() # callbacks=[history,lr_scheduler] history_logger = candle.LoggingCallback(logger.debug) candleRemoteMonitor = candle.CandleRemoteMonitor(params=GP) timeoutMonitor = candle.TerminateOnTimeOut(TIMEOUT) callbacks = [history, history_logger, candleRemoteMonitor, timeoutMonitor] loss = 0. #### Save the Model to disk if GP['save_path'] != None: save_path = GP['save_path'] if not os.path.exists(save_path): os.makedirs(save_path) else: save_path = '.' model_json = molecular_model.to_json() with open(save_path + '/model.json', "w") as json_file: json_file.write(model_json) encoder_json = molecular_encoder.to_json() with open(save_path + '/encoder.json', "w") as json_file: json_file.write(encoder_json) print('Saved model to disk') #### Train the Model if GP['train_bool']: ct = hf.Candle_Molecular_Train( molecular_model, molecular_encoder, data_files, mb_epochs, callbacks, batch_size=batch_size, nbr_type=GP['nbr_type'], save_path=GP['save_path'], len_molecular_hidden_layers=len_molecular_hidden_layers, molecular_nbrs=molecular_nbrs, conv_bool=conv_bool, full_conv_bool=full_conv_bool, type_bool=GP['type_bool'], sampling_density=GP['sampling_density']) frame_loss, frame_mse = ct.train_ac() else: frame_mse = [] frame_loss = [] return frame_loss, frame_mse
def run(GP): # set the seed if GP['seed']: np.random.seed(7) else: np.random.seed(np.random.randint(10000)) # Set paths if not os.path.isdir(GP['home_dir']): print('Keras home directory not set') sys.exit(0) sys.path.append(GP['home_dir']) import p2b1_mol_AE as hf reload(hf) import keras_model_utils as KEU reload(KEU) reload(p2ck) reload(p2ck.optimizers) maps = hf.autoencoder_preprocess() from keras.optimizers import SGD, RMSprop, Adam from keras.datasets import mnist from keras.callbacks import LearningRateScheduler, ModelCheckpoint from keras import callbacks from keras.layers.advanced_activations import ELU from keras.preprocessing.image import ImageDataGenerator # GP=hf.ReadConfig(opts.config_file) batch_size = GP['batch_size'] learning_rate = GP['learning_rate'] kerasDefaults = p2c.keras_default_config() ##### Read Data ######## #(data_files, fields)=p2c.get_list_of_data_files(GP) # Read from local directoy import helper (data_files, fields) = helper.get_local_files( '/p/gscratchr/brainusr/datasets/cancer/pilot2/3k_run16_10us.35fs-DPPC.20-DIPC.60-CHOL.20.dir/' ) # Define datagenerator datagen = hf.ImageNoiseDataGenerator(corruption_level=GP['noise_factor']) # get data dimension ## num_samples = 0 for f in data_files: # Seperate different arrays from the data (X, nbrs, resnums) = helper.get_data_arrays(f) num_samples += X.shape[0] (X, nbrs, resnums) = helper.get_data_arrays(data_files[0]) print(X.shape) molecular_hidden_layers = GP['molecular_num_hidden'] if not molecular_hidden_layers: X_train = hf.get_data(X, case=GP['case']) input_dim = X_train.shape[1] else: # computing input dimension for outer AE input_dim = X.shape[1] * molecular_hidden_layers[-1] print('The input dimension to the State AE is ', input_dim) # get data dimension for molecular autoencoder molecular_nbrs = np.int(GP['molecular_nbrs']) if not GP['type_bool']: # only consider molecular location coordinates dim = np.prod([X.shape[2], X.shape[3] - 5, molecular_nbrs + 1]) molecular_input_dim = dim molecular_output_dim = dim bead_kernel_size = X.shape[3] - 5 mol_kernel_size = 12 # (X.shape[3]-5)*X.shape[2] else: dim = np.prod(X.shape[2:] + (molecular_nbrs + 1, )) molecular_input_dim = dim bead_kernel_size = X.shape[3] mol_kernel_size = 12 # X.shape[3]*X.shape[2] print('The input/output dimension to the Moelecular AE is ', molecular_input_dim) print( 'Data Format:\n [Frames (%s), Molecules (%s), Beads (%s), %s (%s)]' % (num_samples, X.shape[1], X.shape[2], fields.keys(), X.shape[3])) ### Define Model, Solver and Compile ########## print('Define the model and compile') opt = p2ck.build_optimizer(GP['optimizer'], learning_rate, kerasDefaults) model_type = 'mlp' memo = '%s_%s' % (GP['base_memo'], model_type) ######## Define Molecular Model, Solver and Compile ######### molecular_nonlinearity = GP['molecular_nonlinearity'] len_molecular_hidden_layers = len(molecular_hidden_layers) conv_bool = GP['conv_bool'] if conv_bool: print('Molecular kernel size: ', mol_kernel_size) molecular_model, molecular_encoder = hf.conv_dense_mol_auto( bead_k_size=bead_kernel_size, mol_k_size=mol_kernel_size, weights_path=None, input_shape=(1, molecular_input_dim, 1), nonlinearity=molecular_nonlinearity, hidden_layers=molecular_hidden_layers, l2_reg=GP['weight_decay'], drop=GP['drop_prob']) else: molecular_model = hf.dense_auto(weights_path=None, input_shape=(molecular_input_dim, ), nonlinearity=molecular_nonlinearity, hidden_layers=molecular_hidden_layers, l2_reg=GP['weight_decay']) molecular_model.compile( optimizer=opt, loss=helper.combined_loss, metrics=['mean_squared_error', 'mean_absolute_error']) molecular_model.summary() ##### set up callbacks and cooling for the molecular_model ########## drop = 0.5 mb_epochs = GP['molecular_epochs'] initial_lrate = GP['learning_rate'] epochs_drop = 1 + int(np.floor(mb_epochs / 3)) def step_decay(epoch): global initial_lrate, epochs_drop, drop lrate = initial_lrate * np.power(drop, np.floor((1 + epoch) / epochs_drop)) return lrate lr_scheduler = LearningRateScheduler(step_decay) history = callbacks.History() # callbacks=[history,lr_scheduler] candleRemoteMonitor = CandleRemoteMonitor(params=GP) timeoutMonitor = TerminateOnTimeOut(TIMEOUT) callbacks = [history, candleRemoteMonitor, timeoutMonitor] loss = 0. #### Save the Model to disk if GP['save_path'] != None: if not os.path.exists(GP['save_path']): os.makedirs(GP['save_path']) model_json = molecular_model.to_json() with open(GP['save_path'] + '/model.json', "w") as json_file: json_file.write(model_json) print('Saved model to disk') #### Train the Model if GP['train_bool']: if not str2bool(GP['cool']): effec_epochs = GP['epochs'] ct = hf.Candle_Molecular_Train( molecular_model, molecular_encoder, data_files, mb_epochs, callbacks, batch_size=32, case=GP['case'], save_path=GP['save_path'], len_molecular_hidden_layers=len_molecular_hidden_layers, molecular_nbrs=molecular_nbrs, conv_bool=conv_bool, type_bool=GP['type_bool']) # ct=hf.Candle_Train(datagen,model,data_files,effec_epochs,case=GP['case']) frame_loss, frame_mse = ct.train_ac() else: effec_epochs = GP['epochs'] // 3 ct = hf.Candle_Train(datagen, model, data_files, effec_epochs, case=GP['case']) loss = [] for i in range(3): lr = GP['learning_rate'] / 10**i ct.model.optimizer.lr.set_value(lr) if i > 0: ct.print_data = False print('Cooling Learning Rate by factor of 10...') loss.extend(ct.train_ac()) return frame_loss, frame_mse