Beispiel #1
0
    def datagen(self, epoch=0, print_out=1, test=0):
        files = self.files
        # order = range(13, 17) # Temporarily train on only a few files range(len(files))
        # Randomize files after first training epoch
        # if epoch:
        #    order = np.random.permutation(order)

        # choose a random sample to train on
        if not test:
            order = random.sample(
                list(self.train_ind),
                int(self.sampling_density * len(self.train_ind)))
        else:
            order = self.test_ind

        for f_ind in order:
            if print_out:
                print(files[f_ind], '\n')

            (X, nbrs, resnums) = helper.get_data_arrays(files[f_ind])

            # normalizing the location coordinates and bond lengths and scale type encoding
            # Changed the xyz normalization from 255 to 350
            if self.type_feature:
                Xnorm = np.concatenate([
                    X[:, :, :, 0:3] / 320., X[:, :, :, 3:8],
                    X[:, :, :, 8:] / 10.
                ],
                                       axis=3)

            # only consider the location coordinates and bond lengths per molecule
            else:
                Xnorm = np.concatenate(
                    [X[:, :, :, 0:3] / 320., X[:, :, :, 8:] / 10.], axis=3)

            num_frames = X.shape[0]

            xt_all = np.array([])
            yt_all = np.array([])

            num_active_frames = random.sample(
                range(num_frames), int(self.sampling_density * num_frames))

            print('Datagen on the following frames', num_active_frames)

            for i in num_active_frames:

                if self.conv_net:
                    xt = Xnorm[i]
                    if self.nbr_type == 'relative':
                        xt = helper.append_nbrs_relative(
                            xt, nbrs[i], self.molecular_nbrs)
                    elif self.nbr_type == 'invariant':
                        xt = helper.append_nbrs_invariant(
                            xt, nbrs[i], self.molecular_nbrs)
                    else:
                        print('Invalid nbr_type')
                        exit()

                    yt = xt.copy()
                    xt = xt.reshape(xt.shape[0], 1, xt.shape[1], 1)
                    if self.full_conv_net:
                        yt = xt.copy()

                else:
                    xt = Xnorm[i]
                    if self.nbr_type == 'relative':
                        xt = helper.append_nbrs_relative(
                            xt, nbrs[i], self.molecular_nbrs)
                    elif self.nbr_type == 'invariant':
                        xt = helper.append_nbrs_invariant(
                            xt, nbrs[i], self.molecular_nbrs)
                    else:
                        print('Invalid nbr_type')
                        exit()
                    yt = xt.copy()

                if not len(xt_all):
                    xt_all = np.expand_dims(xt, axis=0)
                    yt_all = np.expand_dims(yt, axis=0)
                else:
                    xt_all = np.append(xt_all,
                                       np.expand_dims(xt, axis=0),
                                       axis=0)
                    yt_all = np.append(yt_all,
                                       np.expand_dims(yt, axis=0),
                                       axis=0)

            yield files[f_ind], xt_all, yt_all

        return
def run(GP):

    # set the seed
    if GP['seed']:
        np.random.seed(GP['seed'])
    else:
        np.random.seed(np.random.randint(10000))

    # Set paths
    if not os.path.isdir(GP['home_dir']):
        print('Keras home directory not set')
        sys.exit(0)
    sys.path.append(GP['home_dir'])

    # Setup loggin
    args = candle.ArgumentStruct(**GP)
    #    set_seed(args.rng_seed)
    #    ext = extension_from_parameters(args)
    candle.verify_path(args.save_path)
    prefix = args.save_path  # + ext
    logfile = args.logfile if args.logfile else prefix + '.log'
    candle.set_up_logger(logfile, logger, False)  #args.verbose
    logger.info('Params: {}'.format(GP))

    import p2b1 as hf
    reload(hf)

    #import keras_model_utils as KEU
    #reload(KEU)
    #reload(p2ck)
    #reload(p2ck.optimizers)
    maps = hf.autoencoder_preprocess()

    from keras.optimizers import SGD, RMSprop, Adam
    from keras.datasets import mnist
    from keras.callbacks import LearningRateScheduler, ModelCheckpoint
    from keras import callbacks
    from keras.layers.advanced_activations import ELU
    from keras.preprocessing.image import ImageDataGenerator

    #    GP=hf.ReadConfig(opts.config_file)
    batch_size = GP['batch_size']
    learning_rate = GP['learning_rate']
    kerasDefaults = candle.keras_default_config()

    ##### Read Data ########
    import helper
    (data_files, fields) = p2b1.get_list_of_data_files(GP)
    # Read from local directoy
    #(data_files, fields) = helper.get_local_files('/p/gscratchr/brainusr/datasets/cancer/pilot2/3k_run16_10us.35fs-DPPC.20-DIPC.60-CHOL.20.dir/')
    #(data_files, fields) = helper.get_local_files('3k_run16', '/p/lscratchf/brainusr/datasets/cancer/pilot2/')

    # Define datagenerator
    datagen = hf.ImageNoiseDataGenerator(corruption_level=GP['noise_factor'])

    # get data dimension ##
    num_samples = 0
    for f in data_files:

        # Seperate different arrays from the data
        (X, nbrs, resnums) = helper.get_data_arrays(f)

        num_samples += X.shape[0]

    (X, nbrs, resnums) = helper.get_data_arrays(data_files[0])
    print('\nData chunk shape: ', X.shape)

    molecular_hidden_layers = GP['molecular_num_hidden']
    if not molecular_hidden_layers:
        X_train = hf.get_data(X, case=GP['case'])
        input_dim = X_train.shape[1]
    else:
        # computing input dimension for outer AE
        input_dim = X.shape[1] * molecular_hidden_layers[-1]

    print('\nState AE input/output dimension: ', input_dim)

    # get data dimension for molecular autoencoder
    molecular_nbrs = np.int(GP['molecular_nbrs'])
    num_molecules = X.shape[1]
    num_beads = X.shape[2]

    if GP['nbr_type'] == 'relative':
        # relative x, y, z positions
        num_loc_features = 3
        loc_feat_vect = ['rel_x', 'rel_y', 'rel_z']
    elif GP['nbr_type'] == 'invariant':
        # relative distance and angle
        num_loc_features = 2
        loc_feat_vect = ['rel_dist', 'rel_angle']
    else:
        print('Invalid nbr_type!!')
        exit()

    if not GP['type_bool']:
        # only consider molecular location coordinates
        num_type_features = 0
        type_feat_vect = []
    else:
        num_type_features = 5
        type_feat_vect = list(fields.keys())[3:8]

    num_features = num_loc_features + num_type_features + num_beads
    dim = np.prod([num_beads, num_features, molecular_nbrs + 1])
    bead_kernel_size = num_features
    molecular_input_dim = dim
    mol_kernel_size = num_beads

    feature_vector = loc_feat_vect + type_feat_vect + list(fields.keys())[8:]

    print('\nMolecular AE input/output dimension: ', molecular_input_dim)

    print(
        '\nData Format:\n[Frames (%s), Molecules (%s), Beads (%s), %s (%s)]' %
        (num_samples, num_molecules, num_beads, feature_vector, num_features))

    ### Define Model, Solver and Compile ##########
    print('\nDefine the model and compile')
    opt = candle.build_optimizer(GP['optimizer'], learning_rate, kerasDefaults)
    model_type = 'mlp'
    memo = '%s_%s' % (GP['base_memo'], model_type)

    ######## Define Molecular Model, Solver and Compile #########
    molecular_nonlinearity = GP['molecular_nonlinearity']

    len_molecular_hidden_layers = len(molecular_hidden_layers)
    conv_bool = GP['conv_bool']
    full_conv_bool = GP['full_conv_bool']
    if conv_bool:
        molecular_model, molecular_encoder = AE_models.conv_dense_mol_auto(
            bead_k_size=bead_kernel_size,
            mol_k_size=mol_kernel_size,
            weights_path=None,
            input_shape=(1, molecular_input_dim, 1),
            nonlinearity=molecular_nonlinearity,
            hidden_layers=molecular_hidden_layers,
            l2_reg=GP['l2_reg'],
            drop=float(GP['drop_prob']))
    elif full_conv_bool:
        molecular_model, molecular_encoder = AE_models.full_conv_mol_auto(
            bead_k_size=bead_kernel_size,
            mol_k_size=mol_kernel_size,
            weights_path=None,
            input_shape=(1, molecular_input_dim, 1),
            nonlinearity=molecular_nonlinearity,
            hidden_layers=molecular_hidden_layers,
            l2_reg=GP['l2_reg'],
            drop=float(GP['drop_prob']))

    else:
        molecular_model, molecular_encoder = AE_models.dense_auto(
            weights_path=None,
            input_shape=(molecular_input_dim, ),
            nonlinearity=molecular_nonlinearity,
            hidden_layers=molecular_hidden_layers,
            l2_reg=GP['l2_reg'],
            drop=float(GP['drop_prob']))

    if GP['loss'] == 'mse':
        loss_func = 'mse'
    elif GP['loss'] == 'custom':
        loss_func = helper.combined_loss

    molecular_model.compile(
        optimizer=opt,
        loss=loss_func,
        metrics=['mean_squared_error', 'mean_absolute_error'])
    print('\nModel Summary: \n')
    molecular_model.summary()
    ##### set up callbacks and cooling for the molecular_model ##########
    drop = 0.5
    mb_epochs = GP['epochs']
    initial_lrate = GP['learning_rate']
    epochs_drop = 1 + int(np.floor(mb_epochs / 3))

    def step_decay(epoch):
        global initial_lrate, epochs_drop, drop
        lrate = initial_lrate * np.power(drop,
                                         np.floor((1 + epoch) / epochs_drop))
        return lrate

    lr_scheduler = LearningRateScheduler(step_decay)
    history = callbacks.History()
    # callbacks=[history,lr_scheduler]

    history_logger = candle.LoggingCallback(logger.debug)
    candleRemoteMonitor = candle.CandleRemoteMonitor(params=GP)
    timeoutMonitor = candle.TerminateOnTimeOut(TIMEOUT)
    callbacks = [history, history_logger, candleRemoteMonitor, timeoutMonitor]
    loss = 0.

    #### Save the Model to disk
    if GP['save_path'] != None:
        save_path = GP['save_path']
        if not os.path.exists(save_path):
            os.makedirs(save_path)
    else:
        save_path = '.'

    model_json = molecular_model.to_json()
    with open(save_path + '/model.json', "w") as json_file:
        json_file.write(model_json)

    encoder_json = molecular_encoder.to_json()
    with open(save_path + '/encoder.json', "w") as json_file:
        json_file.write(encoder_json)

    print('Saved model to disk')

    #### Train the Model
    if GP['train_bool']:
        ct = hf.Candle_Molecular_Train(
            molecular_model,
            molecular_encoder,
            data_files,
            mb_epochs,
            callbacks,
            batch_size=batch_size,
            nbr_type=GP['nbr_type'],
            save_path=GP['save_path'],
            len_molecular_hidden_layers=len_molecular_hidden_layers,
            molecular_nbrs=molecular_nbrs,
            conv_bool=conv_bool,
            full_conv_bool=full_conv_bool,
            type_bool=GP['type_bool'],
            sampling_density=GP['sampling_density'])
        frame_loss, frame_mse = ct.train_ac()
    else:
        frame_mse = []
        frame_loss = []

    return frame_loss, frame_mse
Beispiel #3
0
def run(GP):

    # set the seed
    if GP['seed']:
        np.random.seed(7)
    else:
        np.random.seed(np.random.randint(10000))

    # Set paths
    if not os.path.isdir(GP['home_dir']):
        print('Keras home directory not set')
        sys.exit(0)
    sys.path.append(GP['home_dir'])

    import p2b1_mol_AE as hf
    reload(hf)

    import keras_model_utils as KEU
    reload(KEU)
    reload(p2ck)
    reload(p2ck.optimizers)
    maps = hf.autoencoder_preprocess()

    from keras.optimizers import SGD, RMSprop, Adam
    from keras.datasets import mnist
    from keras.callbacks import LearningRateScheduler, ModelCheckpoint
    from keras import callbacks
    from keras.layers.advanced_activations import ELU
    from keras.preprocessing.image import ImageDataGenerator

    #    GP=hf.ReadConfig(opts.config_file)
    batch_size = GP['batch_size']
    learning_rate = GP['learning_rate']
    kerasDefaults = p2c.keras_default_config()

    ##### Read Data ########
    #(data_files, fields)=p2c.get_list_of_data_files(GP)

    # Read from local directoy
    import helper
    (data_files, fields) = helper.get_local_files(
        '/p/gscratchr/brainusr/datasets/cancer/pilot2/3k_run16_10us.35fs-DPPC.20-DIPC.60-CHOL.20.dir/'
    )

    # Define datagenerator
    datagen = hf.ImageNoiseDataGenerator(corruption_level=GP['noise_factor'])

    # get data dimension ##
    num_samples = 0
    for f in data_files:

        # Seperate different arrays from the data
        (X, nbrs, resnums) = helper.get_data_arrays(f)

        num_samples += X.shape[0]

    (X, nbrs, resnums) = helper.get_data_arrays(data_files[0])
    print(X.shape)

    molecular_hidden_layers = GP['molecular_num_hidden']
    if not molecular_hidden_layers:
        X_train = hf.get_data(X, case=GP['case'])
        input_dim = X_train.shape[1]
    else:
        # computing input dimension for outer AE
        input_dim = X.shape[1] * molecular_hidden_layers[-1]

    print('The input dimension to the State AE is ', input_dim)

    # get data dimension for molecular autoencoder
    molecular_nbrs = np.int(GP['molecular_nbrs'])
    if not GP['type_bool']:
        # only consider molecular location coordinates
        dim = np.prod([X.shape[2], X.shape[3] - 5, molecular_nbrs + 1])

        molecular_input_dim = dim
        molecular_output_dim = dim
        bead_kernel_size = X.shape[3] - 5
        mol_kernel_size = 12  # (X.shape[3]-5)*X.shape[2]
    else:
        dim = np.prod(X.shape[2:] + (molecular_nbrs + 1, ))

        molecular_input_dim = dim
        bead_kernel_size = X.shape[3]
        mol_kernel_size = 12  # X.shape[3]*X.shape[2]

    print('The input/output dimension to the Moelecular AE is ',
          molecular_input_dim)

    print(
        'Data Format:\n  [Frames (%s), Molecules (%s), Beads (%s), %s (%s)]' %
        (num_samples, X.shape[1], X.shape[2], fields.keys(), X.shape[3]))

    ### Define Model, Solver and Compile ##########
    print('Define the model and compile')
    opt = p2ck.build_optimizer(GP['optimizer'], learning_rate, kerasDefaults)
    model_type = 'mlp'
    memo = '%s_%s' % (GP['base_memo'], model_type)

    ######## Define Molecular Model, Solver and Compile #########
    molecular_nonlinearity = GP['molecular_nonlinearity']

    len_molecular_hidden_layers = len(molecular_hidden_layers)
    conv_bool = GP['conv_bool']
    if conv_bool:
        print('Molecular kernel size: ', mol_kernel_size)
        molecular_model, molecular_encoder = hf.conv_dense_mol_auto(
            bead_k_size=bead_kernel_size,
            mol_k_size=mol_kernel_size,
            weights_path=None,
            input_shape=(1, molecular_input_dim, 1),
            nonlinearity=molecular_nonlinearity,
            hidden_layers=molecular_hidden_layers,
            l2_reg=GP['weight_decay'],
            drop=GP['drop_prob'])
    else:
        molecular_model = hf.dense_auto(weights_path=None,
                                        input_shape=(molecular_input_dim, ),
                                        nonlinearity=molecular_nonlinearity,
                                        hidden_layers=molecular_hidden_layers,
                                        l2_reg=GP['weight_decay'])

    molecular_model.compile(
        optimizer=opt,
        loss=helper.combined_loss,
        metrics=['mean_squared_error', 'mean_absolute_error'])
    molecular_model.summary()
    ##### set up callbacks and cooling for the molecular_model ##########
    drop = 0.5
    mb_epochs = GP['molecular_epochs']
    initial_lrate = GP['learning_rate']
    epochs_drop = 1 + int(np.floor(mb_epochs / 3))

    def step_decay(epoch):
        global initial_lrate, epochs_drop, drop
        lrate = initial_lrate * np.power(drop,
                                         np.floor((1 + epoch) / epochs_drop))
        return lrate

    lr_scheduler = LearningRateScheduler(step_decay)
    history = callbacks.History()
    # callbacks=[history,lr_scheduler]

    candleRemoteMonitor = CandleRemoteMonitor(params=GP)
    timeoutMonitor = TerminateOnTimeOut(TIMEOUT)
    callbacks = [history, candleRemoteMonitor, timeoutMonitor]
    loss = 0.

    #### Save the Model to disk
    if GP['save_path'] != None:
        if not os.path.exists(GP['save_path']):
            os.makedirs(GP['save_path'])

        model_json = molecular_model.to_json()
        with open(GP['save_path'] + '/model.json', "w") as json_file:
            json_file.write(model_json)
        print('Saved model to disk')


#### Train the Model
    if GP['train_bool']:
        if not str2bool(GP['cool']):
            effec_epochs = GP['epochs']
            ct = hf.Candle_Molecular_Train(
                molecular_model,
                molecular_encoder,
                data_files,
                mb_epochs,
                callbacks,
                batch_size=32,
                case=GP['case'],
                save_path=GP['save_path'],
                len_molecular_hidden_layers=len_molecular_hidden_layers,
                molecular_nbrs=molecular_nbrs,
                conv_bool=conv_bool,
                type_bool=GP['type_bool'])
            #            ct=hf.Candle_Train(datagen,model,data_files,effec_epochs,case=GP['case'])
            frame_loss, frame_mse = ct.train_ac()
        else:
            effec_epochs = GP['epochs'] // 3
            ct = hf.Candle_Train(datagen,
                                 model,
                                 data_files,
                                 effec_epochs,
                                 case=GP['case'])
            loss = []
            for i in range(3):
                lr = GP['learning_rate'] / 10**i
                ct.model.optimizer.lr.set_value(lr)
                if i > 0:
                    ct.print_data = False
                    print('Cooling Learning Rate by factor of 10...')
                loss.extend(ct.train_ac())

    return frame_loss, frame_mse