Ejemplo n.º 1
0
def main(sim_path, out):

    # Define wildcard path to contact matrix data
    cm_filepath = os.path.join(sim_path, 'output-cm-*.h5')

    # Collect contact matrix file names sorted by sim_id
    cm_files = sorted(glob(cm_filepath))

    if not cm_files: 
        raise FileNotFoundError(f'No h5 files found, recheck your input path {sim_path}')

    with ExitStack() as stack:
        # Open all h5 files and add them to exit stack
        open_cm_files = map(lambda file: stack.enter_context(open_h5(file)), 
                            cm_files)

        # Iterate through open h5 files and get contact_map datasets 
        cm_data = list(map(lambda file: file['contact_maps'], open_cm_files))
        
        # Compress all .h5 files into one in cvae format 
        cvae_input = cm_to_cvae(cm_data)

        # Create .h5 as cvae input
        cvae_input_file = os.path.join(out, 'cvae-input.h5')

        # Create and open contact map aggregation output file
        cvae_input_file = stack.enter_context(h5py.File(cvae_input_file, 'w'))

        # Write aggregated contact map dataset to file
        cvae_input_file.create_dataset('contact_maps', data=cvae_input)
Ejemplo n.º 2
0
def main(input_path, out_path):

    f = open_h5(input_path)
    contact_maps = np.array(f['contact_maps'][:])
    f.close()

    sparse_contact_maps_from_matrices(contact_maps, out_path)
Ejemplo n.º 3
0
        def __init__(self, path, split_ptc=0.8, split='train', squeeze=False):
            with open_h5(path) as input_file:
                # Access contact matrix data from h5 file
                data = np.array(input_file['contact_maps'])

            # 80-20 train validation split index
            split_ind = int(split_ptc * len(data))

            if split == 'train':
                self.data = data[:split_ind]
            elif split == 'valid':
                self.data = data[split_ind:]
            else:
                raise ValueError(f'Parameter split={split} is invalid.')

            # TODO: in future contact map Dataset, pass in device to precompute
            #       the operation


            # TODO: this reshape code may not be the best solution. revisit
            num_residues = self.data.shape[2]
            assert num_residues == 22

            if squeeze:
                shape = (-1, num_residues, num_residues)
            else:
                shape = (-1, 1, num_residues, num_residues)

            self.data = torch.from_numpy(self.data.reshape(shape)).to(torch.float32)
Ejemplo n.º 4
0
    def __init__(self,
                 path,
                 out_dir,
                 squeeze,
                 sample_interval=20,
                 batch_size=128,
                 writer=None):
        """
        Parameters
        ----------
        path : str
            Path to h5 file containing contact matrices.

        out_dir : str
            Directory to store output plots.

        squeeze : bool
            If True, data is reshaped to (H, W) else if False data
            is reshaped to (1, H, W).

        sample_interval : int
            Plots every sample_interval'th point in the data set

        batch_size : int
            Batch size to load raw contact matrices into memory.
            Batches are loaded into memory, encoded to a latent
            dimension and then collected in a np.ndarray. The
            np.ndarray is then passed to the TSNE algorithm.

            NOTE: Not a learning hyperparameter, simply needs to
                  be small enough to load batch into memory.

        writer : torch.utils.tensorboard.SummaryWriter
        """

        os.makedirs(out_dir, exist_ok=True)

        # Open h5 file. Python's garbage collector closes the
        # file when class is destructed.
        h5_file = open_h5(path)

        self.dset = h5_file['contact_maps']
        self.out_dir = out_dir
        self.sample_interval = sample_interval
        self.batch_size = batch_size
        self.writer = writer
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')

        if squeeze:
            self.shape = (self.dset.shape[1], self.dset.shape[2])
        else:
            self.shape = (1, self.dset.shape[1], self.dset.shape[2])

        self._init_plot(h5_file)
Ejemplo n.º 5
0
def generate_embeddings(encoder_hparams_path, encoder_weight_path, cm_path):
    encoder_hparams = EncoderHyperparams.load(encoder_hparams_path)

    with open_h5(cm_path) as file:

        # Access contact matrix data from h5 file
        data = file['contact_maps']

        # Get shape of an individual contact matrix
        # (ignore total number of matrices)
        input_shape = data.shape[1:]

        encoder = EncoderConvolution2D(input_shape=input_shape,
                                       hyperparameters=encoder_hparams)

        # Load best model weights
        encoder.load_weights(encoder_weight_path)

        # Create contact matrix embeddings
        cm_embeddings, *_ = encoder.embed(data)

    return cm_embeddings
Ejemplo n.º 6
0
def main(input_path, out_path, model_id, gpu, epochs, batch_size, latent_dim):

    # Set CUDA environment variables
    os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
    os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu)

    with open_h5(input_path) as input_file:

        # Access contact matrix data from h5 file
        data = np.array(input_file['contact_maps'])

    # Shuffle data before train validation split
    np.random.shuffle(data)

    # 80-20 train validation split index
    split = int(0.8 * len(data))

    # Partition input data into 80-20 train valid split
    train, valid = data[:split], data[split:]

    # Get shape of an individual contact matrix
    # (ignore total number of matrices)
    input_shape = train.shape[1:]

    # Set model hyperparameters for encoder and decoder
    shared_hparams = {'num_conv_layers': 4,
                      'filters': [64, 64, 64, 64],
                      'kernels': [3, 3, 3, 3],
                      'strides': [1, 2, 1, 1],
                      'num_affine_layers': 1,
                      'affine_widths': [128],
                      'latent_dim': latent_dim
                     }

    affine_dropouts = [0]

    encoder_hparams = EncoderHyperparams(affine_dropouts=affine_dropouts,
                                         **shared_hparams)
    decoder_hparams = DecoderHyperparams(**shared_hparams)

    encoder = EncoderConvolution2D(input_shape=input_shape,
                                   hyperparameters=encoder_hparams)

    # Get shape attributes of the last encoder layer to define the decoder
    encode_conv_shape, num_conv_params = encoder.get_final_conv_params()

    decoder = DecoderConvolution2D(output_shape=input_shape,
                                   enc_conv_params=num_conv_params,
                                   enc_conv_shape=encode_conv_shape,
                                   hyperparameters=decoder_hparams)

    optimizer = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)

    cvae = VAE(input_shape=input_shape,
               encoder=encoder,
               decoder=decoder,
               optimizer=optimizer)

    # Define callbacks to report model performance for analysis
    embed_callback = EmbeddingCallback(train, cvae)
    loss_callback = LossHistory()

    cvae.train(data=train, validation_data=valid,
               batch_size=batch_size, epochs=epochs,
               callbacks=[embed_callback, loss_callback])

    # Define file paths to store model performance and weights
    ae_weight_path = os.path.join(out_path, f'ae-weight-{model_id}.h5')
    encoder_weight_path = os.path.join(out_path, f'encoder-weight-{model_id}.h5')
    encoder_hparams_path = os.path.join(out_path, f'encoder-hparams-{model_id}.pkl')
    decoder_hparams_path = os.path.join(out_path, f'decoder-hparams-{model_id}.pkl')
    embed_path = os.path.join(out_path, f'embed-{model_id}.npy')
    idx_path = os.path.join(out_path, f'embed-idx-{model_id}.npy')
    loss_path = os.path.join(out_path, f'loss-{model_id}.npy')
    val_loss_path = os.path.join(out_path, f'val-loss-{model_id}.npy')


    # Save weights, hyperparameters, and model performance.
    # Save encoder weights seperately so the full model doesn't need to be
    # loaded during the outlier detection stage.
    cvae.save_weights(ae_weight_path)
    encoder.save_weights(encoder_weight_path)
    encoder_hparams.save(encoder_hparams_path)
    decoder_hparams.save(decoder_hparams_path)
    embed_callback.save(embed_path=embed_path, idx_path=idx_path)
    loss_callback.save(loss_path=loss_path, val_loss_path=val_loss_path)
Ejemplo n.º 7
0
    def __init__(self,
                 path,
                 input_shape,
                 split_ptc=0.8,
                 split='train',
                 sparse=False,
                 gpu=None):
        """
        Parameters
        ----------
        path : str
            Path to h5 file containing contact matrices.

        input_shape : tuple
            Shape of contact matrices (H, W), may be (1, H, W).

        split_ptc : float
            Percentage of total data to be used as training set.

        split : str
            Either 'train' or 'valid', specifies whether this
            dataset returns train or validation data.

        sparse : bool
            If True, process data as sparse row/col COO format. Data
            should not contain any values because they are all 1's and
            generated on the fly. If False, input data is normal tensor.

        gpu : int, None
            If None, then data will be put onto the default GPU if CUDA
            is available and otherwise is put onto a CPU. If gpu is int
            type, then data is put onto the specified GPU.
        """
        if split not in ('train', 'valid'):
            raise ValueError("Parameter split must be 'train' or 'valid'.")
        if split_ptc < 0 or split_ptc > 1:
            raise ValueError(
                'Parameter split_ptc must satisfy 0 <= split_ptc <= 1.')

        # Open h5 file. Python's garbage collector closes the
        # file when class is destructed.
        h5_file = open_h5(path)

        if sparse:
            group = h5_file['contact_maps']
            self.row_dset = group.get('row')
            self.col_dset = group.get('col')
            self.len = len(self.row_dset)
        else:
            # contact_maps dset has shape (N, W, H, 1)
            self.dset = h5_file['contact_maps']
            self.len = len(self.dset)

        # train validation split index
        self.split_ind = int(split_ptc * self.len)
        self.split = split
        self.sparse = sparse
        self.shape = input_shape

        if gpu is None:
            self.device = torch.device(
                'cuda' if torch.cuda.is_available() else 'cpu')
        else:
            self.device = torch.device(f'cuda:{gpu}')