コード例 #1
0
def cmv_dataset(dataset_path: str = None, task_definition: TaskDefinition = None,
                cross_validation_fold: int = 0, n_worker_processes: int = 4, batch_size: int = 4,
                inputformat: str = 'NCL', keep_dataset_in_ram: bool = True,
                sample_n_sequences: int = int(1e4), verbose: bool = True) \
        -> Tuple[TaskDefinition, DataLoader, DataLoader, DataLoader, DataLoader]:
    """Get data loaders for category "real-world immunosequencing data"
     
     Get data loaders for training set and training-, validation-, and test-set in evaluation mode
     (=no random subsampling) for datasets of category "real-world immunosequencing data".
     This is a pre-processed version of the CMV dataset [1]_.
     
    Parameters
    ----------
    dataset_path: str
        File path of dataset. If the dataset does not exist, the corresponding hdf5 container will be downloaded.
        Defaults to "deeprc_datasets/CMV.hdf5"
    task_definition: TaskDefinition
        TaskDefinition object containing the tasks to train the DeepRC model on. See `deeprc/examples/` for examples.
    cross_validation_fold : int
        Specify the fold of the cross-validation the dataloaders should be computed for.
    n_worker_processes : int
        Number of background processes to use for converting dataset to hdf5 container and trainingset dataloader.
    batch_size : int
        Number of repertoires per minibatch during training.
    inputformat : 'NCL' or 'NLC'
        Format of input feature array;
        'NCL' -> (batchsize, channels, seq.length);
        'LNC' -> (seq.length, batchsize, channels);
    keep_dataset_in_ram : bool
        It is faster to load the full hdf5 file into the RAM instead of keeping it on the disk.
        If False, the hdf5 file will be read from the disk and consume less RAM.
    sample_n_sequences : int
        Optional: Random sub-sampling of `sample_n_sequences` sequences per repertoire.
        Number of sequences per repertoire might be smaller than `sample_n_sequences` if repertoire is smaller or
        random indices have been drawn multiple times.
        If None, all sequences will be loaded for each repertoire.
    verbose : bool
        Activate verbose mode
    
    Returns
    ---------
    task_definition: TaskDefinition
        TaskDefinition object containing the tasks to train the DeepRC model on. See `deeprc/examples/` for examples.
    trainingset_dataloader: torch.utils.data.DataLoader
        Dataloader for trainingset with active `sample_n_sequences` (=random subsampling/dropout of repertoire
        sequences)
    trainingset_eval_dataloader: torch.utils.data.DataLoader
        Dataloader for trainingset with deactivated `sample_n_sequences`
    validationset_eval_dataloader: torch.utils.data.DataLoader
        Dataloader for validationset with deactivated `sample_n_sequences`
    testset_eval_dataloader: torch.utils.data.DataLoader
        Dataloader for testset with deactivated `sample_n_sequences`
    
    References
    -----
    .. [1] Emerson, R. O., DeWitt, W. S., Vignali, M., Gravley, J.,Hu, J. K., Osborne, E. J., Desmarais, C., Klinger,
     M.,Carlson, C. S., Hansen, J. A., et al. Immunosequencingidentifies signatures of cytomegalovirus exposure history
     and hla-mediated effects on the t cell repertoire.Naturegenetics, 49(5):659, 2017
    """
    if dataset_path is None:
        dataset_path = os.path.join(os.path.dirname(deeprc.__file__),
                                    'datasets', f'CMV')
    os.makedirs(dataset_path, exist_ok=True)
    metadata_file = os.path.join(dataset_path, f'CMV_metadata.tsv')
    repertoiresdata_file = os.path.join(dataset_path,
                                        f'CMV_repertoiresdata.hdf5')

    # Download metadata file
    if not os.path.exists(metadata_file):
        user_confirmation(
            f"File {metadata_file} not found. It will be downloaded now. Continue?",
            'y', 'n')
        url_get(
            f"https://ml.jku.at/research/DeepRC/datasets/CMV_data/metadata/cmv_emerson_2017.tsv",
            metadata_file)

    # Download repertoire file
    if not os.path.exists(repertoiresdata_file):
        user_confirmation(
            f"File {repertoiresdata_file} not found. It will be downloaded now. Continue?",
            'y', 'n')
        url_get(
            f"https://ml.jku.at/research/DeepRC/datasets/CMV_data/hdf5/cmv_emerson_2017.hdf5",
            repertoiresdata_file)

    # Get file for dataset splits
    split_file = os.path.join(os.path.dirname(deeprc.__file__), 'datasets',
                              'splits_used_in_paper', 'CMV_splits.pkl')
    with open(split_file, 'rb') as sfh:
        split_inds = pkl.load(sfh)

    # Get task_definition
    if task_definition is None:
        task_definition = TaskDefinition(targets=[
            BinaryTarget(column_name='Known CMV status', true_class_value='+')
        ])

    # Create data loaders
    trainingset_dataloader, trainingset_eval_dataloader, validationset_eval_dataloader, testset_eval_dataloader = \
        make_dataloaders(task_definition=task_definition, metadata_file=metadata_file,
                         repertoiresdata_path=repertoiresdata_file, split_inds=split_inds,
                         cross_validation_fold=cross_validation_fold, n_worker_processes=n_worker_processes,
                         batch_size=batch_size, inputformat=inputformat, keep_dataset_in_ram=keep_dataset_in_ram,
                         sample_n_sequences=sample_n_sequences, sequence_counts_scaling_fn=log_sequence_count_scaling,
                         metadata_file_id_column='Subject ID', verbose=verbose)
    return (task_definition, trainingset_dataloader,
            trainingset_eval_dataloader, validationset_eval_dataloader,
            testset_eval_dataloader)
コード例 #2
0
            possible_target_values=['type_1', 'type_2', 'type_3', 'type_4', 'type_5'],
            class_weights=[1., 1., 1., 1., 1.],
            task_weight=aux_task_weight
    )
]).to(device=device)


#
# Get dataset
#
# Get data loaders for training set and training-, validation-, and test-set in evaluation mode (=no random subsampling)
trainingset, trainingset_eval, validationset_eval, testset_eval = make_dataloaders(
        task_definition=task_definition,
        metadata_file="../datasets/example_dataset/metadata.tsv",
        repertoiresdata_path="../datasets/example_dataset/repertoires",
        metadata_file_id_column='ID',
        sequence_column='amino_acid',
        sequence_counts_column='templates',
        sample_n_sequences=args.sample_n_sequences,
        sequence_counts_scaling_fn=no_sequence_count_scaling  # Alternative: deeprc.dataset_readers.log_sequence_count_scaling
)


#
# Create DeepRC Network
#
# Create sequence embedding network (for CNN, kernel_size and n_kernels are important hyper-parameters)
sequence_embedding_network = SequenceEmbeddingCNN(n_input_features=20+3, kernel_size=args.kernel_size,
                                                  n_kernels=args.n_kernels, n_layers=1)
# Create attention network
attention_network = AttentionNetwork(n_input_features=args.n_kernels, n_layers=2, n_units=32)
# Create output network
コード例 #3
0
def cmv_implanted_dataset(dataset_path: str = None, dataset_id: int = 0, task_definition: TaskDefinition = None,
                          cross_validation_fold: int = 0, n_worker_processes: int = 4, batch_size: int = 4,
                          inputformat: str = 'NCL', keep_dataset_in_ram: bool = True,
                          sample_n_sequences: int = int(1e4),  verbose: bool = True) \
        -> Tuple[TaskDefinition, DataLoader, DataLoader, DataLoader, DataLoader]:
    """Get data loaders for category "real-world immunosequencing data with implanted signals".
     
     Get data loaders for training set and training-, validation-, and test-set in evaluation mode
     (=no random subsampling) for datasets of category "real-world immunosequencing data with implanted signals".
     
    Parameters
    ----------
    dataset_path: str
        File path of dataset. If the dataset does not exist, the corresponding hdf5 container will be downloaded.
        Defaults to "deeprc_datasets/CMV_with_implanted_signals_{dataset_id}.hdf5"
    dataset_id: int
        ID of dataset.
        0 = "One Motif 1%", 1 = "One 0.1%", 2 = "Multi 1%", 3 = "Multi 0.1%"
    task_definition: TaskDefinition
        TaskDefinition object containing the tasks to train the DeepRC model on. See `deeprc/examples/` for examples.
    cross_validation_fold : int
        Specify the fold of the cross-validation the dataloaders should be computed for.
    n_worker_processes : int
        Number of background processes to use for converting dataset to hdf5 container and trainingset dataloader.
    batch_size : int
        Number of repertoires per minibatch during training.
    inputformat : 'NCL' or 'NLC'
        Format of input feature array;
        'NCL' -> (batchsize, channels, seq.length);
        'LNC' -> (seq.length, batchsize, channels);
    keep_dataset_in_ram : bool
        It is faster to load the full hdf5 file into the RAM instead of keeping it on the disk.
        If False, the hdf5 file will be read from the disk and consume less RAM.
    sample_n_sequences : int
        Optional: Random sub-sampling of `sample_n_sequences` sequences per repertoire.
        Number of sequences per repertoire might be smaller than `sample_n_sequences` if repertoire is smaller or
        random indices have been drawn multiple times.
        If None, all sequences will be loaded for each repertoire.
    verbose : bool
        Activate verbose mode
    
    Returns
    ---------
    task_definition: TaskDefinition
        TaskDefinition object containing the tasks to train the DeepRC model on. See `deeprc/examples/` for examples.
    trainingset_dataloader: torch.utils.data.DataLoader
        Dataloader for trainingset with active `sample_n_sequences` (=random subsampling/dropout of repertoire
        sequences)
    trainingset_eval_dataloader: torch.utils.data.DataLoader
        Dataloader for trainingset with deactivated `sample_n_sequences`
    validationset_eval_dataloader: torch.utils.data.DataLoader
        Dataloader for validationset with deactivated `sample_n_sequences`
    testset_eval_dataloader: torch.utils.data.DataLoader
        Dataloader for testset with deactivated `sample_n_sequences`
    """
    if dataset_path is None:
        dataset_path = os.path.join(os.path.dirname(deeprc.__file__),
                                    'datasets', f'CMV_with_implanted_signals')
    os.makedirs(dataset_path, exist_ok=True)
    metadata_file = os.path.join(
        dataset_path, f'CMV_with_implanted_signals_{dataset_id}_metadata.tsv')
    repertoiresdata_file = os.path.join(
        dataset_path,
        f'CMV_with_implanted_signals_{dataset_id}_repertoiresdata.hdf5')

    # Download metadata file
    if not os.path.exists(metadata_file):
        user_confirmation(
            f"File {metadata_file} not found. It will be downloaded now. Continue?",
            'y', 'n')
        # url_get(f"https://ml.jku.at/research/DeepRC/datasets/CMV_data_with_implanted_signals/metadata/implanted_signals_{dataset_id}.csv",
        #         metadata_file)
        url_get(
            f"https://cloud.ml.jku.at/s/KQDAdHjHpdn3pzg/download?path=/datasets/CMV_data_with_implanted_signals/metadata&files=implanted_signals_{dataset_id}.tsv",
            metadata_file)

    # Download repertoire file
    if not os.path.exists(repertoiresdata_file):
        user_confirmation(
            f"File {repertoiresdata_file} not found. It will be downloaded now. Continue?",
            'y', 'n')
        url_get(
            f"https://ml.jku.at/research/DeepRC/datasets/CMV_data_with_implanted_signals/hdf5/implanted_signals_{dataset_id}.hdf5",
            repertoiresdata_file)

    # Get file for dataset splits
    split_file = os.path.join(os.path.dirname(deeprc.__file__), 'datasets',
                              'splits_used_in_paper',
                              'CMV_with_implanted_signals.pkl')
    with open(split_file, 'rb') as sfh:
        split_inds = pkl.load(sfh)

    # Get task_definition
    if task_definition is None:
        task_definition = TaskDefinition(targets=[
            BinaryTarget(column_name='status', true_class_value='True')
        ])

    # Create data loaders
    trainingset_dataloader, trainingset_eval_dataloader, validationset_eval_dataloader, testset_eval_dataloader = \
        make_dataloaders(task_definition=task_definition, metadata_file=metadata_file,
                         repertoiresdata_path=repertoiresdata_file, split_inds=split_inds,
                         cross_validation_fold=cross_validation_fold, n_worker_processes=n_worker_processes,
                         batch_size=batch_size, inputformat=inputformat, keep_dataset_in_ram=keep_dataset_in_ram,
                         sample_n_sequences=sample_n_sequences, sequence_counts_scaling_fn=no_sequence_count_scaling,
                         verbose=verbose)
    return (task_definition, trainingset_dataloader,
            trainingset_eval_dataloader, validationset_eval_dataloader,
            testset_eval_dataloader)