Exemple #1
0
def generate_data_model(data_type, verbose=False):
    """Load data model and sample info data"""
    tcga_data = TCGADataModel(training_data=data_type,
                              test=True,
                              verbose=verbose)
    sample_info_df = du.load_sample_info(train_data_type=data_type,
                                         verbose=verbose)
    return tcga_data, sample_info_df
Exemple #2
0
def data_model(data_type):
    """Load data model and sample info data"""
    # passing arguments to fixtures (like data_type here), then using them
    # in tests isn't widely documented in pytest, but seems to work
    # see, e.g. https://stackoverflow.com/a/60148972
    tcga_data = TCGADataModel(training_data=data_type, debug=True, test=True)
    sample_info_df = du.load_sample_info(train_data_type=data_type)
    return tcga_data, sample_info_df
Exemple #3
0
    # create results dir and subdir for experiment if they don't exist
    experiment_dir = Path(io_args.results_dir, 'gene').resolve()
    experiment_dir.mkdir(parents=True, exist_ok=True)

    # save model options for this experiment
    # (hyperparameters, preprocessing info, etc)
    fu.save_model_options(experiment_dir, model_options)

    # create empty log file if it doesn't exist
    log_columns = ['gene', 'titration_ratio', 'shuffle_labels', 'skip_reason']

    tcga_data = TCGADataModel(
        seed=model_options.seed,
        subset_mad_genes=model_options.subset_mad_genes,
        training_data=model_options.training_data,
        overlap_data_types=model_options.overlap_data_types,
        sample_info_df=sample_info_df,
        verbose=io_args.verbose,
        debug=model_options.debug)
    genes_df = tcga_data.load_gene_set(io_args.gene_set)

    # we want to run mutation prediction experiments:
    # - for true labels and shuffled labels
    #   (shuffled labels acts as our lower baseline)
    # - for all genes in the given gene set
    for shuffle_labels in (False, True):

        print('shuffle_labels: {}'.format(shuffle_labels))

        outer_progress = tqdm(genes_df.iterrows(),
                              total=genes_df.shape[0],
    experiment_dir = Path(io_args.results_dir, 'cancer_type').resolve()
    experiment_dir.mkdir(parents=True, exist_ok=True)

    # save model options for this experiment
    # (hyperparameters, preprocessing info, etc)
    fu.save_model_options(experiment_dir, model_options)

    # create empty error log file if it doesn't exist
    log_columns = [
        'cancer_type', 'training_data', 'shuffle_labels', 'skip_reason'
    ]

    # load data matrix for the specified data type
    tcga_data = TCGADataModel(seed=model_options.seed,
                              subset_mad_genes=model_options.subset_mad_genes,
                              training_data=model_options.training_data,
                              sample_info_df=sample_info_df,
                              verbose=io_args.verbose,
                              debug=model_options.debug)

    # we want to run cancer type classification experiments:
    # - for true labels and shuffled labels
    #   (shuffled labels acts as our lower baseline)
    # - for all cancer types in the given list of TCGA cancers
    for shuffle_labels in (False, True):

        print('shuffle_labels: {}'.format(shuffle_labels))

        progress = tqdm(io_args.cancer_types,
                        total=len(io_args.cancer_types),
                        ncols=100,
                        file=sys.stdout)
    # create results dir and subdir for experiment if they don't exist
    experiment_dir = Path(io_args.results_dir, 'gene').resolve()
    experiment_dir.mkdir(parents=True, exist_ok=True)

    # save model options for this experiment
    # (hyperparameters, preprocessing info, etc)
    fu.save_model_options(experiment_dir, model_options)

    # create empty log file if it doesn't exist
    log_columns = ['gene', 'training_data', 'shuffle_labels', 'skip_reason']

    tcga_data = TCGADataModel(
        seed=model_options.seed,
        training_data=model_options.training_data,
        overlap_data_types=model_options.overlap_data_types,
        sample_info_df=sample_info_df,
        verbose=io_args.verbose,
        debug=model_options.debug)
    genes_df = tcga_data.load_gene_set(io_args.gene_set)

    # we want to run mutation prediction experiments:
    # - for true labels and shuffled labels
    #   (shuffled labels acts as our lower baseline)
    # - for all genes in the given gene set
    for shuffle_labels in (False, True):

        print('shuffle_labels: {}'.format(shuffle_labels))

        progress = tqdm(genes_df.iterrows(),
                        total=genes_df.shape[0],
    experiment_dir = Path(io_args.results_dir, 'gene').resolve()
    experiment_dir.mkdir(parents=True, exist_ok=True)

    # save model options for this experiment
    # (hyperparameters, preprocessing info, etc)
    fu.save_model_options(experiment_dir, model_options)

    # create empty log file if it doesn't exist
    log_columns = ['gene', 'training_data', 'shuffle_labels', 'skip_reason']

    tcga_data = TCGADataModel(
        seed=model_options.seed,
        subset_mad_genes=model_options.subset_mad_genes,
        training_data=model_options.training_data,
        overlap_data_types=model_options.overlap_data_types,
        # standardize all data types
        standardize_input=[True] * len(model_options.training_data),
        n_dim=model_options.n_dim,
        sample_info_df=sample_info_df,
        verbose=io_args.verbose,
        debug=model_options.debug)
    genes_df = tcga_data.load_gene_set(io_args.gene_set)

    # we want to run mutation prediction experiments:
    # - for true labels and shuffled labels
    #   (shuffled labels acts as our lower baseline)
    # - for all genes in the given gene set
    for shuffle_labels in (False, True):

        print('shuffle_labels: {}'.format(shuffle_labels))
Exemple #7
0
    # (hyperparameters, preprocessing info, etc)
    fu.save_model_options(experiment_dir, model_options)

    # create empty log file if it doesn't exist
    log_columns = ['gene', 'training_data', 'skip_reason']
    if io_args.log_file.exists() and io_args.log_file.is_file():
        log_df = pd.read_csv(io_args.log_file, sep='\t')
    else:
        log_df = pd.DataFrame(columns=log_columns)
        log_df.to_csv(io_args.log_file, sep='\t')

    tcga_data = TCGADataModel(
        seed=model_options.seed,
        subset_mad_genes=model_options.subset_mad_genes,
        training_data=model_options.training_data,
        overlap_data_types=model_options.overlap_data_types,
        load_compressed_data=True,
        standardize_input=True,
        n_dim=N_DIM,
        sample_info_df=sample_info_df,
        verbose=io_args.verbose)
    genes_df = tcga_data.load_gene_set('vogelstein')

    progress = tqdm(genes_df.iterrows(),
                    total=genes_df.shape[0],
                    ncols=100,
                    file=sys.stdout)

    all_genes = []
    all_preds = []
    sample_list = None
    # (hyperparameters, preprocessing info, etc)
    fu.save_model_options(experiment_dir, model_options,
                          classify=model_options.classify)

    # create empty log file if it doesn't exist
    log_columns = [
        'training_data',
        'shuffle_labels',
        'skip_reason'
    ]
    log_df = None

    tcga_data = TCGADataModel(seed=model_options.seed,
                              subset_mad_genes=model_options.subset_mad_genes,
                              training_data=model_options.training_data,
                              load_compressed_data=model_options.use_compressed,
                              n_dim=model_options.n_dim,
                              sample_info_df=sample_info_df,
                              verbose=io_args.verbose,
                              debug=model_options.debug)

    # we want to run purity prediction experiments for true labels and
    # shuffled labels (the latter as a lower baseline)
    progress = tqdm([False, True],
                    ncols=100,
                    file=sys.stdout)
    for shuffle_labels in progress:
        progress.set_description('shuffle labels: {}'.format(shuffle_labels))

        try:
            output_dir = fu.make_output_dir(experiment_dir, '')
            check_file = fu.check_output_file(output_dir,
Exemple #9
0
def calculate_gene_count(overlap_data_types, seeds, num_folds):
    """For a set of data types, calculate the number of valid genes."""
    gene_seed_list = []
    sample_info_df = du.load_sample_info('expression')
    for seed in seeds:
        tcga_data = TCGADataModel(seed=seed,
                                  overlap_data_types=overlap_data_types)
        genes_df = tcga_data.load_gene_set('vogelstein')
        for gene_ix, gene_series in genes_df.iterrows():
            
            print(gene_series.gene, file=sys.stderr)
            try:
                tcga_data.process_data_for_gene(gene_series.gene,
                                                gene_series.classification,
                                                None)
            except KeyError: continue
            y_ones = np.count_nonzero(tcga_data.y_df.status)
            y_zeroes = len(tcga_data.y_df.status) - y_ones
            print(y_ones, y_zeroes, file=sys.stderr)
            
            # check if any valid cancer types, if not break
            if tcga_data.X_df.shape[0] == 0:
                gene_seed_list.append((gene_series.gene, seed, False, 'no_valid_cancer_types'))
                continue
                
            # subset features to speed up CV
            tcga_data.X_df = tcga_data.X_df.iloc[:, :50]
                
            # if valid cancer types, look at CV folds and make sure each
            # has 0 and 1 labels
            gene_seed_valid = True
            reason = 'N/A'
            for fold_no in range(num_folds):
                with warnings.catch_warnings():
                    warnings.filterwarnings('ignore',
                                            message='The least populated class in y')
                    X_train, X_test, _ = cv.split_stratified(
                        tcga_data.X_df,
                        sample_info_df,
                        num_folds=num_folds,
                        fold_no=fold_no,
                        seed=seed
                    )
                y_train = tcga_data.y_df.reindex(X_train.index)
                y_test = tcga_data.y_df.reindex(X_test.index)
                
                # count 0/1 labels in y_train and y_test
                y_train_ones = np.count_nonzero(y_train.status)
                y_train_zeroes = len(y_train.status) - y_train_ones
                y_test_ones = np.count_nonzero(y_test.status)
                y_test_zeroes = len(y_test.status) - y_test_ones
                print(fold_no, y_train_ones, y_train_zeroes, y_test_ones, y_test_zeroes,
                      file=sys.stderr)
                
                if ((y_train_ones == 0) or (y_train_zeroes == 0)):
                    gene_seed_valid = False
                    reason = 'one_train_class'
                    break
                elif ((y_test_ones == 0) or (y_test_zeroes == 0)):
                    gene_seed_valid = False
                    reason = 'one_test_class'
                    break
                    
            gene_seed_list.append((gene_series.gene, seed, gene_seed_valid, reason))
                
    return gene_seed_list
Exemple #10
0
    fu.save_model_options(experiment_dir, model_options)

    # create empty log file if it doesn't exist
    log_columns = [
        'cancer_type',
        'training_data',
        'shuffle_labels',
        'skip_reason'
    ]
    log_df = None

    tcga_data = TCGADataModel(seed=model_options.seed,
                              subset_mad_genes=model_options.subset_mad_genes,
                              training_data=model_options.training_data,
                              overlap_data_types=model_options.overlap_data_types,
                              load_compressed_data=model_options.use_compressed,
                              n_dim=model_options.n_dim,
                              sample_info_df=sample_info_df,
                              verbose=io_args.verbose,
                              debug=model_options.debug)

    # we want to run MSI prediction experiments for true labels and
    # shuffled labels (the latter as a lower baseline)
    for shuffle_labels in (False, True):

        print('shuffle labels: {}'.format(shuffle_labels))

        progress = tqdm(io_args.cancer_types,
                        total=len(io_args.cancer_types),
                        ncols=100,
                        file=sys.stdout)
    # save model options for this experiment
    # (hyperparameters, preprocessing info, etc)
    fu.save_model_options(experiment_dir, model_options, 'survival')

    # create empty log file if it doesn't exist
    log_columns = [
        'cancer_type', 'training_data', 'shuffle_labels', 'skip_reason'
    ]

    tcga_data = TCGADataModel(
        seed=model_options.seed,
        subset_mad_genes=model_options.subset_mad_genes,
        training_data=model_options.training_data,
        overlap_data_types=model_options.overlap_data_types,
        load_compressed_data=(model_options.n_dim is not None),
        standardize_input=(model_options.n_dim is not None
                           and model_options.training_data
                           in cfg.standardize_data_types),
        n_dim=model_options.n_dim,
        sample_info_df=sample_info_df,
        verbose=io_args.verbose,
        debug=model_options.debug)

    # we want to run survival prediction experiments:
    # - for true labels and shuffled labels
    #   (shuffled labels acts as our lower baseline)
    # - for all cancer types provided
    for shuffle_labels in (False, True):

        print('shuffle_labels: {}'.format(shuffle_labels))