Exemple #1
0
            try:
                gene_dir = fu.make_output_dir(experiment_dir, gene)
                tcga_data.process_data_for_gene(
                    gene,
                    classification,
                    gene_dir,
                )
            except ResultsFileExistsError:
                # this happens if cross-validation for this gene has already been
                # run (i.e. the results file already exists)
                if io_args.verbose:
                    print(
                        'Skipping because results file exists already: gene {}'
                        .format(gene),
                        file=sys.stderr)
                log_df = fu.generate_log_df(
                    log_columns, [gene, 'N/A', shuffle_labels, 'file_exists'])
                fu.write_log_file(log_df, io_args.log_file)
                continue
            except KeyError:
                # this can happen if the given gene isn't in the mutation data
                print('Gene {} not found in mutation data, skipping'.format(
                    gene),
                      file=sys.stderr)
                log_df = fu.generate_log_df(
                    log_columns,
                    [gene, 'N/A', shuffle_labels, 'gene_not_found'])
                fu.write_log_file(log_df, io_args.log_file)
                continue

            num_feats = (model_options.subset_mad_genes +
                         np.count_nonzero(~tcga_data.gene_features))
                cancer_type_dir = fu.make_output_dir(experiment_dir,
                                                     cancer_type)
                check_file = fu.check_output_file(cancer_type_dir, cancer_type,
                                                  shuffle_labels,
                                                  model_options)
                tcga_data.process_data_for_cancer_type(cancer_type,
                                                       cancer_type_dir)
            except ResultsFileExistsError:
                # this happens if cross-validation for this cancer type has
                # already been run (i.e. the results file already exists)
                if io_args.verbose:
                    print('Skipping because results file exists already: '
                          'cancer type {}'.format(cancer_type),
                          file=sys.stderr)
                log_df = fu.generate_log_df(log_columns, [
                    cancer_type, model_options.training_data, shuffle_labels,
                    'file_exists'
                ])
                fu.write_log_file(log_df, io_args.log_file)
                continue

            try:
                # for now, don't standardize methylation data
                standardize_columns = (model_options.training_data
                                       in cfg.standardize_data_types)
                results = run_cv_stratified(
                    tcga_data, 'cancer_type', cancer_type,
                    model_options.training_data, sample_info_df,
                    model_options.num_folds, True, shuffle_labels,
                    standardize_columns, io_args.output_preds)
                # only save results if no exceptions
                fu.save_results(cancer_type_dir, check_file, results,
Exemple #3
0
        log_df = None
        gene = gene_series.gene
        classification = gene_series.classification
        progress.set_description('gene: {}'.format(gene))

        try:
            tcga_data.process_data_for_gene(gene,
                                            classification,
                                            experiment_dir,
                                            filter_cancer_types=False)
        except KeyError:
            # this can happen if the given gene isn't in the mutation data
            print('Gene {} not found in mutation data, skipping'.format(gene),
                  file=sys.stderr)
            log_df = fu.generate_log_df(
                log_columns,
                [gene, model_options.training_data, 'gene_not_found'])
            fu.write_log_file(log_df, io_args.log_file)
            continue

        try:
            standardize_columns = (model_options.training_data
                                   in cfg.standardize_data_types)
            results = run_cv_stratified(tcga_data,
                                        'gene',
                                        gene,
                                        model_options.training_data,
                                        sample_info_df,
                                        model_options.num_folds,
                                        predictor='classify',
                                        shuffle_labels=False,
                    classification,
                    gene_dir,
                    batch_correction=model_options.batch_correction,
                    bc_cancer_type=model_options.bc_cancer_type,
                    drop_target=model_options.drop_target,
                    only_target=model_options.only_target)
            except ResultsFileExistsError:
                # this happens if cross-validation for this gene has already been
                # run (i.e. the results file already exists)
                if io_args.verbose:
                    print(
                        'Skipping because results file exists already: gene {}'
                        .format(gene),
                        file=sys.stderr)
                log_df = fu.generate_log_df(log_columns, [
                    gene, model_options.training_data, shuffle_labels,
                    'file_exists'
                ])
                fu.write_log_file(log_df, io_args.log_file)
                continue
            except KeyError:
                # this can happen if the given gene isn't in the mutation data
                print('Gene {} not found in mutation data, skipping'.format(
                    gene),
                      file=sys.stderr)
                log_df = fu.generate_log_df(log_columns, [
                    gene, model_options.training_data, shuffle_labels,
                    'gene_not_found'
                ])
                fu.write_log_file(log_df, io_args.log_file)
                continue
            try:
                check_file = fu.check_output_file(experiment_dir, cancer_type,
                                                  shuffle_labels,
                                                  model_options, 'survival')
                tcga_data.process_survival_data(experiment_dir, cancer_type)
            except ResultsFileExistsError:
                # this happens if cross-validation for this cancer type has already been
                # run (i.e. the results file already exists)
                if io_args.verbose:
                    print(
                        'Skipping because results file exists already: cancer type {}'
                        .format(cancer_type),
                        file=sys.stderr)
                log_df = fu.generate_log_df(log_columns, [
                    cancer_type, model_options.training_data, shuffle_labels,
                    'file_exists'
                ])
                fu.write_log_file(log_df, io_args.log_file)
                continue

            try:
                # for now, don't standardize methylation data
                # also, always standardize PCA components, for all data types
                standardize_columns = ((model_options.n_dim is not None)
                                       or (model_options.training_data
                                           in cfg.standardize_data_types))
                results = run_cv_stratified(
                    tcga_data,
                    'survival',
                    cancer_type,
                    model_options.training_data,