log_df = pd.read_csv(io_args.log_file, sep='\t') else: log_df = pd.DataFrame(columns=log_columns) log_df.to_csv(io_args.log_file, sep='\t') tcga_data = TCGADataModel( seed=model_options.seed, subset_mad_genes=model_options.subset_mad_genes, training_data=model_options.training_data, overlap_data_types=model_options.overlap_data_types, load_compressed_data=True, standardize_input=True, n_dim=N_DIM, sample_info_df=sample_info_df, verbose=io_args.verbose) genes_df = tcga_data.load_gene_set('vogelstein') progress = tqdm(genes_df.iterrows(), total=genes_df.shape[0], ncols=100, file=sys.stdout) all_genes = [] all_preds = [] sample_list = None for gene_idx, gene_series in progress: log_df = None gene = gene_series.gene classification = gene_series.classification progress.set_description('gene: {}'.format(gene))
# save model options for this experiment # (hyperparameters, preprocessing info, etc) fu.save_model_options(experiment_dir, model_options) # create empty log file if it doesn't exist log_columns = ['gene', 'titration_ratio', 'shuffle_labels', 'skip_reason'] tcga_data = TCGADataModel( seed=model_options.seed, subset_mad_genes=model_options.subset_mad_genes, training_data=model_options.training_data, overlap_data_types=model_options.overlap_data_types, sample_info_df=sample_info_df, verbose=io_args.verbose, debug=model_options.debug) genes_df = tcga_data.load_gene_set(io_args.gene_set) # we want to run mutation prediction experiments: # - for true labels and shuffled labels # (shuffled labels acts as our lower baseline) # - for all genes in the given gene set for shuffle_labels in (False, True): print('shuffle_labels: {}'.format(shuffle_labels)) outer_progress = tqdm(genes_df.iterrows(), total=genes_df.shape[0], ncols=100, file=sys.stdout) for gene_idx, gene_series in outer_progress:
def calculate_gene_count(overlap_data_types, seeds, num_folds): """For a set of data types, calculate the number of valid genes.""" gene_seed_list = [] sample_info_df = du.load_sample_info('expression') for seed in seeds: tcga_data = TCGADataModel(seed=seed, overlap_data_types=overlap_data_types) genes_df = tcga_data.load_gene_set('vogelstein') for gene_ix, gene_series in genes_df.iterrows(): print(gene_series.gene, file=sys.stderr) try: tcga_data.process_data_for_gene(gene_series.gene, gene_series.classification, None) except KeyError: continue y_ones = np.count_nonzero(tcga_data.y_df.status) y_zeroes = len(tcga_data.y_df.status) - y_ones print(y_ones, y_zeroes, file=sys.stderr) # check if any valid cancer types, if not break if tcga_data.X_df.shape[0] == 0: gene_seed_list.append((gene_series.gene, seed, False, 'no_valid_cancer_types')) continue # subset features to speed up CV tcga_data.X_df = tcga_data.X_df.iloc[:, :50] # if valid cancer types, look at CV folds and make sure each # has 0 and 1 labels gene_seed_valid = True reason = 'N/A' for fold_no in range(num_folds): with warnings.catch_warnings(): warnings.filterwarnings('ignore', message='The least populated class in y') X_train, X_test, _ = cv.split_stratified( tcga_data.X_df, sample_info_df, num_folds=num_folds, fold_no=fold_no, seed=seed ) y_train = tcga_data.y_df.reindex(X_train.index) y_test = tcga_data.y_df.reindex(X_test.index) # count 0/1 labels in y_train and y_test y_train_ones = np.count_nonzero(y_train.status) y_train_zeroes = len(y_train.status) - y_train_ones y_test_ones = np.count_nonzero(y_test.status) y_test_zeroes = len(y_test.status) - y_test_ones print(fold_no, y_train_ones, y_train_zeroes, y_test_ones, y_test_zeroes, file=sys.stderr) if ((y_train_ones == 0) or (y_train_zeroes == 0)): gene_seed_valid = False reason = 'one_train_class' break elif ((y_test_ones == 0) or (y_test_zeroes == 0)): gene_seed_valid = False reason = 'one_test_class' break gene_seed_list.append((gene_series.gene, seed, gene_seed_valid, reason)) return gene_seed_list