def main(args): # Set up parameters. alpha = 0.01 n_itr = 1500 # Set up the output folder. output_folder = os.path.realpath('..') + '/result_gtex_feature_explore/result_'\ + args.output_folder if not os.path.exists(output_folder): os.makedirs(output_folder) else: filelist = [os.remove(os.path.join(output_folder, f))\ for f in os.listdir(output_folder)] # Load the data. p, x, n_full, cate_name, cis_name = dl.load_GTEx(args.data_name,\ if_impute=False) # feature_explore md.adafdr_explore(p, x, alpha=alpha, n_full=n_full, vis_dim=None, cate_name=cate_name,\ output_folder=output_folder, h=None)
def main(args): # Set up parameters. alpha = 0.01 n_itr = 1500 # Set up the output folder. output_folder = os.path.realpath( '..') + '/results/result_' + args.output_folder output_datafile = '/data3/martin/gtex_data/results/result_' + args.output_folder + '.pickle' if not os.path.exists(output_folder): os.makedirs(output_folder) else: filelist = [os.remove(os.path.join(output_folder, f))\ for f in os.listdir(output_folder)] # Load the data. p, x, n_full, cate_name, cis_name = dl.load_GTEx(args.data_name) # Logger. logging.basicConfig(level=logging.INFO,format='%(module)s:: %(message)s',\ filename=output_folder+'/result.log', filemode='w') logger = logging.getLogger() result_dic = {} # An overview of the data logger.info('# p: %s' % str(p[0:2])) logger.info('# x: %s' % str(x[0:2, :])) # Report the baseline methods. n_rej, t_rej = md.bh_test(p, alpha=alpha, n_full=n_full, verbose=False) logger.info('## BH, n_rej=%d, t_rej=%0.5f' % (n_rej, t_rej)) result_dic['bh'] = {'h_hat': p < t_rej} n_rej, t_rej, pi0_hat = md.sbh_test(p, alpha=alpha, n_full=n_full, verbose=False) result_dic['sbh'] = {'h_hat': p < t_rej} logger.info('## SBH, n_rej=%d, t_rej=%0.5f, pi0_hat=%0.3f' % (n_rej, t_rej, pi0_hat)) # Analysis md.adafdr_explore(p, x, alpha=alpha, n_full=n_full, vis_dim=None, cate_name=cate_name,\ output_folder=output_folder, h=None) # Fast mode. output_folder_fast = output_folder + '_fast' if not os.path.exists(output_folder_fast): os.makedirs(output_folder_fast) else: filelist = [os.remove(os.path.join(output_folder_fast, f))\ for f in os.listdir(output_folder_fast)] logger.info('# p: %s' % str(p[0:2])) logger.info('# x: %s' % str(x[0:2, :])) start_time = time.time() res = md.adafdr_test(p, x, K=5, alpha=alpha, h=None, n_full=n_full, n_itr=n_itr,\ verbose=True, output_folder=output_folder_fast, random_state=0,\ fast_mode=True) n_rej = res['n_rej'] t_rej = res['threshold'] result_dic['nfdr (fast)'] = {'h_hat': p < t_rej} logger.info('## nfdr2 (fast mode), n_rej1=%d, n_rej2=%d, n_rej_total=%d' % (n_rej[0], n_rej[1], n_rej[0] + n_rej[1])) logger.info('## Total time (fast mode): %0.1fs' % (time.time() - start_time)) # Full mode. logger.info('# p: %s' % str(p[0:2])) logger.info('# x: %s' % str(x[0:2, :])) start_time = time.time() res = md.adafdr_test(p, x, K=5, alpha=alpha, h=None, n_full=n_full, n_itr=n_itr,\ verbose=True, output_folder=output_folder, random_state=0,\ fast_mode=False, single_core=False) n_rej = res['n_rej'] t_rej = res['threshold'] result_dic['nfdr'] = {'h_hat': p < t_rej} logger.info('## nfdr2, n_rej1=%d, n_rej2=%d, n_rej_total=%d' % (n_rej[0], n_rej[1], n_rej[0] + n_rej[1])) logger.info('## Total time: %0.1fs' % (time.time() - start_time)) # Store the result fil = open(output_datafile, 'wb') pickle.dump(result_dic, fil) fil.close()
def main(): # Load reference f_output = '/home/martin/NeuralFDR2/result_downstream_v1/recs_adipose' fil_rec = open(f_output, 'w') fil_rec.write('# Load snp and gene\n') # snp names fil_path = '/data3/martin/gtex_data/gtex_utils/snp_feat.txt' snp_data = np.loadtxt(fil_path, delimiter=',', dtype=str) snp_sym2id = {} snp_id2sym = {} for i in range(snp_data.shape[0]): snp_sym2id[snp_data[i, 0]] = snp_data[i, 1] snp_id2sym[snp_data[i, 1]] = snp_data[i, 0] # gene names fil_path = '/data3/martin/gtex_data/gtex_utils/gencode.v19.genes.patched_contigs.gtf' fil_open = open(fil_path, "r") gene_sym2id = {} gene_id2sym = {} for i_line, line in enumerate(fil_open): if line[0] != '#': line = line.strip().split('\t') line = line[8].strip().split(' ') gene_id = line[1].replace('"', '').replace(';', '') gene_name = line[9].replace('"', '').replace(';', '') gene_id2sym[gene_id] = gene_name gene_sym2id[gene_name] = gene_id fil_open.close() # Load MuTHER data fil_rec.write('# Load MuTHER\n') file_muther_path = '/data3/martin/gtex_data/MuTHER/' + 'MuTHER_cis_results_chrall.txt' data_muther = np.loadtxt(file_muther_path, delimiter=',', dtype=str) MuTHER_dic = {} count_na = 0 for i in range(data_muther.shape[0]): gene_sym, snp_id = data_muther[i, [1, 2]] if ('n' in data_muther[i, 3]) or ('N' in data_muther[i, 3]): count_na = count_na + 1 else: MuTHER_dic[gene_sym + '-' + snp_id] = float(data_muther[i, 3]) fil_rec.write('# MuTHER_dic count_na=%d\n' % count_na) # for i in range(data_muther.shape[0]): # gene_sym,snp_id = data_muther[i, [1,2]] # MuTHER_dic[gene_sym+'-'+snp_id] = float(data_muther[i, 3]) n_full = 29160396 # p = np.array(data_muther[:,-3], dtype=float) # fixit # Process GTEx data fil_rec.write('# Process GTEx\n') # data_list = ['Adipose_Subcutaneous', 'Adipose_Visceral_Omentum', # 'Cells_EBV-transformed_lymphocytes'] data_list = ['Adipose_Subcutaneous', 'Adipose_Visceral_Omentum'] # data_list = ['Adipose_Subcutaneous_chr21'] output_folder = '/home/martin/NeuralFDR2/result_downstream_v1' load_gtex_data_name_dic = { 'Adipose_Subcutaneous': 'Adipose_Subcutaneous', 'Adipose_Subcutaneous_chr21': 'Adipose_Subcutaneous-chr21', 'Adipose_Visceral_Omentum': 'Adipose_Visceral_Omentum', 'Adipose_Visceral_Omentum_chr21': 'Adipose_Visceral_Omentum-chr21', 'Cells_EBV-transformed_lymphocytes': 'Cells_EBV-transformed_lymphocytes' } for data_name in data_list: # Load results fil_rec.write('\n' + data_name + '\n') res_GTEx_path = '/data3/martin/gtex_data/results/' + \ 'result_GTEx_%s.pickle'%data_name fil = open(res_GTEx_path, 'rb') result_dic = pickle.load(fil) fil.close() h_hat_sbh = result_dic['sbh']['h_hat'] h_hat_nfdr = result_dic['nfdr']['h_hat'] fil_rec.write('# D_sbh=%d\n' % np.sum(h_hat_sbh)) fil_rec.write('# D_nfdr=%d, D_overlap=%d\n'\ %(np.sum(h_hat_nfdr), np.sum(h_hat_sbh*h_hat_nfdr))) p_gtex, _, _, _, cis_name = dl.load_GTEx( load_gtex_data_name_dic[data_name]) GTEx_dic = {} for i_cis_name in range(cis_name.shape[0]): GTEx_dic[cis_name[i_cis_name]] = p_gtex[i_cis_name] cis_name_sbh = cis_name[h_hat_sbh] cis_name_nfdr = cis_name[h_hat_nfdr] # Standardize the name. cis_nfdr_standard = standardize_cis_name(cis_name_nfdr, gene_id2sym, snp_sym2id) cis_sbh_standard = standardize_cis_name(cis_name_sbh, gene_id2sym, snp_sym2id) # Look at the difference. cis_nfdr_standard = set(cis_nfdr_standard) cis_sbh_standard = set(cis_sbh_standard) cis_intersect = cis_nfdr_standard & cis_sbh_standard cis_sbh = cis_sbh_standard - cis_intersect cis_nfdr = cis_nfdr_standard - cis_intersect # Compute the corresponding p-values p_MuTHER_intersect = get_MuTHER_p_value(cis_intersect, MuTHER_dic, GTEx_dic, gene_sym2id, snp_id2sym, fil_rec) p_MuTHER_sbh = get_MuTHER_p_value(cis_sbh, MuTHER_dic, GTEx_dic, gene_sym2id, snp_id2sym, fil_rec) p_MuTHER_nfdr = get_MuTHER_p_value(cis_nfdr, MuTHER_dic, GTEx_dic, gene_sym2id, snp_id2sym, fil_rec) # Save results n_counts = np.array([len(cis_intersect), len(cis_sbh), len(cis_nfdr)]) fil = open(output_folder + '/p_overlap_%s.pickle' % data_name, 'wb') pickle.dump(n_counts, fil) pickle.dump(p_MuTHER_intersect[p_MuTHER_intersect[:, 0] < 1, :], fil) pickle.dump(p_MuTHER_sbh[p_MuTHER_sbh[:, 0] < 1, :], fil) pickle.dump(p_MuTHER_nfdr[p_MuTHER_nfdr[:, 0] < 1, :], fil) fil.close() fil_rec.close()
def main(args): # Set up parameters. alpha = 0.01 n_itr = 1500 # Set up the output folder. output_folder = os.path.realpath( '..') + '/results/result_univariate_' + args.output_folder output_datafile = '/data3/martin/gtex_data/results_uni_covariate/result_' +\ args.output_folder + '.pickle' if not os.path.exists(output_folder): os.makedirs(output_folder) else: filelist = [os.remove(os.path.join(output_folder, f))\ for f in os.listdir(output_folder)] # Load the data. p, x, n_full, cate_name, cis_name = dl.load_GTEx(args.data_name) # Logger. logging.basicConfig(level=logging.INFO,format='%(message)s',\ filename=output_folder+'/result.log', filemode='w') logger = logging.getLogger() result_dic = {} # An overview of the data logger.info('# p: %s' % str(p[0:2])) logger.info('# x: %s' % str(x[0:2, :])) # Report the baseline methods. n_rej, t_rej = md.bh_test(p, alpha=alpha, n_full=n_full, verbose=False) logger.info('## BH, n_rej=%d, t_rej=%0.5f' % (n_rej, t_rej)) result_dic['bh'] = {'h_hat': p < t_rej} n_rej, t_rej, pi0_hat = md.sbh_test(p, alpha=alpha, n_full=n_full, verbose=False) result_dic['sbh'] = {'h_hat': p < t_rej} logger.info('## SBH, n_rej=%d, t_rej=%0.5f, pi0_hat=%0.3f\n' % (n_rej, t_rej, pi0_hat)) # Analysis md.adafdr_explore(p, x, alpha=alpha, n_full=n_full, vis_dim=None, cate_name=cate_name,\ output_folder=output_folder, h=None) # Four covaraites seperately cov_list = ['exp', 'maf', 'dist', 'chromotin', 'all'] for i_cov in range(5): logger.info('Covariate: %s' % cov_list[i_cov]) if i_cov < 4: temp_x = x[:, i_cov].reshape([-1, 1]) else: temp_x = x # Fast mode. # output_folder_fast = output_folder + '_fast' # if not os.path.exists(output_folder_fast): # os.makedirs(output_folder_fast) # else: # filelist = [os.remove(os.path.join(output_folder_fast, f))\ # for f in os.listdir(output_folder_fast)] output_folder_fast = None logger.info('# p: %s' % str(p[0:2])) logger.info('# x: %s' % str(temp_x[0:2, :])) start_time = time.time() res = md.adafdr_test(p, temp_x, K=5, alpha=alpha, h=None, n_full=n_full, n_itr=n_itr, verbose=True, output_folder=output_folder_fast, random_state=0, fast_mode=True) n_rej = res['n_rej'] t_rej = res['threshold'] result_dic['nfdr (fast)_%d' % i_cov] = {'h_hat': p < t_rej} logger.info( '## AdaFDR (fast mode), feature=%d, n_rej1=%d, n_rej2=%d, n_rej_total=%d' % (i_cov, n_rej[0], n_rej[1], n_rej[0] + n_rej[1])) logger.info('## Total time (fast mode): %0.1fs' % (time.time() - start_time)) # Full mode. logger.info('# p: %s' % str(p[0:2])) logger.info('# x: %s' % str(temp_x[0:2, :])) start_time = time.time() res = md.adafdr_test(p, temp_x, K=5, alpha=alpha, h=None, n_full=n_full, n_itr=n_itr,\ verbose=True, output_folder=None, random_state=0,\ fast_mode=False, single_core=False) n_rej = res['n_rej'] t_rej = res['threshold'] result_dic['nfdr_%d' % i_cov] = {'h_hat': p < t_rej} logger.info('## nfdr2, n_rej1=%d, n_rej2=%d, n_rej_total=%d' % (n_rej[0], n_rej[1], n_rej[0] + n_rej[1])) logger.info('## Total time: %0.1fs' % (time.time() - start_time)) logger.info(' ') # Store the result fil = open(output_datafile, 'wb') pickle.dump(result_dic, fil) fil.close()