def main(args):
    # Set up parameters.
    alpha = 0.01
    n_itr = 1500
    # Set up the output folder.
    output_folder = os.path.realpath('..') + '/result_gtex_feature_explore/result_'\
                    + args.output_folder
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    else:
        filelist = [os.remove(os.path.join(output_folder, f))\
                    for f in os.listdir(output_folder)]
    # Load the data.
    p, x, n_full, cate_name, cis_name = dl.load_GTEx(args.data_name,\
                                                     if_impute=False)
    # feature_explore
    md.adafdr_explore(p, x, alpha=alpha, n_full=n_full, vis_dim=None, cate_name=cate_name,\
                      output_folder=output_folder, h=None)
Ejemplo n.º 2
0
def main(args):
    # Set up parameters.
    alpha = 0.01
    n_itr = 1500
    # Set up the output folder.
    output_folder = os.path.realpath(
        '..') + '/results/result_' + args.output_folder
    output_datafile = '/data3/martin/gtex_data/results/result_' + args.output_folder + '.pickle'
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    else:
        filelist = [os.remove(os.path.join(output_folder, f))\
                    for f in os.listdir(output_folder)]
    # Load the data.
    p, x, n_full, cate_name, cis_name = dl.load_GTEx(args.data_name)
    # Logger.
    logging.basicConfig(level=logging.INFO,format='%(module)s:: %(message)s',\
                        filename=output_folder+'/result.log', filemode='w')
    logger = logging.getLogger()
    result_dic = {}
    # An overview of the data
    logger.info('# p: %s' % str(p[0:2]))
    logger.info('# x: %s' % str(x[0:2, :]))
    # Report the baseline methods.
    n_rej, t_rej = md.bh_test(p, alpha=alpha, n_full=n_full, verbose=False)
    logger.info('## BH, n_rej=%d, t_rej=%0.5f' % (n_rej, t_rej))
    result_dic['bh'] = {'h_hat': p < t_rej}
    n_rej, t_rej, pi0_hat = md.sbh_test(p,
                                        alpha=alpha,
                                        n_full=n_full,
                                        verbose=False)
    result_dic['sbh'] = {'h_hat': p < t_rej}
    logger.info('## SBH, n_rej=%d, t_rej=%0.5f, pi0_hat=%0.3f' %
                (n_rej, t_rej, pi0_hat))
    # Analysis
    md.adafdr_explore(p, x, alpha=alpha, n_full=n_full, vis_dim=None, cate_name=cate_name,\
                      output_folder=output_folder, h=None)
    # Fast mode.
    output_folder_fast = output_folder + '_fast'
    if not os.path.exists(output_folder_fast):
        os.makedirs(output_folder_fast)
    else:
        filelist = [os.remove(os.path.join(output_folder_fast, f))\
                    for f in os.listdir(output_folder_fast)]
    logger.info('# p: %s' % str(p[0:2]))
    logger.info('# x: %s' % str(x[0:2, :]))
    start_time = time.time()
    res = md.adafdr_test(p, x, K=5, alpha=alpha, h=None, n_full=n_full, n_itr=n_itr,\
                         verbose=True, output_folder=output_folder_fast, random_state=0,\
                         fast_mode=True)
    n_rej = res['n_rej']
    t_rej = res['threshold']
    result_dic['nfdr (fast)'] = {'h_hat': p < t_rej}
    logger.info('## nfdr2 (fast mode), n_rej1=%d, n_rej2=%d, n_rej_total=%d' %
                (n_rej[0], n_rej[1], n_rej[0] + n_rej[1]))
    logger.info('## Total time (fast mode): %0.1fs' %
                (time.time() - start_time))
    # Full mode.
    logger.info('# p: %s' % str(p[0:2]))
    logger.info('# x: %s' % str(x[0:2, :]))
    start_time = time.time()
    res = md.adafdr_test(p, x, K=5, alpha=alpha, h=None, n_full=n_full, n_itr=n_itr,\
                         verbose=True, output_folder=output_folder, random_state=0,\
                         fast_mode=False, single_core=False)
    n_rej = res['n_rej']
    t_rej = res['threshold']
    result_dic['nfdr'] = {'h_hat': p < t_rej}
    logger.info('## nfdr2, n_rej1=%d, n_rej2=%d, n_rej_total=%d' %
                (n_rej[0], n_rej[1], n_rej[0] + n_rej[1]))
    logger.info('## Total time: %0.1fs' % (time.time() - start_time))
    # Store the result
    fil = open(output_datafile, 'wb')
    pickle.dump(result_dic, fil)
    fil.close()
Ejemplo n.º 3
0
def main():
    # Load reference
    f_output = '/home/martin/NeuralFDR2/result_downstream_v1/recs_adipose'
    fil_rec = open(f_output, 'w')
    fil_rec.write('# Load snp and gene\n')
    # snp names
    fil_path = '/data3/martin/gtex_data/gtex_utils/snp_feat.txt'
    snp_data = np.loadtxt(fil_path, delimiter=',', dtype=str)
    snp_sym2id = {}
    snp_id2sym = {}
    for i in range(snp_data.shape[0]):
        snp_sym2id[snp_data[i, 0]] = snp_data[i, 1]
        snp_id2sym[snp_data[i, 1]] = snp_data[i, 0]
    # gene names
    fil_path = '/data3/martin/gtex_data/gtex_utils/gencode.v19.genes.patched_contigs.gtf'
    fil_open = open(fil_path, "r")
    gene_sym2id = {}
    gene_id2sym = {}
    for i_line, line in enumerate(fil_open):
        if line[0] != '#':
            line = line.strip().split('\t')
            line = line[8].strip().split(' ')
            gene_id = line[1].replace('"', '').replace(';', '')
            gene_name = line[9].replace('"', '').replace(';', '')
            gene_id2sym[gene_id] = gene_name
            gene_sym2id[gene_name] = gene_id
    fil_open.close()
    # Load MuTHER data
    fil_rec.write('# Load MuTHER\n')
    file_muther_path = '/data3/martin/gtex_data/MuTHER/' + 'MuTHER_cis_results_chrall.txt'
    data_muther = np.loadtxt(file_muther_path, delimiter=',', dtype=str)
    MuTHER_dic = {}
    count_na = 0
    for i in range(data_muther.shape[0]):
        gene_sym, snp_id = data_muther[i, [1, 2]]
        if ('n' in data_muther[i, 3]) or ('N' in data_muther[i, 3]):
            count_na = count_na + 1
        else:
            MuTHER_dic[gene_sym + '-' + snp_id] = float(data_muther[i, 3])
    fil_rec.write('# MuTHER_dic count_na=%d\n' % count_na)
    # for i in range(data_muther.shape[0]):
    #     gene_sym,snp_id = data_muther[i, [1,2]]
    #     MuTHER_dic[gene_sym+'-'+snp_id] = float(data_muther[i, 3])
    n_full = 29160396
    # p = np.array(data_muther[:,-3], dtype=float) # fixit
    # Process GTEx data
    fil_rec.write('# Process GTEx\n')
    # data_list = ['Adipose_Subcutaneous', 'Adipose_Visceral_Omentum',
    #              'Cells_EBV-transformed_lymphocytes']
    data_list = ['Adipose_Subcutaneous', 'Adipose_Visceral_Omentum']
    # data_list = ['Adipose_Subcutaneous_chr21']
    output_folder = '/home/martin/NeuralFDR2/result_downstream_v1'
    load_gtex_data_name_dic = {
        'Adipose_Subcutaneous': 'Adipose_Subcutaneous',
        'Adipose_Subcutaneous_chr21': 'Adipose_Subcutaneous-chr21',
        'Adipose_Visceral_Omentum': 'Adipose_Visceral_Omentum',
        'Adipose_Visceral_Omentum_chr21': 'Adipose_Visceral_Omentum-chr21',
        'Cells_EBV-transformed_lymphocytes':
        'Cells_EBV-transformed_lymphocytes'
    }
    for data_name in data_list:
        # Load results
        fil_rec.write('\n' + data_name + '\n')
        res_GTEx_path = '/data3/martin/gtex_data/results/' + \
                        'result_GTEx_%s.pickle'%data_name
        fil = open(res_GTEx_path, 'rb')
        result_dic = pickle.load(fil)
        fil.close()
        h_hat_sbh = result_dic['sbh']['h_hat']
        h_hat_nfdr = result_dic['nfdr']['h_hat']
        fil_rec.write('# D_sbh=%d\n' % np.sum(h_hat_sbh))
        fil_rec.write('# D_nfdr=%d, D_overlap=%d\n'\
                      %(np.sum(h_hat_nfdr), np.sum(h_hat_sbh*h_hat_nfdr)))
        p_gtex, _, _, _, cis_name = dl.load_GTEx(
            load_gtex_data_name_dic[data_name])
        GTEx_dic = {}
        for i_cis_name in range(cis_name.shape[0]):
            GTEx_dic[cis_name[i_cis_name]] = p_gtex[i_cis_name]
        cis_name_sbh = cis_name[h_hat_sbh]
        cis_name_nfdr = cis_name[h_hat_nfdr]
        # Standardize the name.
        cis_nfdr_standard = standardize_cis_name(cis_name_nfdr, gene_id2sym,
                                                 snp_sym2id)
        cis_sbh_standard = standardize_cis_name(cis_name_sbh, gene_id2sym,
                                                snp_sym2id)
        # Look at the difference.
        cis_nfdr_standard = set(cis_nfdr_standard)
        cis_sbh_standard = set(cis_sbh_standard)
        cis_intersect = cis_nfdr_standard & cis_sbh_standard
        cis_sbh = cis_sbh_standard - cis_intersect
        cis_nfdr = cis_nfdr_standard - cis_intersect
        # Compute the corresponding p-values
        p_MuTHER_intersect = get_MuTHER_p_value(cis_intersect, MuTHER_dic,
                                                GTEx_dic, gene_sym2id,
                                                snp_id2sym, fil_rec)
        p_MuTHER_sbh = get_MuTHER_p_value(cis_sbh, MuTHER_dic, GTEx_dic,
                                          gene_sym2id, snp_id2sym, fil_rec)
        p_MuTHER_nfdr = get_MuTHER_p_value(cis_nfdr, MuTHER_dic, GTEx_dic,
                                           gene_sym2id, snp_id2sym, fil_rec)
        # Save results
        n_counts = np.array([len(cis_intersect), len(cis_sbh), len(cis_nfdr)])
        fil = open(output_folder + '/p_overlap_%s.pickle' % data_name, 'wb')
        pickle.dump(n_counts, fil)
        pickle.dump(p_MuTHER_intersect[p_MuTHER_intersect[:, 0] < 1, :], fil)
        pickle.dump(p_MuTHER_sbh[p_MuTHER_sbh[:, 0] < 1, :], fil)
        pickle.dump(p_MuTHER_nfdr[p_MuTHER_nfdr[:, 0] < 1, :], fil)
        fil.close()
    fil_rec.close()
def main(args):
    # Set up parameters.
    alpha = 0.01
    n_itr = 1500
    # Set up the output folder.
    output_folder = os.path.realpath(
        '..') + '/results/result_univariate_' + args.output_folder
    output_datafile = '/data3/martin/gtex_data/results_uni_covariate/result_' +\
                       args.output_folder + '.pickle'
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    else:
        filelist = [os.remove(os.path.join(output_folder, f))\
                    for f in os.listdir(output_folder)]
    # Load the data.
    p, x, n_full, cate_name, cis_name = dl.load_GTEx(args.data_name)
    # Logger.
    logging.basicConfig(level=logging.INFO,format='%(message)s',\
                        filename=output_folder+'/result.log', filemode='w')
    logger = logging.getLogger()
    result_dic = {}
    # An overview of the data
    logger.info('# p: %s' % str(p[0:2]))
    logger.info('# x: %s' % str(x[0:2, :]))
    # Report the baseline methods.
    n_rej, t_rej = md.bh_test(p, alpha=alpha, n_full=n_full, verbose=False)
    logger.info('## BH, n_rej=%d, t_rej=%0.5f' % (n_rej, t_rej))
    result_dic['bh'] = {'h_hat': p < t_rej}
    n_rej, t_rej, pi0_hat = md.sbh_test(p,
                                        alpha=alpha,
                                        n_full=n_full,
                                        verbose=False)
    result_dic['sbh'] = {'h_hat': p < t_rej}
    logger.info('## SBH, n_rej=%d, t_rej=%0.5f, pi0_hat=%0.3f\n' %
                (n_rej, t_rej, pi0_hat))
    # Analysis
    md.adafdr_explore(p, x, alpha=alpha, n_full=n_full, vis_dim=None, cate_name=cate_name,\
                      output_folder=output_folder, h=None)
    # Four covaraites seperately
    cov_list = ['exp', 'maf', 'dist', 'chromotin', 'all']
    for i_cov in range(5):
        logger.info('Covariate: %s' % cov_list[i_cov])
        if i_cov < 4:
            temp_x = x[:, i_cov].reshape([-1, 1])
        else:
            temp_x = x
        # Fast mode.
        # output_folder_fast = output_folder + '_fast'
        # if not os.path.exists(output_folder_fast):
        #     os.makedirs(output_folder_fast)
        # else:
        #     filelist = [os.remove(os.path.join(output_folder_fast, f))\
        #                 for f in os.listdir(output_folder_fast)]
        output_folder_fast = None
        logger.info('# p: %s' % str(p[0:2]))
        logger.info('# x: %s' % str(temp_x[0:2, :]))
        start_time = time.time()
        res = md.adafdr_test(p,
                             temp_x,
                             K=5,
                             alpha=alpha,
                             h=None,
                             n_full=n_full,
                             n_itr=n_itr,
                             verbose=True,
                             output_folder=output_folder_fast,
                             random_state=0,
                             fast_mode=True)
        n_rej = res['n_rej']
        t_rej = res['threshold']
        result_dic['nfdr (fast)_%d' % i_cov] = {'h_hat': p < t_rej}
        logger.info(
            '## AdaFDR (fast mode), feature=%d, n_rej1=%d, n_rej2=%d, n_rej_total=%d'
            % (i_cov, n_rej[0], n_rej[1], n_rej[0] + n_rej[1]))
        logger.info('## Total time (fast mode): %0.1fs' %
                    (time.time() - start_time))
        # Full mode.
        logger.info('# p: %s' % str(p[0:2]))
        logger.info('# x: %s' % str(temp_x[0:2, :]))
        start_time = time.time()
        res = md.adafdr_test(p, temp_x, K=5, alpha=alpha, h=None, n_full=n_full, n_itr=n_itr,\
                             verbose=True, output_folder=None, random_state=0,\
                             fast_mode=False, single_core=False)
        n_rej = res['n_rej']
        t_rej = res['threshold']
        result_dic['nfdr_%d' % i_cov] = {'h_hat': p < t_rej}
        logger.info('## nfdr2, n_rej1=%d, n_rej2=%d, n_rej_total=%d' %
                    (n_rej[0], n_rej[1], n_rej[0] + n_rej[1]))
        logger.info('## Total time: %0.1fs' % (time.time() - start_time))
        logger.info(' ')
    # Store the result
    fil = open(output_datafile, 'wb')
    pickle.dump(result_dic, fil)
    fil.close()