Esempio n. 1
0
def test_adafdr_retest():
    """ Test for adafdr_retest
    """
    p, x, h, n_full, _ = dl.load_2d_bump_slope(n_sample=20000)
    res = md.adafdr_test(p, x, alpha=0.1, single_core=True)
    res_temp = md.adafdr_test(p, x, alpha=0.02, single_core=True)
    res_retest = md.adafdr_retest(res, alpha=0.02)
    print('adafdr_test discoveries at alpha=0.02:',
          np.sum(res_temp['decision']))
    print('adafdr_retest discoveries at alpha=0.02:',
          np.sum(res_retest['decision']))
    print('# diff', np.sum(res_temp['decision'] != res_retest['decision']))
    assert np.sum(res_temp['decision'] != res_retest['decision']) < 10
Esempio n. 2
0
def test_adafdr_test():
    """ Test for adafdr_test
    """
    p, x, h, n_full, _ = dl.load_2d_bump_slope(n_sample=20000)
    res = md.adafdr_test(p, x, K=2, alpha=0.1, h=None, n_full=n_full,\
                         n_itr=50, verbose=False, random_state=0,\
                         fast_mode = False, single_core=True)
    t = res['threshold']
    FDP = np.sum((p < t) * (h == 0)) / np.sum(p < t)
    n_rej = np.sum(p < t)
    print('n_rej', n_rej)
    assert n_rej > 700
    print('FDP', FDP)
    assert FDP < 0.12
Esempio n. 3
0
def main(args):
    # Set up parameters.
    alpha = 0.01
    n_itr = 1500
    # Set up the output folder.
    output_folder = os.path.realpath(
        '..') + '/results/result_' + args.output_folder
    output_datafile = '/data3/martin/gtex_data/results/result_' + args.output_folder + '.pickle'
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    else:
        filelist = [os.remove(os.path.join(output_folder, f))\
                    for f in os.listdir(output_folder)]
    # Load the data.
    p, x, n_full, cate_name, cis_name = dl.load_GTEx(args.data_name)
    # Logger.
    logging.basicConfig(level=logging.INFO,format='%(module)s:: %(message)s',\
                        filename=output_folder+'/result.log', filemode='w')
    logger = logging.getLogger()
    result_dic = {}
    # An overview of the data
    logger.info('# p: %s' % str(p[0:2]))
    logger.info('# x: %s' % str(x[0:2, :]))
    # Report the baseline methods.
    n_rej, t_rej = md.bh_test(p, alpha=alpha, n_full=n_full, verbose=False)
    logger.info('## BH, n_rej=%d, t_rej=%0.5f' % (n_rej, t_rej))
    result_dic['bh'] = {'h_hat': p < t_rej}
    n_rej, t_rej, pi0_hat = md.sbh_test(p,
                                        alpha=alpha,
                                        n_full=n_full,
                                        verbose=False)
    result_dic['sbh'] = {'h_hat': p < t_rej}
    logger.info('## SBH, n_rej=%d, t_rej=%0.5f, pi0_hat=%0.3f' %
                (n_rej, t_rej, pi0_hat))
    # Analysis
    md.adafdr_explore(p, x, alpha=alpha, n_full=n_full, vis_dim=None, cate_name=cate_name,\
                      output_folder=output_folder, h=None)
    # Fast mode.
    output_folder_fast = output_folder + '_fast'
    if not os.path.exists(output_folder_fast):
        os.makedirs(output_folder_fast)
    else:
        filelist = [os.remove(os.path.join(output_folder_fast, f))\
                    for f in os.listdir(output_folder_fast)]
    logger.info('# p: %s' % str(p[0:2]))
    logger.info('# x: %s' % str(x[0:2, :]))
    start_time = time.time()
    res = md.adafdr_test(p, x, K=5, alpha=alpha, h=None, n_full=n_full, n_itr=n_itr,\
                         verbose=True, output_folder=output_folder_fast, random_state=0,\
                         fast_mode=True)
    n_rej = res['n_rej']
    t_rej = res['threshold']
    result_dic['nfdr (fast)'] = {'h_hat': p < t_rej}
    logger.info('## nfdr2 (fast mode), n_rej1=%d, n_rej2=%d, n_rej_total=%d' %
                (n_rej[0], n_rej[1], n_rej[0] + n_rej[1]))
    logger.info('## Total time (fast mode): %0.1fs' %
                (time.time() - start_time))
    # Full mode.
    logger.info('# p: %s' % str(p[0:2]))
    logger.info('# x: %s' % str(x[0:2, :]))
    start_time = time.time()
    res = md.adafdr_test(p, x, K=5, alpha=alpha, h=None, n_full=n_full, n_itr=n_itr,\
                         verbose=True, output_folder=output_folder, random_state=0,\
                         fast_mode=False, single_core=False)
    n_rej = res['n_rej']
    t_rej = res['threshold']
    result_dic['nfdr'] = {'h_hat': p < t_rej}
    logger.info('## nfdr2, n_rej1=%d, n_rej2=%d, n_rej_total=%d' %
                (n_rej[0], n_rej[1], n_rej[0] + n_rej[1]))
    logger.info('## Total time: %0.1fs' % (time.time() - start_time))
    # Store the result
    fil = open(output_datafile, 'wb')
    pickle.dump(result_dic, fil)
    fil.close()
Esempio n. 4
0
def main(args):
    # Set up parameters.
    alpha = 0.1
    n_itr = 1500
    # Set up the output folder.
    output_folder = os.path.realpath(
        '..') + '/result_small_data/result_' + args.output_folder
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    else:
        filelist = [os.remove(os.path.join(output_folder, f))\
                    for f in os.listdir(output_folder)]
    # Load the data.
    if 'GTEx' in args.data_loader:
        p, x, n_full, cate_name = dl.load_GTEx_full(verbose=True)
        x = x[:, 0:3]
        h = None
    else:
        p, h, x = eval('dl.' + args.data_loader + '()')
        n_full = p.shape[0]
    cate_name = {}
    # Logger.
    logging.basicConfig(level=logging.INFO,format='%(module)s:: %(message)s',\
                        filename=output_folder+'/result.log', filemode='w')
    logger = logging.getLogger()
    # An overview of the data
    logger.info('# p: %s' % str(p[0:2]))
    logger.info('# x: %s' % str(x[0:2, :]))
    # Report the baseline methods.
    n_rej, t_rej = md.bh_test(p, alpha=alpha, n_full=n_full, verbose=False)
    logger.info('## BH, n_rej=%d, t_rej=%0.5f' % (n_rej, t_rej))
    n_rej, t_rej, pi0_hat = md.sbh_test(p,
                                        alpha=alpha,
                                        n_full=n_full,
                                        verbose=False)
    logger.info('## SBH, n_rej=%d, t_rej=%0.5f, pi0_hat=%0.3f' %
                (n_rej, t_rej, pi0_hat))
    # Analysis
    md.adafdr_explore(p, x, alpha=alpha, n_full=n_full, vis_dim=None, cate_name=cate_name,\
                       output_folder=output_folder, h=None)
    # Fast mode.
    output_folder_fast = output_folder + '_fast'
    if not os.path.exists(output_folder_fast):
        os.makedirs(output_folder_fast)
    else:
        filelist = [os.remove(os.path.join(output_folder_fast, f))\
                    for f in os.listdir(output_folder_fast)]
    logger.info('# p: %s' % str(p[0:2]))
    logger.info('# x: %s' % str(x[0:2, :]))
    start_time = time.time()
    res = md.adafdr_test(p, x, K=5, alpha=alpha, h=None, n_full=n_full, n_itr=n_itr,\
                         verbose=True, output_folder=output_folder_fast, random_state=0,\
                         fast_mode=True)
    n_rej = res['n_rej']
    t_rej = res['threshold']
    logger.info('## AdaFDR (fast mode), n_rej1=%d, n_rej2=%d, n_rej_total=%d' %
                (n_rej[0], n_rej[1], n_rej[0] + n_rej[1]))
    logger.info('## Total time (fast mode): %0.1fs' %
                (time.time() - start_time))
    # Full mode.
    logger.info('# p: %s' % str(p[0:2]))
    logger.info('# x: %s' % str(x[0:2, :]))
    start_time = time.time()
    res = md.adafdr_test(p, x, K=5, alpha=alpha, h=None, n_full=n_full, n_itr=n_itr,\
                         verbose=True, output_folder=output_folder, random_state=0,\
                         fast_mode=False, single_core=False)
    n_rej = res['n_rej']
    t_rej = res['threshold']
    logger.info('## AdaFDR, n_rej1=%d, n_rej2=%d, n_rej_total=%d' %
                (n_rej[0], n_rej[1], n_rej[0] + n_rej[1]))
    logger.info('## Total time: %0.1fs' % (time.time() - start_time))
Esempio n. 5
0
def main(args):    
    # Set up the parameters.
    input_folder = args.input_folder
    output_folder = './temp_result/res_' + args.data_name
    if args.alpha is not None:
        alpha_list = [args.alpha]
    else:
        alpha_list = [0.05, 0.1, 0.15, 0.2]
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    else:
        filelist = [os.remove(os.path.join(output_folder, f))\
                    for f in os.listdir(output_folder)]
    print('input_folder: %s'%input_folder)
    print('output_folder: %s'%output_folder)
    print('alpha_list: %s'%alpha_list)
    # Get a file for recording.
    f_write = open(output_folder+'/result.log', 'w')
    # Process all data in the folder
    file_list = os.listdir(args.input_folder)   
    result_dic = {'bh': [], 'sbh': [], 'adafdr-fast': [], 'adafdr': []}
    for filename in file_list:
        if filename[0] == '.':
            continue        
        file_path = args.input_folder + '/' + filename
        p, x, h = dl.load_simulation_data(file_path)
        for alpha in alpha_list:
            print('# Processing %s with alpha=%0.2f'%(filename, alpha))
            f_write.write('# Processing %s with alpha=%0.2f\n'%(filename, alpha))
            # BH result
            n_rej, t_rej = md.bh_test(p, alpha=alpha, verbose=False)
            fdp,power = get_fdp_and_power(h, p<=t_rej)
            result_dic['bh'].append([fdp, power, alpha, filename])
            f_write.write('## BH discoveries: %d, threshold=%0.3f\n'%(n_rej,t_rej))
            # SBH result
            n_rej, t_rej, pi0_hat = md.sbh_test(p, alpha=alpha, verbose=False)
            fdp,power = get_fdp_and_power(h, p<=t_rej)
            result_dic['sbh'].append([fdp, power, alpha, filename])
            temp = '## SBH discoveries: %d, threshold=%0.3f, pi0_hat=%0.3f\n'%(n_rej, t_rej, pi0_hat)
            f_write.write(temp)
            # AdaFDR-fast result
            start_time = time.time()
            res = md.adafdr_test(p, x, alpha=alpha, fast_mode=True)
            n_rej = res['n_rej']
            t_rej = res['threshold']
            fdp,power = get_fdp_and_power(h, p<=t_rej)
            result_dic['adafdr-fast'].append([fdp, power, alpha, filename])
            temp = '## AdaFDR-fast discoveries: fold_1=%d, fold_2=%d, total=%d\n'%\
                    (n_rej[0],n_rej[1],n_rej[0]+n_rej[1])
            f_write.write(temp)
            f_write.write('## Time: %0.1fs'%(time.time()-start_time))
            # AdaFDR result
            start_time = time.time()
            res = md.adafdr_test(p, x, alpha=alpha, fast_mode=False)
            n_rej = res['n_rej']
            t_rej = res['threshold']
            fdp,power = get_fdp_and_power(h, p<=t_rej)
            result_dic['adafdr'].append([fdp, power, alpha, filename])
            temp = '## AdaFDR discoveries: fold_1=%d, fold_2=%d, total=%d\n'%\
                    (n_rej[0],n_rej[1],n_rej[0]+n_rej[1])
            f_write.write(temp)
            f_write.write('## Time: %0.1fs'%(time.time()-start_time))
            f_write.write('\n')
    # Store the result
    fil = open(output_folder+'/result.pickle','wb') 
    pickle.dump(result_dic, fil)
    fil.close()   
    f_write.close()
Esempio n. 6
0
def main(args):
    # Set up parameters.
    alpha = 0.1
    n_itr = 1500
    # Set up the output folder.
    output_folder = os.path.realpath('..') + '/result_simulation/result_' + args.output_folder
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    else:
        filelist = [os.remove(os.path.join(output_folder, f))\
                    for f in os.listdir(output_folder)]
    # Get logger.
    logging.basicConfig(level=logging.INFO,format='%(message)s',\
                        filename=output_folder+'/result.log', filemode='w')
    logger = logging.getLogger()
    # Run method in all data in the folder
    file_list = os.listdir(args.input_folder)
    alpha_list = [0.05, 0.1, 0.15, 0.2]
    if 'speed' in args.input_folder:
        alpha_list = [0.1]
    if 'ntest' in args.input_folder:
        alpha_list = [0.1]
    if 'prop_alt' in args.input_folder:
        alpha_list = [0.1]
    result_dic = {'bh': {}, 'sbh': {}, 'nfdr (fast)': {}, 'nfdr': {}}
    time_dic = {'nfdr (fast)': {}, 'nfdr': {}}
    for alpha in alpha_list:
        result_dic['bh'][alpha] = []
        result_dic['sbh'][alpha] = []
        result_dic['nfdr (fast)'][alpha] = []
        result_dic['nfdr'][alpha] = []
        time_dic['nfdr (fast)'][alpha] = {}
        time_dic['nfdr'][alpha] = {}
        for filename in file_list:
            filename_short = filename
            if filename[0] == '.':
                continue
            print('# Processing %s with alpha=%0.2f'%(filename, alpha))
            logger.info('# Processing %s with alpha=%0.2f'%(filename, alpha))
            filename = args.input_folder + '/' + filename
            p, x, h = dl.load_simulation_data(filename)
            n_full = p.shape[0]
            # Report the baseline.
            n_rej, t_rej = md.bh_test(p, alpha=alpha, n_full=n_full, verbose=False)
            result_dic['bh'][alpha].append([h, p<=t_rej])
            logger.info('## BH, n_rej=%d, t_rej=%0.5f'%(n_rej,t_rej))
            n_rej, t_rej, pi0_hat = md.sbh_test(p, alpha=alpha, n_full=n_full, verbose=False)
            result_dic['sbh'][alpha].append([h, p<=t_rej])
            logger.info('## SBH, n_rej=%d, t_rej=%0.5f, pi0_hat=%0.3f'%(n_rej, t_rej, pi0_hat))
            # Fast mode.
            start_time = time.time()
            # res = md.adafdr_test(p, x, K=5, alpha=alpha, h=h, n_full=n_full, n_itr=n_itr,\
            #                      verbose=False, output_folder=None, random_state=0,\
            #                      fast_mode=True)
            res_train = md.adafdr_train(p, x, K=5, alpha=alpha, h=h, n_full=n_full,
                                        n_itr=n_itr, verbose=False,
                                        output_folder=None, random_state=0,
                                        fast_mode=True)
            res = md.adafdr_test(res_train, alpha, n_full=n_full, 
                                 output_folder = None)
    
            n_rej = res['n_rej']
            t_rej = res['threshold']
            time_dic['nfdr (fast)'][alpha][filename_short] = time.time()-start_time
            result_dic['nfdr (fast)'][alpha].append([h, p<=t_rej])
            logger.info('## nfdr2 (fast mode), n_rej1=%d, n_rej2=%d, n_rej_total=%d'%(n_rej[0],n_rej[1],n_rej[0]+n_rej[1]))
            logger.info('## Total time (fast mode): %0.1fs'%(time.time()-start_time))
            # Full mode.
            start_time = time.time()
            # res = md.adafdr_test(p, x, K=5, alpha=alpha, h=h, n_full=n_full, n_itr=n_itr,\
            #                      verbose=False, output_folder=None, random_state=0,\
            #                      fast_mode=False, single_core=False)
            res_train = md.adafdr_train(p, x, K=5, alpha=alpha, h=h, n_full=n_full,
                                        n_itr=n_itr, verbose=False,
                                        output_folder=None, random_state=0,
                                        fast_mode=False, single_core=False)
            res = md.adafdr_test(res_train, alpha, n_full=n_full, 
                                 output_folder = None)
            n_rej = res['n_rej']
            t_rej = res['threshold']
            time_dic['nfdr'][alpha][filename_short] = time.time()-start_time
            result_dic['nfdr'][alpha].append([h, p<=t_rej])
            logger.info('## nfdr2, n_rej1=%d, n_rej2=%d, n_rej_total=%d'%(n_rej[0],n_rej[1],n_rej[0]+n_rej[1]))
            logger.info('## Total time: %0.1fs'%(time.time()-start_time))
            logger.info('\n')
    # Store the result
    fil = open(output_folder+'/result_dic.pickle','wb') 
    pickle.dump(result_dic, fil)
    pickle.dump(time_dic, fil)
    fil.close()
def main(args):
    # Set up parameters.
    alpha = 0.01
    n_itr = 1500
    # Set up the output folder.
    output_folder = os.path.realpath(
        '..') + '/results/result_univariate_' + args.output_folder
    output_datafile = '/data3/martin/gtex_data/results_uni_covariate/result_' +\
                       args.output_folder + '.pickle'
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    else:
        filelist = [os.remove(os.path.join(output_folder, f))\
                    for f in os.listdir(output_folder)]
    # Load the data.
    p, x, n_full, cate_name, cis_name = dl.load_GTEx(args.data_name)
    # Logger.
    logging.basicConfig(level=logging.INFO,format='%(message)s',\
                        filename=output_folder+'/result.log', filemode='w')
    logger = logging.getLogger()
    result_dic = {}
    # An overview of the data
    logger.info('# p: %s' % str(p[0:2]))
    logger.info('# x: %s' % str(x[0:2, :]))
    # Report the baseline methods.
    n_rej, t_rej = md.bh_test(p, alpha=alpha, n_full=n_full, verbose=False)
    logger.info('## BH, n_rej=%d, t_rej=%0.5f' % (n_rej, t_rej))
    result_dic['bh'] = {'h_hat': p < t_rej}
    n_rej, t_rej, pi0_hat = md.sbh_test(p,
                                        alpha=alpha,
                                        n_full=n_full,
                                        verbose=False)
    result_dic['sbh'] = {'h_hat': p < t_rej}
    logger.info('## SBH, n_rej=%d, t_rej=%0.5f, pi0_hat=%0.3f\n' %
                (n_rej, t_rej, pi0_hat))
    # Analysis
    md.adafdr_explore(p, x, alpha=alpha, n_full=n_full, vis_dim=None, cate_name=cate_name,\
                      output_folder=output_folder, h=None)
    # Four covaraites seperately
    cov_list = ['exp', 'maf', 'dist', 'chromotin', 'all']
    for i_cov in range(5):
        logger.info('Covariate: %s' % cov_list[i_cov])
        if i_cov < 4:
            temp_x = x[:, i_cov].reshape([-1, 1])
        else:
            temp_x = x
        # Fast mode.
        # output_folder_fast = output_folder + '_fast'
        # if not os.path.exists(output_folder_fast):
        #     os.makedirs(output_folder_fast)
        # else:
        #     filelist = [os.remove(os.path.join(output_folder_fast, f))\
        #                 for f in os.listdir(output_folder_fast)]
        output_folder_fast = None
        logger.info('# p: %s' % str(p[0:2]))
        logger.info('# x: %s' % str(temp_x[0:2, :]))
        start_time = time.time()
        res = md.adafdr_test(p,
                             temp_x,
                             K=5,
                             alpha=alpha,
                             h=None,
                             n_full=n_full,
                             n_itr=n_itr,
                             verbose=True,
                             output_folder=output_folder_fast,
                             random_state=0,
                             fast_mode=True)
        n_rej = res['n_rej']
        t_rej = res['threshold']
        result_dic['nfdr (fast)_%d' % i_cov] = {'h_hat': p < t_rej}
        logger.info(
            '## AdaFDR (fast mode), feature=%d, n_rej1=%d, n_rej2=%d, n_rej_total=%d'
            % (i_cov, n_rej[0], n_rej[1], n_rej[0] + n_rej[1]))
        logger.info('## Total time (fast mode): %0.1fs' %
                    (time.time() - start_time))
        # Full mode.
        logger.info('# p: %s' % str(p[0:2]))
        logger.info('# x: %s' % str(temp_x[0:2, :]))
        start_time = time.time()
        res = md.adafdr_test(p, temp_x, K=5, alpha=alpha, h=None, n_full=n_full, n_itr=n_itr,\
                             verbose=True, output_folder=None, random_state=0,\
                             fast_mode=False, single_core=False)
        n_rej = res['n_rej']
        t_rej = res['threshold']
        result_dic['nfdr_%d' % i_cov] = {'h_hat': p < t_rej}
        logger.info('## nfdr2, n_rej1=%d, n_rej2=%d, n_rej_total=%d' %
                    (n_rej[0], n_rej[1], n_rej[0] + n_rej[1]))
        logger.info('## Total time: %0.1fs' % (time.time() - start_time))
        logger.info(' ')
    # Store the result
    fil = open(output_datafile, 'wb')
    pickle.dump(result_dic, fil)
    fil.close()