Beispiel #1
0
 def setUpClass(cls):
     args = parser.parse_args('')
     args.ref_ld = DIR + '/simulate_test/ldscore/twold_onefile'
     args.w_ld = DIR + '/simulate_test/ldscore/w'
     args.rg = ','.join(
         (DIR + '/simulate_test/sumstats/' + str(i) for i in xrange(N_REP)))
     args.out = DIR + '/simulate_test/1'
     x = s.estimate_rg(args, log)
     args.intercept_gencov = ','.join(('0' for _ in xrange(N_REP)))
     args.intercept_h2 = ','.join(('1' for _ in xrange(N_REP)))
     y = s.estimate_rg(args, log)
     cls.rg = x
     cls.rg_noint = y
Beispiel #2
0
 def test_no_check_alleles(self):
     args = parser.parse_args('')
     args.ref_ld = DIR + '/simulate_test/ldscore/oneld_onefile'
     args.w_ld = DIR + '/simulate_test/ldscore/w'
     args.rg = ','.join(
         [DIR + '/simulate_test/sumstats/1' for _ in xrange(2)])
     args.out = DIR + '/simulate_test/1'
     x = s.estimate_rg(args, log)[0]
     args.no_check_alleles = True
     y = s.estimate_rg(args, log)[0]
     assert_equal(x.rg_ratio, y.rg_ratio)
     assert_almost_equal(x.rg_jknife, y.rg_jknife)
     assert_equal(x.rg_se, y.rg_se)
Beispiel #3
0
 def test_no_check_alleles(self):
     args = parser.parse_args('')
     args.ref_ld = DIR + '/simulate_test/ldscore/oneld_onefile'
     args.w_ld = DIR + '/simulate_test/ldscore/w'
     args.rg = ','.join(
         [DIR + '/simulate_test/sumstats/1' for _ in xrange(2)])
     args.out = DIR + '/simulate_test/1'
     x = s.estimate_rg(args, log)[0]
     args.no_check_alleles = True
     y = s.estimate_rg(args, log)[0]
     assert_equal(x.rg_ratio, y.rg_ratio)
     assert_almost_equal(x.rg_jknife, y.rg_jknife)
     assert_equal(x.rg_se, y.rg_se)
Beispiel #4
0
 def setUpClass(cls):
     args = parser.parse_args('')
     args.ref_ld = DIR + '/simulate_test/ldscore/twold_onefile'
     args.w_ld = DIR + '/simulate_test/ldscore/w'
     args.rg = ','.join(
         (DIR + '/simulate_test/sumstats/' + str(i) for i in xrange(N_REP)))
     args.out = DIR + '/simulate_test/1'
     x = s.estimate_rg(args, log)
     args.intercept_gencov = ','.join(('0' for _ in xrange(N_REP)))
     args.intercept_h2 = ','.join(('1' for _ in xrange(N_REP)))
     y = s.estimate_rg(args, log)
     cls.rg = x
     cls.rg_noint = y
Beispiel #5
0
 def test_twostep_rg(self):
     # make sure two step isn't going crazy
     args = parser.parse_args('')
     args.ref_ld_chr = DIR + '/simulate_test/ldscore/oneld_onefile'
     args.w_ld = DIR + '/simulate_test/ldscore/w'
     args.rg = ','.join(
         [DIR + '/simulate_test/sumstats/1' for _ in xrange(2)])
     args.out = DIR + '/simulate_test/rg'
     args.two_step = 999
     x = s.estimate_rg(args, log)[0]
     args.two_step = 99999
     y = s.estimate_rg(args, log)[0]
     assert_allclose(x.rg_ratio, y.rg_ratio, atol=1e-5)
     assert_allclose(x.gencov.tot, y.gencov.tot, atol=1e-5)
Beispiel #6
0
 def test_twostep_rg(self):
     # make sure two step isn't going crazy
     args = parser.parse_args('')
     args.ref_ld_chr = DIR + '/simulate_test/ldscore/oneld_onefile'
     args.w_ld = DIR + '/simulate_test/ldscore/w'
     args.rg = ','.join(
         [DIR + '/simulate_test/sumstats/1' for _ in xrange(2)])
     args.out = DIR + '/simulate_test/rg'
     args.two_step = 999
     x = s.estimate_rg(args, log)[0]
     args.two_step = 99999
     y = s.estimate_rg(args, log)[0]
     assert_allclose(x.rg_ratio, y.rg_ratio, atol=1e-5)
     assert_allclose(x.gencov.tot, y.gencov.tot, atol=1e-5)
Beispiel #7
0
 def test_rg_M(self):
     args = parser.parse_args('')
     args.ref_ld = DIR + '/simulate_test/ldscore/oneld_onefile'
     args.w_ld = DIR + '/simulate_test/ldscore/w'
     args.rg = ','.join(
         [DIR + '/simulate_test/sumstats/1' for _ in xrange(2)])
     args.out = DIR + '/simulate_test/1'
     x = s.estimate_rg(args, log)[0]
     args.M = open(DIR + '/simulate_test/ldscore/oneld_onefile.l2.M_5_50',
                   'rb').read().rstrip('\n')
     y = s.estimate_rg(args, log)[0]
     assert_array_almost_equal(x.rg_ratio, y.rg_ratio)
     assert_array_almost_equal(x.rg_se, y.rg_se)
     args.M = '1,2'
     assert_raises(ValueError, s.estimate_rg, args, log)
     args.M = 'foo_bar'
     assert_raises(ValueError, s.estimate_rg, args, log)
Beispiel #8
0
 def test_rg_M(self):
     args = parser.parse_args('')
     args.ref_ld = DIR + '/simulate_test/ldscore/oneld_onefile'
     args.w_ld = DIR + '/simulate_test/ldscore/w'
     args.rg = ','.join(
         [DIR + '/simulate_test/sumstats/1' for _ in xrange(2)])
     args.out = DIR + '/simulate_test/1'
     x = s.estimate_rg(args, log)[0]
     args.M = open(
         DIR + '/simulate_test/ldscore/oneld_onefile.l2.M_5_50', 'rb').read().rstrip('\n')
     y = s.estimate_rg(args, log)[0]
     assert_array_almost_equal(x.rg_ratio, y.rg_ratio)
     assert_array_almost_equal(x.rg_se, y.rg_se)
     args.M = '1,2'
     assert_raises(ValueError, s.estimate_rg, args, log)
     args.M = 'foo_bar'
     assert_raises(ValueError, s.estimate_rg, args, log)
Beispiel #9
0
 def test_rg_ref_ld(self):
     args = parser.parse_args('')
     args.ref_ld_chr = DIR + '/simulate_test/ldscore/twold_onefile'
     args.w_ld = DIR + '/simulate_test/ldscore/w'
     args.rg = ','.join(
         [DIR + '/simulate_test/sumstats/1' for _ in xrange(2)])
     args.out = DIR + '/simulate_test/1'
     args.print_cov = True  # right now just check no runtime errors
     args.print_delete_vals = True
     x = s.estimate_rg(args, log)[0]
     args.ref_ld = DIR + '/simulate_test/ldscore/twold_firstfile,' + \
         DIR + '/simulate_test/ldscore/twold_secondfile'
     y = s.estimate_rg(args, log)[0]
     args.ref_ld_chr = DIR + '/simulate_test/ldscore/twold_firstfile,' + \
         DIR + '/simulate_test/ldscore/twold_secondfile'
     z = s.estimate_rg(args, log)[0]
     assert_almost_equal(x.rg_ratio, y.rg_ratio)
     assert_almost_equal(y.rg_jknife, z.rg_jknife)
     assert_almost_equal(x.rg_se, y.rg_se)
Beispiel #10
0
 def test_rg_ref_ld(self):
     args = parser.parse_args('')
     args.ref_ld_chr = DIR + '/simulate_test/ldscore/twold_onefile'
     args.w_ld = DIR + '/simulate_test/ldscore/w'
     args.rg = ','.join(
         [DIR + '/simulate_test/sumstats/1' for _ in xrange(2)])
     args.out = DIR + '/simulate_test/1'
     args.print_cov = True  # right now just check no runtime errors
     args.print_delete_vals = True
     x = s.estimate_rg(args, log)[0]
     args.ref_ld = DIR + '/simulate_test/ldscore/twold_firstfile,' + \
         DIR + '/simulate_test/ldscore/twold_secondfile'
     y = s.estimate_rg(args, log)[0]
     args.ref_ld_chr = DIR + '/simulate_test/ldscore/twold_firstfile,' + \
         DIR + '/simulate_test/ldscore/twold_secondfile'
     z = s.estimate_rg(args, log)[0]
     assert_almost_equal(x.rg_ratio, y.rg_ratio)
     assert_almost_equal(y.rg_jknife, z.rg_jknife)
     assert_almost_equal(x.rg_se, y.rg_se)
Beispiel #11
0
            if not args.overlap_annot or args.not_M_5_50:
                if args.frqfile is not None or args.frqfile_chr is not None:
                    log.log(
                        'The frequency file is unnecessary and is being ignored.'
                    )
                    args.frqfile = None
                    args.frqfile_chr = None
            if args.overlap_annot and not args.not_M_5_50:
                if not ((args.frqfile and args.ref_ld) or
                        (args.frqfile_chr and args.ref_ld_chr)):
                    raise ValueError(
                        'Must set either --frqfile and --ref-ld or --frqfile-chr and --ref-ld-chr'
                    )

            if args.rg:
                sumstats.estimate_rg(args, log)
            elif args.h2:
                sumstats.estimate_h2(args, log)
            elif args.h2_cts:
                sumstats.cell_type_specific(args, log)

            # bad flags
        else:
            print header
            print 'Error: no analysis selected.'
            print 'ldsc.py -h describes options.'
    except Exception:
        ex_type, ex, tb = sys.exc_info()
        log.log(traceback.format_exc(ex))
        raise
    finally:
Beispiel #12
0
def ldsc_rg_pair(args, **kwargs):
    """
    Args is a list with elements:
    - args[0] = phenotype name
    - args[1] = phenotype description
    - args[2] = file name for phenotype 1
    - args[3] = file name for phenotype 2
    - args[4] = N for phenotype 1
    - args[5] = N_cases for phenotype 1
    - args[6] = N_controls for phenotype 1
    - args[7] = N for phenotype 2
    - args[8] = N_cases for phenotype 2
    - args[9] = N_controls for phenotype 2
    
    Assumes keyword args for:
    wd
    gs_sumstat_dir
    ld_ref_panel
    """

    # handle args
    phname = str(args[0])
    phdesc = str(args[1])
    f1 = str(args[2])
    f2 = str(args[3])
    n1 = int(args[4])
    ncas1 = float(args[5])
    ncon1 = float(args[6])
    n2 = int(args[7])
    ncas2 = float(args[8])
    ncon2 = float(args[9])

    # log
    print "Starting phenotype: " + str(phname)

    # download sumstats for phens
    gs_ss_path1 = gs_sumstat_dir + '/' + str(f1)
    loc_ss_path1 = wd + '/' + str(f1)
    subprocess.call(['gsutil', 'cp', gs_ss_path1, loc_ss_path1])

    gs_ss_path2 = gs_sumstat_dir + '/' + str(f2)
    loc_ss_path2 = wd + '/' + str(f2)
    subprocess.call(['gsutil', 'cp', gs_ss_path2, loc_ss_path2])

    # list of files
    rg_file_list = ','.join([loc_ss_path1, loc_ss_path2])

    # list of names
    rg_name_list = [str(f1), str(f2)]

    # dummy output name
    rg_out = wd + '/' + 'rg.summary'

    # args for ldsc
    args_ldsc_rg = Namespace(out=rg_out,
                             bfile=None,
                             l2=None,
                             extract=None,
                             keep=None,
                             ld_wind_snps=None,
                             ld_wind_kb=None,
                             ld_wind_cm=None,
                             print_snps=None,
                             annot=None,
                             thin_annot=False,
                             cts_bin=None,
                             cts_break=None,
                             cts_names=None,
                             per_allele=False,
                             pq_exp=None,
                             no_print_annot=False,
                             maf=None,
                             h2=None,
                             rg=rg_file_list,
                             ref_ld=None,
                             ref_ld_chr=ld_ref_panel,
                             w_ld=None,
                             w_ld_chr=ld_ref_panel,
                             overlap_annot=False,
                             no_intercept=False,
                             intercept_h2=None,
                             intercept_gencov=None,
                             M=None,
                             two_step=99999,
                             chisq_max=99999,
                             print_cov=False,
                             print_delete_vals=False,
                             chunk_size=50,
                             pickle=False,
                             invert_anyway=False,
                             yes_really=False,
                             n_blocks=200,
                             not_M_5_50=False,
                             return_silly_things=False,
                             no_check_alleles=False,
                             print_coefficients=False,
                             samp_prev=None,
                             pop_prev=None,
                             frqfile=None,
                             h2_cts=None,
                             frqfile_chr=None,
                             print_all_cts=False)

    # run rg
    rg_out = sumstats.estimate_rg(args_ldsc_rg, Logger_to_Logging())

    # get basic rg summary table
    rg_tab_txt = sumstats._get_rg_table(rg_name_list, rg_out, args_ldsc_rg)
    rg_df = pd.read_csv(StringIO(rg_tab_txt), delim_whitespace=True)

    print(rg_df)

    # rename h2, int columns so we can add h2/int for phenotype 1
    rg_df.rename(
        {
            'h2_obs': 'ph2_h2_obs',
            'h2_obs_se': 'ph2_h2_obs_se',
            'h2_int': 'ph2_h2_int',
            'h2_int_se': 'ph2_h2_int_se'
        },
        axis='columns',
        inplace=True)

    # add h2/int for phenotype 1
    t = lambda attr: lambda obj: getattr(obj, attr, 'NA')
    rg_df['ph1_h2_int'] = map(t('intercept'), map(t('hsq1'), rg_out))
    rg_df['ph1_h2_int_se'] = map(t('intercept_se'), map(t('hsq1'), rg_out))
    rg_df['ph1_h2_obs'] = map(t('tot'), map(t('hsq1'), rg_out))
    rg_df['ph1_h2_obs_se'] = map(t('tot_se'), map(t('hsq1'), rg_out))

    # add phenotype info
    rg_df.insert(0, 'description', str(phdesc))
    rg_df.insert(0, 'phenotype', str(phname))

    # add sample size info
    rg_df['ph1_n'] = n1
    rg_df['ph1_n_case'] = ncas1
    rg_df['ph1_n_control'] = ncon1
    rg_df['ph2_n'] = n2
    rg_df['ph2_n_case'] = ncas2
    rg_df['ph2_n_control'] = ncon2

    print '#########'
    print 'rg: ' + str(rg_df['rg'])
    print '#########'

    return rg_df
Beispiel #13
0
            if args.w_ld and args.w_ld_chr:
                raise ValueError('Cannot set both --w-ld and --w-ld-chr.')
            if (args.samp_prev is not None) != (args.pop_prev is not None):
                raise ValueError('Must set both or neither of --samp-prev and --pop-prev.')

            if not args.overlap_annot or args.not_M_5_50:
                if args.frqfile is not None or args.frqfile_chr is not None:
                    log.log('The frequency file is unnecessary and is being ignored.')
                    args.frqfile = None
                    args.frqfile_chr = None
            if args.overlap_annot and not args.not_M_5_50:
                if not ((args.frqfile and args.ref_ld) or (args.frqfile_chr and args.ref_ld_chr)):
                    raise ValueError ('Must set either --frqfile and --ref-ld or --frqfile-chr and --ref-ld-chr')

            if args.rg:
                sumstats.estimate_rg(args, log)
            elif args.h2:
                sumstats.estimate_h2(args, log)

            # bad flags
        else:
            print header
            print 'Error: no analysis selected.'
            print 'ldsc.py -h describes options.'
    except Exception:
        ex_type, ex, tb = sys.exc_info()
        log.log( traceback.format_exc(ex) )
        raise
    finally:
        log.log('Analysis finished at {T}'.format(T=time.ctime()) )
        time_elapsed = round(time.time()-start_time,2)
Beispiel #14
0
def ldsc_rg_target(ph_list, **kwargs):
    """
    Assumes keyword args for:
    wd
    gs_sumstat_dir
    ld_ref_panel
    target_name
    """

    # log
    print "Starting phenotypes: "
    print ph_list

    # download sumstats for phens
    for ph in ph_list:
        gs_ss_path = gs_sumstat_dir + '/' + str(ph) + '.ukbb.sumstats.gz'
        loc_ss_path = wd + '/' + str(ph) + '.ukbb.sumstats.gz'
        subprocess.call(['gsutil', 'cp', gs_ss_path, loc_ss_path])

    # list of files
    ukb_loc_list = ','.join(
        [wd + '/' + str(x) + ".ukbb.sumstats.gz" for x in ph_list])
    rg_file_list = ','.join([loc_target_ss, ukb_loc_list])

    # list of names
    ukb_name_list = [str(x) + ".ukbb" for x in ph_list]
    rg_name_list = [target_name] + ukb_name_list

    # dummy output name
    rg_out = wd + '/' + 'rg.summary'

    # args for ldsc
    args_ldsc_rg = Namespace(out=rg_out,
                             bfile=None,
                             l2=None,
                             extract=None,
                             keep=None,
                             ld_wind_snps=None,
                             ld_wind_kb=None,
                             ld_wind_cm=None,
                             print_snps=None,
                             annot=None,
                             thin_annot=False,
                             cts_bin=None,
                             cts_break=None,
                             cts_names=None,
                             per_allele=False,
                             pq_exp=None,
                             no_print_annot=False,
                             maf=None,
                             h2=None,
                             rg=rg_file_list,
                             ref_ld=None,
                             ref_ld_chr=ld_ref_panel,
                             w_ld=None,
                             w_ld_chr=ld_ref_panel,
                             overlap_annot=False,
                             no_intercept=False,
                             intercept_h2=None,
                             intercept_gencov=None,
                             M=None,
                             two_step=None,
                             chisq_max=None,
                             print_cov=False,
                             print_delete_vals=False,
                             chunk_size=50,
                             pickle=False,
                             invert_anyway=False,
                             yes_really=False,
                             n_blocks=200,
                             not_M_5_50=False,
                             return_silly_things=False,
                             no_check_alleles=False,
                             print_coefficients=False,
                             samp_prev=None,
                             pop_prev=None,
                             frqfile=None,
                             h2_cts=None,
                             frqfile_chr=None,
                             print_all_cts=False)

    # run rg
    rg_out = sumstats.estimate_rg(args_ldsc_rg, Logger_to_Logging())

    # format output
    rg_tab_txt = sumstats._get_rg_table(rg_name_list, rg_out, args_ldsc_rg)
    rg_df = pd.read_csv(StringIO(rg_tab_txt), delim_whitespace=True)

    return rg_df