def setUpClass(cls): args = parser.parse_args('') args.ref_ld = DIR + '/simulate_test/ldscore/twold_onefile' args.w_ld = DIR + '/simulate_test/ldscore/w' args.rg = ','.join( (DIR + '/simulate_test/sumstats/' + str(i) for i in xrange(N_REP))) args.out = DIR + '/simulate_test/1' x = s.estimate_rg(args, log) args.intercept_gencov = ','.join(('0' for _ in xrange(N_REP))) args.intercept_h2 = ','.join(('1' for _ in xrange(N_REP))) y = s.estimate_rg(args, log) cls.rg = x cls.rg_noint = y
def test_no_check_alleles(self): args = parser.parse_args('') args.ref_ld = DIR + '/simulate_test/ldscore/oneld_onefile' args.w_ld = DIR + '/simulate_test/ldscore/w' args.rg = ','.join( [DIR + '/simulate_test/sumstats/1' for _ in xrange(2)]) args.out = DIR + '/simulate_test/1' x = s.estimate_rg(args, log)[0] args.no_check_alleles = True y = s.estimate_rg(args, log)[0] assert_equal(x.rg_ratio, y.rg_ratio) assert_almost_equal(x.rg_jknife, y.rg_jknife) assert_equal(x.rg_se, y.rg_se)
def test_twostep_rg(self): # make sure two step isn't going crazy args = parser.parse_args('') args.ref_ld_chr = DIR + '/simulate_test/ldscore/oneld_onefile' args.w_ld = DIR + '/simulate_test/ldscore/w' args.rg = ','.join( [DIR + '/simulate_test/sumstats/1' for _ in xrange(2)]) args.out = DIR + '/simulate_test/rg' args.two_step = 999 x = s.estimate_rg(args, log)[0] args.two_step = 99999 y = s.estimate_rg(args, log)[0] assert_allclose(x.rg_ratio, y.rg_ratio, atol=1e-5) assert_allclose(x.gencov.tot, y.gencov.tot, atol=1e-5)
def test_rg_M(self): args = parser.parse_args('') args.ref_ld = DIR + '/simulate_test/ldscore/oneld_onefile' args.w_ld = DIR + '/simulate_test/ldscore/w' args.rg = ','.join( [DIR + '/simulate_test/sumstats/1' for _ in xrange(2)]) args.out = DIR + '/simulate_test/1' x = s.estimate_rg(args, log)[0] args.M = open(DIR + '/simulate_test/ldscore/oneld_onefile.l2.M_5_50', 'rb').read().rstrip('\n') y = s.estimate_rg(args, log)[0] assert_array_almost_equal(x.rg_ratio, y.rg_ratio) assert_array_almost_equal(x.rg_se, y.rg_se) args.M = '1,2' assert_raises(ValueError, s.estimate_rg, args, log) args.M = 'foo_bar' assert_raises(ValueError, s.estimate_rg, args, log)
def test_rg_M(self): args = parser.parse_args('') args.ref_ld = DIR + '/simulate_test/ldscore/oneld_onefile' args.w_ld = DIR + '/simulate_test/ldscore/w' args.rg = ','.join( [DIR + '/simulate_test/sumstats/1' for _ in xrange(2)]) args.out = DIR + '/simulate_test/1' x = s.estimate_rg(args, log)[0] args.M = open( DIR + '/simulate_test/ldscore/oneld_onefile.l2.M_5_50', 'rb').read().rstrip('\n') y = s.estimate_rg(args, log)[0] assert_array_almost_equal(x.rg_ratio, y.rg_ratio) assert_array_almost_equal(x.rg_se, y.rg_se) args.M = '1,2' assert_raises(ValueError, s.estimate_rg, args, log) args.M = 'foo_bar' assert_raises(ValueError, s.estimate_rg, args, log)
def test_rg_ref_ld(self): args = parser.parse_args('') args.ref_ld_chr = DIR + '/simulate_test/ldscore/twold_onefile' args.w_ld = DIR + '/simulate_test/ldscore/w' args.rg = ','.join( [DIR + '/simulate_test/sumstats/1' for _ in xrange(2)]) args.out = DIR + '/simulate_test/1' args.print_cov = True # right now just check no runtime errors args.print_delete_vals = True x = s.estimate_rg(args, log)[0] args.ref_ld = DIR + '/simulate_test/ldscore/twold_firstfile,' + \ DIR + '/simulate_test/ldscore/twold_secondfile' y = s.estimate_rg(args, log)[0] args.ref_ld_chr = DIR + '/simulate_test/ldscore/twold_firstfile,' + \ DIR + '/simulate_test/ldscore/twold_secondfile' z = s.estimate_rg(args, log)[0] assert_almost_equal(x.rg_ratio, y.rg_ratio) assert_almost_equal(y.rg_jknife, z.rg_jknife) assert_almost_equal(x.rg_se, y.rg_se)
if not args.overlap_annot or args.not_M_5_50: if args.frqfile is not None or args.frqfile_chr is not None: log.log( 'The frequency file is unnecessary and is being ignored.' ) args.frqfile = None args.frqfile_chr = None if args.overlap_annot and not args.not_M_5_50: if not ((args.frqfile and args.ref_ld) or (args.frqfile_chr and args.ref_ld_chr)): raise ValueError( 'Must set either --frqfile and --ref-ld or --frqfile-chr and --ref-ld-chr' ) if args.rg: sumstats.estimate_rg(args, log) elif args.h2: sumstats.estimate_h2(args, log) elif args.h2_cts: sumstats.cell_type_specific(args, log) # bad flags else: print header print 'Error: no analysis selected.' print 'ldsc.py -h describes options.' except Exception: ex_type, ex, tb = sys.exc_info() log.log(traceback.format_exc(ex)) raise finally:
def ldsc_rg_pair(args, **kwargs): """ Args is a list with elements: - args[0] = phenotype name - args[1] = phenotype description - args[2] = file name for phenotype 1 - args[3] = file name for phenotype 2 - args[4] = N for phenotype 1 - args[5] = N_cases for phenotype 1 - args[6] = N_controls for phenotype 1 - args[7] = N for phenotype 2 - args[8] = N_cases for phenotype 2 - args[9] = N_controls for phenotype 2 Assumes keyword args for: wd gs_sumstat_dir ld_ref_panel """ # handle args phname = str(args[0]) phdesc = str(args[1]) f1 = str(args[2]) f2 = str(args[3]) n1 = int(args[4]) ncas1 = float(args[5]) ncon1 = float(args[6]) n2 = int(args[7]) ncas2 = float(args[8]) ncon2 = float(args[9]) # log print "Starting phenotype: " + str(phname) # download sumstats for phens gs_ss_path1 = gs_sumstat_dir + '/' + str(f1) loc_ss_path1 = wd + '/' + str(f1) subprocess.call(['gsutil', 'cp', gs_ss_path1, loc_ss_path1]) gs_ss_path2 = gs_sumstat_dir + '/' + str(f2) loc_ss_path2 = wd + '/' + str(f2) subprocess.call(['gsutil', 'cp', gs_ss_path2, loc_ss_path2]) # list of files rg_file_list = ','.join([loc_ss_path1, loc_ss_path2]) # list of names rg_name_list = [str(f1), str(f2)] # dummy output name rg_out = wd + '/' + 'rg.summary' # args for ldsc args_ldsc_rg = Namespace(out=rg_out, bfile=None, l2=None, extract=None, keep=None, ld_wind_snps=None, ld_wind_kb=None, ld_wind_cm=None, print_snps=None, annot=None, thin_annot=False, cts_bin=None, cts_break=None, cts_names=None, per_allele=False, pq_exp=None, no_print_annot=False, maf=None, h2=None, rg=rg_file_list, ref_ld=None, ref_ld_chr=ld_ref_panel, w_ld=None, w_ld_chr=ld_ref_panel, overlap_annot=False, no_intercept=False, intercept_h2=None, intercept_gencov=None, M=None, two_step=99999, chisq_max=99999, print_cov=False, print_delete_vals=False, chunk_size=50, pickle=False, invert_anyway=False, yes_really=False, n_blocks=200, not_M_5_50=False, return_silly_things=False, no_check_alleles=False, print_coefficients=False, samp_prev=None, pop_prev=None, frqfile=None, h2_cts=None, frqfile_chr=None, print_all_cts=False) # run rg rg_out = sumstats.estimate_rg(args_ldsc_rg, Logger_to_Logging()) # get basic rg summary table rg_tab_txt = sumstats._get_rg_table(rg_name_list, rg_out, args_ldsc_rg) rg_df = pd.read_csv(StringIO(rg_tab_txt), delim_whitespace=True) print(rg_df) # rename h2, int columns so we can add h2/int for phenotype 1 rg_df.rename( { 'h2_obs': 'ph2_h2_obs', 'h2_obs_se': 'ph2_h2_obs_se', 'h2_int': 'ph2_h2_int', 'h2_int_se': 'ph2_h2_int_se' }, axis='columns', inplace=True) # add h2/int for phenotype 1 t = lambda attr: lambda obj: getattr(obj, attr, 'NA') rg_df['ph1_h2_int'] = map(t('intercept'), map(t('hsq1'), rg_out)) rg_df['ph1_h2_int_se'] = map(t('intercept_se'), map(t('hsq1'), rg_out)) rg_df['ph1_h2_obs'] = map(t('tot'), map(t('hsq1'), rg_out)) rg_df['ph1_h2_obs_se'] = map(t('tot_se'), map(t('hsq1'), rg_out)) # add phenotype info rg_df.insert(0, 'description', str(phdesc)) rg_df.insert(0, 'phenotype', str(phname)) # add sample size info rg_df['ph1_n'] = n1 rg_df['ph1_n_case'] = ncas1 rg_df['ph1_n_control'] = ncon1 rg_df['ph2_n'] = n2 rg_df['ph2_n_case'] = ncas2 rg_df['ph2_n_control'] = ncon2 print '#########' print 'rg: ' + str(rg_df['rg']) print '#########' return rg_df
if args.w_ld and args.w_ld_chr: raise ValueError('Cannot set both --w-ld and --w-ld-chr.') if (args.samp_prev is not None) != (args.pop_prev is not None): raise ValueError('Must set both or neither of --samp-prev and --pop-prev.') if not args.overlap_annot or args.not_M_5_50: if args.frqfile is not None or args.frqfile_chr is not None: log.log('The frequency file is unnecessary and is being ignored.') args.frqfile = None args.frqfile_chr = None if args.overlap_annot and not args.not_M_5_50: if not ((args.frqfile and args.ref_ld) or (args.frqfile_chr and args.ref_ld_chr)): raise ValueError ('Must set either --frqfile and --ref-ld or --frqfile-chr and --ref-ld-chr') if args.rg: sumstats.estimate_rg(args, log) elif args.h2: sumstats.estimate_h2(args, log) # bad flags else: print header print 'Error: no analysis selected.' print 'ldsc.py -h describes options.' except Exception: ex_type, ex, tb = sys.exc_info() log.log( traceback.format_exc(ex) ) raise finally: log.log('Analysis finished at {T}'.format(T=time.ctime()) ) time_elapsed = round(time.time()-start_time,2)
def ldsc_rg_target(ph_list, **kwargs): """ Assumes keyword args for: wd gs_sumstat_dir ld_ref_panel target_name """ # log print "Starting phenotypes: " print ph_list # download sumstats for phens for ph in ph_list: gs_ss_path = gs_sumstat_dir + '/' + str(ph) + '.ukbb.sumstats.gz' loc_ss_path = wd + '/' + str(ph) + '.ukbb.sumstats.gz' subprocess.call(['gsutil', 'cp', gs_ss_path, loc_ss_path]) # list of files ukb_loc_list = ','.join( [wd + '/' + str(x) + ".ukbb.sumstats.gz" for x in ph_list]) rg_file_list = ','.join([loc_target_ss, ukb_loc_list]) # list of names ukb_name_list = [str(x) + ".ukbb" for x in ph_list] rg_name_list = [target_name] + ukb_name_list # dummy output name rg_out = wd + '/' + 'rg.summary' # args for ldsc args_ldsc_rg = Namespace(out=rg_out, bfile=None, l2=None, extract=None, keep=None, ld_wind_snps=None, ld_wind_kb=None, ld_wind_cm=None, print_snps=None, annot=None, thin_annot=False, cts_bin=None, cts_break=None, cts_names=None, per_allele=False, pq_exp=None, no_print_annot=False, maf=None, h2=None, rg=rg_file_list, ref_ld=None, ref_ld_chr=ld_ref_panel, w_ld=None, w_ld_chr=ld_ref_panel, overlap_annot=False, no_intercept=False, intercept_h2=None, intercept_gencov=None, M=None, two_step=None, chisq_max=None, print_cov=False, print_delete_vals=False, chunk_size=50, pickle=False, invert_anyway=False, yes_really=False, n_blocks=200, not_M_5_50=False, return_silly_things=False, no_check_alleles=False, print_coefficients=False, samp_prev=None, pop_prev=None, frqfile=None, h2_cts=None, frqfile_chr=None, print_all_cts=False) # run rg rg_out = sumstats.estimate_rg(args_ldsc_rg, Logger_to_Logging()) # format output rg_tab_txt = sumstats._get_rg_table(rg_name_list, rg_out, args_ldsc_rg) rg_df = pd.read_csv(StringIO(rg_tab_txt), delim_whitespace=True) return rg_df