def test_chisq(self): x = ps.chisq('test/parse_test/test.chisq') self.assertEqual(list(x['SNP']), ['rs1', 'rs2', 'rs3']) self.assertEqual(list(x['N']), [100, 100, 100]) self.assertEqual(list(x['INFO']), [1, 1, 1]) assert np.all(np.abs(x['MAF'] - [0.5, 0.01, 0.01]) < 10e-6) assert np.all(x.columns == ['SNP', 'N', 'CHISQ', 'INFO', 'MAF'])
def sumstats(args): ''' Wrapper function for estmating 1. h2 / partitioned h2 2. genetic covariance / correlation 3. LD Score regression intercept from reference panel LD and GWAS summary statistics. ''' # open output files log = logger(args.out + ".log") log.log(args) # read .chisq or betaprod try: if args.sumstats_h2: sumstats = ps.chisq(args.sumstats_h2) elif args.sumstats_intercept: sumstats = ps.chisq(args.sumstats_intercept) elif args.sumstats_gencor: sumstats = ps.betaprod(args.sumstats_gencor) except ValueError as e: log.log('Error parsing summary statistics.') raise e log_msg = 'Read summary statistics for {N} SNPs.' log.log(log_msg.format(N=len(sumstats))) # read reference panel LD Scores and .M try: if args.ref_ld: ref_ldscores = ps.ldscore(args.ref_ld) M_annot = ps.M(args.ref_ld) elif args.ref_ld_chr: ref_ldscores = ps.ldscore(args.ref_ld_chr, 22) M_annot = ps.M(args.ref_ld_chr, 22) except ValueError as e: log.log('Error parsing reference LD.') raise e if np.any(ref_ldscores.iloc[:, 1:len(ref_ldscores.columns)].var(axis=0) == 0): raise ValueError('Zero-variance LD Score. Possibly an empty column?') log_msg = 'Read reference panel LD Scores for {N} SNPs.' log.log(log_msg.format(N=len(ref_ldscores))) # read regression SNP LD Scores try: if args.regression_snp_ld: w_ldscores = ps.ldscore(args.regression_snp_ld) elif args.regression_snp_ld_chr: w_ldscores = ps.ldscore(args.regression_snp_ld_chr, 22) except ValueError as e: log.log('Error parsing regression SNP LD') raise e w_ldscores.columns = ['SNP', 'LD_weights' ] #to keep the column names from being the same log_msg = 'Read LD Scores for {N} SNPs to be retained for regression.' log.log(log_msg.format(N=len(w_ldscores))) # merge with reference panel LD Scores sumstats = pd.merge(sumstats, ref_ldscores, how="inner", on="SNP") log_msg = 'After merging with reference panel LD, {N} SNPs remain.' log.log(log_msg.format(N=len(sumstats))) # merge with regression SNP LD Scores sumstats = pd.merge(sumstats, w_ldscores, how="inner", on="SNP") log_msg = 'After merging with regression SNP LD, {N} SNPs remain.' log.log(log_msg.format(N=len(sumstats))) # this has to be here, because pandas will modify duplicate column names on merge ### TODO still not quite satisfactory -- what if user accidentally submits # the same file for ref_ld and w_ld? # maybe don't merge, but just get the row numbers to prevent modification of colnames?? ref_ld_colnames = ref_ldscores.columns[1:len(ref_ldscores.columns)] w_ld_colname = sumstats.columns[-1] del (ref_ldscores) del (w_ldscores) err_msg = 'No SNPs retained for analysis after filtering on {C} {P} {F}.' log_msg = 'After filtering on {C} {P} {F}, {N} SNPs remain.' loop = ['1', '2'] if args.sumstats_gencor else [''] var_to_arg = { 'infomax': args.info_max, 'infomin': args.info_min, 'maf': args.maf } var_to_cname = {'infomax': 'INFO', 'infomin': 'INFO', 'maf': 'MAF'} var_to_pred = { 'infomax': lambda x: x < args.info_max, 'infomin': lambda x: x > args.info_min, 'maf': lambda x: x > args.maf } var_to_predstr = {'infomax': '<', 'infomin': '>', 'maf': '>'} for v in var_to_arg.keys(): arg = var_to_arg[v] pred = var_to_pred[v] pred_str = var_to_predstr[v] for p in loop: cname = var_to_cname[v] + p if arg is not None: sumstats = ps.filter_df(sumstats, cname, pred) snp_count = len(sumstats) if snp_count == 0: raise ValueError(err_msg.format(C=cname, F=arg, P=pred_str)) else: log.log( log_msg.format(C=cname, F=arg, N=snp_count, P=pred_str)) # LD Score regression intercept if args.sumstats_intercept: log.log('Estimating LD Score regression intercept') # filter out large-effect loci max_N = np.max(sumstats['N']) max_chisq = max(0.01 * max_N, 20) sumstats = sumstats[sumstats['CHISQ'] < max_chisq] log_msg = 'After filtering on chi^2 < {C}, {N} SNPs remain.' snp_count = len(sumstats) if snp_count == 0: raise ValueError(log_msg.format(C=max_chisq, N='no')) else: log.log(log_msg.format(C=max_chisq, N=len(sumstats))) snp_count = len(sumstats) n_annot = len(ref_ld_colnames) ref_ld = np.matrix(sumstats[ref_ld_colnames]).reshape( (snp_count, n_annot)) w_ld = np.matrix(sumstats[w_ld_colname]).reshape((snp_count, 1)) M_annot = np.matrix(M_annot).reshape((1, n_annot)) chisq = np.matrix(sumstats.CHISQ).reshape((snp_count, 1)) N = np.matrix(sumstats.N).reshape((snp_count, 1)) del sumstats h2hat = jk.Hsq(chisq, ref_ld, w_ld, N, M_annot, args.num_blocks) log.log(_print_intercept(h2hat)) return h2hat # LD Score regression to estimate h2 elif args.sumstats_h2: log.log('Estimating heritability.') snp_count = len(sumstats) n_annot = len(ref_ld_colnames) ref_ld = np.matrix(sumstats[ref_ld_colnames]).reshape( (snp_count, n_annot)) w_ld = np.matrix(sumstats[w_ld_colname]).reshape((snp_count, 1)) M_annot = np.matrix(M_annot).reshape((1, n_annot)) chisq = np.matrix(sumstats.CHISQ).reshape((snp_count, 1)) N = np.matrix(sumstats.N).reshape((snp_count, 1)) del sumstats h2hat = jk.Hsq(chisq, ref_ld, w_ld, N, M_annot, args.num_blocks) log.log(_print_hsq(h2hat, ref_ld_colnames)) return [M_annot, h2hat] # LD Score regression to estimate genetic correlation elif args.sumstats_gencor: log.log('Estimating genetic correlation.') snp_count = len(sumstats) n_annot = len(ref_ld_colnames) ref_ld = np.matrix(sumstats[ref_ld_colnames]).reshape( (snp_count, n_annot)) w_ld = np.matrix(sumstats[w_ld_colname]).reshape((snp_count, 1)) M_annot = np.matrix(M_annot).reshape((1, n_annot)) betahat1 = np.matrix(sumstats.BETAHAT1).reshape((snp_count, 1)) betahat2 = np.matrix(sumstats.BETAHAT2).reshape((snp_count, 1)) N1 = np.matrix(sumstats.N1).reshape((snp_count, 1)) N2 = np.matrix(sumstats.N2).reshape((snp_count, 1)) del sumstats gchat = jk.Gencor(betahat1, betahat2, ref_ld, w_ld, N1, N2, M_annot, args.overlap, args.rho, args.num_blocks) log.log('\n') log.log('Heritability of first phenotype') log.log('-------------------------------') log.log(_print_hsq(gchat.hsq1, ref_ld_colnames)) log.log('\n') log.log('Heritability of second phenotype') log.log('--------------------------------') log.log(_print_hsq(gchat.hsq2, ref_ld_colnames)) log.log('\n') log.log('Genetic Covariance') log.log('------------------') log.log(_print_gencov(gchat.gencov, ref_ld_colnames)) log.log('\n') log.log('Genetic Correlation') log.log('-------------------') log.log(_print_gencor(gchat)) return [M_annot, gchat]
def sumstats(args): ''' Wrapper function for estmating 1. h2 / partitioned h2 2. genetic covariance / correlation 3. LD Score regression intercept from reference panel LD and GWAS summary statistics. ''' # open output files log = logger(args.out + ".log") log.log(args) # read .chisq or betaprod try: if args.sumstats_h2: sumstats = ps.chisq(args.sumstats_h2) elif args.sumstats_intercept: sumstats = ps.chisq(args.sumstats_intercept) elif args.sumstats_gencor: sumstats = ps.betaprod(args.sumstats_gencor) except ValueError as e: log.log('Error parsing summary statistics.') raise e log_msg = 'Read summary statistics for {N} SNPs.' log.log(log_msg.format(N=len(sumstats))) # read reference panel LD Scores and .M try: if args.ref_ld: ref_ldscores = ps.ldscore(args.ref_ld) M_annot = ps.M(args.ref_ld) elif args.ref_ld_chr: ref_ldscores = ps.ldscore(args.ref_ld_chr,22) M_annot = ps.M(args.ref_ld_chr, 22) except ValueError as e: log.log('Error parsing reference LD.') raise e if np.any(ref_ldscores.iloc[:,1:len(ref_ldscores.columns)].var(axis=0) == 0): raise ValueError('Zero-variance LD Score. Possibly an empty column?') log_msg = 'Read reference panel LD Scores for {N} SNPs.' log.log(log_msg.format(N=len(ref_ldscores))) # read regression SNP LD Scores try: if args.regression_snp_ld: w_ldscores = ps.ldscore(args.regression_snp_ld) elif args.regression_snp_ld_chr: w_ldscores = ps.ldscore(args.regression_snp_ld_chr, 22) except ValueError as e: log.log('Error parsing regression SNP LD') raise e w_ldscores.columns = ['SNP','LD_weights'] #to keep the column names from being the same log_msg = 'Read LD Scores for {N} SNPs to be retained for regression.' log.log(log_msg.format(N=len(w_ldscores))) # merge with reference panel LD Scores sumstats = pd.merge(sumstats, ref_ldscores, how="inner", on="SNP") log_msg = 'After merging with reference panel LD, {N} SNPs remain.' log.log(log_msg.format(N=len(sumstats))) # merge with regression SNP LD Scores sumstats = pd.merge(sumstats, w_ldscores, how="inner", on="SNP") log_msg = 'After merging with regression SNP LD, {N} SNPs remain.' log.log(log_msg.format(N=len(sumstats))) # this has to be here, because pandas will modify duplicate column names on merge ### TODO still not quite satisfactory -- what if user accidentally submits # the same file for ref_ld and w_ld? # maybe don't merge, but just get the row numbers to prevent modification of colnames?? ref_ld_colnames = ref_ldscores.columns[1:len(ref_ldscores.columns)] w_ld_colname = sumstats.columns[-1] del(ref_ldscores); del(w_ldscores) err_msg = 'No SNPs retained for analysis after filtering on {C} {P} {F}.' log_msg = 'After filtering on {C} {P} {F}, {N} SNPs remain.' loop = ['1','2'] if args.sumstats_gencor else [''] var_to_arg = {'infomax': args.info_max, 'infomin': args.info_min, 'maf': args.maf} var_to_cname = {'infomax': 'INFO', 'infomin': 'INFO', 'maf': 'MAF'} var_to_pred = {'infomax': lambda x: x < args.info_max, 'infomin': lambda x: x > args.info_min, 'maf': lambda x: x > args.maf} var_to_predstr = {'infomax': '<', 'infomin': '>', 'maf': '>'} for v in var_to_arg.keys(): arg = var_to_arg[v]; pred = var_to_pred[v]; pred_str = var_to_predstr[v] for p in loop: cname = var_to_cname[v] + p; if arg is not None: sumstats = ps.filter_df(sumstats, cname, pred) snp_count = len(sumstats) if snp_count == 0: raise ValueError(err_msg.format(C=cname, F=arg, P=pred_str)) else: log.log(log_msg.format(C=cname, F=arg, N=snp_count, P=pred_str)) # LD Score regression intercept if args.sumstats_intercept: log.log('Estimating LD Score regression intercept') # filter out large-effect loci max_N = np.max(sumstats['N']) max_chisq = max(0.01*max_N, 20) sumstats = sumstats[sumstats['CHISQ'] < max_chisq] log_msg = 'After filtering on chi^2 < {C}, {N} SNPs remain.' snp_count = len(sumstats) if snp_count == 0: raise ValueError(log_msg.format(C=max_chisq, N='no')) else: log.log(log_msg.format(C=max_chisq, N=len(sumstats))) snp_count = len(sumstats); n_annot = len(ref_ld_colnames) ref_ld = np.matrix(sumstats[ref_ld_colnames]).reshape((snp_count, n_annot)) w_ld = np.matrix(sumstats[w_ld_colname]).reshape((snp_count, 1)) M_annot = np.matrix(M_annot).reshape((1, n_annot)) chisq = np.matrix(sumstats.CHISQ).reshape((snp_count, 1)) N = np.matrix(sumstats.N).reshape((snp_count,1)) del sumstats h2hat = jk.Hsq(chisq, ref_ld, w_ld, N, M_annot, args.num_blocks) log.log(_print_intercept(h2hat)) return h2hat # LD Score regression to estimate h2 elif args.sumstats_h2: log.log('Estimating heritability.') snp_count = len(sumstats); n_annot = len(ref_ld_colnames) ref_ld = np.matrix(sumstats[ref_ld_colnames]).reshape((snp_count, n_annot)) w_ld = np.matrix(sumstats[w_ld_colname]).reshape((snp_count, 1)) M_annot = np.matrix(M_annot).reshape((1, n_annot)) chisq = np.matrix(sumstats.CHISQ).reshape((snp_count, 1)) N = np.matrix(sumstats.N).reshape((snp_count,1)) del sumstats h2hat = jk.Hsq(chisq, ref_ld, w_ld, N, M_annot, args.num_blocks) log.log(_print_hsq(h2hat, ref_ld_colnames)) return [M_annot,h2hat] # LD Score regression to estimate genetic correlation elif args.sumstats_gencor: log.log('Estimating genetic correlation.') snp_count = len(sumstats); n_annot = len(ref_ld_colnames) ref_ld = np.matrix(sumstats[ref_ld_colnames]).reshape((snp_count, n_annot)) w_ld = np.matrix(sumstats[w_ld_colname]).reshape((snp_count, 1)) M_annot = np.matrix(M_annot).reshape((1, n_annot)) betahat1 = np.matrix(sumstats.BETAHAT1).reshape((snp_count, 1)) betahat2 = np.matrix(sumstats.BETAHAT2).reshape((snp_count, 1)) N1 = np.matrix(sumstats.N1).reshape((snp_count,1)) N2 = np.matrix(sumstats.N2).reshape((snp_count,1)) del sumstats gchat = jk.Gencor(betahat1, betahat2, ref_ld, w_ld, N1, N2, M_annot, args.overlap, args.rho, args.num_blocks) log.log( '\n' ) log.log( 'Heritability of first phenotype' ) log.log( '-------------------------------' ) log.log( _print_hsq(gchat.hsq1, ref_ld_colnames) ) log.log( '\n' ) log.log( 'Heritability of second phenotype' ) log.log( '--------------------------------' ) log.log( _print_hsq(gchat.hsq2, ref_ld_colnames) ) log.log( '\n' ) log.log( 'Genetic Covariance' ) log.log( '------------------' ) log.log( _print_gencov(gchat.gencov, ref_ld_colnames) ) log.log( '\n' ) log.log( 'Genetic Correlation' ) log.log( '-------------------' ) log.log( _print_gencor(gchat) ) return [M_annot,gchat]