Ejemplo n.º 1
0
 def test_chisq(self):
     x = ps.chisq('test/parse_test/test.chisq')
     self.assertEqual(list(x['SNP']), ['rs1', 'rs2', 'rs3'])
     self.assertEqual(list(x['N']), [100, 100, 100])
     self.assertEqual(list(x['INFO']), [1, 1, 1])
     assert np.all(np.abs(x['MAF'] - [0.5, 0.01, 0.01]) < 10e-6)
     assert np.all(x.columns == ['SNP', 'N', 'CHISQ', 'INFO', 'MAF'])
Ejemplo n.º 2
0
def sumstats(args):
    '''
	Wrapper function for estmating
		1. h2 / partitioned h2
		2. genetic covariance / correlation
		3. LD Score regression intercept
	
	from reference panel LD and GWAS summary statistics.
	
	'''

    # open output files
    log = logger(args.out + ".log")
    log.log(args)
    # read .chisq or betaprod
    try:
        if args.sumstats_h2:
            sumstats = ps.chisq(args.sumstats_h2)
        elif args.sumstats_intercept:
            sumstats = ps.chisq(args.sumstats_intercept)
        elif args.sumstats_gencor:
            sumstats = ps.betaprod(args.sumstats_gencor)
    except ValueError as e:
        log.log('Error parsing summary statistics.')
        raise e

    log_msg = 'Read summary statistics for {N} SNPs.'
    log.log(log_msg.format(N=len(sumstats)))

    # read reference panel LD Scores and .M
    try:
        if args.ref_ld:
            ref_ldscores = ps.ldscore(args.ref_ld)
            M_annot = ps.M(args.ref_ld)

        elif args.ref_ld_chr:
            ref_ldscores = ps.ldscore(args.ref_ld_chr, 22)
            M_annot = ps.M(args.ref_ld_chr, 22)
    except ValueError as e:
        log.log('Error parsing reference LD.')
        raise e

    if np.any(ref_ldscores.iloc[:,
                                1:len(ref_ldscores.columns)].var(axis=0) == 0):
        raise ValueError('Zero-variance LD Score. Possibly an empty column?')

    log_msg = 'Read reference panel LD Scores for {N} SNPs.'
    log.log(log_msg.format(N=len(ref_ldscores)))

    # read regression SNP LD Scores
    try:
        if args.regression_snp_ld:
            w_ldscores = ps.ldscore(args.regression_snp_ld)
        elif args.regression_snp_ld_chr:
            w_ldscores = ps.ldscore(args.regression_snp_ld_chr, 22)
    except ValueError as e:
        log.log('Error parsing regression SNP LD')
        raise e

    w_ldscores.columns = ['SNP', 'LD_weights'
                          ]  #to keep the column names from being the same

    log_msg = 'Read LD Scores for {N} SNPs to be retained for regression.'
    log.log(log_msg.format(N=len(w_ldscores)))

    # merge with reference panel LD Scores
    sumstats = pd.merge(sumstats, ref_ldscores, how="inner", on="SNP")
    log_msg = 'After merging with reference panel LD, {N} SNPs remain.'
    log.log(log_msg.format(N=len(sumstats)))

    # merge with regression SNP LD Scores
    sumstats = pd.merge(sumstats, w_ldscores, how="inner", on="SNP")
    log_msg = 'After merging with regression SNP LD, {N} SNPs remain.'
    log.log(log_msg.format(N=len(sumstats)))

    # this has to be here, because pandas will modify duplicate column names on merge
    ### TODO still not quite satisfactory -- what if user accidentally submits
    # the same file for ref_ld and w_ld?
    # maybe don't merge, but just get the row numbers to prevent modification of colnames??
    ref_ld_colnames = ref_ldscores.columns[1:len(ref_ldscores.columns)]
    w_ld_colname = sumstats.columns[-1]
    del (ref_ldscores)
    del (w_ldscores)

    err_msg = 'No SNPs retained for analysis after filtering on {C} {P} {F}.'
    log_msg = 'After filtering on {C} {P} {F}, {N} SNPs remain.'
    loop = ['1', '2'] if args.sumstats_gencor else ['']
    var_to_arg = {
        'infomax': args.info_max,
        'infomin': args.info_min,
        'maf': args.maf
    }
    var_to_cname = {'infomax': 'INFO', 'infomin': 'INFO', 'maf': 'MAF'}
    var_to_pred = {
        'infomax': lambda x: x < args.info_max,
        'infomin': lambda x: x > args.info_min,
        'maf': lambda x: x > args.maf
    }
    var_to_predstr = {'infomax': '<', 'infomin': '>', 'maf': '>'}
    for v in var_to_arg.keys():
        arg = var_to_arg[v]
        pred = var_to_pred[v]
        pred_str = var_to_predstr[v]
        for p in loop:
            cname = var_to_cname[v] + p
            if arg is not None:
                sumstats = ps.filter_df(sumstats, cname, pred)
                snp_count = len(sumstats)
                if snp_count == 0:
                    raise ValueError(err_msg.format(C=cname, F=arg,
                                                    P=pred_str))
                else:
                    log.log(
                        log_msg.format(C=cname, F=arg, N=snp_count,
                                       P=pred_str))

    # LD Score regression intercept
    if args.sumstats_intercept:
        log.log('Estimating LD Score regression intercept')
        # filter out large-effect loci
        max_N = np.max(sumstats['N'])
        max_chisq = max(0.01 * max_N, 20)
        sumstats = sumstats[sumstats['CHISQ'] < max_chisq]
        log_msg = 'After filtering on chi^2 < {C}, {N} SNPs remain.'
        snp_count = len(sumstats)
        if snp_count == 0:
            raise ValueError(log_msg.format(C=max_chisq, N='no'))
        else:
            log.log(log_msg.format(C=max_chisq, N=len(sumstats)))

        snp_count = len(sumstats)
        n_annot = len(ref_ld_colnames)
        ref_ld = np.matrix(sumstats[ref_ld_colnames]).reshape(
            (snp_count, n_annot))
        w_ld = np.matrix(sumstats[w_ld_colname]).reshape((snp_count, 1))
        M_annot = np.matrix(M_annot).reshape((1, n_annot))
        chisq = np.matrix(sumstats.CHISQ).reshape((snp_count, 1))
        N = np.matrix(sumstats.N).reshape((snp_count, 1))
        del sumstats

        h2hat = jk.Hsq(chisq, ref_ld, w_ld, N, M_annot, args.num_blocks)
        log.log(_print_intercept(h2hat))
        return h2hat

    # LD Score regression to estimate h2
    elif args.sumstats_h2:
        log.log('Estimating heritability.')

        snp_count = len(sumstats)
        n_annot = len(ref_ld_colnames)
        ref_ld = np.matrix(sumstats[ref_ld_colnames]).reshape(
            (snp_count, n_annot))
        w_ld = np.matrix(sumstats[w_ld_colname]).reshape((snp_count, 1))
        M_annot = np.matrix(M_annot).reshape((1, n_annot))
        chisq = np.matrix(sumstats.CHISQ).reshape((snp_count, 1))
        N = np.matrix(sumstats.N).reshape((snp_count, 1))
        del sumstats

        h2hat = jk.Hsq(chisq, ref_ld, w_ld, N, M_annot, args.num_blocks)
        log.log(_print_hsq(h2hat, ref_ld_colnames))
        return [M_annot, h2hat]

    # LD Score regression to estimate genetic correlation
    elif args.sumstats_gencor:
        log.log('Estimating genetic correlation.')

        snp_count = len(sumstats)
        n_annot = len(ref_ld_colnames)
        ref_ld = np.matrix(sumstats[ref_ld_colnames]).reshape(
            (snp_count, n_annot))
        w_ld = np.matrix(sumstats[w_ld_colname]).reshape((snp_count, 1))
        M_annot = np.matrix(M_annot).reshape((1, n_annot))
        betahat1 = np.matrix(sumstats.BETAHAT1).reshape((snp_count, 1))
        betahat2 = np.matrix(sumstats.BETAHAT2).reshape((snp_count, 1))
        N1 = np.matrix(sumstats.N1).reshape((snp_count, 1))
        N2 = np.matrix(sumstats.N2).reshape((snp_count, 1))
        del sumstats

        gchat = jk.Gencor(betahat1, betahat2, ref_ld, w_ld, N1, N2, M_annot,
                          args.overlap, args.rho, args.num_blocks)

        log.log('\n')
        log.log('Heritability of first phenotype')
        log.log('-------------------------------')
        log.log(_print_hsq(gchat.hsq1, ref_ld_colnames))
        log.log('\n')
        log.log('Heritability of second phenotype')
        log.log('--------------------------------')
        log.log(_print_hsq(gchat.hsq2, ref_ld_colnames))
        log.log('\n')
        log.log('Genetic Covariance')
        log.log('------------------')
        log.log(_print_gencov(gchat.gencov, ref_ld_colnames))
        log.log('\n')
        log.log('Genetic Correlation')
        log.log('-------------------')
        log.log(_print_gencor(gchat))

        return [M_annot, gchat]
Ejemplo n.º 3
0
def sumstats(args):
	'''
	Wrapper function for estmating
		1. h2 / partitioned h2
		2. genetic covariance / correlation
		3. LD Score regression intercept
	
	from reference panel LD and GWAS summary statistics.
	
	'''
	
	# open output files
	log = logger(args.out + ".log")
	log.log(args)
	# read .chisq or betaprod
	try:
		if args.sumstats_h2:
			sumstats = ps.chisq(args.sumstats_h2)
		elif args.sumstats_intercept:
			sumstats = ps.chisq(args.sumstats_intercept)
		elif args.sumstats_gencor:
			sumstats = ps.betaprod(args.sumstats_gencor)
	except ValueError as e:
		log.log('Error parsing summary statistics.')
		raise e
	
	log_msg = 'Read summary statistics for {N} SNPs.'
	log.log(log_msg.format(N=len(sumstats)))
	
	# read reference panel LD Scores and .M 
	try:
		if args.ref_ld:
			ref_ldscores = ps.ldscore(args.ref_ld)
			M_annot = ps.M(args.ref_ld)
	
		elif args.ref_ld_chr:
			ref_ldscores = ps.ldscore(args.ref_ld_chr,22)
			M_annot = ps.M(args.ref_ld_chr, 22)
	except ValueError as e:
		log.log('Error parsing reference LD.')
		raise e
		
	if np.any(ref_ldscores.iloc[:,1:len(ref_ldscores.columns)].var(axis=0) == 0):
		raise ValueError('Zero-variance LD Score. Possibly an empty column?')

	log_msg = 'Read reference panel LD Scores for {N} SNPs.'
	log.log(log_msg.format(N=len(ref_ldscores)))

	# read regression SNP LD Scores
	try:
		if args.regression_snp_ld:
			w_ldscores = ps.ldscore(args.regression_snp_ld)
		elif args.regression_snp_ld_chr:
			w_ldscores = ps.ldscore(args.regression_snp_ld_chr, 22)
	except ValueError as e:
		log.log('Error parsing regression SNP LD')
		raise e
		
	w_ldscores.columns = ['SNP','LD_weights'] #to keep the column names from being the same

	log_msg = 'Read LD Scores for {N} SNPs to be retained for regression.'
	log.log(log_msg.format(N=len(w_ldscores)))
	
	# merge with reference panel LD Scores 
	sumstats = pd.merge(sumstats, ref_ldscores, how="inner", on="SNP")
	log_msg = 'After merging with reference panel LD, {N} SNPs remain.'
	log.log(log_msg.format(N=len(sumstats)))

	# merge with regression SNP LD Scores
	sumstats = pd.merge(sumstats, w_ldscores, how="inner", on="SNP")
	log_msg = 'After merging with regression SNP LD, {N} SNPs remain.'
	log.log(log_msg.format(N=len(sumstats)))
	
	# this has to be here, because pandas will modify duplicate column names on merge
	### TODO still not quite satisfactory -- what if user accidentally submits
	# the same file for ref_ld and w_ld? 
	# maybe don't merge, but just get the row numbers to prevent modification of colnames??
	ref_ld_colnames = ref_ldscores.columns[1:len(ref_ldscores.columns)]	
	w_ld_colname = sumstats.columns[-1]
	del(ref_ldscores); del(w_ldscores)
	
	err_msg = 'No SNPs retained for analysis after filtering on {C} {P} {F}.'
	log_msg = 'After filtering on {C} {P} {F}, {N} SNPs remain.'
	loop = ['1','2'] if args.sumstats_gencor else ['']
	var_to_arg = {'infomax': args.info_max, 'infomin': args.info_min, 'maf': args.maf}
	var_to_cname  = {'infomax': 'INFO', 'infomin': 'INFO', 'maf': 'MAF'}
	var_to_pred = {'infomax': lambda x: x < args.info_max, 
		'infomin': lambda x: x > args.info_min, 
		'maf': lambda x: x > args.maf}
	var_to_predstr = {'infomax': '<', 'infomin': '>', 'maf': '>'}
	for v in var_to_arg.keys():
		arg = var_to_arg[v]; pred = var_to_pred[v]; pred_str = var_to_predstr[v]
		for p in loop:
			cname = var_to_cname[v] + p; 
			if arg is not None:
				sumstats = ps.filter_df(sumstats, cname, pred)
				snp_count = len(sumstats)
				if snp_count == 0:
					raise ValueError(err_msg.format(C=cname, F=arg, P=pred_str))
				else:
					log.log(log_msg.format(C=cname, F=arg, N=snp_count, P=pred_str))

	# LD Score regression intercept
	if args.sumstats_intercept:
		log.log('Estimating LD Score regression intercept')
		# filter out large-effect loci
		max_N = np.max(sumstats['N'])
		max_chisq = max(0.01*max_N, 20)
		sumstats = sumstats[sumstats['CHISQ'] < max_chisq]
		log_msg = 'After filtering on chi^2 < {C}, {N} SNPs remain.'
		snp_count = len(sumstats)
		if snp_count == 0:
			raise ValueError(log_msg.format(C=max_chisq, N='no'))
		else:
			log.log(log_msg.format(C=max_chisq, N=len(sumstats)))

		snp_count = len(sumstats); n_annot = len(ref_ld_colnames)
		ref_ld = np.matrix(sumstats[ref_ld_colnames]).reshape((snp_count, n_annot))
		w_ld = np.matrix(sumstats[w_ld_colname]).reshape((snp_count, 1))
		M_annot = np.matrix(M_annot).reshape((1, n_annot))
		chisq = np.matrix(sumstats.CHISQ).reshape((snp_count, 1))
		N = np.matrix(sumstats.N).reshape((snp_count,1))
		del sumstats

		h2hat = jk.Hsq(chisq, ref_ld, w_ld, N, M_annot, args.num_blocks)
		log.log(_print_intercept(h2hat))
		return h2hat


	# LD Score regression to estimate h2
	elif args.sumstats_h2:
		log.log('Estimating heritability.')
		
		snp_count = len(sumstats); n_annot = len(ref_ld_colnames)
		ref_ld = np.matrix(sumstats[ref_ld_colnames]).reshape((snp_count, n_annot))
		w_ld = np.matrix(sumstats[w_ld_colname]).reshape((snp_count, 1))
		M_annot = np.matrix(M_annot).reshape((1, n_annot))
		chisq = np.matrix(sumstats.CHISQ).reshape((snp_count, 1))
		N = np.matrix(sumstats.N).reshape((snp_count,1))
		del sumstats
		
		h2hat = jk.Hsq(chisq, ref_ld, w_ld, N, M_annot, args.num_blocks)
		log.log(_print_hsq(h2hat, ref_ld_colnames))
		return [M_annot,h2hat]


	# LD Score regression to estimate genetic correlation
	elif args.sumstats_gencor:
		log.log('Estimating genetic correlation.')
		
		snp_count = len(sumstats); n_annot = len(ref_ld_colnames)
		ref_ld = np.matrix(sumstats[ref_ld_colnames]).reshape((snp_count, n_annot))
		w_ld = np.matrix(sumstats[w_ld_colname]).reshape((snp_count, 1))
		M_annot = np.matrix(M_annot).reshape((1, n_annot))
		betahat1 = np.matrix(sumstats.BETAHAT1).reshape((snp_count, 1))
		betahat2 = np.matrix(sumstats.BETAHAT2).reshape((snp_count, 1))
		N1 = np.matrix(sumstats.N1).reshape((snp_count,1))
		N2 = np.matrix(sumstats.N2).reshape((snp_count,1))
		del sumstats
		
		gchat = jk.Gencor(betahat1, betahat2, ref_ld, w_ld, N1, N2, M_annot, args.overlap,
			args.rho, args.num_blocks)

		log.log( '\n' )
		log.log( 'Heritability of first phenotype' )
		log.log( '-------------------------------' )
		log.log( _print_hsq(gchat.hsq1, ref_ld_colnames) )
		log.log( '\n' )
		log.log( 'Heritability of second phenotype' )
		log.log( '--------------------------------' )
		log.log( _print_hsq(gchat.hsq2, ref_ld_colnames) )
		log.log( '\n' )
		log.log( 'Genetic Covariance' )
		log.log( '------------------' )
		log.log( _print_gencov(gchat.gencov, ref_ld_colnames) )
		log.log( '\n' )
		log.log( 'Genetic Correlation' )
		log.log( '-------------------' )
		log.log( _print_gencor(gchat) )
		
		return [M_annot,gchat]