Esempio n. 1
0
 def setUp(self):
     self.hsq1 = 0.2
     self.hsq2 = 0.7
     ld = (np.abs(np.random.normal(size=800)) + 1).reshape((400, 2))
     N = np.ones((400, 1)) * 1e5
     self.M = np.ones((1, 2)) * 1e7 / 2.0
     chisq = 1 + 1e5 * (ld[:, 0] * self.hsq1 / self.M[0, 0] +
                        ld[:, 1] * self.hsq2 / self.M[0, 1]).reshape((400, 1))
     w_ld = np.ones_like(chisq)
     self.hsq_noint = reg.Hsq(
         chisq, ld, w_ld, N, self.M, n_blocks=3, intercept=1)
     self.hsq_int = reg.Hsq(chisq, ld, w_ld, N, self.M, n_blocks=3)
     print(self.hsq_noint.summary())
     print(self.hsq_int.summary())
Esempio n. 2
0
    def test_eq_hsq(self):
        '''
        Gencov should be the same as hsq if z1 = z2, hsq + intercept_hsq are 0 and
        all intermediate rg's are > 0 (because Hsq.weights lower-bounds the hsq guess at 0
        but Gencov.weights lower-bounds the rho_g guess at -1). The setup below guarantees
        that all intermediate rho_g guesses will be 1

        '''
        self.ld = np.abs(np.random.normal(size=100).reshape((50, 2))) + 2
        self.z1 = (np.sum(self.ld, axis=1) + 10).reshape((50, 1))
        gencov = reg.Gencov(self.z1,
                            self.z1,
                            self.ld,
                            self.w_ld,
                            self.N1,
                            self.N1,
                            self.M,
                            0,
                            0,
                            0,
                            0,
                            n_blocks=3,
                            intercept_gencov=1)
        hsq = reg.Hsq(np.square(self.z1),
                      self.ld,
                      self.w_ld,
                      self.N1,
                      self.M,
                      n_blocks=3,
                      intercept=1)
        print(gencov.summary(['asdf', 'asdf']))
        print(hsq.summary(['asdf', 'asdf']))
        assert_array_almost_equal(gencov.tot, hsq.tot)
        assert_array_almost_equal(gencov.tot_se, hsq.tot_se)
        assert_array_almost_equal(gencov.tot_cov, hsq.tot_cov)
Esempio n. 3
0
def cell_type_specific(args, log):
    '''Cell type specific analysis'''
    args = copy.deepcopy(args)
    if args.intercept_h2 is not None:
        args.intercept_h2 = float(args.intercept_h2)
    if args.no_intercept:
        args.intercept_h2 = 1

    M_annot_all_regr, w_ld_cname, ref_ld_cnames_all_regr, sumstats, novar_cols = \
            _read_ld_sumstats(args, log, args.h2_cts)
    M_tot = np.sum(M_annot_all_regr)
    _check_ld_condnum(args, log, ref_ld_cnames_all_regr)
    _warn_length(log, sumstats)
    n_snp = len(sumstats)
    n_blocks = min(n_snp, args.n_blocks)
    if args.chisq_max is None:
        chisq_max = max(0.001*sumstats.N.max(), 80)
    else:
        chisq_max = args.chisq_max

    ii = np.ravel(sumstats.Z**2 < chisq_max)
    sumstats = sumstats.iloc[ii, :]
    log.log('Removed {M} SNPs with chi^2 > {C} ({N} SNPs remain)'.format(
            C=chisq_max, N=np.sum(ii), M=n_snp-np.sum(ii)))
    n_snp = np.sum(ii)  # lambdas are late-binding, so this works
    ref_ld_all_regr = np.array(sumstats[ref_ld_cnames_all_regr]).reshape((len(sumstats),-1))
    chisq = np.array(sumstats.Z**2)
    keep_snps = sumstats[['SNP']]

    s = lambda x: np.array(x).reshape((n_snp, 1))
    results_columns = ['Name', 'Coefficient', 'Coefficient_std_error', 'Coefficient_P_value']
    results_data = []
    for (name, ct_ld_chr) in [x.split() for x in open(args.ref_ld_chr_cts).readlines()]:
        ref_ld_cts_allsnps = _read_chr_split_files(ct_ld_chr, None, log,
                                   'cts reference panel LD Score', ps.ldscore_fromlist)
        log.log('Performing regression.')
        ref_ld_cts = np.array(pd.merge(keep_snps, ref_ld_cts_allsnps, on='SNP', how='left').iloc[:,1:])
        if np.any(np.isnan(ref_ld_cts)):
            raise ValueError ('Missing some LD scores from cts files. Are you sure all SNPs in ref-ld-chr are also in ref-ld-chr-cts')

        ref_ld = np.hstack([ref_ld_cts, ref_ld_all_regr])
        M_cts = ps.M_fromlist(
                _splitp(ct_ld_chr), _N_CHR, common=(not args.not_M_5_50))
        M_annot = np.hstack([M_cts, M_annot_all_regr])
        hsqhat = reg.Hsq(s(chisq), ref_ld, s(sumstats[w_ld_cname]), s(sumstats.N),
                     M_annot, n_blocks=n_blocks, intercept=args.intercept_h2,
                     twostep=None, old_weights=True)
        coef, coef_se = hsqhat.coef[0], hsqhat.coef_se[0]
        results_data.append((name, coef, coef_se, stats.norm.sf(coef/coef_se)))
        if args.print_all_cts:
            for i in range(1, len(ct_ld_chr.split(','))):
                coef, coef_se = hsqhat.coef[i], hsqhat.coef_se[i]
                results_data.append((name+'_'+str(i), coef, coef_se, stats.norm.sf(coef/coef_se)))


    df_results = pd.DataFrame(data = results_data, columns = results_columns)
    df_results.sort_values(by = 'Coefficient_P_value', inplace=True)
    df_results.to_csv(args.out+'.cell_type_results.txt', sep='\t', index=False)
    log.log('Results printed to '+args.out+'.cell_type_results.txt')
Esempio n. 4
0
 def setUp(self):
     self.chisq = np.ones((4, 1)) * 4
     self.ld = np.ones((4, 1))
     self.w_ld = np.ones((4, 1))
     self.N = 9 * np.ones((4, 1))
     self.M = np.matrix((7))
     self.hsq = reg.Hsq(
         self.chisq, self.ld, self.w_ld, self.N, self.M, n_blocks=3, intercept=1)
Esempio n. 5
0
 def setUp(self):
     self.chisq = np.ones((17, 1)) * 4
     self.ld = np.hstack(
         [np.ones((17, 1)), np.arange(17).reshape((17, 1))]).reshape((17, 2))
     self.w_ld = np.ones((17, 1))
     self.N = 9 * np.ones((17, 1))
     self.M = np.matrix((7, 2))
     self.hsq = reg.Hsq(
         self.chisq, self.ld, self.w_ld, self.N, self.M, n_blocks=3, intercept=1)
Esempio n. 6
0
 def test_summary(self):
     # not much to test; we can at least make sure no errors at runtime
     self.hsq.summary(['asdf'])
     self.ld += np.arange(4).reshape((4, 1))
     self.chisq += np.arange(4).reshape((4, 1))
     hsq = reg.Hsq(
         self.chisq, self.ld, self.w_ld, self.N, self.M, n_blocks=3)
     hsq.summary(['asdf'])
     # test ratio printout with mean chi^2 < 1
     hsq.mean_chisq = 0.5
     hsq.summary(['asdf'])
Esempio n. 7
0
 def test_summary(self):
     # not much to test; we can at least make sure no errors at runtime
     self.hsq.summary(['asdf', 'qwer'])
     # change to random 7/30/2019 to avoid inconsistent singular matrix errors
     self.ld += np.random.normal(scale=0.1, size=(17, 2))
     self.chisq += np.arange(17).reshape((17, 1))
     hsq = reg.Hsq(
         self.chisq, self.ld, self.w_ld, self.N, self.M, n_blocks=3)
     hsq.summary(['asdf', 'qwer'])
     # test ratio printout with mean chi^2 < 1
     hsq.mean_chisq = 0.5
     hsq.summary(['asdf', 'qwer'])
Esempio n. 8
0
def estimate_h2(args, log):
    '''Estimate h2 and partitioned h2.'''
    args = copy.deepcopy(args)
    if args.samp_prev is not None and args.pop_prev is not None:
        args.samp_prev, args.pop_prev = map(
            float, [args.samp_prev, args.pop_prev])
    if args.intercept_h2 is not None:
        args.intercept_h2 = float(args.intercept_h2)
    if args.no_intercept:
        args.intercept_h2 = 1
    M_annot, w_ld_cname, ref_ld_cnames, sumstats, novar_cols = _read_ld_sumstats(
        args, log, args.h2)
    ref_ld = np.array(sumstats[ref_ld_cnames])
    _check_ld_condnum(args, log, ref_ld_cnames)
    _warn_length(log, sumstats)
    n_snp = len(sumstats)
    n_blocks = min(n_snp, args.n_blocks)
    n_annot = len(ref_ld_cnames)
    chisq_max = args.chisq_max
    old_weights = False
    if n_annot == 1:
        if args.two_step is None and args.intercept_h2 is None:
            args.two_step = 30
    else:
        old_weights = True
        if args.chisq_max is None:
            chisq_max = max(0.001*sumstats.N.max(), 80)

    s = lambda x: np.array(x).reshape((n_snp, 1))
    chisq = s(sumstats.Z**2)
    if chisq_max is not None:
        ii = np.ravel(chisq < chisq_max)
        sumstats = sumstats.iloc[ii, :]
        log.log('Removed {M} SNPs with chi^2 > {C} ({N} SNPs remain)'.format(
                C=chisq_max, N=np.sum(ii), M=n_snp-np.sum(ii)))
        n_snp = np.sum(ii)  # lambdas are late-binding, so this works
        ref_ld = np.array(sumstats[ref_ld_cnames])
        chisq = chisq[ii].reshape((n_snp, 1))

    if args.two_step is not None:
        log.log('Using two-step estimator with cutoff at {M}.'.format(M=args.two_step))

    hsqhat = reg.Hsq(chisq, ref_ld, s(sumstats[w_ld_cname]), s(sumstats.N),
                     M_annot, n_blocks=n_blocks, intercept=args.intercept_h2,
                     twostep=args.two_step, old_weights=old_weights)

    if args.print_cov:
        _print_cov(hsqhat, args.out + '.cov', log)
    if args.print_delete_vals:
        _print_delete_values(hsqhat, args.out + '.delete', log)
        _print_part_delete_values(hsqhat, args.out + '.part_delete', log)

    log.log(hsqhat.summary(ref_ld_cnames, P=args.samp_prev, K=args.pop_prev, overlap = args.overlap_annot))
    if args.overlap_annot:
        overlap_matrix, M_tot = _read_annot(args, log)

        # overlap_matrix = overlap_matrix[np.array(~novar_cols), np.array(~novar_cols)]#np.logical_not
        df_results = hsqhat._overlap_output(ref_ld_cnames, overlap_matrix, M_annot, M_tot, args.print_coefficients)
        df_results.to_csv(args.out+'.results', sep="\t", index=False)
        log.log('Results printed to '+args.out+'.results')

    return hsqhat