Python getBlockLefts Examples

Programming Language: Python

Namespace/Package Name: ldscore.ldscore

Method/Function: getBlockLefts

Examples at hotexamples.com: 9

Python getBlockLefts - 9 examples found. These are the top rated real world Python examples of ldscore.ldscore.getBlockLefts extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

def test_getBlockLefts():
    l = [
        (np.arange(1, 6), 5, np.zeros(5)),
        (np.arange(1, 6), 0, np.arange(0, 5)),
        ((1, 4, 6, 7, 7, 8), 2, (0, 1, 1, 2, 2, 2))
    ]
    for coords, max_dist, correct in l:
        assert np.all(ld.getBlockLefts(coords, max_dist) == correct)

Example #2

Show file

def test_getBlockLefts():
    coords = (np.arange(1, 6), np.arange(1, 6), (1, 4, 6, 7, 7, 8))
    max_dist = (5, 0, 2)
    correct = (np.zeros(5), np.arange(0, 5), (0, 1, 1, 2, 2, 2))
    assert np.all([
        np.all(ld.getBlockLefts(coor, max_d) == ca)
        for coor, max_d, ca in zip(coords, max_dist, correct)
    ])

Example #3

Show file

File: ldsc.py Project: josephepowell/ldsc

def ldscore(args):
    '''
	Wrapper function for estimating l1, l1^2, l2 and l4 (+ optionally standard errors) from
	reference panel genotypes. 
	
	Annot format is 
	chr snp bp cm <annotations>
	
	'''
    log = logger(args.out + '.log')

    if args.bin:
        snp_file, snp_obj = args.bin + '.snp', ps.VcfSNPFile
        ind_file, ind_obj = args.bin + '.ind', ps.VcfINDFile
        array_file, array_obj = args.bin + '.bin', ld.VcfBINFile
    elif args.bfile:
        snp_file, snp_obj = args.bfile + '.bim', ps.PlinkBIMFile
        ind_file, ind_obj = args.bfile + '.fam', ps.PlinkFAMFile
        array_file, array_obj = args.bfile + '.bed', ld.PlinkBEDFile

    # read bim/snp
    array_snps = snp_obj(snp_file)
    m = len(array_snps.IDList)
    log.log('Read list of {m} SNPs from {f}'.format(m=m, f=snp_file))

    # read annot
    if args.annot and args.keep:
        raise ValueError('--annot and --keep are currently incompatible.')
    elif args.annot:
        annot = ps.AnnotFile(args.annot)
        num_annots, ma = len(annot.df.columns) - 4, len(annot.df)
        log.log("Read {A} annotations for {M} SNPs from {f}".format(
            f=args.annot, A=num_annots, M=ma))
        annot_matrix = np.array(annot.df.iloc[:, 4:])
        annot_colnames = annot.df.columns[4:]
        keep_snps = None
    elif args.keep:
        keep_snps = __filter__(args.keep, 'SNPs', 'include', array_snps)
        annot_matrix, annot_colnames, num_annots = None, None, 1
    else:
        annot_matrix, annot_colnames, keep_snps = None, None, None,
        num_annots = 1

    # read fam/ind
    array_indivs = ind_obj(ind_file)
    n = len(array_indivs.IDList)
    log.log('Read list of {n} individuals from {f}'.format(n=n, f=ind_file))
    # read keep_indivs
    if args.extract:
        keep_indivs = __filter__(args.extract, 'individuals', 'include',
                                 array_indivs)
    else:
        keep_indivs = None

    # read genotype array
    log.log('Reading genotypes from {fname}'.format(fname=array_file))
    geno_array = array_obj(array_file,
                           n,
                           array_snps,
                           keep_snps=keep_snps,
                           keep_indivs=keep_indivs,
                           mafMin=args.maf)

    # filter annot_matrix down to only SNPs passing MAF cutoffs
    if annot_matrix is not None:
        annot_keep = geno_array.kept_snps
        annot_matrix = annot_matrix[annot_keep, :]

    # determine block widths
    x = np.array((args.ld_wind_snps, args.ld_wind_kb, args.ld_wind_cm),
                 dtype=bool)
    if np.sum(x) != 1:
        raise ValueError('Must specify exactly one --ld-wind option')

    if args.ld_wind_snps:
        max_dist = args.ld_wind_snps
        coords = np.array(xrange(geno_array.m))
    elif args.ld_wind_kb:
        max_dist = args.ld_wind_kb * 1000
        coords = np.array(array_snps.df['BP'])[geno_array.kept_snps]
    elif args.ld_wind_cm:
        max_dist = args.ld_wind_cm
        coords = np.array(array_snps.df['CM'])[geno_array.kept_snps]

    block_left = ld.getBlockLefts(coords, max_dist)
    if block_left[len(block_left) - 1] == 0 and not args.yes_really:
        error_msg = 'Do you really want to compute whole-chomosome LD Score? If so, set the '
        error_msg += '--yes-really flag (warning: it will use a lot of time / memory)'
        raise ValueError(error_msg)

    if args.se:  # block jackknife

        # block size
        if args.block_size:
            jSize = args.block_size
        elif n > 50:
            jSize = 10
        else:
            jSize = 1

        jN = int(np.ceil(n / jSize))
        if args.l1:
            col_prefix = "L1"
            file_suffix = "l1.jknife"
            raise NotImplementedError(
                'Sorry, havent implemented L1 block jackknife yet.')

        elif args.l1sq:
            col_prefix = "L1SQ"
            file_suffix = "l1sq.jknife"
            raise NotImplementedError(
                'Sorry, havent implemented L1^2 block jackknife yet.')

        elif args.l2:
            col_prefix = "L2"
            file_suffix = "l2.jknife"
            c = "Computing LD Score (L2) and block jackknife standard errors with {n} blocks."

        elif args.l4:
            col_prefix = "L4"
            file_suffix = "l4.jknife"
            c = "Computing L4 and block jackknife standard errors with {n} blocks."

        print c.format(n=jN)
        (lN_est, lN_se) = geno_array.ldScoreBlockJackknife(block_left,
                                                           args.chunk_size,
                                                           jN=jN,
                                                           annot=annot_matrix)
        lN = np.c_[lN_est, lN_se]
        if num_annots == 1:
            ldscore_colnames = [col_prefix, 'SE(' + col_prefix + ')']
        else:
            ldscore_colnames = [x + col_prefix for x in annot_colnames]
            ldscore_colnames += ['SE(' + x + ')' for x in ldscore_colnames]

    else:  # not block jackknife
        if args.l1:
            log.log("Estimating L1.")
            lN = geno_array.l1VarBlocks(block_left,
                                        args.chunk_size,
                                        annot=annot_matrix)
            col_prefix = "L1"
            file_suffix = "l1"

        elif args.l1sq:
            log.log("Estimating L1 ^ 2.")
            lN = geno_array.l1sqVarBlocks(block_left,
                                          args.chunk_size,
                                          annot=annot_matrix)
            col_prefix = "L1SQ"
            file_suffix = "l1sq"

        elif args.l2:
            log.log("Estimating LD Score (L2).")
            lN = geno_array.ldScoreVarBlocks(block_left,
                                             args.chunk_size,
                                             annot=annot_matrix)
            col_prefix = "L2"
            file_suffix = "l2"

        elif args.l4:
            col_prefix = "L4"
            file_suffix = "l4"
            raise NotImplementedError(
                'Sorry, havent implemented L4 yet. Try the jackknife.')
            lN = geno_array.l4VarBlocks(block_left, c, annot)

        if num_annots == 1:
            ldscore_colnames = [col_prefix]
        else:
            ldscore_colnames = [x + col_prefix for x in annot_colnames]

    # print .ldscore
    # output columns: CHR, BP, CM, RS, MAF, [LD Scores and optionally SEs]
    out_fname = args.out + '.' + file_suffix + '.ldscore'
    new_colnames = geno_array.colnames + ldscore_colnames
    df = pd.DataFrame(np.c_[geno_array.df, lN])
    df.columns = new_colnames
    log.log("Writing results to {f}.gz".format(f=out_fname))
    df.to_csv(out_fname, sep="\t", header=True, index=False)
    call(['gzip', out_fname])
    # print .M
    fout_M = open(args.out + '.' + file_suffix + '.M', 'wb')
    if num_annots == 1:
        print >> fout_M, geno_array.m
    else:
        M = np.squeeze(np.sum(annot_matrix, axis=0))
        print >> fout_M, '\t'.join(map(str, M))

    fout_M.close()

Example #4

Show file

File: ldsc_thin.py Project: jmiao24/GNOVA-2.0

def _ldscore(bfile, annots, gwas_snps):
    '''
    Wrapper function for estimating l1, l1^2, l2 and l4 (+ optionally standard errors) from
    reference panel genotypes.

    Annot format is
    chr snp bp cm <annotations>

    '''
    log = Logger('ldsc.log')

    snp_file, snp_obj = bfile + '.bim', ps.PlinkBIMFile
    ind_file, ind_obj = bfile + '.fam', ps.PlinkFAMFile
    array_file, array_obj = bfile + '.bed', ld.PlinkBEDFile
    # read bim/snp
    array_snps = snp_obj(snp_file)
    # snp list
    m = len(array_snps.IDList)
    log.log('Read list of {m} SNPs from {f}'.format(m=m, f=snp_file))
    if annots is not None:  # read --annot
        try:
            annot = ps.AnnotFile(pd.concat(annots, ignore_index=True))
            n_annot, ma = len(annot.df.columns) - 4, len(annot.df)
            log.log("Read {A} annotations for {M} SNPs".format(A=n_annot,
                                                               M=ma))
            annot_colnames = annot.df.columns[4:]
            keep_snps = None
            #take only annot SNPs in intersect
            kept_cols = len(annot.df.columns)
            annot.df = subset_annot_file(annot.df, gwas_snps, kept_cols)
            annot_matrix = np.array(annot.df.iloc[:, 4:])
        except Exception:
            log.log('Error parsing .annot file')
            raise
    else:
        annot_matrix, annot_colnames, keep_snps = None, None, None,
        n_annot = 1

    keep_snps = __filter_bim__(gwas_snps, array_snps, log)

    # read fam
    array_indivs = ind_obj(ind_file)
    n = len(array_indivs.IDList)
    log.log('Read list of {n} individuals from {f}'.format(n=n, f=ind_file))
    # read keep_indivs
    keep_indivs = None

    # read genotype array
    log.log('Reading genotypes from {fname}'.format(fname=array_file))
    geno_array = array_obj(array_file,
                           n,
                           array_snps,
                           log,
                           keep_snps=keep_snps,
                           keep_indivs=keep_indivs,
                           mafMin=None)

    #determine block widths

    max_dist = 1
    coords = np.array(array_snps.df['CM'])[geno_array.kept_snps]

    block_left = ld.getBlockLefts(coords, max_dist)

    scale_suffix = ''

    lN = geno_array.ldScoreVarBlocks(block_left, 50, annot=annot_matrix)
    col_prefix = "L2"

    if n_annot == 1:
        ldscore_colnames = [col_prefix + scale_suffix]
    else:
        ldscore_colnames = [
            y + col_prefix + scale_suffix for y in annot_colnames
        ]

    # print .ldscore. Output columns: CHR, BP, RS, [LD Scores]
    new_colnames = geno_array.colnames + ldscore_colnames
    df = pd.DataFrame.from_records(np.c_[geno_array.df, lN])
    df.columns = new_colnames
    df.drop(['CM', 'MAF'], axis=1)

    # print LD Score summary
    pd.set_option('display.max_rows', 200)
    log.log('\nSummary of LD Scores')
    t = df.ix[:, 4:].describe()
    log.log(t.ix[1:, :])

    np.seterr(divide='ignore',
              invalid='ignore')  # print NaN instead of weird errors
    # print correlation matrix including all LD Scores and sample MAF
    log.log('')
    log.log('MAF/LD Score Correlation Matrix')
    log.log(df.ix[:, 4:].corr())

    # print condition number
    if n_annot > 1:  # condition number of a column vector w/ nonzero var is trivially one
        log.log('\nLD Score Matrix Condition Number')
        cond_num = np.linalg.cond(df.ix[:, 5:])
        log.log(remove_brackets(str(np.matrix(cond_num))))
        if cond_num > 10000:
            log.log('WARNING: ill-conditioned LD Score Matrix!')

    # summarize annot matrix if there is one
    if annot_matrix is not None:
        # covariance matrix
        x = pd.DataFrame(annot_matrix, columns=annot_colnames)
        log.log('\nAnnotation Correlation Matrix')
        log.log(x.corr())

        # column sums
        log.log('\nAnnotation Matrix Column Sums')
        log.log(_remove_dtype(x.sum(axis=0)))

        # row sums
        log.log('\nSummary of Annotation Matrix Row Sums')
        row_sums = x.sum(axis=1).describe()
        log.log(_remove_dtype(row_sums))

    np.seterr(divide='raise', invalid='raise')
    return df

Example #5

Show file

def ldscore(args, log):
    '''
    Wrapper function for estimating l1, l1^2, l2 and l4 (+ optionally standard errors) from
    reference panel genotypes.

    Annot format is
    chr snp bp cm <annotations>

    '''

    if args.bfile:
        snp_file, snp_obj = args.bfile + '.bim', ps.PlinkBIMFile
        ind_file, ind_obj = args.bfile + '.fam', ps.PlinkFAMFile
        array_file, array_obj = args.bfile + '.bed', ld.PlinkBEDFile

    # read bim/snp
    array_snps = snp_obj(snp_file)
    m = len(array_snps.IDList)
    log.log('Read list of {m} SNPs from {f}'.format(m=m, f=snp_file))
    if args.annot is not None:  # read --annot
        try:
            if args.thin_annot:  # annot file has only annotations
                annot = ps.ThinAnnotFile(args.annot)
                n_annot, ma = len(annot.df.columns), len(annot.df)
                log.log("Read {A} annotations for {M} SNPs from {f}".format(
                    f=args.annot, A=n_annot, M=ma))
                annot_matrix = annot.df.values
                annot_colnames = annot.df.columns
                keep_snps = None
            else:
                annot = ps.AnnotFile(args.annot)
                n_annot, ma = len(annot.df.columns) - 4, len(annot.df)
                log.log("Read {A} annotations for {M} SNPs from {f}".format(
                    f=args.annot, A=n_annot, M=ma))
                annot_matrix = np.array(annot.df.iloc[:, 4:])
                annot_colnames = annot.df.columns[4:]
                keep_snps = None
                if np.any(annot.df.SNP.values != array_snps.df.SNP.values):
                    raise ValueError('The .annot file must contain the same SNPs in the same'+\
                        ' order as the .bim file.')
        except Exception:
            log.log('Error parsing .annot file')
            raise

    elif args.extract is not None:  # --extract
        keep_snps = __filter__(args.extract, 'SNPs', 'include', array_snps)
        annot_matrix, annot_colnames, n_annot = None, None, 1

    elif args.cts_bin is not None and args.cts_breaks is not None:  # --cts-bin
        cts_fnames = sumstats._splitp(args.cts_bin)  # read filenames
        args.cts_breaks = args.cts_breaks.replace(
            'N', '-')  # replace N with negative sign
        try:  # split on x
            breaks = [[float(x) for x in y.split(',')]
                      for y in args.cts_breaks.split('x')]
        except ValueError as e:
            raise ValueError(
                '--cts-breaks must be a comma-separated list of numbers: ' +
                str(e.args))

        if len(breaks) != len(cts_fnames):
            raise ValueError(
                'Need to specify one set of breaks for each file in --cts-bin.'
            )

        if args.cts_names:
            cts_colnames = [str(x) for x in args.cts_names.split(',')]
            if len(cts_colnames) != len(cts_fnames):
                msg = 'Must specify either no --cts-names or one value for each file in --cts-bin.'
                raise ValueError(msg)

        else:
            cts_colnames = ['ANNOT' + str(i) for i in xrange(len(cts_fnames))]

        log.log('Reading numbers with which to bin SNPs from {F}'.format(
            F=args.cts_bin))

        cts_levs = []
        full_labs = []
        for i, fh in enumerate(cts_fnames):
            vec = ps.read_cts(cts_fnames[i], array_snps.df.SNP.values)

            max_cts = np.max(vec)
            min_cts = np.min(vec)
            cut_breaks = list(breaks[i])
            name_breaks = list(cut_breaks)
            if np.all(cut_breaks >= max_cts) or np.all(cut_breaks <= min_cts):
                raise ValueError(
                    'All breaks lie outside the range of the cts variable.')

            if np.all(cut_breaks <= max_cts):
                name_breaks.append(max_cts)
                cut_breaks.append(max_cts + 1)

            if np.all(cut_breaks >= min_cts):
                name_breaks.append(min_cts)
                cut_breaks.append(min_cts - 1)

            name_breaks.sort()
            cut_breaks.sort()
            n_breaks = len(cut_breaks)
            # so that col names are consistent across chromosomes with different max vals
            name_breaks[0] = 'min'
            name_breaks[-1] = 'max'
            name_breaks = [str(x) for x in name_breaks]
            labs = [
                name_breaks[i] + '_' + name_breaks[i + 1]
                for i in xrange(n_breaks - 1)
            ]
            cut_vec = pd.Series(pd.cut(vec, bins=cut_breaks, labels=labs))
            cts_levs.append(cut_vec)
            full_labs.append(labs)

        annot_matrix = pd.concat(cts_levs, axis=1)
        annot_matrix.columns = cts_colnames
        # crosstab -- for now we keep empty columns
        annot_matrix = pd.crosstab(
            annot_matrix.index,
            [annot_matrix[i] for i in annot_matrix.columns],
            dropna=False,
            colnames=annot_matrix.columns)

        # add missing columns
        if len(cts_colnames) > 1:
            for x in product(*full_labs):
                if x not in annot_matrix.columns:
                    annot_matrix[x] = 0
        else:
            for x in full_labs[0]:
                if x not in annot_matrix.columns:
                    annot_matrix[x] = 0

        annot_matrix = annot_matrix[sorted(annot_matrix.columns,
                                           key=annot_sort_key)]
        if len(cts_colnames) > 1:
            # flatten multi-index
            annot_colnames = [
                '_'.join([cts_colnames[i] + '_' + b for i, b in enumerate(c)])
                for c in annot_matrix.columns
            ]
        else:
            annot_colnames = [
                cts_colnames[0] + '_' + b for b in annot_matrix.columns
            ]

        annot_matrix = np.matrix(annot_matrix)
        keep_snps = None
        n_annot = len(annot_colnames)
        if np.any(np.sum(annot_matrix, axis=1) == 0):
            # This exception should never be raised. For debugging only.
            raise ValueError(
                'Some SNPs have no annotation in --cts-bin. This is a bug!')

    else:
        annot_matrix, annot_colnames, keep_snps = None, None, None,
        n_annot = 1

    # read fam
    array_indivs = ind_obj(ind_file)
    n = len(array_indivs.IDList)
    log.log('Read list of {n} individuals from {f}'.format(n=n, f=ind_file))
    # read keep_indivs
    if args.keep:
        keep_indivs = __filter__(args.keep, 'individuals', 'include',
                                 array_indivs)
    else:
        keep_indivs = None

    # read genotype array
    log.log('Reading genotypes from {fname}'.format(fname=array_file))
    geno_array = array_obj(array_file,
                           n,
                           array_snps,
                           keep_snps=keep_snps,
                           keep_indivs=keep_indivs,
                           mafMin=args.maf)

    # filter annot_matrix down to only SNPs passing MAF cutoffs
    if annot_matrix is not None:
        annot_keep = geno_array.kept_snps
        annot_matrix = annot_matrix[annot_keep, :]

    # determine block widths
    x = np.array((args.ld_wind_snps, args.ld_wind_kb, args.ld_wind_cm),
                 dtype=bool)
    if np.sum(x) != 1:
        raise ValueError('Must specify exactly one --ld-wind option')

    if args.ld_wind_snps:
        max_dist = args.ld_wind_snps
        coords = np.array(xrange(geno_array.m))
    elif args.ld_wind_kb:
        max_dist = args.ld_wind_kb * 1000
        coords = np.array(array_snps.df['BP'])[geno_array.kept_snps]
    elif args.ld_wind_cm:
        max_dist = args.ld_wind_cm
        coords = np.array(array_snps.df['CM'])[geno_array.kept_snps]

    block_left = ld.getBlockLefts(coords, max_dist)
    if block_left[len(block_left) - 1] == 0 and not args.yes_really:
        error_msg = 'Do you really want to compute whole-chomosome LD Score? If so, set the '
        error_msg += '--yes-really flag (warning: it will use a lot of time / memory)'
        raise ValueError(error_msg)

    scale_suffix = ''
    if args.pq_exp is not None:
        log.log('Computing LD with pq ^ {S}.'.format(S=args.pq_exp))
        msg = 'Note that LD Scores with pq raised to a nonzero power are'
        msg += 'not directly comparable to normal LD Scores.'
        log.log(msg)
        scale_suffix = '_S{S}'.format(S=args.pq_exp)
        pq = np.matrix(geno_array.maf * (1 - geno_array.maf)).reshape(
            (geno_array.m, 1))
        pq = np.power(pq, args.pq_exp)

        if annot_matrix is not None:
            annot_matrix = np.multiply(annot_matrix, pq)
        else:
            annot_matrix = pq

    log.log("Estimating LD Score.")
    lN = geno_array.ldScoreVarBlocks(block_left,
                                     args.chunk_size,
                                     annot=annot_matrix)
    col_prefix = "L2"
    file_suffix = "l2"

    if n_annot == 1:
        ldscore_colnames = [col_prefix + scale_suffix]
    else:
        ldscore_colnames = [
            y + col_prefix + scale_suffix for y in annot_colnames
        ]

    # print .ldscore. Output columns: CHR, BP, RS, [LD Scores]
    out_fname = args.out + '.' + file_suffix + '.ldscore'
    new_colnames = geno_array.colnames + ldscore_colnames
    df = pd.DataFrame.from_records(np.c_[geno_array.df, lN])
    df.columns = new_colnames
    if args.print_snps:
        if args.print_snps.endswith('gz'):
            print_snps = pd.read_csv(args.print_snps,
                                     header=None,
                                     compression='gzip')
        elif args.print_snps.endswith('bz2'):
            print_snps = pd.read_csv(args.print_snps,
                                     header=None,
                                     compression='bz2')
        else:
            print_snps = pd.read_csv(args.print_snps, header=None)
        if len(print_snps.columns) > 1:
            raise ValueError(
                '--print-snps must refer to a file with a one column of SNP IDs.'
            )
        log.log('Reading list of {N} SNPs for which to print LD Scores from {F}'.format(\
                        F=args.print_snps, N=len(print_snps)))

        print_snps.columns = ['SNP']
        df = df.ix[df.SNP.isin(print_snps.SNP), :]
        if len(df) == 0:
            raise ValueError(
                'After merging with --print-snps, no SNPs remain.')
        else:
            msg = 'After merging with --print-snps, LD Scores for {N} SNPs will be printed.'
            log.log(msg.format(N=len(df)))

    l2_suffix = '.gz'
    log.log("Writing LD Scores for {N} SNPs to {f}.gz".format(f=out_fname,
                                                              N=len(df)))
    df.drop(['CM', 'MAF'], axis=1).to_csv(out_fname,
                                          sep="\t",
                                          header=True,
                                          index=False,
                                          float_format='%.3f')
    call(['gzip', '-f', out_fname])
    if annot_matrix is not None:
        M = np.atleast_1d(np.squeeze(np.asarray(np.sum(annot_matrix, axis=0))))
        ii = geno_array.maf > 0.05
        M_5_50 = np.atleast_1d(
            np.squeeze(np.asarray(np.sum(annot_matrix[ii, :], axis=0))))
    else:
        M = [geno_array.m]
        M_5_50 = [np.sum(geno_array.maf > 0.05)]

    # print .M
    fout_M = open(args.out + '.' + file_suffix + '.M', 'wb')
    print >> fout_M, '\t'.join(map(str, M))
    fout_M.close()

    # print .M_5_50
    fout_M_5_50 = open(args.out + '.' + file_suffix + '.M_5_50', 'wb')
    print >> fout_M_5_50, '\t'.join(map(str, M_5_50))
    fout_M_5_50.close()

    # print annot matrix
    if (args.cts_bin is not None) and not args.no_print_annot:
        out_fname_annot = args.out + '.annot'
        new_colnames = geno_array.colnames + ldscore_colnames
        annot_df = pd.DataFrame(np.c_[geno_array.df, annot_matrix])
        annot_df.columns = new_colnames
        del annot_df['MAF']
        log.log("Writing annot matrix produced by --cts-bin to {F}".format(
            F=out_fname + '.gz'))
        annot_df.to_csv(out_fname_annot, sep="\t", header=True, index=False)
        call(['gzip', '-f', out_fname_annot])

    # print LD Score summary
    pd.set_option('display.max_rows', 200)
    log.log('\nSummary of LD Scores in {F}'.format(F=out_fname + l2_suffix))
    t = df.ix[:, 4:].describe()
    log.log(t.ix[1:, :])

    np.seterr(divide='ignore',
              invalid='ignore')  # print NaN instead of weird errors
    # print correlation matrix including all LD Scores and sample MAF
    log.log('')
    log.log('MAF/LD Score Correlation Matrix')
    log.log(df.ix[:, 4:].corr())

    # print condition number
    if n_annot > 1:  # condition number of a column vector w/ nonzero var is trivially one
        log.log('\nLD Score Matrix Condition Number')
        cond_num = np.linalg.cond(df.ix[:, 5:])
        log.log(reg.remove_brackets(str(np.matrix(cond_num))))
        if cond_num > 10000:
            log.log('WARNING: ill-conditioned LD Score Matrix!')

    # summarize annot matrix if there is one
    if annot_matrix is not None:
        # covariance matrix
        x = pd.DataFrame(annot_matrix, columns=annot_colnames)
        log.log('\nAnnotation Correlation Matrix')
        log.log(x.corr())

        # column sums
        log.log('\nAnnotation Matrix Column Sums')
        log.log(_remove_dtype(x.sum(axis=0)))

        # row sums
        log.log('\nSummary of Annotation Matrix Row Sums')
        row_sums = x.sum(axis=1).describe()
        log.log(_remove_dtype(row_sums))

    np.seterr(divide='raise', invalid='raise')

Example #6

Show file

def test_getBlockLefts(coords, max_dist, correct):
    assert np.all(ld.getBlockLefts(coords, max_dist) == correct)

Example #7

Show file

File: test_ldscore.py Project: josephepowell/ldsc

def test_getBlockLefts(coords, max_dist, correct):
	assert np.all(ld.getBlockLefts(coords, max_dist) == correct)

Example #8

Show file

File: ldsc.py Project: josephepowell/ldsc

def ldscore(args):
	'''
	Wrapper function for estimating l1, l1^2, l2 and l4 (+ optionally standard errors) from
	reference panel genotypes. 
	
	Annot format is 
	chr snp bp cm <annotations>
	
	'''
	log = logger(args.out+'.log')
	
	if args.bin:
		snp_file, snp_obj = args.bin+'.snp', ps.VcfSNPFile
		ind_file, ind_obj = args.bin+'.ind', ps.VcfINDFile
		array_file, array_obj = args.bin+'.bin', ld.VcfBINFile
	elif args.bfile:
		snp_file, snp_obj = args.bfile+'.bim', ps.PlinkBIMFile
		ind_file, ind_obj = args.bfile+'.fam', ps.PlinkFAMFile
		array_file, array_obj = args.bfile+'.bed', ld.PlinkBEDFile

	# read bim/snp
	array_snps = snp_obj(snp_file)
	m = len(array_snps.IDList)
	log.log('Read list of {m} SNPs from {f}'.format(m=m, f=snp_file))
	
	# read annot
	if args.annot and args.keep:
		raise ValueError('--annot and --keep are currently incompatible.')
	elif args.annot:
		annot = ps.AnnotFile(args.annot)
		num_annots,ma = len(annot.df.columns) - 4, len(annot.df)
		log.log("Read {A} annotations for {M} SNPs from {f}".format(f=args.annot,A=num_annots,
			M=ma))
		annot_matrix = np.array(annot.df.iloc[:,4:])
		annot_colnames = annot.df.columns[4:]
		keep_snps = None
	elif args.keep:
		keep_snps = __filter__(args.keep, 'SNPs', 'include', array_snps)
		annot_matrix, annot_colnames, num_annots = None, None, 1
	else:
		annot_matrix, annot_colnames, keep_snps = None, None, None, 
		num_annots = 1
	
	# read fam/ind
	array_indivs = ind_obj(ind_file)
	n = len(array_indivs.IDList)	 
	log.log('Read list of {n} individuals from {f}'.format(n=n, f=ind_file))
	# read keep_indivs
	if args.extract:
		keep_indivs = __filter__(args.extract, 'individuals', 'include', array_indivs)
	else:
		keep_indivs = None
	
	# read genotype array
	log.log('Reading genotypes from {fname}'.format(fname=array_file))
	geno_array = array_obj(array_file, n, array_snps, keep_snps=keep_snps,
		keep_indivs=keep_indivs, mafMin=args.maf)
		
	# filter annot_matrix down to only SNPs passing MAF cutoffs
	if annot_matrix is not None:
		annot_keep = geno_array.kept_snps
		annot_matrix = annot_matrix[annot_keep,:]
	
	# determine block widths
	x = np.array((args.ld_wind_snps, args.ld_wind_kb, args.ld_wind_cm), dtype=bool)
	if np.sum(x) != 1: 
		raise ValueError('Must specify exactly one --ld-wind option')
	
	if args.ld_wind_snps:
		max_dist = args.ld_wind_snps
		coords = np.array(xrange(geno_array.m))
	elif args.ld_wind_kb:
		max_dist = args.ld_wind_kb*1000
		coords = np.array(array_snps.df['BP'])[geno_array.kept_snps]
	elif args.ld_wind_cm:
		max_dist = args.ld_wind_cm
		coords = np.array(array_snps.df['CM'])[geno_array.kept_snps]

	block_left = ld.getBlockLefts(coords, max_dist)
	if block_left[len(block_left)-1] == 0 and not args.yes_really:
		error_msg = 'Do you really want to compute whole-chomosome LD Score? If so, set the '
		error_msg += '--yes-really flag (warning: it will use a lot of time / memory)'
		raise ValueError(error_msg)

	if args.se: # block jackknife

		# block size
		if args.block_size:
			jSize = args.block_size 
		elif n > 50:
			jSize = 10
		else:
			jSize = 1
		
		jN = int(np.ceil(n / jSize))
		if args.l1:
			col_prefix = "L1"; file_suffix = "l1.jknife"
			raise NotImplementedError('Sorry, havent implemented L1 block jackknife yet.')
			
		elif args.l1sq:
			col_prefix = "L1SQ"; file_suffix = "l1sq.jknife"
			raise NotImplementedError('Sorry, havent implemented L1^2 block jackknife yet.')
			
		elif args.l2:
			col_prefix = "L2"; file_suffix = "l2.jknife"
			c = "Computing LD Score (L2) and block jackknife standard errors with {n} blocks."
			
		elif args.l4:
			col_prefix = "L4"; file_suffix = "l4.jknife"
			c = "Computing L4 and block jackknife standard errors with {n} blocks."
			
		print c.format(n=jN)
		(lN_est, lN_se) = geno_array.ldScoreBlockJackknife(block_left, args.chunk_size, jN=jN,
			annot=annot_matrix)
		lN = np.c_[lN_est, lN_se]
		if num_annots == 1:
			ldscore_colnames = [col_prefix, 'SE('+col_prefix+')']
		else:
			ldscore_colnames =  [x+col_prefix for x in annot_colnames]
			ldscore_colnames += ['SE('+x+')' for x in ldscore_colnames]

	else: # not block jackknife
		if args.l1:
			log.log("Estimating L1.")
			lN = geno_array.l1VarBlocks(block_left, args.chunk_size, annot=annot_matrix)
			col_prefix = "L1"; file_suffix = "l1"
		
		elif args.l1sq:
			log.log("Estimating L1 ^ 2.")
			lN = geno_array.l1sqVarBlocks(block_left, args.chunk_size, annot=annot_matrix)
			col_prefix = "L1SQ"; file_suffix = "l1sq"
		
		elif args.l2:
			log.log("Estimating LD Score (L2).")
			lN = geno_array.ldScoreVarBlocks(block_left, args.chunk_size, annot=annot_matrix)
			col_prefix = "L2"; file_suffix = "l2"
	
		elif args.l4:
			col_prefix = "L4"; file_suffix = "l4"
			raise NotImplementedError('Sorry, havent implemented L4 yet. Try the jackknife.')
			lN = geno_array.l4VarBlocks(block_left, c, annot)
		
		if num_annots == 1:
			ldscore_colnames = [col_prefix]
		else:
			ldscore_colnames =  [x+col_prefix for x in annot_colnames]
			
	# print .ldscore
	# output columns: CHR, BP, CM, RS, MAF, [LD Scores and optionally SEs]
	out_fname = args.out + '.' + file_suffix + '.ldscore'
	new_colnames = geno_array.colnames + ldscore_colnames
	df = pd.DataFrame(np.c_[geno_array.df, lN])
	df.columns = new_colnames
	log.log("Writing results to {f}.gz".format(f=out_fname))
	df.to_csv(out_fname, sep="\t", header=True, index=False)	
	call(['gzip',out_fname])
	# print .M
	fout_M = open(args.out + '.'+ file_suffix +'.M','wb')
	if num_annots == 1:
		print >> fout_M, geno_array.m
	else:
		M = np.squeeze(np.sum(annot_matrix, axis=0))
		print >> fout_M, '\t'.join(map(str,M))

	fout_M.close()

Example #9

Show file

File: ldsc.py Project: Chris1221/ldsc

def ldscore(args, log):
    '''
    Wrapper function for estimating l1, l1^2, l2 and l4 (+ optionally standard errors) from
    reference panel genotypes.

    Annot format is
    chr snp bp cm <annotations>

    '''

    if args.bfile:
        snp_file, snp_obj = args.bfile+'.bim', ps.PlinkBIMFile
        ind_file, ind_obj = args.bfile+'.fam', ps.PlinkFAMFile
        array_file, array_obj = args.bfile+'.bed', ld.PlinkBEDFile

    # read bim/snp
    array_snps = snp_obj(snp_file)
    m = len(array_snps.IDList)
    log.log('Read list of {m} SNPs from {f}'.format(m=m, f=snp_file))
    if args.annot is not None:  # read --annot
        try:
            annot = ps.AnnotFile(args.annot)
            n_annot, ma = len(annot.df.columns) - 4, len(annot.df)
            log.log("Read {A} annotations for {M} SNPs from {f}".format(f=args.annot,
                A=n_annot, M=ma))
            annot_matrix = np.array(annot.df.iloc[:,4:])
            annot_colnames = annot.df.columns[4:]
            keep_snps = None
            if np.any(annot.df.SNP.values != array_snps.df.SNP.values):
                raise ValueError('The .annot file must contain the same SNPs in the same'+\
                    ' order as the .bim file.')
        except Exception:
            log.log('Error parsing .annot file')
            raise

    elif args.extract is not None:  # --extract
        keep_snps = __filter__(args.extract, 'SNPs', 'include', array_snps)
        annot_matrix, annot_colnames, n_annot = None, None, 1


    elif args.cts_bin is not None and args.cts_breaks is not None:  # --cts-bin
        cts_fnames = sumstats._splitp(args.cts_bin)  # read filenames
        args.cts_breaks = args.cts_breaks.replace('N','-')  # replace N with negative sign
        try:  # split on x
            breaks = [[float(x) for x in y.split(',')] for y in args.cts_breaks.split('x')]
        except ValueError as e:
            raise ValueError('--cts-breaks must be a comma-separated list of numbers: '
                +str(e.args))

        if len(breaks) != len(cts_fnames):
            raise ValueError('Need to specify one set of breaks for each file in --cts-bin.')

        if args.cts_names:
            cts_colnames = [str(x) for x in args.cts_names.split(',')]
            if len(cts_colnames) != len(cts_fnames):
                msg = 'Must specify either no --cts-names or one value for each file in --cts-bin.'
                raise ValueError(msg)

        else:
            cts_colnames = ['ANNOT'+str(i) for i in xrange(len(cts_fnames))]

        log.log('Reading numbers with which to bin SNPs from {F}'.format(F=args.cts_bin))

        cts_levs = []
        full_labs = []
        for i,fh in enumerate(cts_fnames):
            vec = ps.read_cts(cts_fnames[i], array_snps.df.SNP.values)

            max_cts = np.max(vec)
            min_cts = np.min(vec)
            cut_breaks = list(breaks[i])
            name_breaks = list(cut_breaks)
            if np.all(cut_breaks >= max_cts) or np.all(cut_breaks <= min_cts):
                raise ValueError('All breaks lie outside the range of the cts variable.')

            if np.all(cut_breaks <= max_cts):
                name_breaks.append(max_cts)
                cut_breaks.append(max_cts+1)

            if np.all(cut_breaks >= min_cts):
                name_breaks.append(min_cts)
                cut_breaks.append(min_cts-1)

            name_breaks.sort()
            cut_breaks.sort()
            n_breaks = len(cut_breaks)
            # so that col names are consistent across chromosomes with different max vals
            name_breaks[0] = 'min'
            name_breaks[-1] = 'max'
            name_breaks = [str(x) for x in name_breaks]
            labs = [name_breaks[i]+'_'+name_breaks[i+1] for i in xrange(n_breaks-1)]
            cut_vec = pd.Series(pd.cut(vec, bins=cut_breaks, labels=labs))
            cts_levs.append(cut_vec)
            full_labs.append(labs)

        annot_matrix = pd.concat(cts_levs, axis=1)
        annot_matrix.columns = cts_colnames
        # crosstab -- for now we keep empty columns
        annot_matrix = pd.crosstab(annot_matrix.index,
            [annot_matrix[i] for i in annot_matrix.columns], dropna=False,
            colnames=annot_matrix.columns)

        # add missing columns
        if len(cts_colnames) > 1:
            for x in product(*full_labs):
                if x not in annot_matrix.columns:
                    annot_matrix[x] = 0
        else:
            for x in full_labs[0]:
                if x not in annot_matrix.columns:
                    annot_matrix[x] = 0

        annot_matrix = annot_matrix[sorted(annot_matrix.columns, key=annot_sort_key)]
        if len(cts_colnames) > 1:
            # flatten multi-index
            annot_colnames = ['_'.join([cts_colnames[i]+'_'+b for i,b in enumerate(c)])
                for c in annot_matrix.columns]
        else:
            annot_colnames = [cts_colnames[0]+'_'+b for b in annot_matrix.columns]

        annot_matrix = np.matrix(annot_matrix)
        keep_snps = None
        n_annot = len(annot_colnames)
        if np.any(np.sum(annot_matrix, axis=1) == 0):
            # This exception should never be raised. For debugging only.
            raise ValueError('Some SNPs have no annotation in --cts-bin. This is a bug!')

    else:
        annot_matrix, annot_colnames, keep_snps = None, None, None,
        n_annot = 1

    # read fam
    array_indivs = ind_obj(ind_file)
    n = len(array_indivs.IDList)
    log.log('Read list of {n} individuals from {f}'.format(n=n, f=ind_file))
    # read keep_indivs
    if args.keep:
        keep_indivs = __filter__(args.keep, 'individuals', 'include', array_indivs)
    else:
        keep_indivs = None

    # read genotype array
    log.log('Reading genotypes from {fname}'.format(fname=array_file))
    geno_array = array_obj(array_file, n, array_snps, keep_snps=keep_snps,
        keep_indivs=keep_indivs, mafMin=args.maf)

    # filter annot_matrix down to only SNPs passing MAF cutoffs
    if annot_matrix is not None:
        annot_keep = geno_array.kept_snps
        annot_matrix = annot_matrix[annot_keep,:]

    # determine block widths
    x = np.array((args.ld_wind_snps, args.ld_wind_kb, args.ld_wind_cm), dtype=bool)
    if np.sum(x) != 1:
        raise ValueError('Must specify exactly one --ld-wind option')

    if args.ld_wind_snps:
        max_dist = args.ld_wind_snps
        coords = np.array(xrange(geno_array.m))
    elif args.ld_wind_kb:
        max_dist = args.ld_wind_kb*1000
        coords = np.array(array_snps.df['BP'])[geno_array.kept_snps]
    elif args.ld_wind_cm:
        max_dist = args.ld_wind_cm
        coords = np.array(array_snps.df['CM'])[geno_array.kept_snps]

    block_left = ld.getBlockLefts(coords, max_dist)
    if block_left[len(block_left)-1] == 0 and not args.yes_really:
        error_msg = 'Do you really want to compute whole-chomosome LD Score? If so, set the '
        error_msg += '--yes-really flag (warning: it will use a lot of time / memory)'
        raise ValueError(error_msg)

    scale_suffix = ''
    if args.pq_exp is not None:
        log.log('Computing LD with pq ^ {S}.'.format(S=args.pq_exp))
        msg = 'Note that LD Scores with pq raised to a nonzero power are'
        msg += 'not directly comparable to normal LD Scores.'
        log.log(msg)
        scale_suffix = '_S{S}'.format(S=args.pq_exp)
        pq = np.matrix(geno_array.maf*(1-geno_array.maf)).reshape((geno_array.m, 1))
        pq = np.power(pq, args.pq_exp)

        if annot_matrix is not None:
            annot_matrix = np.multiply(annot_matrix, pq)
        else:
            annot_matrix = pq

    log.log("Estimating LD Score.")
    lN = geno_array.ldScoreVarBlocks(block_left, args.chunk_size, annot=annot_matrix)
    col_prefix = "L2"; file_suffix = "l2"

    if n_annot == 1:
        ldscore_colnames = [col_prefix+scale_suffix]
    else:
        ldscore_colnames =  [y+col_prefix+scale_suffix for y in annot_colnames]

    # print .ldscore. Output columns: CHR, BP, RS, [LD Scores]
    out_fname = args.out + '.' + file_suffix + '.ldscore'
    new_colnames = geno_array.colnames + ldscore_colnames
    df = pd.DataFrame.from_records(np.c_[geno_array.df, lN])
    df.columns = new_colnames
    if args.print_snps:
        if args.print_snps.endswith('gz'):
            print_snps = pd.read_csv(args.print_snps, header=None, compression='gzip')
        elif args.print_snps.endswith('bz2'):
            print_snps = pd.read_csv(args.print_snps, header=None, compression='bz2')
        else:
            print_snps = pd.read_csv(args.print_snps, header=None)
        if len(print_snps.columns) > 1:
            raise ValueError('--print-snps must refer to a file with a one column of SNP IDs.')
        log.log('Reading list of {N} SNPs for which to print LD Scores from {F}'.format(\
                        F=args.print_snps, N=len(print_snps)))

        print_snps.columns=['SNP']
        df = df.ix[df.SNP.isin(print_snps.SNP),:]
        if len(df) == 0:
            raise ValueError('After merging with --print-snps, no SNPs remain.')
        else:
            msg = 'After merging with --print-snps, LD Scores for {N} SNPs will be printed.'
            log.log(msg.format(N=len(df)))

    l2_suffix = '.gz'
    log.log("Writing LD Scores for {N} SNPs to {f}.gz".format(f=out_fname, N=len(df)))
    df.drop(['CM','MAF'], axis=1).to_csv(out_fname, sep="\t", header=True, index=False,
        float_format='%.3f')
    call(['gzip', '-f', out_fname])
    if annot_matrix is not None:
        M = np.atleast_1d(np.squeeze(np.asarray(np.sum(annot_matrix, axis=0))))
        ii = geno_array.maf > 0.05
        M_5_50 = np.atleast_1d(np.squeeze(np.asarray(np.sum(annot_matrix[ii,:], axis=0))))
    else:
        M = [geno_array.m]
        M_5_50 = [np.sum(geno_array.maf > 0.05)]

    # print .M
    fout_M = open(args.out + '.'+ file_suffix +'.M','wb')
    print >>fout_M, '\t'.join(map(str,M))
    fout_M.close()

    # print .M_5_50
    fout_M_5_50 = open(args.out + '.'+ file_suffix +'.M_5_50','wb')
    print >>fout_M_5_50, '\t'.join(map(str,M_5_50))
    fout_M_5_50.close()

    # print annot matrix
    if (args.cts_bin is not None) and not args.no_print_annot:
        out_fname_annot = args.out + '.annot'
        new_colnames = geno_array.colnames + ldscore_colnames
        annot_df = pd.DataFrame(np.c_[geno_array.df, annot_matrix])
        annot_df.columns = new_colnames
        del annot_df['MAF']
        log.log("Writing annot matrix produced by --cts-bin to {F}".format(F=out_fname+'.gz'))
        annot_df.to_csv(out_fname_annot, sep="\t", header=True, index=False)
        call(['gzip', '-f', out_fname_annot])

    # print LD Score summary
    pd.set_option('display.max_rows', 200)
    log.log('\nSummary of LD Scores in {F}'.format(F=out_fname+l2_suffix))
    t = df.ix[:,4:].describe()
    log.log( t.ix[1:,:] )

    np.seterr(divide='ignore', invalid='ignore')  # print NaN instead of weird errors
    # print correlation matrix including all LD Scores and sample MAF
    log.log('')
    log.log('MAF/LD Score Correlation Matrix')
    log.log( df.ix[:,4:].corr() )

    # print condition number
    if n_annot > 1: # condition number of a column vector w/ nonzero var is trivially one
        log.log('\nLD Score Matrix Condition Number')
        cond_num = np.linalg.cond(df.ix[:,5:])
        log.log( reg.remove_brackets(str(np.matrix(cond_num))) )
        if cond_num > 10000:
            log.log('WARNING: ill-conditioned LD Score Matrix!')

    # summarize annot matrix if there is one
    if annot_matrix is not None:
        # covariance matrix
        x = pd.DataFrame(annot_matrix, columns=annot_colnames)
        log.log('\nAnnotation Correlation Matrix')
        log.log( x.corr() )

        # column sums
        log.log('\nAnnotation Matrix Column Sums')
        log.log(_remove_dtype(x.sum(axis=0)))

        # row sums
        log.log('\nSummary of Annotation Matrix Row Sums')
        row_sums = x.sum(axis=1).describe()
        log.log(_remove_dtype(row_sums))

    np.seterr(divide='raise', invalid='raise')