Esempio n. 1
0
d = Dataset(args.dataset)
A = SnpSubset(d, GenomicSubset(args.subset).bedtool)
if args.path_to_R is not None:
    R = pickle.load(open(args.path_to_R))
else:
    R = None

newA = IntRangeSet()
for r in A.expanded_by(0.003).irs.ranges():
    S = IntRangeSet([a-r[0] for a in A.irs & IntRangeSet(r)])
    print(r, 'analyzing', len(S), 'snps')
    if R is None:
        X = d.get_standardized_genotypes(r)
        cov = X.T.dot(X) / d.N
    else:
        cov = R.ranges_to_arrays[r]
    while True:
        new = get_high_ld_snps(S, cov)
        if len(new) == 0:
            break
        else:
            print('\tadding', len(new), 'snps')
            print('\t\tbefore', S)
            S += new
            print('\t\tafter', S)
    newA += IntRangeSet([s+r[0] for s in S])

b = BedTool([interval_from_range(r) for r in newA.ranges()])
print(b)
b.saveas(paths.genome_subsets + args.subset + '.R2ge{:0.2}.bed'.format(args.R2_threshold))
Esempio n. 2
0
class SnpSubset(object):
    def __init__(self, dataset, bedtool=None, irs=None):
        # use bedtools to create an indicator vector for the snps membership in the subset
        self.dataset = dataset
        if bedtool:
            indicator = dataset.snp_coords().intersect(bedtool, c=True)
            self.irs = IntRangeSet(np.flatnonzero(
                np.array([int(snp.name) for snp in indicator])))
        elif irs:
            self.irs = irs
        else:
            self.irs = IntRangeSet()

    def num_snps(self):
        return len(self.irs)

    def expand_by(self, expansion_in_each_direction, units='Morgans'):
        result = IntRangeSet()
        for r in self.irs.ranges():
                result += self.dataset.buffer_around_slice(
                        r, expansion_in_each_direction, units=units)
        self.irs = result

    def expanded_by(self, expansion_in_each_direction, units='Morgans'):
        result = copy.copy(self)
        result.expand_by(expansion_in_each_direction, units=units)
        return result

    # prints subsets in the appropriate format for ldsc
    # all subsets must have the same dataset
    @classmethod
    def print_subsets(cls, outfilename, snpsubsets, names, add_other=False):
        def snp_info_df(d):
            bfile = d.genotypes_bedfile.filename
            return pd.read_csv(bfile + '.bim',
                    delim_whitespace=True,
                    usecols=[0,1,2,3],
                    names=['CHR','SNP','CM','BP'])

        # check that all snpsubsets have the same data set
        if len(set([ss.dataset for ss in snpsubsets])) > 1:
            print('error: all subsets must have the same underlying dataset')
            return
        if not outfilename.endswith('.gz'):
            print('outfilename must end with ".gz". I only write zipped files')
            return

        # get snp info for this dataset
        d = snpsubsets[0].dataset
        df = snp_info_df(d)

        # add the 'other' annotation if necessary
        if add_other:
            union = IntRangeSet()
            for ss in snpsubsets:
                union.update(ss.irs)
            snpsubsets.append(SnpSubset(d, irs=d.all_snps() - union))
            names.append('OTHER')

        # create the pandas dataframe and output it
        for name, ss in zip(names, snpsubsets):
            df[name] = 0
            df.ix[[i for i in ss.irs], name] = 1
        df = df[['CHR','BP','SNP','CM'] + names]
        with gzip.open(outfilename, 'wt') as write_file:
            df.to_csv(write_file, index=False, sep='\t')