d = Dataset(args.dataset) A = SnpSubset(d, GenomicSubset(args.subset).bedtool) if args.path_to_R is not None: R = pickle.load(open(args.path_to_R)) else: R = None newA = IntRangeSet() for r in A.expanded_by(0.003).irs.ranges(): S = IntRangeSet([a-r[0] for a in A.irs & IntRangeSet(r)]) print(r, 'analyzing', len(S), 'snps') if R is None: X = d.get_standardized_genotypes(r) cov = X.T.dot(X) / d.N else: cov = R.ranges_to_arrays[r] while True: new = get_high_ld_snps(S, cov) if len(new) == 0: break else: print('\tadding', len(new), 'snps') print('\t\tbefore', S) S += new print('\t\tafter', S) newA += IntRangeSet([s+r[0] for s in S]) b = BedTool([interval_from_range(r) for r in newA.ranges()]) print(b) b.saveas(paths.genome_subsets + args.subset + '.R2ge{:0.2}.bed'.format(args.R2_threshold))
class SnpSubset(object): def __init__(self, dataset, bedtool=None, irs=None): # use bedtools to create an indicator vector for the snps membership in the subset self.dataset = dataset if bedtool: indicator = dataset.snp_coords().intersect(bedtool, c=True) self.irs = IntRangeSet(np.flatnonzero( np.array([int(snp.name) for snp in indicator]))) elif irs: self.irs = irs else: self.irs = IntRangeSet() def num_snps(self): return len(self.irs) def expand_by(self, expansion_in_each_direction, units='Morgans'): result = IntRangeSet() for r in self.irs.ranges(): result += self.dataset.buffer_around_slice( r, expansion_in_each_direction, units=units) self.irs = result def expanded_by(self, expansion_in_each_direction, units='Morgans'): result = copy.copy(self) result.expand_by(expansion_in_each_direction, units=units) return result # prints subsets in the appropriate format for ldsc # all subsets must have the same dataset @classmethod def print_subsets(cls, outfilename, snpsubsets, names, add_other=False): def snp_info_df(d): bfile = d.genotypes_bedfile.filename return pd.read_csv(bfile + '.bim', delim_whitespace=True, usecols=[0,1,2,3], names=['CHR','SNP','CM','BP']) # check that all snpsubsets have the same data set if len(set([ss.dataset for ss in snpsubsets])) > 1: print('error: all subsets must have the same underlying dataset') return if not outfilename.endswith('.gz'): print('outfilename must end with ".gz". I only write zipped files') return # get snp info for this dataset d = snpsubsets[0].dataset df = snp_info_df(d) # add the 'other' annotation if necessary if add_other: union = IntRangeSet() for ss in snpsubsets: union.update(ss.irs) snpsubsets.append(SnpSubset(d, irs=d.all_snps() - union)) names.append('OTHER') # create the pandas dataframe and output it for name, ss in zip(names, snpsubsets): df[name] = 0 df.ix[[i for i in ss.irs], name] = 1 df = df[['CHR','BP','SNP','CM'] + names] with gzip.open(outfilename, 'wt') as write_file: df.to_csv(write_file, index=False, sep='\t')