def get_bim_metadata(bim_file): '''Return (a) a dictionary of chromosome-to-bp-length from a BIM file. Note: the relevant length is measured between the first and last SNPs, and may be shorter than the total chromosome length; (b) a dictionary of chromosome-to-list-of-snp-names.''' # a = list(itemutil.index_of_change(((int(x[0]), int(x[3])) # for x in csv.reader(bim_file, delimiter='\t', skipinitialspace=True)), # output_first=True, output_last=True, # output_value=True, # comparator=lambda x,y: x[0]==y[0])) # endpoints = np.diff(np.array([(x[1][1] if x[1] else 0, x[2][1] if x[2] else 0) for x in a]).flatten())[1:2*22:2] # A BIM file is sorted by chromosome, then by SNP base-pair location snp_names = util.mdict() chr_endpoints = {} prev = start = None for line in csv.reader(bim_file, delimiter='\t', skipinitialspace=True): chrom = int(line[0]) curr = (chrom, int(line[3])) snp_names[chrom] = line[1] if not start: start = curr if prev and chrom != prev[0]: chr_endpoints[start[0]] = prev[1] - start[1] start = curr prev = curr chr_endpoints[start[0]] = prev[1] - start[1] # Last chromosome return dict((k, chr_endpoints[k]) for k in CHROMOSOMES if chr_endpoints.has_key(k)), snp_names
def sub_problem(self, samples, snps=None): '''Return a sub-problem that contains a subset 'samples' of the genotyped nodes.''' # Re-order samples so that all genotyped appear before all non-genotyped if isinstance(samples, list): samples = np.array(samples) # Create pedigree object p = self.sub_pedigree(samples) genotyped = np.where(samples < self.pedigree.num_genotyped)[0] num_genotyped = len(genotyped) samples = samples[np.concatenate((genotyped, np.where(samples >= self.pedigree.num_genotyped)[0]))] # Create deep copies of relevant parts of data arrays g, h = self.data g_snp, h_snp, qc = self.genotype.snp, self.haplotype.snp, self.haplotype.qc frames = self.frames if snps is not None: g, h, qc = g[snps, :, :], h[snps, :, :], qc[snps, :, :] if qc.size else None g_snp, h_snp = g_snp[snps], g_snp[snps] # Restrict frames to snps, convert to new SNP indices orig_snp = dict((v, k) for k, v in enumerate(snps)) def sub_frame(frame): for x in frame: if orig_snp.has_key(x): yield orig_snp[x] frames = util.mdict() for k, v in frames.iteritems(): for frame in v: frames[k] = sub_frame(frame) genotyped = p.sample_index[0:num_genotyped] g, h = g[:, genotyped, :].copy(), h[:, genotyped, :].copy() if qc.size: qc = qc[:, genotyped, ].copy() g_snp, h_snp = g_snp.copy(), h_snp.copy() # Build sub-problem object graph g = im.factory.GenotypeFactory.new_instance('genotype', g, g_snp) h = im.factory.GenotypeFactory.new_instance('haplotype', h, h_snp) h.qc = qc # Build restricted info object error = self.error[:, genotyped].copy() if self.error.size else self.error if snps is not None and error.size: error = error[snps, :] sample_set = set(samples) sample_index_map = util.dict_invert(dict(enumerate(p.sample_index))) ibd = im.segment.SegmentSet(im.segment.Segment(x.snp, map(lambda y: (sample_index_map[y[0]], y[1]), x.samples), x.bp, error_snps=x.error_snps) for x in self.info.ibd if (sample_set >= set([y[0] for y in x.samples]))) info = ProblemInfo(p, g, snp=(self.info.snp[snps] if snps is not None else self.info.snp), ibd=ibd) return Problem(p, g, haplotype=h, info=info, error=error, frames=frames, lam=self.lam)
def sub_problem_of_snps(self, snps=None): '''Return a sub-problem that contains a subset of the SNPs. Assumes at least two SNPs.''' # Re-order samples so that all genotyped appear before all non-genotyped # Create deep copies of relevant parts of data arrays g, h = self.data g_snp, h_snp, qc = self.genotype.snp, self.haplotype.snp, self.haplotype.qc frames = self.frames if snps is not None: g, h = g[snps, :, :], h[snps, :, :] if qc.size: qc = qc[snps, :, :] g_snp, h_snp = g_snp[snps], g_snp[snps] # Restrict frames to snps, convert to new SNP indices orig_snp = dict((v, k) for k, v in enumerate(snps)) def sub_frame(frame): for x in frame: if orig_snp.has_key(x): yield orig_snp[x] frames = util.mdict() for k, v in frames.iteritems(): for frame in v: frames[k] = sub_frame(frame) g, h = g.copy(), h.copy() if qc.size: qc = qc.copy() g_snp, h_snp = g_snp.copy(), h_snp.copy() # Build sub-problem object graph g = im.factory.GenotypeFactory.new_instance('genotype', g, g_snp) h = im.factory.GenotypeFactory.new_instance('haplotype', h, h_snp) h.qc = qc if self.haplotype.poo_phase is not None: h.poo_phase = self.haplotype.poo_phase.copy() if self.haplotype.hap_type is not None: h.hap_type = self.haplotype.hap_type[snps].copy() # Build restricted info object error = self.error.copy() if self.error.size else self.error if error.size: error = error[snps, :] ibd = im.segment.SegmentSet(im.segment.Segment(x.snp, map(lambda y: (self.sample_index[y[0]], y[1]), x.samples), x.bp, error_snps=x.error_snps) for x in self.info.ibd) info = ProblemInfo(self.pedigree, g, snp=(self.info.snp[snps] if snps is not None else self.info.snp), ibd=ibd) return Problem(self.pedigree, g, haplotype=h, info=info, error=error, frames=frames, lam=self.lam)
def __init__(self, items): self._frames = util.mdict() for k, v in items: self._frames[k] = v
# Initialize daos = db_gene.snp.snp_db_dao.Daos(url=options.db_url) util.mkdir_if_not_exists(os.path.dirname(options.out_base_name)) # Set genetic distance column in BIM file (read locations from snp db) and save a new copy of it snp_data = np.genfromtxt(input_file, dtype=[ ('chrom', np.uint8), # Chromosome # containing the SNP ('name', np.chararray), # SNP name (e.g., 'rs...') ('dist_cm', np.float), # Genetic position [CENTI-Morgans!!] ('base_pair', np.uint), # Base pair position on chromosome ('allele1', np.chararray), ('allele2', np.chararray) ]) snp_names = snp_data['name'] a = dict((x.name, x) for x in daos.snp_dao.get_snps_iter(snp_names)) # Note: our genetic distance unit is cM snp_data['dist_cm'] = map(lambda x: x if x else 0.0, ((a[x].genetic_pos if a.has_key(x) else None) for x in snp_names)) np.savetxt(options.out_base_name + '.bim.new', snp_data, fmt='%d\t%s\t%f\t%d\t%s\t%s') # For each chromosome in the PLINK file: load LD data, generate frame numbers, save them to a file (c, s), frames = bu.get_bim_metadata(open(input_file, 'rb')), util.mdict() for chrom in s.iterkeys(): for x in db_gene.snp.ld_graph.frames(chrom, s[chrom], daos.snp_dao, daos.ld_dao): frames[chrom] = x with open(options.out_base_name + '.frm', 'wb') as frm_file: db_gene.snp.ld_graph.write_frames(frames, frm_file) except: traceback.print_exc(file=sys.stdout) sys.exit(util.EXIT_FAILURE)
dtype=[ ('chrom', np.uint8), # Chromosome # containing the SNP ('name', np.chararray), # SNP name (e.g., 'rs...') ('dist_cm', np.float), # Genetic position [CENTI-Morgans!!] ('base_pair', np.uint), # Base pair position on chromosome ('allele1', np.chararray), ('allele2', np.chararray) ]) snp_names = snp_data['name'] a = dict((x.name, x) for x in daos.snp_dao.get_snps_iter(snp_names)) # Note: our genetic distance unit is cM snp_data['dist_cm'] = map(lambda x: x if x else 0.0, ((a[x].genetic_pos if a.has_key(x) else None) for x in snp_names)) np.savetxt(options.out_base_name + '.bim.new', snp_data, fmt='%d\t%s\t%f\t%d\t%s\t%s') # For each chromosome in the PLINK file: load LD data, generate frame numbers, save them to a file (c, s), frames = bu.get_bim_metadata(open(input_file, 'rb')), util.mdict() for chrom in s.iterkeys(): for x in db_gene.snp.ld_graph.frames(chrom, s[chrom], daos.snp_dao, daos.ld_dao): frames[chrom] = x with open(options.out_base_name + '.frm', 'wb') as frm_file: db_gene.snp.ld_graph.write_frames(frames, frm_file) except: traceback.print_exc(file=sys.stdout) sys.exit(util.EXIT_FAILURE)