Exemple #1
0
def get_bim_metadata(bim_file):
    '''Return (a)
    a dictionary of chromosome-to-bp-length from a BIM file. Note: the relevant length
    is measured between the first and last SNPs, and may be shorter than the total chromosome length;
    (b) a dictionary of chromosome-to-list-of-snp-names.'''
    #    a = list(itemutil.index_of_change(((int(x[0]), int(x[3]))
    #                                       for x in csv.reader(bim_file, delimiter='\t', skipinitialspace=True)),
    #                                      output_first=True, output_last=True,
    #                                      output_value=True,
    #                                      comparator=lambda x,y: x[0]==y[0]))
    #    endpoints = np.diff(np.array([(x[1][1] if x[1] else 0, x[2][1] if x[2] else 0) for x in a]).flatten())[1:2*22:2]

    # A BIM file is sorted by chromosome, then by SNP base-pair location
    snp_names = util.mdict()
    chr_endpoints = {}
    prev = start = None
    for line in csv.reader(bim_file, delimiter='\t', skipinitialspace=True):
        chrom = int(line[0])
        curr = (chrom, int(line[3]))
        snp_names[chrom] = line[1]
        if not start: start = curr
        if prev and chrom != prev[0]:
            chr_endpoints[start[0]] = prev[1] - start[1]
            start = curr
        prev = curr
    chr_endpoints[start[0]] = prev[1] - start[1]  # Last chromosome
    return dict((k, chr_endpoints[k]) for k in CHROMOSOMES
                if chr_endpoints.has_key(k)), snp_names
Exemple #2
0
def get_bim_metadata(bim_file):
    '''Return (a)
    a dictionary of chromosome-to-bp-length from a BIM file. Note: the relevant length
    is measured between the first and last SNPs, and may be shorter than the total chromosome length;
    (b) a dictionary of chromosome-to-list-of-snp-names.'''
#    a = list(itemutil.index_of_change(((int(x[0]), int(x[3])) 
#                                       for x in csv.reader(bim_file, delimiter='\t', skipinitialspace=True)), 
#                                      output_first=True, output_last=True, 
#                                      output_value=True, 
#                                      comparator=lambda x,y: x[0]==y[0]))
#    endpoints = np.diff(np.array([(x[1][1] if x[1] else 0, x[2][1] if x[2] else 0) for x in a]).flatten())[1:2*22:2]

    # A BIM file is sorted by chromosome, then by SNP base-pair location
    snp_names = util.mdict()
    chr_endpoints = {}
    prev = start = None
    for line in csv.reader(bim_file, delimiter='\t', skipinitialspace=True):
        chrom = int(line[0])
        curr = (chrom, int(line[3]))
        snp_names[chrom] = line[1]
        if not start: start = curr
        if prev and chrom != prev[0]:
            chr_endpoints[start[0]] = prev[1] - start[1]
            start = curr
        prev = curr
    chr_endpoints[start[0]] = prev[1] - start[1] # Last chromosome
    return dict((k, chr_endpoints[k]) for k in CHROMOSOMES if chr_endpoints.has_key(k)), snp_names
Exemple #3
0
    def sub_problem(self, samples, snps=None):
        '''Return a sub-problem that contains a subset 'samples' of the genotyped nodes.'''
        # Re-order samples so that all genotyped appear before all non-genotyped
        if isinstance(samples, list): samples = np.array(samples)
        # Create pedigree object
        p = self.sub_pedigree(samples)

        genotyped = np.where(samples < self.pedigree.num_genotyped)[0]
        num_genotyped = len(genotyped) 
        samples = samples[np.concatenate((genotyped, np.where(samples >= self.pedigree.num_genotyped)[0]))]

        # Create deep copies of relevant parts of data arrays
        g, h = self.data
        g_snp, h_snp, qc = self.genotype.snp, self.haplotype.snp, self.haplotype.qc
        frames = self.frames 
        if snps is not None:
            g, h, qc = g[snps, :, :], h[snps, :, :], qc[snps, :, :] if qc.size else None
            g_snp, h_snp = g_snp[snps], g_snp[snps]
            # Restrict frames to snps, convert to new SNP indices
            orig_snp = dict((v, k) for k, v in enumerate(snps))
            def sub_frame(frame):
                for x in frame:
                    if orig_snp.has_key(x): yield orig_snp[x]
            
            frames = util.mdict()
            for k, v in frames.iteritems():
                for frame in v:                    
                    frames[k] = sub_frame(frame)
            
        genotyped = p.sample_index[0:num_genotyped]
        g, h = g[:, genotyped, :].copy(), h[:, genotyped, :].copy()
        if qc.size: qc = qc[:, genotyped, ].copy()
        g_snp, h_snp = g_snp.copy(), h_snp.copy()
        
        # Build sub-problem object graph
        g = im.factory.GenotypeFactory.new_instance('genotype', g, g_snp)
        h = im.factory.GenotypeFactory.new_instance('haplotype', h, h_snp)
        h.qc = qc
        # Build restricted info object
        error = self.error[:, genotyped].copy() if self.error.size else self.error
        if snps is not None and error.size: error = error[snps, :]
        sample_set = set(samples)
        sample_index_map = util.dict_invert(dict(enumerate(p.sample_index)))
        ibd = im.segment.SegmentSet(im.segment.Segment(x.snp, map(lambda y: (sample_index_map[y[0]], y[1]), x.samples),
                                                       x.bp, error_snps=x.error_snps) for x in self.info.ibd if (sample_set >= set([y[0] for y in x.samples])))
        info = ProblemInfo(p, g, snp=(self.info.snp[snps] if snps is not None else self.info.snp), ibd=ibd)
        return Problem(p, g, haplotype=h, info=info, error=error, frames=frames, lam=self.lam)
Exemple #4
0
    def sub_problem_of_snps(self, snps=None):
        '''Return a sub-problem that contains a subset of the SNPs. Assumes at least two SNPs.'''
        # Re-order samples so that all genotyped appear before all non-genotyped

        # Create deep copies of relevant parts of data arrays
        g, h = self.data
        g_snp, h_snp, qc = self.genotype.snp, self.haplotype.snp, self.haplotype.qc
        frames = self.frames 
        if snps is not None:
            g, h = g[snps, :, :], h[snps, :, :]
            if qc.size: qc = qc[snps, :, :]
            g_snp, h_snp = g_snp[snps], g_snp[snps]
            # Restrict frames to snps, convert to new SNP indices
            orig_snp = dict((v, k) for k, v in enumerate(snps))
            def sub_frame(frame):
                for x in frame:
                    if orig_snp.has_key(x): yield orig_snp[x]
            
            frames = util.mdict()
            for k, v in frames.iteritems():
                for frame in v:                    
                    frames[k] = sub_frame(frame)
            
        g, h = g.copy(), h.copy()
        if qc.size:
            qc = qc.copy()
        g_snp, h_snp = g_snp.copy(), h_snp.copy()
        
        # Build sub-problem object graph
        g = im.factory.GenotypeFactory.new_instance('genotype', g, g_snp)
        h = im.factory.GenotypeFactory.new_instance('haplotype', h, h_snp)
        h.qc = qc
        if self.haplotype.poo_phase is not None:
            h.poo_phase = self.haplotype.poo_phase.copy()
        if self.haplotype.hap_type is not None:
            h.hap_type = self.haplotype.hap_type[snps].copy()
        
        # Build restricted info object
        error = self.error.copy() if self.error.size else self.error
        if error.size: error = error[snps, :]
        ibd = im.segment.SegmentSet(im.segment.Segment(x.snp, map(lambda y: (self.sample_index[y[0]], y[1]), x.samples),
                                                       x.bp, error_snps=x.error_snps) for x in self.info.ibd)
        info = ProblemInfo(self.pedigree, g, snp=(self.info.snp[snps] if snps is not None else self.info.snp), ibd=ibd)
        return Problem(self.pedigree, g, haplotype=h, info=info, error=error, frames=frames, lam=self.lam)
Exemple #5
0
 def __init__(self, items):
     self._frames = util.mdict()
     for k, v in items:
         self._frames[k] = v
Exemple #6
0
        # Initialize
        daos = db_gene.snp.snp_db_dao.Daos(url=options.db_url)
        util.mkdir_if_not_exists(os.path.dirname(options.out_base_name))
        
        # Set genetic distance column in BIM file (read locations from snp db) and save a new copy of it
        snp_data = np.genfromtxt(input_file,
                                 dtype=[
                                        ('chrom', np.uint8),  # Chromosome # containing the SNP
                                        ('name', np.chararray),  # SNP name (e.g., 'rs...')
                                        ('dist_cm', np.float),  # Genetic position [CENTI-Morgans!!]
                                        ('base_pair', np.uint),  # Base pair position on chromosome
                                        ('allele1', np.chararray),
                                        ('allele2', np.chararray)
                                        ])
        snp_names = snp_data['name']
        a = dict((x.name, x) for x in daos.snp_dao.get_snps_iter(snp_names))
        # Note: our genetic distance unit is cM 
        snp_data['dist_cm'] = map(lambda x: x if x else 0.0, ((a[x].genetic_pos if a.has_key(x) else None) for x in snp_names))
        np.savetxt(options.out_base_name + '.bim.new', snp_data, fmt='%d\t%s\t%f\t%d\t%s\t%s')

        # For each chromosome in the PLINK file: load LD data, generate frame numbers, save them to a file
        (c, s), frames = bu.get_bim_metadata(open(input_file, 'rb')), util.mdict()
        for chrom in s.iterkeys():
            for x in db_gene.snp.ld_graph.frames(chrom, s[chrom], daos.snp_dao, daos.ld_dao):
                frames[chrom] = x
        with open(options.out_base_name + '.frm', 'wb') as frm_file:
            db_gene.snp.ld_graph.write_frames(frames, frm_file)
    except:
        traceback.print_exc(file=sys.stdout)
        sys.exit(util.EXIT_FAILURE)
Exemple #7
0
 def __init__(self, items):
     self._frames = util.mdict()
     for k, v in items: self._frames[k] = v
Exemple #8
0
            dtype=[
                ('chrom', np.uint8),  # Chromosome # containing the SNP
                ('name', np.chararray),  # SNP name (e.g., 'rs...')
                ('dist_cm', np.float),  # Genetic position [CENTI-Morgans!!]
                ('base_pair', np.uint),  # Base pair position on chromosome
                ('allele1', np.chararray),
                ('allele2', np.chararray)
            ])
        snp_names = snp_data['name']
        a = dict((x.name, x) for x in daos.snp_dao.get_snps_iter(snp_names))
        # Note: our genetic distance unit is cM
        snp_data['dist_cm'] = map(lambda x: x if x else 0.0,
                                  ((a[x].genetic_pos if a.has_key(x) else None)
                                   for x in snp_names))
        np.savetxt(options.out_base_name + '.bim.new',
                   snp_data,
                   fmt='%d\t%s\t%f\t%d\t%s\t%s')

        # For each chromosome in the PLINK file: load LD data, generate frame numbers, save them to a file
        (c, s), frames = bu.get_bim_metadata(open(input_file,
                                                  'rb')), util.mdict()
        for chrom in s.iterkeys():
            for x in db_gene.snp.ld_graph.frames(chrom, s[chrom], daos.snp_dao,
                                                 daos.ld_dao):
                frames[chrom] = x
        with open(options.out_base_name + '.frm', 'wb') as frm_file:
            db_gene.snp.ld_graph.write_frames(frames, frm_file)
    except:
        traceback.print_exc(file=sys.stdout)
        sys.exit(util.EXIT_FAILURE)