Esempio n. 1
0
 def __init__(self, data, snp, sample_id):
     '''
     Construct a genotype set from data arrays:
     - snp: SNP metadata record array (contains chromosome, name, morgans, base-pair location)
     - data: a 3-D genotype data array: (individual x SNP x allele)
     - sample_id: genotyped individuals' ID set
     '''       
     # People's IDs
     self.sample_id = sample_id
     self.data = data
     self._num_snps = self.data.shape[0]
     self._num_samples = self.data.shape[1]
     self._snp_range = None
     
     # SNP metadata: SNP label, chromosome number, Genetic distance in Morgans, and
     # base pair location for each SNP
     self.snp = snp
     # Base-pair-location to snp-index map, lazily-initialized + cached
     base_pair = self.snp['base_pair']
     self._base_pair = base_pair  # np.array([int(base_pair)]) if base_pair.size == 1 else base_pair
     self._bp_to_snp = dict_invert(dict(enumerate(self._base_pair)))
     # Construct a BST for fast bp queries
     self._snp_tree = BinarySearchTree(values=self._base_pair[optimal_insertion_order(self._num_snps)])
     self._snp_index_tree = util.list_index_tree(self._base_pair)
     # A genetic map: lists the two allele letters corresponding to 1 and 2 for each SNP, according
     # their order in the self.snp array.
     self.map = []
     # General metadata, for easy handling of CGI data
     self.metadata = []
                 
     # samples for which the parent-of-origin phase is determined
     self.poo_phase = np.zeros((self._num_samples,), dtype=np.byte)
Esempio n. 2
0
def pipeline_monogenic_validation(work_dir=os.environ['OBER_OUT'] + '/requests/monogenic/work',
                                  index_segments_dir=os.environ['OBER_OUT'] + '/requests/monogenic/work/index_segments',
                                  region_size=100,
                                  theta_affinity=0.95,
                                  theta_weight=0.5,
                                  regenerate_segments=True,
                                  snps=None,  # np.array([6, 8]),
                                  debug=1,
                                  debug_sample=512):
    # Load SNPs
    problem = im.io.read_plink(prefix=work_dir + '/monogenic.12', pedigree=im.itu.HUTT_PED, haplotype=None, frames=None)
    # Testing: simulate aligned samples output (hap types should be 2 in the imputed genotype output line)
    problem.haplotype.poo_phase = np.zeros((problem.num_samples,), dtype=np.byte)
    problem.haplotype.poo_phase[np.array([0, 1])] = 1
    problem.haplotype.poo_phase[np.array([2, 3])] = -1
    
    # Create segments only for the regions around each snp
    if regenerate_segments:
        for row in (problem.info.snp[snps] if snps is not None else problem.info.snp):
            # Find SNP's region (the one containing its base-pair position) 
            chrom, bp = row['chrom'], row['base_pair']
            phasing_dir = '%s/phasing/chr%d' % (os.environ['OBER_OUT'], chrom)
            index_segments_chrom_dir = '%s/chr%d' % (index_segments_dir, chrom)
            info_file = '%s/hutt.phased.info.npz' % (phasing_dir,)
            info = im.io.read_info_npz(info_file)
            snp_bp = info.snp['base_pair']
            snp_index = util.nearest_neighbor_in_list_tree(bp, snp_bp, util.list_index_tree(snp_bp))
            snp_index = snp_index if snp_bp[snp_index] <= bp else snp_index - 1
            start = region_size * (snp_index / region_size)
            stop = start + region_size
            segment_file = '%s/segments-%d-%d.out' % (index_segments_chrom_dir, start, stop)
            if not os.path.exists(segment_file):
                util.mkdir_if_not_exists(index_segments_chrom_dir)
                util.run_command('find-segments-of-snp-range %d %d < %s/segments.out > %s' % (start, stop, phasing_dir, segment_file)) 
            
            # Index segments
            if regenerate_segments or \
            not os.path.exists('%s/metadata.npz' % (index_segments_chrom_dir,)) or \
            not os.path.exists('%s/region-%d.npz' % (index_segments_chrom_dir, start)):
                index_segments_beagle.main(segment_file, info_file, segment_file, index_segments_chrom_dir,
                                           snp_index=snp_index, debug=2,
                                           theta_affinity=theta_affinity, theta_weight=theta_weight)
    
    # Impute using the newly generated segment index
    _, t = im.v.iv.impute_problem(problem, debug=debug, remove_partial_calls=True,
                                  segment_location=index_segments_dir,  # if regenerate_segments else None,
                                  snps=snps, debug_sample=debug_sample)

    im.io.write_plink(im.Problem(genotype=t.imputed, pedigree=im.examples.hutt_pedigree(), haplotype=None, frames=None),
                      work_dir + '/imputed.12', save_frames=False, save_haplotype=False)
    im.cgi.io_cgi.write_imputed(t, sys.stdout, poo_phase=problem.haplotype.poo_phase)
    with open(work_dir + '/imputed.12.lgen', 'wb') as f:
        im.cgi.io_cgi.write_imputed_lgen(t, f)
    return t
Esempio n. 3
0
 def _load_chrom(self, chrom):
     '''Load index of a new chromosome number, chrom.'''
     # Load index metadata
     self._chrom = chrom
     metadata = np.load('%s/chr%d/metadata.npz' % (self._index_dir, chrom))
     self._snp = metadata['snp']
     self._region_size = metadata['region_size']
     # Currently-active region is [start,stop)
     self._start = 0
     self._stop = 0 
     self._region_num = -1
     
     # Construct a BST for fast queries of the left-neighboring SNP of a base-pair position
     base_pair = self._snp['base_pair']
     self._base_pair = np.array([int(base_pair)]) if base_pair.size == 1 else base_pair
     self._snp_index_tree = util.list_index_tree(self._base_pair)
Esempio n. 4
0
    def _load_chrom(self, chrom):
        '''Load index of a new chromosome number, chrom.'''
        # Load index metadata
        self._chrom = chrom
        metadata = np.load('%s/chr%d/metadata.npz' % (self._index_dir, chrom))
        self._snp = metadata['snp']
        self._region_size = metadata['region_size']
        # Currently-active region is [start,stop)
        self._start = 0
        self._stop = 0
        self._region_num = -1

        # Construct a BST for fast queries of the left-neighboring SNP of a base-pair position
        base_pair = self._snp['base_pair']
        self._base_pair = np.array([int(base_pair)
                                    ]) if base_pair.size == 1 else base_pair
        self._snp_index_tree = util.list_index_tree(self._base_pair)
Esempio n. 5
0
def pipeline_monogenic_validation(
        work_dir=os.environ['OBER_OUT'] + '/requests/monogenic/work',
        index_segments_dir=os.environ['OBER_OUT'] +
    '/requests/monogenic/work/index_segments',
        region_size=100,
        theta_affinity=0.95,
        theta_weight=0.5,
        regenerate_segments=True,
        snps=None,  # np.array([6, 8]),
        debug=1,
        debug_sample=512):
    # Load SNPs
    problem = im.io.read_plink(prefix=work_dir + '/monogenic.12',
                               pedigree=im.itu.HUTT_PED,
                               haplotype=None,
                               frames=None)
    # Testing: simulate aligned samples output (hap types should be 2 in the imputed genotype output line)
    problem.haplotype.poo_phase = np.zeros((problem.num_samples, ),
                                           dtype=np.byte)
    problem.haplotype.poo_phase[np.array([0, 1])] = 1
    problem.haplotype.poo_phase[np.array([2, 3])] = -1

    # Create segments only for the regions around each snp
    if regenerate_segments:
        for row in (problem.info.snp[snps]
                    if snps is not None else problem.info.snp):
            # Find SNP's region (the one containing its base-pair position)
            chrom, bp = row['chrom'], row['base_pair']
            phasing_dir = '%s/phasing/chr%d' % (os.environ['OBER_OUT'], chrom)
            index_segments_chrom_dir = '%s/chr%d' % (index_segments_dir, chrom)
            info_file = '%s/hutt.phased.info.npz' % (phasing_dir, )
            info = im.io.read_info_npz(info_file)
            snp_bp = info.snp['base_pair']
            snp_index = util.nearest_neighbor_in_list_tree(
                bp, snp_bp, util.list_index_tree(snp_bp))
            snp_index = snp_index if snp_bp[snp_index] <= bp else snp_index - 1
            start = region_size * (snp_index / region_size)
            stop = start + region_size
            segment_file = '%s/segments-%d-%d.out' % (index_segments_chrom_dir,
                                                      start, stop)
            if not os.path.exists(segment_file):
                util.mkdir_if_not_exists(index_segments_chrom_dir)
                util.run_command(
                    'find-segments-of-snp-range %d %d < %s/segments.out > %s' %
                    (start, stop, phasing_dir, segment_file))

            # Index segments
            if regenerate_segments or \
            not os.path.exists('%s/metadata.npz' % (index_segments_chrom_dir,)) or \
            not os.path.exists('%s/region-%d.npz' % (index_segments_chrom_dir, start)):
                index_segments_beagle.main(segment_file,
                                           info_file,
                                           segment_file,
                                           index_segments_chrom_dir,
                                           snp_index=snp_index,
                                           debug=2,
                                           theta_affinity=theta_affinity,
                                           theta_weight=theta_weight)

    # Impute using the newly generated segment index
    _, t = im.v.iv.impute_problem(
        problem,
        debug=debug,
        remove_partial_calls=True,
        segment_location=index_segments_dir,  # if regenerate_segments else None,
        snps=snps,
        debug_sample=debug_sample)

    im.io.write_plink(im.Problem(genotype=t.imputed,
                                 pedigree=im.examples.hutt_pedigree(),
                                 haplotype=None,
                                 frames=None),
                      work_dir + '/imputed.12',
                      save_frames=False,
                      save_haplotype=False)
    im.cgi.io_cgi.write_imputed(t,
                                sys.stdout,
                                poo_phase=problem.haplotype.poo_phase)
    with open(work_dir + '/imputed.12.lgen', 'wb') as f:
        im.cgi.io_cgi.write_imputed_lgen(t, f)
    return t