Ejemplo n.º 1
0
Archivo: io.py Proyecto: orenlivne/ober
def write_npz(problem, out_file):
    '''Write problem to NPZ file. out_file may be a file name or an open 
    file descriptor.'''
    p, g, h = problem.pedigree, problem.genotype, problem.haplotype
    if isinstance(out_file, str): util.mkdir_if_not_exists(os.path.dirname(out_file))
    # Wrap every non-np-array quantity by a np-array
    np.savez(out_file,
             
             pedigree_nodes=p.graph.nodes(),
             pedigree_graph=np.array([nx.to_edgelist(p.graph)]),
             pedigree_sample_id=p.sample_id,
             pedigree_sex=p.sex,
             pedigree_phenotype=p.phenotype,
             pedigree_node_type=p.node_type,
             pedigree_sample_index=p.sample_index,
             pedigree_num_genotyped=np.array([p.num_genotyped]),
             
             genotype_data=g.data,
             genotype_snp=g.snp,
             genotype_map=g.map,
             
             haplotype_data=h.data,
             haplotype_snp=h.snp,
             haplotype_qc=h.qc,
             haplotype_hap_type=h.hap_type,
             haplotype_poo_phase=h.poo_phase,
             
             error=problem.error,
             frames=np.array([problem.frames]),  # problem.frames.to_array(),
             info=np.array([problem.info]),
             lam=problem.lam)
Ejemplo n.º 2
0
Archivo: io.py Proyecto: orenlivne/ober
def write_npz(problem, out_file):
    '''Write problem to NPZ file. out_file may be a file name or an open 
    file descriptor.'''
    p, g, h = problem.pedigree, problem.genotype, problem.haplotype
    if isinstance(out_file, str):
        util.mkdir_if_not_exists(os.path.dirname(out_file))
    # Wrap every non-np-array quantity by a np-array
    np.savez(
        out_file,
        pedigree_nodes=p.graph.nodes(),
        pedigree_graph=np.array([nx.to_edgelist(p.graph)]),
        pedigree_sample_id=p.sample_id,
        pedigree_sex=p.sex,
        pedigree_phenotype=p.phenotype,
        pedigree_node_type=p.node_type,
        pedigree_sample_index=p.sample_index,
        pedigree_num_genotyped=np.array([p.num_genotyped]),
        genotype_data=g.data,
        genotype_snp=g.snp,
        genotype_map=g.map,
        haplotype_data=h.data,
        haplotype_snp=h.snp,
        haplotype_qc=h.qc,
        haplotype_hap_type=h.hap_type,
        haplotype_poo_phase=h.poo_phase,
        error=problem.error,
        frames=np.array([problem.frames]),  # problem.frames.to_array(),
        info=np.array([problem.info]),
        lam=problem.lam)
Ejemplo n.º 3
0
def __main(args, options):
    '''Main program - accepts an options struct.'''    
    # Parse and validate command-line arguments
    in_file, info_file, segment_file, out_dir = args
    options.out_dir = args[3]  # Useful shortcut

    try:
        if options.num_processes > 1:
            manager = Manager()
            lock = manager.Lock()
        else:
            lock = None
        start = time.time()

        # Load SNP info
        info = im.io.read_info_npz(info_file)
        if options.debug >= 1:
            _writeln('haps %d, snps %d, region size %d snps, processes %d' % \
                      (2 * info.num_samples, info.num_snps, options.region_size, options.num_processes), lock)
        
        # Read list of regions to process from stdin/in_file. If empty, process all regions
        regions = map(int, ([options.snp_index / options.region_size] if options.snp_index else
                            (options.regions if options.regions else
                            (sys.stdin if in_file == '-' else open(in_file, 'rb')).readlines())))
        num_regions = (info.num_snps + options.region_size - 1) / options.region_size
        if not regions:
            regions = range(num_regions)
        _writeln('regions ' + repr(regions) + ' num_regions ' + repr(num_regions) + 
                  ' segment threshold ' + repr(options.min_len) + ' Mbp algorithm ' + options.algorithm + ' margin ' + repr(options.margin), lock)
        
        # Process each SNP region [start,stop) independently
        if options.save:
            util.mkdir_if_not_exists(out_dir)
        
        # Save index metadata, if processing the first region
        if options.save and (options.force_save_metadata or 0 in regions):
            if options.debug >= 1:
                _writeln('Writing metadata to %s/metadata' % (out_dir,), lock)
            np.savez('%s/metadata' % (out_dir,), snp=info.snp, region_size=options.region_size)

        process = _process_region_profile if options.debug >= 2 else _process_region
        if options.num_processes > 1:
            # Multi-process mode. Map phase:build and save regional index files 
            po = Pool(processes=options.num_processes)
            po.map(process, ((info, segment_file, out_dir, options, i, lock) for i in (i for i in regions if i >= 0 and i < num_regions)))
        else:
            # Single-process mode.
            for i in (i for i in regions if i >= 0 and i < num_regions):
                process((info, segment_file, out_dir, options, i, None))
            
        # Reduce phase - nothing to do here
        t = time.time() - start
        if options.debug >= 1:
            _writeln('Elapsed Time: %.3f sec (%.3f sec/region)' % (t, t / len(regions)), lock)
        if options.num_processes > 1:
            manager.shutdown()
    except:
        traceback.print_exc(file=sys.stdout)
        sys.exit(util.EXIT_FAILURE)
Ejemplo n.º 4
0
def plot_stats(stats, save_prefix=None, fig_num=1, snp_style="continuous", filter_length=20):
    """Plot imputation statistics for a phased chromosome, validating against the original genotypes."""
    if save_prefix:
        util.mkdir_if_not_exists(os.path.dirname(save_prefix))

    P.figure(fig_num)
    P.clf()
    # P.show()
    stats.plot_vs_snp(snp_style=snp_style, x_axis="cm_cumulative", filter_length=filter_length)
    if save_prefix:
        P.savefig(save_prefix + "-snp-cm-cumulative.png")

    fig_num += 1
    P.figure(fig_num)
    P.clf()
    # P.show()
    stats.plot_vs_snp(snp_style=snp_style, x_axis="bp_cumulative", filter_length=filter_length)
    if save_prefix:
        P.savefig(save_prefix + "-snp-bp-cumulative.png")

    fig_num += 1
    P.figure(fig_num)
    P.clf()
    # P.show()
    stats.plot_vs_snp(snp_style=snp_style, x_axis="cm_edge_dist", filter_length=filter_length)
    if save_prefix:
        P.savefig(save_prefix + "-snp-cm-edge-dist.png")

    fig_num += 1
    P.figure(fig_num)
    P.clf()
    # P.show()
    stats.scatter_snp_concordance(snp_style=snp_style)
    if save_prefix:
        P.savefig(save_prefix + "-snp-concordance.png")

    fig_num += 1
    P.figure(fig_num)
    P.clf()
    # P.show()
    stats.plot_vs_maf(snp_style=snp_style)
    if save_prefix:
        P.savefig(save_prefix + "-maf.png")

    fig_num += 1
    P.figure(fig_num)
    P.clf()
    # P.show()
    stats.plot_vs_sample()
    if save_prefix:
        P.savefig(save_prefix + "-sample.png")

    if save_prefix:
        stats.summary(open(save_prefix + "-stats.txt", "wb"))
Ejemplo n.º 5
0
def pipeline_monogenic_validation(work_dir=os.environ['OBER_OUT'] + '/requests/monogenic/work',
                                  index_segments_dir=os.environ['OBER_OUT'] + '/requests/monogenic/work/index_segments',
                                  region_size=100,
                                  theta_affinity=0.95,
                                  theta_weight=0.5,
                                  regenerate_segments=True,
                                  snps=None,  # np.array([6, 8]),
                                  debug=1,
                                  debug_sample=512):
    # Load SNPs
    problem = im.io.read_plink(prefix=work_dir + '/monogenic.12', pedigree=im.itu.HUTT_PED, haplotype=None, frames=None)
    # Testing: simulate aligned samples output (hap types should be 2 in the imputed genotype output line)
    problem.haplotype.poo_phase = np.zeros((problem.num_samples,), dtype=np.byte)
    problem.haplotype.poo_phase[np.array([0, 1])] = 1
    problem.haplotype.poo_phase[np.array([2, 3])] = -1
    
    # Create segments only for the regions around each snp
    if regenerate_segments:
        for row in (problem.info.snp[snps] if snps is not None else problem.info.snp):
            # Find SNP's region (the one containing its base-pair position) 
            chrom, bp = row['chrom'], row['base_pair']
            phasing_dir = '%s/phasing/chr%d' % (os.environ['OBER_OUT'], chrom)
            index_segments_chrom_dir = '%s/chr%d' % (index_segments_dir, chrom)
            info_file = '%s/hutt.phased.info.npz' % (phasing_dir,)
            info = im.io.read_info_npz(info_file)
            snp_bp = info.snp['base_pair']
            snp_index = util.nearest_neighbor_in_list_tree(bp, snp_bp, util.list_index_tree(snp_bp))
            snp_index = snp_index if snp_bp[snp_index] <= bp else snp_index - 1
            start = region_size * (snp_index / region_size)
            stop = start + region_size
            segment_file = '%s/segments-%d-%d.out' % (index_segments_chrom_dir, start, stop)
            if not os.path.exists(segment_file):
                util.mkdir_if_not_exists(index_segments_chrom_dir)
                util.run_command('find-segments-of-snp-range %d %d < %s/segments.out > %s' % (start, stop, phasing_dir, segment_file)) 
            
            # Index segments
            if regenerate_segments or \
            not os.path.exists('%s/metadata.npz' % (index_segments_chrom_dir,)) or \
            not os.path.exists('%s/region-%d.npz' % (index_segments_chrom_dir, start)):
                index_segments_beagle.main(segment_file, info_file, segment_file, index_segments_chrom_dir,
                                           snp_index=snp_index, debug=2,
                                           theta_affinity=theta_affinity, theta_weight=theta_weight)
    
    # Impute using the newly generated segment index
    _, t = im.v.iv.impute_problem(problem, debug=debug, remove_partial_calls=True,
                                  segment_location=index_segments_dir,  # if regenerate_segments else None,
                                  snps=snps, debug_sample=debug_sample)

    im.io.write_plink(im.Problem(genotype=t.imputed, pedigree=im.examples.hutt_pedigree(), haplotype=None, frames=None),
                      work_dir + '/imputed.12', save_frames=False, save_haplotype=False)
    im.cgi.io_cgi.write_imputed(t, sys.stdout, poo_phase=problem.haplotype.poo_phase)
    with open(work_dir + '/imputed.12.lgen', 'wb') as f:
        im.cgi.io_cgi.write_imputed_lgen(t, f)
    return t
Ejemplo n.º 6
0
def plot_stats(stats, save_prefix=None, fig_num=1, snp_style='continuous',
               filter_length=20):
    '''Plot imputation statistics for a phased chromosome, validating against the original genotypes.'''
    if save_prefix: util.mkdir_if_not_exists(os.path.dirname(save_prefix))
     
    P.figure(fig_num)
    P.clf()
    # P.show()
    stats.plot_vs_snp(snp_style=snp_style, x_axis='cm_cumulative', filter_length=filter_length)
    if save_prefix: P.savefig(save_prefix + '-snp-cm-cumulative.png')  

    fig_num += 1
    P.figure(fig_num)
    P.clf()
    # P.show()
    stats.plot_vs_snp(snp_style=snp_style, x_axis='bp_cumulative', filter_length=filter_length)
    if save_prefix: P.savefig(save_prefix + '-snp-bp-cumulative.png')  

    fig_num += 1
    P.figure(fig_num)
    P.clf()
    # P.show()
    stats.plot_vs_snp(snp_style=snp_style, x_axis='cm_edge_dist', filter_length=filter_length)
    if save_prefix: P.savefig(save_prefix + '-snp-cm-edge-dist.png')  

    fig_num += 1
    P.figure(fig_num)
    P.clf()
    # P.show()
    stats.scatter_snp_concordance(snp_style=snp_style)
    if save_prefix: P.savefig(save_prefix + '-snp-concordance.png')

    fig_num += 1
    P.figure(fig_num)
    P.clf()
    # P.show()
    stats.plot_vs_maf(snp_style=snp_style)
    if save_prefix: P.savefig(save_prefix + '-maf.png')  

    fig_num += 1
    P.figure(fig_num)
    P.clf()
    # P.show()
    stats.plot_vs_sample()
    if save_prefix: P.savefig(save_prefix + '-sample.png')

    if save_prefix: stats.summary(open(save_prefix + '-stats.txt', 'wb'))        
Ejemplo n.º 7
0
def __main(args, options):
    '''Main program - accepts an options struct.'''    
    # Parse and validate command-line arguments
    in_file, info_file, segment_file, out_dir = args
    options.out_dir = args[3]  # Useful shortcut

    try:
        # Initialize thread pool
        if options.num_processes > 1:
            manager = Manager()
            lock = manager.Lock()
        else: lock = None
        start = time.time()

        # Load SNP info
        info = im.io.read_info_npz(info_file)
        if options.debug >= 1:
            _writeln('haps %d, snps %d, region size %d snps, processes %d' % \
                      (2 * info.num_samples, info.num_snps, options.region_size, options.num_processes), lock)
        
        # Read list of regions to process from stdin/in_file. If empty, process all regions.
        # If a region index list is read, IT MUST BE CONTIGUOUS! (e.g. [3, 4, 5, 6])
        regions = map(int, ([options.snp_index / options.region_size] if options.snp_index is not None else
                            (options.regions if options.regions else
                            (sys.stdin if in_file == '-' else open(in_file, 'rb')).readlines())))
        num_regions = (info.num_snps + options.region_size - 1) / options.region_size
        if not regions: regions = range(num_regions)
        _writeln('regions ' + repr(regions) + ' num_regions ' + repr(num_regions) + 
                 ' segment threshold ' + repr(options.min_len) + ' Mbp algorithm ' + options.algorithm + ' margin ' + repr(options.margin), lock)
                
        # Save index metadata if first region is processed in this run
        if options.save: util.mkdir_if_not_exists(out_dir)
        if options.save and (options.force_save_metadata or 0 in regions):
            if options.debug >= 1: _writeln('Writing metadata to %s/metadata' % (out_dir,), lock)
            np.savez('%s/metadata' % (out_dir,), snp=info.snp, region_size=options.region_size)

#        segments = _SegmentCollection(info, segment_file, regions, options)
#        print segments

        # Map phase: process each SNP independently
        r = segments.region_info
#        snps = [(r['region'][0], options.snp_index - r['snp_start'][0])] if options.snp_index is not None else \
#        ((region, snp) for region, start_raw, stop_raw in zip(r['region'], r['snp_start'], r['snp_stop'])
#         for snp in xrange(start_raw, stop_raw))
        snp_processor = _new_snp_processor(info, segments, options, lock)

        if options.num_processes > 1:
            # Multi-process mode. SNPs are processed in parallel. 
            po = Pool(processes=options.num_processes)
            #a = MyObject()
            #result = po.map(process_snp, ((a, region, snp) for region, snp in snps))
            #result = po.map(process_snp, ((region, snp) for region, snp in snps))
            #result = po.map(snp_processor.process, ((region, snp) for region, snp in snps))
            result = po.map(snp_processor.process, ((region, snp) for region, snp in snps))
        else:
            # Single-process mode (sequential)
            #result = [snp_processor.process((region, snp)) for region, snp in snps] 
            for region in regions:
                result = [snp_processor.process(segment_file, region,((region, snp)) for region, snp in snps)] 
        print result
        
        # Reduce phase: organize results in array and save to npz files
        _save_index(segments.region_info, result, options.save, out_dir) 

        t = time.time() - start
        if options.debug >= 1: _writeln('Elapsed Time: %.3f sec (%.3f sec/region)' % (t, t / len(regions)), lock)
        if options.num_processes > 1: manager.shutdown()
    except:
        traceback.print_exc(file=sys.stdout)
        sys.exit(util.EXIT_FAILURE)
Ejemplo n.º 8
0
        default=None,
        help=
        'Identity coefficient file for all sample pairs. Format: id1 id2 lam delta1...delta9. If empty, defaults to plink_set.id'
    )
    options, args = parser.parse_args(sys.argv[1:])
    if len(args) != 3:
        print usage
        sys.exit(util.EXIT_BAD_INPUT_ARGS)
    input, chrom, out = args[0], int(args[1]), args[2]  # @ReservedAssignment
    if chrom < 1 or chrom > 22:
        print usage
        print('\nMust specify a chromosome number in 1..22.')
        sys.exit(util.EXIT_BAD_INPUT_ARGS)

    try:
        util.mkdir_if_not_exists(os.path.dirname(out))
        # Use [cM] as genetic distance unit
        plink_cmd_base = '%s --bfile %s --chr %d --out %s' % (bu.PLINK, input,
                                                              chrom, out)

        if options.recode:
            # First, compute allele frequencies with PLINK
            util.run_command(plink_cmd_base + ' --nonfounders --freq')
            # Convert frequencies file that to a reference allele recoding
            # file (a file containing the list of SNPs and their minor allele letter)
            bu.frq_to_minor_file(out + '.frq', out + '.mnr')
            # Finally, convert binary PLINK to a 12-recoded TPED, where 1=minor allele for each SNP
            util.run_command(
                '%s --transpose --recode12 --reference-allele %s.mnr' %
                (plink_cmd_base, out))
        else:
Ejemplo n.º 9
0
def __main(args, options):
    '''Main program - accepts an options struct.'''
    # Parse and validate command-line arguments
    in_file, info_file, segment_file, out_dir = args
    options.out_dir = args[3]  # Useful shortcut

    try:
        # Initialize thread pool
        if options.num_processes > 1:
            manager = Manager()
            lock = manager.Lock()
        else:
            lock = None
        start = time.time()

        # Load SNP info
        info = im.io.read_info_npz(info_file)
        if options.debug >= 1:
            _writeln('haps %d, snps %d, region size %d snps, processes %d' % \
                      (2 * info.num_samples, info.num_snps, options.region_size, options.num_processes), lock)

        # Read list of regions to process from stdin/in_file. If empty, process all regions.
        # If a region index list is read, IT MUST BE CONTIGUOUS! (e.g. [3, 4, 5, 6])
        regions = map(
            int, ([options.snp_index /
                   options.region_size] if options.snp_index is not None else
                  (options.regions if options.regions else
                   (sys.stdin if in_file == '-' else open(in_file, 'rb')
                    ).readlines())))
        num_regions = (info.num_snps + options.region_size -
                       1) / options.region_size
        if not regions: regions = range(num_regions)
        _writeln(
            'regions ' + repr(regions) + ' num_regions ' + repr(num_regions) +
            ' segment length threshold ' + repr(options.min_len) +
            ' Mbp algorithm ' + options.algorithm + ' margin ' +
            repr(options.margin) + ' affinity threshold ' +
            repr(options.theta_affinity) + ' weight threshold ' +
            repr(options.theta_weight), lock)

        # Save index metadata if first region is processed in this run
        if options.save: util.mkdir_if_not_exists(out_dir)
        if options.save and (options.force_save_metadata or 0 in regions):
            if options.debug >= 1:
                _writeln('Writing metadata to %s/metadata' % (out_dir, ), lock)
            np.savez('%s/metadata' % (out_dir, ),
                     snp=info.snp,
                     region_size=options.region_size)

        segments = _SegmentCollection(info, segment_file, regions, options)
        # print segments

        # Map phase: process each SNP independently
        r = segments.region_info
        snps = [(r['region'][0], options.snp_index)] if options.snp_index is not None else \
            ((region, snp) for region, start_raw, stop_raw in zip(r['region'], r['snp_start'], r['snp_stop'])
             for snp in xrange(start_raw, stop_raw))
        snp_processor = _new_snp_processor(info, segments, options, lock)

        if options.num_processes > 1:
            # Multi-process mode. SNPs are processed in parallel.
            po = Pool(processes=options.num_processes)
            # a = MyObject()
            # result = po.map(process_snp, ((region, snp) for region, snp in snps))
            result = po.imap(snp_processor.process,
                             ((region, snp) for region, snp in snps))
            # result = po.map(process_snp, ((a, region, snp) for region, snp in snps))
        else:
            # Single-process mode (sequential)
            result = [
                snp_processor.process((region, snp)) for region, snp in snps
            ]
        # print result

        # Reduce phase: organize results in array and save to npz files
        _save_index(segments.region_info, result, out_dir, options)

        t = time.time() - start
        if options.debug >= 1:
            _writeln(
                'Elapsed Time: %.3f sec (%.3f sec/region)' %
                (t, t / len(regions)), lock)
        if options.num_processes > 1: manager.shutdown()
    except:
        traceback.print_exc(file=sys.stdout)
        sys.exit(util.EXIT_FAILURE)
Ejemplo n.º 10
0
def pipeline_monogenic_validation(
        work_dir=os.environ['OBER_OUT'] + '/requests/monogenic/work',
        index_segments_dir=os.environ['OBER_OUT'] +
    '/requests/monogenic/work/index_segments',
        region_size=100,
        theta_affinity=0.95,
        theta_weight=0.5,
        regenerate_segments=True,
        snps=None,  # np.array([6, 8]),
        debug=1,
        debug_sample=512):
    # Load SNPs
    problem = im.io.read_plink(prefix=work_dir + '/monogenic.12',
                               pedigree=im.itu.HUTT_PED,
                               haplotype=None,
                               frames=None)
    # Testing: simulate aligned samples output (hap types should be 2 in the imputed genotype output line)
    problem.haplotype.poo_phase = np.zeros((problem.num_samples, ),
                                           dtype=np.byte)
    problem.haplotype.poo_phase[np.array([0, 1])] = 1
    problem.haplotype.poo_phase[np.array([2, 3])] = -1

    # Create segments only for the regions around each snp
    if regenerate_segments:
        for row in (problem.info.snp[snps]
                    if snps is not None else problem.info.snp):
            # Find SNP's region (the one containing its base-pair position)
            chrom, bp = row['chrom'], row['base_pair']
            phasing_dir = '%s/phasing/chr%d' % (os.environ['OBER_OUT'], chrom)
            index_segments_chrom_dir = '%s/chr%d' % (index_segments_dir, chrom)
            info_file = '%s/hutt.phased.info.npz' % (phasing_dir, )
            info = im.io.read_info_npz(info_file)
            snp_bp = info.snp['base_pair']
            snp_index = util.nearest_neighbor_in_list_tree(
                bp, snp_bp, util.list_index_tree(snp_bp))
            snp_index = snp_index if snp_bp[snp_index] <= bp else snp_index - 1
            start = region_size * (snp_index / region_size)
            stop = start + region_size
            segment_file = '%s/segments-%d-%d.out' % (index_segments_chrom_dir,
                                                      start, stop)
            if not os.path.exists(segment_file):
                util.mkdir_if_not_exists(index_segments_chrom_dir)
                util.run_command(
                    'find-segments-of-snp-range %d %d < %s/segments.out > %s' %
                    (start, stop, phasing_dir, segment_file))

            # Index segments
            if regenerate_segments or \
            not os.path.exists('%s/metadata.npz' % (index_segments_chrom_dir,)) or \
            not os.path.exists('%s/region-%d.npz' % (index_segments_chrom_dir, start)):
                index_segments_beagle.main(segment_file,
                                           info_file,
                                           segment_file,
                                           index_segments_chrom_dir,
                                           snp_index=snp_index,
                                           debug=2,
                                           theta_affinity=theta_affinity,
                                           theta_weight=theta_weight)

    # Impute using the newly generated segment index
    _, t = im.v.iv.impute_problem(
        problem,
        debug=debug,
        remove_partial_calls=True,
        segment_location=index_segments_dir,  # if regenerate_segments else None,
        snps=snps,
        debug_sample=debug_sample)

    im.io.write_plink(im.Problem(genotype=t.imputed,
                                 pedigree=im.examples.hutt_pedigree(),
                                 haplotype=None,
                                 frames=None),
                      work_dir + '/imputed.12',
                      save_frames=False,
                      save_haplotype=False)
    im.cgi.io_cgi.write_imputed(t,
                                sys.stdout,
                                poo_phase=problem.haplotype.poo_phase)
    with open(work_dir + '/imputed.12.lgen', 'wb') as f:
        im.cgi.io_cgi.write_imputed_lgen(t, f)
    return t
Ejemplo n.º 11
0
 parser.add_option('-r', '--recode'      , action='store_true'  , dest='recode', default=True,
                   help='Recode alleles to 1=minor, 2=major (if False, a random assignment to 1,2 is made)')
 parser.add_option('-i', '--id-coef', type=str, dest='id_coef', default=None,
                   help='Identity coefficient file for all sample pairs. Format: id1 id2 lam delta1...delta9. If empty, defaults to plink_set.id')
 options, args = parser.parse_args(sys.argv[1:])
 if len(args) != 3:
     print usage
     sys.exit(util.EXIT_BAD_INPUT_ARGS)
 input, chrom, out = args[0], int(args[1]), args[2]  # @ReservedAssignment
 if chrom < 1 or chrom > 22:
     print usage
     print('\nMust specify a chromosome number in 1..22.')
     sys.exit(util.EXIT_BAD_INPUT_ARGS)
     
 try:
     util.mkdir_if_not_exists(os.path.dirname(out))
     # Use [cM] as genetic distance unit
     plink_cmd_base = '%s --bfile %s --chr %d --out %s' % (bu.PLINK, input, chrom, out)
     
     if options.recode:
         # First, compute allele frequencies with PLINK  
         util.run_command(plink_cmd_base + ' --nonfounders --freq')
         # Convert frequencies file that to a reference allele recoding
         # file (a file containing the list of SNPs and their minor allele letter)
         bu.frq_to_minor_file(out + '.frq', out + '.mnr') 
         # Finally, convert binary PLINK to a 12-recoded TPED, where 1=minor allele for each SNP                 
         util.run_command('%s --transpose --recode12 --reference-allele %s.mnr' % (plink_cmd_base, out))
     else:
         # No recoding, just convert binary to 2-recoded TPED. PLINK assigns "1" to
         # the first-encountered allele in the file for each SNP.
         util.run_command('%s --transpose --recode12' % (plink_cmd_base,))
Ejemplo n.º 12
0
    (options, args) = parser.parse_args(sys.argv[1:])
    if len(args) != 4:
        print usage
        sys.exit(1)

    # in_dir =  'phasing.20121130/split_chr'
    # out_dir = 'phasing.20121130/individual'
    # chrom = 5
    # part = 0
    in_dir, out_dir, chrom, part = args[0], args[1], int(args[2]), int(args[3])
    print 'Running phasing in stages'
    print 'in_dir  = %s' % (in_dir,)
    print 'out_dir = %s' % (out_dir,)
    print 'chrom   = %d' % (chrom,)
    print 'part    = %d' % (part,)

    out_dir = '%s/chr%d' % (out_dir, chrom)
    util.mkdir_if_not_exists(out_dir)
    
    npz_file = '%s/hutt.stage0.npz' % (out_dir,)
    if not os.path.exists(npz_file) and options.stage == 0:
        convert.main(pedigree=itu.HUTT_PED,
                     prefix='%s/chr%d/hutt_chr%d_part%d' % (in_dir, chrom, chrom, part),
                     npz=npz_file, target='npz', debug=True)
    
    for stage in (range(1, 5) if options.stage == 0 else [options.stage]):
        phase.main(pedigree=itu.HUTT_PED,
                          input='%s/hutt.stage%d.npz' % (out_dir, stage - 1),
                          output='%s/hutt.stage%d.npz' % (out_dir, stage),
                          stage=stage, debug=options.debug)
Ejemplo n.º 13
0
def __main(args, options):
    '''Main program - accepts an options struct.'''
    # Parse and validate command-line arguments
    in_file, info_file, segment_file, out_dir = args
    options.out_dir = args[3]  # Useful shortcut

    try:
        if options.num_processes > 1:
            manager = Manager()
            lock = manager.Lock()
        else:
            lock = None
        start = time.time()

        # Load SNP info
        info = im.io.read_info_npz(info_file)
        if options.debug >= 1:
            _writeln('haps %d, snps %d, region size %d snps, processes %d' % \
                      (2 * info.num_samples, info.num_snps, options.region_size, options.num_processes), lock)

        # Read list of regions to process from stdin/in_file. If empty, process all regions
        regions = map(int,
                      ([options.snp_index /
                        options.region_size] if options.snp_index else
                       (options.regions if options.regions else
                        (sys.stdin if in_file == '-' else open(in_file, 'rb')
                         ).readlines())))
        num_regions = (info.num_snps + options.region_size -
                       1) / options.region_size
        if not regions:
            regions = range(num_regions)
        _writeln(
            'regions ' + repr(regions) + ' num_regions ' + repr(num_regions) +
            ' segment threshold ' + repr(options.min_len) + ' Mbp algorithm ' +
            options.algorithm + ' margin ' + repr(options.margin), lock)

        # Process each SNP region [start,stop) independently
        if options.save:
            util.mkdir_if_not_exists(out_dir)

        # Save index metadata, if processing the first region
        if options.save and (options.force_save_metadata or 0 in regions):
            if options.debug >= 1:
                _writeln('Writing metadata to %s/metadata' % (out_dir, ), lock)
            np.savez('%s/metadata' % (out_dir, ),
                     snp=info.snp,
                     region_size=options.region_size)

        process = _process_region_profile if options.debug >= 2 else _process_region
        if options.num_processes > 1:
            # Multi-process mode. Map phase:build and save regional index files
            po = Pool(processes=options.num_processes)
            po.map(process,
                   ((info, segment_file, out_dir, options, i, lock)
                    for i in (i
                              for i in regions if i >= 0 and i < num_regions)))
        else:
            # Single-process mode.
            for i in (i for i in regions if i >= 0 and i < num_regions):
                process((info, segment_file, out_dir, options, i, None))

        # Reduce phase - nothing to do here
        t = time.time() - start
        if options.debug >= 1:
            _writeln(
                'Elapsed Time: %.3f sec (%.3f sec/region)' %
                (t, t / len(regions)), lock)
        if options.num_processes > 1:
            manager.shutdown()
    except:
        traceback.print_exc(file=sys.stdout)
        sys.exit(util.EXIT_FAILURE)
Ejemplo n.º 14
0
                      type='str',
                      dest='out_base_name',
                      default=None,
                      help='Output PLINK data set base name')
    options, args = parser.parse_args(sys.argv[1:])
    if len(args) != 1:
        print usage
        sys.exit(util.EXIT_BAD_INPUT_ARGS)
    input_file = args[0]
    if not options.out_base_name:
        options.out_base_name = os.path.splitext(input_file)[0]

    try:
        # Initialize
        daos = db_gene.snp.snp_db_dao.Daos(url=options.db_url)
        util.mkdir_if_not_exists(os.path.dirname(options.out_base_name))

        # Set genetic distance column in BIM file (read locations from snp db) and save a new copy of it
        snp_data = np.genfromtxt(
            input_file,
            dtype=[
                ('chrom', np.uint8),  # Chromosome # containing the SNP
                ('name', np.chararray),  # SNP name (e.g., 'rs...')
                ('dist_cm', np.float),  # Genetic position [CENTI-Morgans!!]
                ('base_pair', np.uint),  # Base pair position on chromosome
                ('allele1', np.chararray),
                ('allele2', np.chararray)
            ])
        snp_names = snp_data['name']
        a = dict((x.name, x) for x in daos.snp_dao.get_snps_iter(snp_names))
        # Note: our genetic distance unit is cM
Ejemplo n.º 15
0
    data_cols = tuple(data[:, i] for i in xrange(data.shape[1]))
    # Create SNP classes
    maf, called_in_both = data[:, maf_col], data[:, called_in_both_col]
    all_snps = SnpClass('all', data_cols,
                        (maf > 0) & (called_in_both > min_called_in_both))
    common_snps = SnpClass('common', data_cols, (maf > maf_threshold) &
                           (called_in_both > min_called_in_both))
    # rare_snps = SnpClass('rare', data_cols, (maf <= maf_threshold) & (called_in_both > min_called_in_both))
    return all_snps, common_snps


def plot_impute2_concordance(
    (all_snps, common_snps), save_dir=None, plot=False, min_info_to_plot=0.9):
    '''Generate plot of impute2 concordance for a single window from a Struct holding
    statistics on all snps, all_snps.'''
    util.mkdir_if_not_exists(save_dir)
    # Useful variables
    lim_threshold = [0., 1.]
    n = 40
    maf_n = 40
    # info_bins = [0, 0.7, 0.8, 0.85, 0.9, 1]
    info_bins = [0, 0.9, 1]

    threshold = np.linspace(lim_threshold[0], lim_threshold[1], n + 1)
    maf_bins = np.linspace(0, 0.5, maf_n + 1)
    k = 0  # Figure counter

    if save_dir: util.mkdir_if_not_exists(save_dir)
    for snp_class in (all_snps, ):  # (all_snps, common_snps, rare_snps):
        #         k += 1
        #         P.figure(k)
Ejemplo n.º 16
0
                   help='Print debugging information')
 parser.add_option('-d', '--db-url'       , type='str'           , dest='db_url', default=db_gene.DEFAULT_URL,
                   help='SNP database URL')
 parser.add_option('-o', '--out'          , type='str'           , dest='out_base_name', default=None,
                   help='Output PLINK data set base name')
 options, args = parser.parse_args(sys.argv[1:])
 if len(args) != 1:
     print usage
     sys.exit(util.EXIT_BAD_INPUT_ARGS)
 input_file = args[0]
 if not options.out_base_name: options.out_base_name = os.path.splitext(input_file)[0]
         
 try:
     # Initialize
     daos = db_gene.snp.snp_db_dao.Daos(url=options.db_url)
     util.mkdir_if_not_exists(os.path.dirname(options.out_base_name))
     
     # Set genetic distance column in BIM file (read locations from snp db) and save a new copy of it
     snp_data = np.genfromtxt(input_file,
                              dtype=[
                                     ('chrom', np.uint8),  # Chromosome # containing the SNP
                                     ('name', np.chararray),  # SNP name (e.g., 'rs...')
                                     ('dist_cm', np.float),  # Genetic position [CENTI-Morgans!!]
                                     ('base_pair', np.uint),  # Base pair position on chromosome
                                     ('allele1', np.chararray),
                                     ('allele2', np.chararray)
                                     ])
     snp_names = snp_data['name']
     a = dict((x.name, x) for x in daos.snp_dao.get_snps_iter(snp_names))
     # Note: our genetic distance unit is cM 
     snp_data['dist_cm'] = map(lambda x: x if x else 0.0, ((a[x].genetic_pos if a.has_key(x) else None) for x in snp_names))
Ejemplo n.º 17
0
        print usage
        sys.exit(1)

    # in_dir =  'phasing.20121130/split_chr'
    # out_dir = 'phasing.20121130/individual'
    # chrom = 5
    # part = 0
    in_dir, out_dir, chrom, part = args[0], args[1], int(args[2]), int(args[3])
    print "Running phasing in stages"
    print "in_dir  = %s" % (in_dir,)
    print "out_dir = %s" % (out_dir,)
    print "chrom   = %d" % (chrom,)
    print "part    = %d" % (part,)

    out_dir = "%s/chr%d" % (out_dir, chrom)
    util.mkdir_if_not_exists(out_dir)

    npz_file = "%s/hutt.stage0.npz" % (out_dir,)
    if not os.path.exists(npz_file) and options.stage == 0:
        convert.main(
            pedigree=itu.HUTT_PED,
            prefix="%s/chr%d/hutt_chr%d_part%d" % (in_dir, chrom, chrom, part),
            npz=npz_file,
            target="npz",
            debug=True,
        )

    for stage in range(1, 5) if options.stage == 0 else [options.stage]:
        phase.main(
            pedigree=itu.HUTT_PED,
            input="%s/hutt.stage%d.npz" % (out_dir, stage - 1),
Ejemplo n.º 18
0
 parser.add_option('-r', '--recode'      , action='store_true'  , dest='recode', default=False,
                   help='Recode alleles to 1=minor, 2=major (if False, allele coding is kept intact)')
 parser.add_option('-g', '--out-gxn'     , type='str'           , dest='out_gxn', default=bu.ARG_NONE,
                   help='Output directory of GXN files (if not specified, writes to same directory as out-plink-set''s)')
 (options, args) = parser.parse_args(sys.argv[1:])
 options.print_times = True
 if options.out_gxn.startswith(bu.ARG_NONE):
     options.out_gxn = None
 if len(args) != 3:
     print usage
     sys.exit(util.EXIT_BAD_INPUT_ARGS)
 
 try:
     # Prepare file names, create directories
     (base_name, pedigree_file, out_base_name) = args
     mkdir_if_not_exists(os.path.dirname(out_base_name))
     if options.out_gxn:
         mkdir_if_not_exists(os.path.dirname(options.out_gxn))
     else:
         options.out_gxn = out_base_name
 
     npz_file = base_name + '.npz'
     
     # Convert plink tped -> npz
     problem = io.read_plink(prefix=base_name, pedigree=pedigree_file, haplotype=None,
                             verbose=options.debug)
 
     # Phase, impute, fill missing
     phaser = phase.build_phasing_pipeline(options)      
     request = phase.run_phasing_chain(phaser, problem)
     stats = request.stats
Ejemplo n.º 19
0
    #-----------------------------
    # Load data - phased run 
    #-----------------------------
    data = np.loadtxt(in_file, usecols=usecols) 
    data_cols = tuple(data[:, i] for i in xrange(data.shape[1]))
    # Create SNP classes
    maf, called_in_both = data[:, maf_col], data[:, called_in_both_col]
    all_snps = SnpClass('all', data_cols, (maf > 0) & (called_in_both > min_called_in_both))
    common_snps = SnpClass('common', data_cols, (maf > maf_threshold) & (called_in_both > min_called_in_both))
    # rare_snps = SnpClass('rare', data_cols, (maf <= maf_threshold) & (called_in_both > min_called_in_both))
    return all_snps, common_snps
    
def plot_impute2_concordance((all_snps, common_snps), save_dir=None, plot=False, min_info_to_plot=0.9):
    '''Generate plot of impute2 concordance for a single window from a Struct holding
    statistics on all snps, all_snps.'''
    util.mkdir_if_not_exists(save_dir)
    # Useful variables
    lim_threshold = [0., 1.]
    n = 40
    maf_n = 40
    # info_bins = [0, 0.7, 0.8, 0.85, 0.9, 1]
    info_bins = [0, 0.9, 1]

    threshold = np.linspace(lim_threshold[0], lim_threshold[1], n + 1)
    maf_bins = np.linspace(0, 0.5, maf_n + 1)
    k = 0  # Figure counter

    if save_dir: util.mkdir_if_not_exists(save_dir)
    for snp_class in (all_snps,):  # (all_snps, common_snps, rare_snps): 
#         k += 1
#         P.figure(k)
Ejemplo n.º 20
0
                      type='int',
                      dest='stop',
                      default=None,
                      help='Ending part number (not inclusive)')
    (options, args) = parser.parse_args(sys.argv[1:])
    if len(args) != 3:
        print usage
        sys.exit(util.EXIT_BAD_INPUT_ARGS)
    if options.start is None or options.stop is None:
        print 'Must specify start and stop'
        print usage
        sys.exit(util.EXIT_BAD_INPUT_ARGS)
    (in_file, part_type, out_file) = args
    part_type = __parse_part_type(part_type)
    num_parts = options.stop - options.start
    mkdir_if_not_exists(os.path.dirname(out_file))

    try:
        # Merge PLINK data sets. If there's one part, nothing to merge, just copy the files.
        part_names = bu.partition_names(in_file,
                                        part_type,
                                        parts=xrange(options.start,
                                                     options.stop)).values()
        first_part_name = part_names[0]
        print 'Reducing, num_parts', num_parts
        if num_parts == 1:
            for ext in EXTENSIONS:
                shutil.copy(first_part_name + '.' + ext, out_file + '.' + ext)
        else:
            # Prepare PLINK merge command input file
            f = tempfile.NamedTemporaryFile(delete=False)
Ejemplo n.º 21
0
 parser.add_option('-s', '--start-part'          , type='int', dest='start', default=None,
                   help='Starting part number (inclusive)')
 parser.add_option('-e', '--stop-part'          , type='int', dest='stop', default=None,
                   help='Ending part number (not inclusive)')
 (options, args) = parser.parse_args(sys.argv[1:])
 if len(args) != 3:
     print usage
     sys.exit(util.EXIT_BAD_INPUT_ARGS)
 if options.start is None or options.stop is None:
     print 'Must specify start and stop'
     print usage
     sys.exit(util.EXIT_BAD_INPUT_ARGS)
 (in_file, part_type, out_file) = args
 part_type = __parse_part_type(part_type)
 num_parts = options.stop - options.start
 mkdir_if_not_exists(os.path.dirname(out_file))
 
 try:
     # Merge PLINK data sets. If there's one part, nothing to merge, just copy the files.
     part_names = bu.partition_names(in_file, part_type,
                                     parts=xrange(options.start, options.stop)).values()
     first_part_name = part_names[0]
     print 'Reducing, num_parts', num_parts
     if num_parts == 1:
         for ext in EXTENSIONS:
             shutil.copy(first_part_name + '.' + ext, out_file + '.' + ext)
     else:
         # Prepare PLINK merge command input file
         f = tempfile.NamedTemporaryFile(delete=False)
         for name in part_names:
             for ext in EXTENSIONS: