def write_npz(problem, out_file): '''Write problem to NPZ file. out_file may be a file name or an open file descriptor.''' p, g, h = problem.pedigree, problem.genotype, problem.haplotype if isinstance(out_file, str): util.mkdir_if_not_exists(os.path.dirname(out_file)) # Wrap every non-np-array quantity by a np-array np.savez(out_file, pedigree_nodes=p.graph.nodes(), pedigree_graph=np.array([nx.to_edgelist(p.graph)]), pedigree_sample_id=p.sample_id, pedigree_sex=p.sex, pedigree_phenotype=p.phenotype, pedigree_node_type=p.node_type, pedigree_sample_index=p.sample_index, pedigree_num_genotyped=np.array([p.num_genotyped]), genotype_data=g.data, genotype_snp=g.snp, genotype_map=g.map, haplotype_data=h.data, haplotype_snp=h.snp, haplotype_qc=h.qc, haplotype_hap_type=h.hap_type, haplotype_poo_phase=h.poo_phase, error=problem.error, frames=np.array([problem.frames]), # problem.frames.to_array(), info=np.array([problem.info]), lam=problem.lam)
def write_npz(problem, out_file): '''Write problem to NPZ file. out_file may be a file name or an open file descriptor.''' p, g, h = problem.pedigree, problem.genotype, problem.haplotype if isinstance(out_file, str): util.mkdir_if_not_exists(os.path.dirname(out_file)) # Wrap every non-np-array quantity by a np-array np.savez( out_file, pedigree_nodes=p.graph.nodes(), pedigree_graph=np.array([nx.to_edgelist(p.graph)]), pedigree_sample_id=p.sample_id, pedigree_sex=p.sex, pedigree_phenotype=p.phenotype, pedigree_node_type=p.node_type, pedigree_sample_index=p.sample_index, pedigree_num_genotyped=np.array([p.num_genotyped]), genotype_data=g.data, genotype_snp=g.snp, genotype_map=g.map, haplotype_data=h.data, haplotype_snp=h.snp, haplotype_qc=h.qc, haplotype_hap_type=h.hap_type, haplotype_poo_phase=h.poo_phase, error=problem.error, frames=np.array([problem.frames]), # problem.frames.to_array(), info=np.array([problem.info]), lam=problem.lam)
def __main(args, options): '''Main program - accepts an options struct.''' # Parse and validate command-line arguments in_file, info_file, segment_file, out_dir = args options.out_dir = args[3] # Useful shortcut try: if options.num_processes > 1: manager = Manager() lock = manager.Lock() else: lock = None start = time.time() # Load SNP info info = im.io.read_info_npz(info_file) if options.debug >= 1: _writeln('haps %d, snps %d, region size %d snps, processes %d' % \ (2 * info.num_samples, info.num_snps, options.region_size, options.num_processes), lock) # Read list of regions to process from stdin/in_file. If empty, process all regions regions = map(int, ([options.snp_index / options.region_size] if options.snp_index else (options.regions if options.regions else (sys.stdin if in_file == '-' else open(in_file, 'rb')).readlines()))) num_regions = (info.num_snps + options.region_size - 1) / options.region_size if not regions: regions = range(num_regions) _writeln('regions ' + repr(regions) + ' num_regions ' + repr(num_regions) + ' segment threshold ' + repr(options.min_len) + ' Mbp algorithm ' + options.algorithm + ' margin ' + repr(options.margin), lock) # Process each SNP region [start,stop) independently if options.save: util.mkdir_if_not_exists(out_dir) # Save index metadata, if processing the first region if options.save and (options.force_save_metadata or 0 in regions): if options.debug >= 1: _writeln('Writing metadata to %s/metadata' % (out_dir,), lock) np.savez('%s/metadata' % (out_dir,), snp=info.snp, region_size=options.region_size) process = _process_region_profile if options.debug >= 2 else _process_region if options.num_processes > 1: # Multi-process mode. Map phase:build and save regional index files po = Pool(processes=options.num_processes) po.map(process, ((info, segment_file, out_dir, options, i, lock) for i in (i for i in regions if i >= 0 and i < num_regions))) else: # Single-process mode. for i in (i for i in regions if i >= 0 and i < num_regions): process((info, segment_file, out_dir, options, i, None)) # Reduce phase - nothing to do here t = time.time() - start if options.debug >= 1: _writeln('Elapsed Time: %.3f sec (%.3f sec/region)' % (t, t / len(regions)), lock) if options.num_processes > 1: manager.shutdown() except: traceback.print_exc(file=sys.stdout) sys.exit(util.EXIT_FAILURE)
def plot_stats(stats, save_prefix=None, fig_num=1, snp_style="continuous", filter_length=20): """Plot imputation statistics for a phased chromosome, validating against the original genotypes.""" if save_prefix: util.mkdir_if_not_exists(os.path.dirname(save_prefix)) P.figure(fig_num) P.clf() # P.show() stats.plot_vs_snp(snp_style=snp_style, x_axis="cm_cumulative", filter_length=filter_length) if save_prefix: P.savefig(save_prefix + "-snp-cm-cumulative.png") fig_num += 1 P.figure(fig_num) P.clf() # P.show() stats.plot_vs_snp(snp_style=snp_style, x_axis="bp_cumulative", filter_length=filter_length) if save_prefix: P.savefig(save_prefix + "-snp-bp-cumulative.png") fig_num += 1 P.figure(fig_num) P.clf() # P.show() stats.plot_vs_snp(snp_style=snp_style, x_axis="cm_edge_dist", filter_length=filter_length) if save_prefix: P.savefig(save_prefix + "-snp-cm-edge-dist.png") fig_num += 1 P.figure(fig_num) P.clf() # P.show() stats.scatter_snp_concordance(snp_style=snp_style) if save_prefix: P.savefig(save_prefix + "-snp-concordance.png") fig_num += 1 P.figure(fig_num) P.clf() # P.show() stats.plot_vs_maf(snp_style=snp_style) if save_prefix: P.savefig(save_prefix + "-maf.png") fig_num += 1 P.figure(fig_num) P.clf() # P.show() stats.plot_vs_sample() if save_prefix: P.savefig(save_prefix + "-sample.png") if save_prefix: stats.summary(open(save_prefix + "-stats.txt", "wb"))
def pipeline_monogenic_validation(work_dir=os.environ['OBER_OUT'] + '/requests/monogenic/work', index_segments_dir=os.environ['OBER_OUT'] + '/requests/monogenic/work/index_segments', region_size=100, theta_affinity=0.95, theta_weight=0.5, regenerate_segments=True, snps=None, # np.array([6, 8]), debug=1, debug_sample=512): # Load SNPs problem = im.io.read_plink(prefix=work_dir + '/monogenic.12', pedigree=im.itu.HUTT_PED, haplotype=None, frames=None) # Testing: simulate aligned samples output (hap types should be 2 in the imputed genotype output line) problem.haplotype.poo_phase = np.zeros((problem.num_samples,), dtype=np.byte) problem.haplotype.poo_phase[np.array([0, 1])] = 1 problem.haplotype.poo_phase[np.array([2, 3])] = -1 # Create segments only for the regions around each snp if regenerate_segments: for row in (problem.info.snp[snps] if snps is not None else problem.info.snp): # Find SNP's region (the one containing its base-pair position) chrom, bp = row['chrom'], row['base_pair'] phasing_dir = '%s/phasing/chr%d' % (os.environ['OBER_OUT'], chrom) index_segments_chrom_dir = '%s/chr%d' % (index_segments_dir, chrom) info_file = '%s/hutt.phased.info.npz' % (phasing_dir,) info = im.io.read_info_npz(info_file) snp_bp = info.snp['base_pair'] snp_index = util.nearest_neighbor_in_list_tree(bp, snp_bp, util.list_index_tree(snp_bp)) snp_index = snp_index if snp_bp[snp_index] <= bp else snp_index - 1 start = region_size * (snp_index / region_size) stop = start + region_size segment_file = '%s/segments-%d-%d.out' % (index_segments_chrom_dir, start, stop) if not os.path.exists(segment_file): util.mkdir_if_not_exists(index_segments_chrom_dir) util.run_command('find-segments-of-snp-range %d %d < %s/segments.out > %s' % (start, stop, phasing_dir, segment_file)) # Index segments if regenerate_segments or \ not os.path.exists('%s/metadata.npz' % (index_segments_chrom_dir,)) or \ not os.path.exists('%s/region-%d.npz' % (index_segments_chrom_dir, start)): index_segments_beagle.main(segment_file, info_file, segment_file, index_segments_chrom_dir, snp_index=snp_index, debug=2, theta_affinity=theta_affinity, theta_weight=theta_weight) # Impute using the newly generated segment index _, t = im.v.iv.impute_problem(problem, debug=debug, remove_partial_calls=True, segment_location=index_segments_dir, # if regenerate_segments else None, snps=snps, debug_sample=debug_sample) im.io.write_plink(im.Problem(genotype=t.imputed, pedigree=im.examples.hutt_pedigree(), haplotype=None, frames=None), work_dir + '/imputed.12', save_frames=False, save_haplotype=False) im.cgi.io_cgi.write_imputed(t, sys.stdout, poo_phase=problem.haplotype.poo_phase) with open(work_dir + '/imputed.12.lgen', 'wb') as f: im.cgi.io_cgi.write_imputed_lgen(t, f) return t
def plot_stats(stats, save_prefix=None, fig_num=1, snp_style='continuous', filter_length=20): '''Plot imputation statistics for a phased chromosome, validating against the original genotypes.''' if save_prefix: util.mkdir_if_not_exists(os.path.dirname(save_prefix)) P.figure(fig_num) P.clf() # P.show() stats.plot_vs_snp(snp_style=snp_style, x_axis='cm_cumulative', filter_length=filter_length) if save_prefix: P.savefig(save_prefix + '-snp-cm-cumulative.png') fig_num += 1 P.figure(fig_num) P.clf() # P.show() stats.plot_vs_snp(snp_style=snp_style, x_axis='bp_cumulative', filter_length=filter_length) if save_prefix: P.savefig(save_prefix + '-snp-bp-cumulative.png') fig_num += 1 P.figure(fig_num) P.clf() # P.show() stats.plot_vs_snp(snp_style=snp_style, x_axis='cm_edge_dist', filter_length=filter_length) if save_prefix: P.savefig(save_prefix + '-snp-cm-edge-dist.png') fig_num += 1 P.figure(fig_num) P.clf() # P.show() stats.scatter_snp_concordance(snp_style=snp_style) if save_prefix: P.savefig(save_prefix + '-snp-concordance.png') fig_num += 1 P.figure(fig_num) P.clf() # P.show() stats.plot_vs_maf(snp_style=snp_style) if save_prefix: P.savefig(save_prefix + '-maf.png') fig_num += 1 P.figure(fig_num) P.clf() # P.show() stats.plot_vs_sample() if save_prefix: P.savefig(save_prefix + '-sample.png') if save_prefix: stats.summary(open(save_prefix + '-stats.txt', 'wb'))
def __main(args, options): '''Main program - accepts an options struct.''' # Parse and validate command-line arguments in_file, info_file, segment_file, out_dir = args options.out_dir = args[3] # Useful shortcut try: # Initialize thread pool if options.num_processes > 1: manager = Manager() lock = manager.Lock() else: lock = None start = time.time() # Load SNP info info = im.io.read_info_npz(info_file) if options.debug >= 1: _writeln('haps %d, snps %d, region size %d snps, processes %d' % \ (2 * info.num_samples, info.num_snps, options.region_size, options.num_processes), lock) # Read list of regions to process from stdin/in_file. If empty, process all regions. # If a region index list is read, IT MUST BE CONTIGUOUS! (e.g. [3, 4, 5, 6]) regions = map(int, ([options.snp_index / options.region_size] if options.snp_index is not None else (options.regions if options.regions else (sys.stdin if in_file == '-' else open(in_file, 'rb')).readlines()))) num_regions = (info.num_snps + options.region_size - 1) / options.region_size if not regions: regions = range(num_regions) _writeln('regions ' + repr(regions) + ' num_regions ' + repr(num_regions) + ' segment threshold ' + repr(options.min_len) + ' Mbp algorithm ' + options.algorithm + ' margin ' + repr(options.margin), lock) # Save index metadata if first region is processed in this run if options.save: util.mkdir_if_not_exists(out_dir) if options.save and (options.force_save_metadata or 0 in regions): if options.debug >= 1: _writeln('Writing metadata to %s/metadata' % (out_dir,), lock) np.savez('%s/metadata' % (out_dir,), snp=info.snp, region_size=options.region_size) # segments = _SegmentCollection(info, segment_file, regions, options) # print segments # Map phase: process each SNP independently r = segments.region_info # snps = [(r['region'][0], options.snp_index - r['snp_start'][0])] if options.snp_index is not None else \ # ((region, snp) for region, start_raw, stop_raw in zip(r['region'], r['snp_start'], r['snp_stop']) # for snp in xrange(start_raw, stop_raw)) snp_processor = _new_snp_processor(info, segments, options, lock) if options.num_processes > 1: # Multi-process mode. SNPs are processed in parallel. po = Pool(processes=options.num_processes) #a = MyObject() #result = po.map(process_snp, ((a, region, snp) for region, snp in snps)) #result = po.map(process_snp, ((region, snp) for region, snp in snps)) #result = po.map(snp_processor.process, ((region, snp) for region, snp in snps)) result = po.map(snp_processor.process, ((region, snp) for region, snp in snps)) else: # Single-process mode (sequential) #result = [snp_processor.process((region, snp)) for region, snp in snps] for region in regions: result = [snp_processor.process(segment_file, region,((region, snp)) for region, snp in snps)] print result # Reduce phase: organize results in array and save to npz files _save_index(segments.region_info, result, options.save, out_dir) t = time.time() - start if options.debug >= 1: _writeln('Elapsed Time: %.3f sec (%.3f sec/region)' % (t, t / len(regions)), lock) if options.num_processes > 1: manager.shutdown() except: traceback.print_exc(file=sys.stdout) sys.exit(util.EXIT_FAILURE)
default=None, help= 'Identity coefficient file for all sample pairs. Format: id1 id2 lam delta1...delta9. If empty, defaults to plink_set.id' ) options, args = parser.parse_args(sys.argv[1:]) if len(args) != 3: print usage sys.exit(util.EXIT_BAD_INPUT_ARGS) input, chrom, out = args[0], int(args[1]), args[2] # @ReservedAssignment if chrom < 1 or chrom > 22: print usage print('\nMust specify a chromosome number in 1..22.') sys.exit(util.EXIT_BAD_INPUT_ARGS) try: util.mkdir_if_not_exists(os.path.dirname(out)) # Use [cM] as genetic distance unit plink_cmd_base = '%s --bfile %s --chr %d --out %s' % (bu.PLINK, input, chrom, out) if options.recode: # First, compute allele frequencies with PLINK util.run_command(plink_cmd_base + ' --nonfounders --freq') # Convert frequencies file that to a reference allele recoding # file (a file containing the list of SNPs and their minor allele letter) bu.frq_to_minor_file(out + '.frq', out + '.mnr') # Finally, convert binary PLINK to a 12-recoded TPED, where 1=minor allele for each SNP util.run_command( '%s --transpose --recode12 --reference-allele %s.mnr' % (plink_cmd_base, out)) else:
def __main(args, options): '''Main program - accepts an options struct.''' # Parse and validate command-line arguments in_file, info_file, segment_file, out_dir = args options.out_dir = args[3] # Useful shortcut try: # Initialize thread pool if options.num_processes > 1: manager = Manager() lock = manager.Lock() else: lock = None start = time.time() # Load SNP info info = im.io.read_info_npz(info_file) if options.debug >= 1: _writeln('haps %d, snps %d, region size %d snps, processes %d' % \ (2 * info.num_samples, info.num_snps, options.region_size, options.num_processes), lock) # Read list of regions to process from stdin/in_file. If empty, process all regions. # If a region index list is read, IT MUST BE CONTIGUOUS! (e.g. [3, 4, 5, 6]) regions = map( int, ([options.snp_index / options.region_size] if options.snp_index is not None else (options.regions if options.regions else (sys.stdin if in_file == '-' else open(in_file, 'rb') ).readlines()))) num_regions = (info.num_snps + options.region_size - 1) / options.region_size if not regions: regions = range(num_regions) _writeln( 'regions ' + repr(regions) + ' num_regions ' + repr(num_regions) + ' segment length threshold ' + repr(options.min_len) + ' Mbp algorithm ' + options.algorithm + ' margin ' + repr(options.margin) + ' affinity threshold ' + repr(options.theta_affinity) + ' weight threshold ' + repr(options.theta_weight), lock) # Save index metadata if first region is processed in this run if options.save: util.mkdir_if_not_exists(out_dir) if options.save and (options.force_save_metadata or 0 in regions): if options.debug >= 1: _writeln('Writing metadata to %s/metadata' % (out_dir, ), lock) np.savez('%s/metadata' % (out_dir, ), snp=info.snp, region_size=options.region_size) segments = _SegmentCollection(info, segment_file, regions, options) # print segments # Map phase: process each SNP independently r = segments.region_info snps = [(r['region'][0], options.snp_index)] if options.snp_index is not None else \ ((region, snp) for region, start_raw, stop_raw in zip(r['region'], r['snp_start'], r['snp_stop']) for snp in xrange(start_raw, stop_raw)) snp_processor = _new_snp_processor(info, segments, options, lock) if options.num_processes > 1: # Multi-process mode. SNPs are processed in parallel. po = Pool(processes=options.num_processes) # a = MyObject() # result = po.map(process_snp, ((region, snp) for region, snp in snps)) result = po.imap(snp_processor.process, ((region, snp) for region, snp in snps)) # result = po.map(process_snp, ((a, region, snp) for region, snp in snps)) else: # Single-process mode (sequential) result = [ snp_processor.process((region, snp)) for region, snp in snps ] # print result # Reduce phase: organize results in array and save to npz files _save_index(segments.region_info, result, out_dir, options) t = time.time() - start if options.debug >= 1: _writeln( 'Elapsed Time: %.3f sec (%.3f sec/region)' % (t, t / len(regions)), lock) if options.num_processes > 1: manager.shutdown() except: traceback.print_exc(file=sys.stdout) sys.exit(util.EXIT_FAILURE)
def pipeline_monogenic_validation( work_dir=os.environ['OBER_OUT'] + '/requests/monogenic/work', index_segments_dir=os.environ['OBER_OUT'] + '/requests/monogenic/work/index_segments', region_size=100, theta_affinity=0.95, theta_weight=0.5, regenerate_segments=True, snps=None, # np.array([6, 8]), debug=1, debug_sample=512): # Load SNPs problem = im.io.read_plink(prefix=work_dir + '/monogenic.12', pedigree=im.itu.HUTT_PED, haplotype=None, frames=None) # Testing: simulate aligned samples output (hap types should be 2 in the imputed genotype output line) problem.haplotype.poo_phase = np.zeros((problem.num_samples, ), dtype=np.byte) problem.haplotype.poo_phase[np.array([0, 1])] = 1 problem.haplotype.poo_phase[np.array([2, 3])] = -1 # Create segments only for the regions around each snp if regenerate_segments: for row in (problem.info.snp[snps] if snps is not None else problem.info.snp): # Find SNP's region (the one containing its base-pair position) chrom, bp = row['chrom'], row['base_pair'] phasing_dir = '%s/phasing/chr%d' % (os.environ['OBER_OUT'], chrom) index_segments_chrom_dir = '%s/chr%d' % (index_segments_dir, chrom) info_file = '%s/hutt.phased.info.npz' % (phasing_dir, ) info = im.io.read_info_npz(info_file) snp_bp = info.snp['base_pair'] snp_index = util.nearest_neighbor_in_list_tree( bp, snp_bp, util.list_index_tree(snp_bp)) snp_index = snp_index if snp_bp[snp_index] <= bp else snp_index - 1 start = region_size * (snp_index / region_size) stop = start + region_size segment_file = '%s/segments-%d-%d.out' % (index_segments_chrom_dir, start, stop) if not os.path.exists(segment_file): util.mkdir_if_not_exists(index_segments_chrom_dir) util.run_command( 'find-segments-of-snp-range %d %d < %s/segments.out > %s' % (start, stop, phasing_dir, segment_file)) # Index segments if regenerate_segments or \ not os.path.exists('%s/metadata.npz' % (index_segments_chrom_dir,)) or \ not os.path.exists('%s/region-%d.npz' % (index_segments_chrom_dir, start)): index_segments_beagle.main(segment_file, info_file, segment_file, index_segments_chrom_dir, snp_index=snp_index, debug=2, theta_affinity=theta_affinity, theta_weight=theta_weight) # Impute using the newly generated segment index _, t = im.v.iv.impute_problem( problem, debug=debug, remove_partial_calls=True, segment_location=index_segments_dir, # if regenerate_segments else None, snps=snps, debug_sample=debug_sample) im.io.write_plink(im.Problem(genotype=t.imputed, pedigree=im.examples.hutt_pedigree(), haplotype=None, frames=None), work_dir + '/imputed.12', save_frames=False, save_haplotype=False) im.cgi.io_cgi.write_imputed(t, sys.stdout, poo_phase=problem.haplotype.poo_phase) with open(work_dir + '/imputed.12.lgen', 'wb') as f: im.cgi.io_cgi.write_imputed_lgen(t, f) return t
parser.add_option('-r', '--recode' , action='store_true' , dest='recode', default=True, help='Recode alleles to 1=minor, 2=major (if False, a random assignment to 1,2 is made)') parser.add_option('-i', '--id-coef', type=str, dest='id_coef', default=None, help='Identity coefficient file for all sample pairs. Format: id1 id2 lam delta1...delta9. If empty, defaults to plink_set.id') options, args = parser.parse_args(sys.argv[1:]) if len(args) != 3: print usage sys.exit(util.EXIT_BAD_INPUT_ARGS) input, chrom, out = args[0], int(args[1]), args[2] # @ReservedAssignment if chrom < 1 or chrom > 22: print usage print('\nMust specify a chromosome number in 1..22.') sys.exit(util.EXIT_BAD_INPUT_ARGS) try: util.mkdir_if_not_exists(os.path.dirname(out)) # Use [cM] as genetic distance unit plink_cmd_base = '%s --bfile %s --chr %d --out %s' % (bu.PLINK, input, chrom, out) if options.recode: # First, compute allele frequencies with PLINK util.run_command(plink_cmd_base + ' --nonfounders --freq') # Convert frequencies file that to a reference allele recoding # file (a file containing the list of SNPs and their minor allele letter) bu.frq_to_minor_file(out + '.frq', out + '.mnr') # Finally, convert binary PLINK to a 12-recoded TPED, where 1=minor allele for each SNP util.run_command('%s --transpose --recode12 --reference-allele %s.mnr' % (plink_cmd_base, out)) else: # No recoding, just convert binary to 2-recoded TPED. PLINK assigns "1" to # the first-encountered allele in the file for each SNP. util.run_command('%s --transpose --recode12' % (plink_cmd_base,))
(options, args) = parser.parse_args(sys.argv[1:]) if len(args) != 4: print usage sys.exit(1) # in_dir = 'phasing.20121130/split_chr' # out_dir = 'phasing.20121130/individual' # chrom = 5 # part = 0 in_dir, out_dir, chrom, part = args[0], args[1], int(args[2]), int(args[3]) print 'Running phasing in stages' print 'in_dir = %s' % (in_dir,) print 'out_dir = %s' % (out_dir,) print 'chrom = %d' % (chrom,) print 'part = %d' % (part,) out_dir = '%s/chr%d' % (out_dir, chrom) util.mkdir_if_not_exists(out_dir) npz_file = '%s/hutt.stage0.npz' % (out_dir,) if not os.path.exists(npz_file) and options.stage == 0: convert.main(pedigree=itu.HUTT_PED, prefix='%s/chr%d/hutt_chr%d_part%d' % (in_dir, chrom, chrom, part), npz=npz_file, target='npz', debug=True) for stage in (range(1, 5) if options.stage == 0 else [options.stage]): phase.main(pedigree=itu.HUTT_PED, input='%s/hutt.stage%d.npz' % (out_dir, stage - 1), output='%s/hutt.stage%d.npz' % (out_dir, stage), stage=stage, debug=options.debug)
def __main(args, options): '''Main program - accepts an options struct.''' # Parse and validate command-line arguments in_file, info_file, segment_file, out_dir = args options.out_dir = args[3] # Useful shortcut try: if options.num_processes > 1: manager = Manager() lock = manager.Lock() else: lock = None start = time.time() # Load SNP info info = im.io.read_info_npz(info_file) if options.debug >= 1: _writeln('haps %d, snps %d, region size %d snps, processes %d' % \ (2 * info.num_samples, info.num_snps, options.region_size, options.num_processes), lock) # Read list of regions to process from stdin/in_file. If empty, process all regions regions = map(int, ([options.snp_index / options.region_size] if options.snp_index else (options.regions if options.regions else (sys.stdin if in_file == '-' else open(in_file, 'rb') ).readlines()))) num_regions = (info.num_snps + options.region_size - 1) / options.region_size if not regions: regions = range(num_regions) _writeln( 'regions ' + repr(regions) + ' num_regions ' + repr(num_regions) + ' segment threshold ' + repr(options.min_len) + ' Mbp algorithm ' + options.algorithm + ' margin ' + repr(options.margin), lock) # Process each SNP region [start,stop) independently if options.save: util.mkdir_if_not_exists(out_dir) # Save index metadata, if processing the first region if options.save and (options.force_save_metadata or 0 in regions): if options.debug >= 1: _writeln('Writing metadata to %s/metadata' % (out_dir, ), lock) np.savez('%s/metadata' % (out_dir, ), snp=info.snp, region_size=options.region_size) process = _process_region_profile if options.debug >= 2 else _process_region if options.num_processes > 1: # Multi-process mode. Map phase:build and save regional index files po = Pool(processes=options.num_processes) po.map(process, ((info, segment_file, out_dir, options, i, lock) for i in (i for i in regions if i >= 0 and i < num_regions))) else: # Single-process mode. for i in (i for i in regions if i >= 0 and i < num_regions): process((info, segment_file, out_dir, options, i, None)) # Reduce phase - nothing to do here t = time.time() - start if options.debug >= 1: _writeln( 'Elapsed Time: %.3f sec (%.3f sec/region)' % (t, t / len(regions)), lock) if options.num_processes > 1: manager.shutdown() except: traceback.print_exc(file=sys.stdout) sys.exit(util.EXIT_FAILURE)
type='str', dest='out_base_name', default=None, help='Output PLINK data set base name') options, args = parser.parse_args(sys.argv[1:]) if len(args) != 1: print usage sys.exit(util.EXIT_BAD_INPUT_ARGS) input_file = args[0] if not options.out_base_name: options.out_base_name = os.path.splitext(input_file)[0] try: # Initialize daos = db_gene.snp.snp_db_dao.Daos(url=options.db_url) util.mkdir_if_not_exists(os.path.dirname(options.out_base_name)) # Set genetic distance column in BIM file (read locations from snp db) and save a new copy of it snp_data = np.genfromtxt( input_file, dtype=[ ('chrom', np.uint8), # Chromosome # containing the SNP ('name', np.chararray), # SNP name (e.g., 'rs...') ('dist_cm', np.float), # Genetic position [CENTI-Morgans!!] ('base_pair', np.uint), # Base pair position on chromosome ('allele1', np.chararray), ('allele2', np.chararray) ]) snp_names = snp_data['name'] a = dict((x.name, x) for x in daos.snp_dao.get_snps_iter(snp_names)) # Note: our genetic distance unit is cM
data_cols = tuple(data[:, i] for i in xrange(data.shape[1])) # Create SNP classes maf, called_in_both = data[:, maf_col], data[:, called_in_both_col] all_snps = SnpClass('all', data_cols, (maf > 0) & (called_in_both > min_called_in_both)) common_snps = SnpClass('common', data_cols, (maf > maf_threshold) & (called_in_both > min_called_in_both)) # rare_snps = SnpClass('rare', data_cols, (maf <= maf_threshold) & (called_in_both > min_called_in_both)) return all_snps, common_snps def plot_impute2_concordance( (all_snps, common_snps), save_dir=None, plot=False, min_info_to_plot=0.9): '''Generate plot of impute2 concordance for a single window from a Struct holding statistics on all snps, all_snps.''' util.mkdir_if_not_exists(save_dir) # Useful variables lim_threshold = [0., 1.] n = 40 maf_n = 40 # info_bins = [0, 0.7, 0.8, 0.85, 0.9, 1] info_bins = [0, 0.9, 1] threshold = np.linspace(lim_threshold[0], lim_threshold[1], n + 1) maf_bins = np.linspace(0, 0.5, maf_n + 1) k = 0 # Figure counter if save_dir: util.mkdir_if_not_exists(save_dir) for snp_class in (all_snps, ): # (all_snps, common_snps, rare_snps): # k += 1 # P.figure(k)
help='Print debugging information') parser.add_option('-d', '--db-url' , type='str' , dest='db_url', default=db_gene.DEFAULT_URL, help='SNP database URL') parser.add_option('-o', '--out' , type='str' , dest='out_base_name', default=None, help='Output PLINK data set base name') options, args = parser.parse_args(sys.argv[1:]) if len(args) != 1: print usage sys.exit(util.EXIT_BAD_INPUT_ARGS) input_file = args[0] if not options.out_base_name: options.out_base_name = os.path.splitext(input_file)[0] try: # Initialize daos = db_gene.snp.snp_db_dao.Daos(url=options.db_url) util.mkdir_if_not_exists(os.path.dirname(options.out_base_name)) # Set genetic distance column in BIM file (read locations from snp db) and save a new copy of it snp_data = np.genfromtxt(input_file, dtype=[ ('chrom', np.uint8), # Chromosome # containing the SNP ('name', np.chararray), # SNP name (e.g., 'rs...') ('dist_cm', np.float), # Genetic position [CENTI-Morgans!!] ('base_pair', np.uint), # Base pair position on chromosome ('allele1', np.chararray), ('allele2', np.chararray) ]) snp_names = snp_data['name'] a = dict((x.name, x) for x in daos.snp_dao.get_snps_iter(snp_names)) # Note: our genetic distance unit is cM snp_data['dist_cm'] = map(lambda x: x if x else 0.0, ((a[x].genetic_pos if a.has_key(x) else None) for x in snp_names))
print usage sys.exit(1) # in_dir = 'phasing.20121130/split_chr' # out_dir = 'phasing.20121130/individual' # chrom = 5 # part = 0 in_dir, out_dir, chrom, part = args[0], args[1], int(args[2]), int(args[3]) print "Running phasing in stages" print "in_dir = %s" % (in_dir,) print "out_dir = %s" % (out_dir,) print "chrom = %d" % (chrom,) print "part = %d" % (part,) out_dir = "%s/chr%d" % (out_dir, chrom) util.mkdir_if_not_exists(out_dir) npz_file = "%s/hutt.stage0.npz" % (out_dir,) if not os.path.exists(npz_file) and options.stage == 0: convert.main( pedigree=itu.HUTT_PED, prefix="%s/chr%d/hutt_chr%d_part%d" % (in_dir, chrom, chrom, part), npz=npz_file, target="npz", debug=True, ) for stage in range(1, 5) if options.stage == 0 else [options.stage]: phase.main( pedigree=itu.HUTT_PED, input="%s/hutt.stage%d.npz" % (out_dir, stage - 1),
parser.add_option('-r', '--recode' , action='store_true' , dest='recode', default=False, help='Recode alleles to 1=minor, 2=major (if False, allele coding is kept intact)') parser.add_option('-g', '--out-gxn' , type='str' , dest='out_gxn', default=bu.ARG_NONE, help='Output directory of GXN files (if not specified, writes to same directory as out-plink-set''s)') (options, args) = parser.parse_args(sys.argv[1:]) options.print_times = True if options.out_gxn.startswith(bu.ARG_NONE): options.out_gxn = None if len(args) != 3: print usage sys.exit(util.EXIT_BAD_INPUT_ARGS) try: # Prepare file names, create directories (base_name, pedigree_file, out_base_name) = args mkdir_if_not_exists(os.path.dirname(out_base_name)) if options.out_gxn: mkdir_if_not_exists(os.path.dirname(options.out_gxn)) else: options.out_gxn = out_base_name npz_file = base_name + '.npz' # Convert plink tped -> npz problem = io.read_plink(prefix=base_name, pedigree=pedigree_file, haplotype=None, verbose=options.debug) # Phase, impute, fill missing phaser = phase.build_phasing_pipeline(options) request = phase.run_phasing_chain(phaser, problem) stats = request.stats
#----------------------------- # Load data - phased run #----------------------------- data = np.loadtxt(in_file, usecols=usecols) data_cols = tuple(data[:, i] for i in xrange(data.shape[1])) # Create SNP classes maf, called_in_both = data[:, maf_col], data[:, called_in_both_col] all_snps = SnpClass('all', data_cols, (maf > 0) & (called_in_both > min_called_in_both)) common_snps = SnpClass('common', data_cols, (maf > maf_threshold) & (called_in_both > min_called_in_both)) # rare_snps = SnpClass('rare', data_cols, (maf <= maf_threshold) & (called_in_both > min_called_in_both)) return all_snps, common_snps def plot_impute2_concordance((all_snps, common_snps), save_dir=None, plot=False, min_info_to_plot=0.9): '''Generate plot of impute2 concordance for a single window from a Struct holding statistics on all snps, all_snps.''' util.mkdir_if_not_exists(save_dir) # Useful variables lim_threshold = [0., 1.] n = 40 maf_n = 40 # info_bins = [0, 0.7, 0.8, 0.85, 0.9, 1] info_bins = [0, 0.9, 1] threshold = np.linspace(lim_threshold[0], lim_threshold[1], n + 1) maf_bins = np.linspace(0, 0.5, maf_n + 1) k = 0 # Figure counter if save_dir: util.mkdir_if_not_exists(save_dir) for snp_class in (all_snps,): # (all_snps, common_snps, rare_snps): # k += 1 # P.figure(k)
type='int', dest='stop', default=None, help='Ending part number (not inclusive)') (options, args) = parser.parse_args(sys.argv[1:]) if len(args) != 3: print usage sys.exit(util.EXIT_BAD_INPUT_ARGS) if options.start is None or options.stop is None: print 'Must specify start and stop' print usage sys.exit(util.EXIT_BAD_INPUT_ARGS) (in_file, part_type, out_file) = args part_type = __parse_part_type(part_type) num_parts = options.stop - options.start mkdir_if_not_exists(os.path.dirname(out_file)) try: # Merge PLINK data sets. If there's one part, nothing to merge, just copy the files. part_names = bu.partition_names(in_file, part_type, parts=xrange(options.start, options.stop)).values() first_part_name = part_names[0] print 'Reducing, num_parts', num_parts if num_parts == 1: for ext in EXTENSIONS: shutil.copy(first_part_name + '.' + ext, out_file + '.' + ext) else: # Prepare PLINK merge command input file f = tempfile.NamedTemporaryFile(delete=False)
parser.add_option('-s', '--start-part' , type='int', dest='start', default=None, help='Starting part number (inclusive)') parser.add_option('-e', '--stop-part' , type='int', dest='stop', default=None, help='Ending part number (not inclusive)') (options, args) = parser.parse_args(sys.argv[1:]) if len(args) != 3: print usage sys.exit(util.EXIT_BAD_INPUT_ARGS) if options.start is None or options.stop is None: print 'Must specify start and stop' print usage sys.exit(util.EXIT_BAD_INPUT_ARGS) (in_file, part_type, out_file) = args part_type = __parse_part_type(part_type) num_parts = options.stop - options.start mkdir_if_not_exists(os.path.dirname(out_file)) try: # Merge PLINK data sets. If there's one part, nothing to merge, just copy the files. part_names = bu.partition_names(in_file, part_type, parts=xrange(options.start, options.stop)).values() first_part_name = part_names[0] print 'Reducing, num_parts', num_parts if num_parts == 1: for ext in EXTENSIONS: shutil.copy(first_part_name + '.' + ext, out_file + '.' + ext) else: # Prepare PLINK merge command input file f = tempfile.NamedTemporaryFile(delete=False) for name in part_names: for ext in EXTENSIONS: