def main(): args = parse_args() sys.stderr.write("command line: %s\n" % " ".join(sys.argv)) sys.stderr.write("python version: %s\n" % sys.version) sys.stderr.write("pysam version: %s\n" % pysam.__version__) sys.stderr.write("pytables version: %s\n" % tables.__version__) util.check_pysam_version() util.check_pytables_version() # disable warnings that come from pytables when chromosome # names are like 1, 2, 3 (instead of chr1, chr2, chr3) warnings.filterwarnings('ignore', category=tables.NaturalNameWarning) snp_tab_h5 = tables.open_file(args.snp_tab, "r") snp_index_h5 = tables.open_file(args.snp_index, "r") if args.haplotype: hap_h5 = tables.open_file(args.haplotype, "r") else: hap_h5 = None ref_count_h5 = tables.open_file(args.ref_as_counts, "w") alt_count_h5 = tables.open_file(args.alt_as_counts, "w") other_count_h5 = tables.open_file(args.other_as_counts, "w") read_count_h5 = tables.open_file(args.read_counts, "w") output_h5 = [ref_count_h5, alt_count_h5, other_count_h5, read_count_h5] chrom_dict = {} # initialize every chromosome in output files chrom_list = chromosome.get_all_chromosomes(args.chrom) for chrom in chrom_list: for out_file in output_h5: create_carray(out_file, chrom, args.data_type) chrom_dict[chrom.name] = chrom count = 0 dtype = None if args.data_type == "uint8": max_count = MAX_UINT8_COUNT dtype = np.uint8 elif args.data_type == "uint16": max_count = MAX_UINT16_COUNT dtype = np.uint16 else: raise NotImplementedError("unsupported datatype %s" % args.data_type) # create a txt file to also holds the counts if args.txt_counts is not None: if os.path.splitext(args.txt_counts)[1] == ".gz": txt_counts = gzip.open(args.txt_counts, 'wt+') else: txt_counts = open(args.txt_counts, 'w+') for chrom in chrom_list: sys.stderr.write("%s\n" % chrom.name) if args.test_chrom: if chrom.name != args.test_chrom: sys.stderr.write("skipping because not test chrom\n") continue warned_pos = {} # fetch SNP info for this chromosome if chrom.name not in snp_tab_h5.root: # no SNPs for this chromosome sys.stderr.write("skipping %s because chromosome with this name " "not found in SNP table\n" % chrom.name) continue sys.stderr.write("fetching SNPs\n") snp_tab = snp_tab_h5.get_node("/%s" % chrom.name) snp_index_array = snp_index_h5.get_node("/%s" % chrom.name)[:] if hap_h5: hap_tab = hap_h5.get_node("/%s" % chrom.name) ind_dict, ind_idx = snptable.SNPTable().get_h5_sample_indices( hap_h5, chrom, [args.individual]) if len(ind_idx) == 1: ind_idx = ind_idx[0] sys.stderr.write("index for individual %s is %d\n" % (args.individual, ind_idx)) else: raise ValueError("got sample indices for %d individuals, " "but expected to get index for one " "individual (%s)" % (len(ind_idx), args.individual)) hap_tab = None ind_idx = None else: hap_tab = None ind_idx = None # initialize count arrays for this chromosome to 0 ref_carray = get_carray(ref_count_h5, chrom) alt_carray = get_carray(alt_count_h5, chrom) other_carray = get_carray(other_count_h5, chrom) read_count_carray = get_carray(read_count_h5, chrom) ref_array = np.zeros(chrom.length, dtype) alt_array = np.zeros(chrom.length, dtype) other_array = np.zeros(chrom.length, dtype) read_count_array = np.zeros(chrom.length, dtype) # loop over all BAM files, pulling out reads # for this chromosome for bam_filename in args.bam_filenames: sys.stderr.write("reading from file %s\n" % bam_filename) samfile = pysam.Samfile(bam_filename, "rb") for read in get_sam_iter(samfile, chrom): count += 1 if count == 10000: sys.stderr.write(".") count = 0 add_read_count(read, chrom, ref_array, alt_array, other_array, read_count_array, snp_index_array, snp_tab, hap_tab, warned_pos, max_count, ind_idx) # store results for this chromosome ref_carray[:] = ref_array alt_carray[:] = alt_array other_carray[:] = other_array read_count_carray[:] = read_count_array sys.stderr.write("\n") # write data to numpy arrays, so that they can be written to a txt # file later # columns are: # chrom, pos, ref, alt, genotype, ref_count, alt_count, other_count if args.txt_counts is not None: write_txt_file(txt_counts, chrom, snp_tab, hap_tab, ind_idx, ref_array, alt_array, other_array) samfile.close() if args.txt_counts: # close the open txt file handler txt_counts.close() # check if any of the reads contained an unimplemented CIGAR if unimplemented_CIGAR[0] > 0: sys.stderr.write("WARNING: Encountered " + str(unimplemented_CIGAR[0]) + " instances of CIGAR codes: " + str(unimplemented_CIGAR[1]) + ". Reads with these " "CIGAR codes were skipped because they " "are currently unimplemented.\n") # set track statistics and close HDF5 files sys.stderr.write("setting statistics for each chromosome\n") for h5f in output_h5: chromstat.set_stats(h5f, chrom_list) h5f.close() snp_tab_h5.close() snp_index_h5.close() if hap_h5: hap_h5.close() sys.stderr.write("done\n")
if (len(keep_cache) + len(discard_cache)) != 0: sys.stderr.write("WARNING: failed to find pairs for %d " "keep reads and %d discard reads on this " "chromosome\n" % (len(keep_cache), len(discard_cache))) read_stats.discard_missing_pair += len(keep_cache) + len(discard_cache) read_stats.write(sys.stderr) if __name__ == "__main__": sys.stderr.write("command line: %s\n" % " ".join(sys.argv)) sys.stderr.write("python version: %s\n" % sys.version) sys.stderr.write("pysam version: %s\n" % pysam.__version__) util.check_pysam_version() parser = argparse.ArgumentParser() parser.add_argument('input_bam', help="input BAM or SAM file (must " "be sorted!)") parser.add_argument("output_bam", help="output BAM or SAM file (not " "sorted!)") options = parser.parse_args() main(options.input_bam, options.output_bam)
def main(): args = parse_args() sys.stderr.write("command line: %s\n" % " ".join(sys.argv)) sys.stderr.write("python version: %s\n" % sys.version) sys.stderr.write("pysam version: %s\n" % pysam.__version__) sys.stderr.write("pytables version: %s\n" % tables.__version__) util.check_pysam_version() util.check_pytables_version() snp_tab_h5 = tables.open_file(args.snp_tab, "r") snp_index_h5 = tables.open_file(args.snp_index, "r") if args.haplotype: hap_h5 = tables.open_file(args.haplotype, "r") ind_idx = lookup_individual_index(args.samples, args.individual) else: hap_h5 = None ind_idx = None ref_count_h5 = tables.open_file(args.ref_as_counts, "w") alt_count_h5 = tables.open_file(args.alt_as_counts, "w") other_count_h5 = tables.open_file(args.other_as_counts, "w") read_count_h5 = tables.open_file(args.read_counts, "w") output_h5 = [ref_count_h5, alt_count_h5, other_count_h5, read_count_h5] chrom_dict = {} # initialize every chromosome in output files chrom_list = chromosome.get_all_chromosomes(args.chrom) for chrom in chrom_list: for out_file in output_h5: create_carray(out_file, chrom, args.data_type) chrom_dict[chrom.name] = chrom count = 0 dtype = None if args.data_type == "uint8": max_count = MAX_UINT8_COUNT dtype = np.uint8 elif args.data_type == "uint16": max_count = MAX_UINT16_COUNT dtype = np.uint16 else: raise NotImplementedError("unsupported datatype %s" % args.data_type) # create a list to hold the counts that will be later written # to a txt file if args.text_counts is not None: txt_counts = list() for chrom in chrom_list: sys.stderr.write("%s\n" % chrom.name) warned_pos = {} # fetch SNP info for this chromosome if chrom.name not in snp_tab_h5.root: # no SNPs for this chromosome continue sys.stderr.write("fetching SNPs\n") snp_tab = snp_tab_h5.get_node("/%s" % chrom.name) snp_index_array = snp_index_h5.get_node("/%s" % chrom.name)[:] if hap_h5: hap_tab = hap_h5.get_node("/%s" % chrom.name) else: hap_tab = None # initialize count arrays for this chromosome to 0 ref_carray = get_carray(ref_count_h5, chrom) alt_carray = get_carray(alt_count_h5, chrom) other_carray = get_carray(other_count_h5, chrom) read_count_carray = get_carray(read_count_h5, chrom) ref_array = np.zeros(chrom.length, dtype) alt_array = np.zeros(chrom.length, dtype) other_array = np.zeros(chrom.length, dtype) read_count_array = np.zeros(chrom.length, dtype) # loop over all BAM files, pulling out reads # for this chromosome for bam_filename in args.bam_filenames: sys.stderr.write("reading from file %s\n" % bam_filename) samfile = pysam.Samfile(bam_filename, "rb") for read in get_sam_iter(samfile, chrom): count += 1 if count == 10000: sys.stderr.write(".") count = 0 add_read_count(read, chrom, ref_array, alt_array, other_array, read_count_array, snp_index_array, snp_tab, hap_tab, warned_pos, max_count, ind_idx) # store results for this chromosome ref_carray[:] = ref_array alt_carray[:] = alt_array other_carray[:] = other_array read_count_carray[:] = read_count_array sys.stderr.write("\n") # write data to numpy arrays, so that they can be written to a txt # file later # columns are: # chrom, pos, ref, alt, genotype, ref_count, alt_count, other_count if args.text_counts is not None: chrom = np.tile(chrom.name, len(snp_tab)) pos = np.array([snp['pos'] for snp in snp_tab]) ref = np.array([snp['allele1'] for snp in snp_tab]) alt = np.array([snp['allele2'] for snp in snp_tab]) if hap_tab is not None: genotype = np.array([str(hap[0])+"|"+str(hap[1]) for hap in hap_tab]) else: genotype = np.empty((len(snp_tab), 0)) txt_counts.append( np.column_stack((chrom, pos, ref, alt, genotype, ref_array[pos-1], alt_array[pos-1], other_array[pos-1])) ) samfile.close() # write the txt_counts np arrays to a txt file if args.text_counts is not None: # we use vstack to combine np arrays row-wise into a multi-dimensional # array np.savetxt(args.text_counts, np.vstack(tuple(txt_counts)), fmt="%1s", delimiter=" ") # set track statistics and close HDF5 files sys.stderr.write("setting statistics for each chromosome\n") for h5f in output_h5: chromstat.set_stats(h5f, chrom_list) h5f.close() snp_tab_h5.close() snp_index_h5.close() if hap_h5: hap_h5.close() sys.stderr.write("done\n")
def main(): args = parse_args() sys.stderr.write("command line: %s\n" % " ".join(sys.argv)) sys.stderr.write("python version: %s\n" % sys.version) sys.stderr.write("pysam version: %s\n" % pysam.__version__) sys.stderr.write("pytables version: %s\n" % tables.__version__) util.check_pysam_version() util.check_pytables_version() snp_tab_h5 = tables.open_file(args.snp_tab, "r") snp_index_h5 = tables.open_file(args.snp_index, "r") if args.haplotype: hap_h5 = tables.open_file(args.haplotype, "r") else: hap_h5 = None ref_count_h5 = tables.open_file(args.ref_as_counts, "w") alt_count_h5 = tables.open_file(args.alt_as_counts, "w") other_count_h5 = tables.open_file(args.other_as_counts, "w") read_count_h5 = tables.open_file(args.read_counts, "w") output_h5 = [ref_count_h5, alt_count_h5, other_count_h5, read_count_h5] chrom_dict = {} # initialize every chromosome in output files chrom_list = chromosome.get_all_chromosomes(args.chrom) for chrom in chrom_list: for out_file in output_h5: create_carray(out_file, chrom, args.data_type) chrom_dict[chrom.name] = chrom count = 0 dtype = None if args.data_type == "uint8": max_count = MAX_UINT8_COUNT dtype = np.uint8 elif args.data_type == "uint16": max_count = MAX_UINT16_COUNT dtype = np.uint16 else: raise NotImplementedError("unsupported datatype %s" % args.data_type) # create a txt file to also holds the counts if args.txt_counts is not None: if os.path.splitext(args.txt_counts)[1] == ".gz": txt_counts = gzip.open(args.txt_counts, 'a+') else: txt_counts = open(args.txt_counts, 'a+') for chrom in chrom_list: sys.stderr.write("%s\n" % chrom.name) warned_pos = {} # fetch SNP info for this chromosome if chrom.name not in snp_tab_h5.root: # no SNPs for this chromosome continue sys.stderr.write("fetching SNPs\n") snp_tab = snp_tab_h5.get_node("/%s" % chrom.name) snp_index_array = snp_index_h5.get_node("/%s" % chrom.name)[:] if hap_h5: hap_tab = hap_h5.get_node("/%s" % chrom.name) ind_idx = snptable.SNPTable().get_h5_sample_indices( hap_h5, chrom, [args.individual])[1] if len(ind_idx) != 0: ind_idx = ind_idx[0] else: hap_tab = None ind_idx = None else: hap_tab = None ind_idx = None # initialize count arrays for this chromosome to 0 ref_carray = get_carray(ref_count_h5, chrom) alt_carray = get_carray(alt_count_h5, chrom) other_carray = get_carray(other_count_h5, chrom) read_count_carray = get_carray(read_count_h5, chrom) ref_array = np.zeros(chrom.length, dtype) alt_array = np.zeros(chrom.length, dtype) other_array = np.zeros(chrom.length, dtype) read_count_array = np.zeros(chrom.length, dtype) # loop over all BAM files, pulling out reads # for this chromosome for bam_filename in args.bam_filenames: sys.stderr.write("reading from file %s\n" % bam_filename) samfile = pysam.Samfile(bam_filename, "rb") for read in get_sam_iter(samfile, chrom): count += 1 if count == 10000: sys.stderr.write(".") count = 0 add_read_count(read, chrom, ref_array, alt_array, other_array, read_count_array, snp_index_array, snp_tab, hap_tab, warned_pos, max_count, ind_idx) # store results for this chromosome ref_carray[:] = ref_array alt_carray[:] = alt_array other_carray[:] = other_array read_count_carray[:] = read_count_array sys.stderr.write("\n") # write data to numpy arrays, so that they can be written to a txt # file later # columns are: # chrom, pos, ref, alt, genotype, ref_count, alt_count, other_count if args.txt_counts is not None: chrom = np.tile(chrom.name, len(snp_tab)) pos = np.array([snp['pos'] for snp in snp_tab]) ref = np.array([snp['allele1'] for snp in snp_tab]) alt = np.array([snp['allele2'] for snp in snp_tab]) if hap_tab is not None: genotype = np.array( [str(hap[0]) + "|" + str(hap[1]) for hap in hap_tab]) else: genotype = np.empty((len(snp_tab), 0)) # write an np array to a txt file np.savetxt(txt_counts, np.column_stack( (chrom, pos, ref, alt, genotype, ref_array[pos - 1], alt_array[pos - 1], other_array[pos - 1])), fmt="%1s", delimiter=" ") samfile.close() if args.txt_counts: # close the open txt file handler txt_counts.close() # check if any of the reads contained an unimplemented CIGAR sys.stderr.write( "WARNING: Encountered " + str(unimplemented_CIGAR[0]) + " instances of any of the following CIGAR codes: " + str(unimplemented_CIGAR[1]) + ". The regions of reads with these CIGAR codes were skipped because these CIGAR codes are currently unimplemented.\n" ) # set track statistics and close HDF5 files sys.stderr.write("setting statistics for each chromosome\n") for h5f in output_h5: chromstat.set_stats(h5f, chrom_list) h5f.close() snp_tab_h5.close() snp_index_h5.close() if hap_h5: hap_h5.close() sys.stderr.write("done\n")
haplotype_filename=haplotype_filename) filter_reads(files, max_seqs=max_seqs, max_snps=max_snps, samples=samples) files.close() if __name__ == '__main__': sys.stderr.write("command line: %s\n" % " ".join(sys.argv)) sys.stderr.write("python version: %s\n" % sys.version) sys.stderr.write("pysam version: %s\n" % pysam.__version__) sys.stderr.write("pytables version: %s\n" % tables.__version__) util.check_pysam_version() util.check_pytables_version() util.check_python_version() options = parse_options() samples = parse_samples(options.samples) main(options.bam_filename, is_paired_end=options.is_paired_end, is_sorted=options.is_sorted, max_seqs=options.max_seqs, max_snps=options.max_snps, output_dir=options.output_dir, snp_dir=options.snp_dir, snp_tab_filename=options.snp_tab, snp_index_filename=options.snp_index, haplotype_filename=options.haplotype, samples=samples)