Ejemplo n.º 1
0
    def test_read_indels(self):
        data = Data()
        data.snp_list = [(10, "A", "-"), # 1bp deletion
                         (20, "A", "ATTG"), # 3bp insertion
                         (21, "A", "T"), # not an indel
                         (3, "AAA", "A")] # 2bp deletion
        
        data.setup()

        snp_tab = snptable.SNPTable()
        snp_tab.read_file(data.snp_filename)

        # check snp_index set correctly
        assert len(snp_tab.snp_index) == 21
        assert snp_tab.snp_index[9] == 0
        assert snp_tab.snp_index[19] == 1
        assert snp_tab.snp_index[2] == 3
        
        # only 4 values of index should be non -1
        assert np.where(snp_tab.snp_index != -1)[0].shape[0] == 4

        # check snp_allele set correctly
        assert snp_tab.snp_allele1[0] == b"A"
        assert snp_tab.snp_allele2[0] == b""
        assert snp_tab.snp_allele1[1] == b"A"
        assert snp_tab.snp_allele2[1] == b"ATTG"
        assert snp_tab.snp_allele1[3] == b"AAA"
        assert snp_tab.snp_allele2[3] == b"A"

        # check that snp_pos set correctly
        assert snp_tab.snp_pos[0] == 10
        assert snp_tab.snp_pos[1] == 20
        assert snp_tab.snp_pos[2] == 21
        assert snp_tab.snp_pos[3] == 3
Ejemplo n.º 2
0
    def test_read_snps(self):
        data = Data()
        data.setup()
        
        snp_tab = snptable.SNPTable()
        snp_tab.read_file(data.snp_filename)

        # check snp_index set correctly
        assert len(snp_tab.snp_index) == 100
        assert snp_tab.snp_index[9] == 0
        assert snp_tab.snp_index[19] == 1
        assert snp_tab.snp_index[99] == 2
        # only 3 values of index should be non -1
        assert np.where(snp_tab.snp_index != -1)[0].shape[0] == 3

        # check snp_allele set correctly
        assert snp_tab.snp_allele1[0] == b"A"
        assert snp_tab.snp_allele2[0] == b"C"
        assert snp_tab.snp_allele1[1] == b"T"
        assert snp_tab.snp_allele2[1] == b"G"
        assert snp_tab.snp_allele1[2] == b"A"
        assert snp_tab.snp_allele2[2] == b"T"

        # check that snp_pos set correctly
        assert snp_tab.snp_pos[0] == 10
        assert snp_tab.snp_pos[1] == 20
        assert snp_tab.snp_pos[2] == 100
Ejemplo n.º 3
0
    def test_get_overlapping_indel(self):
        """Test that indels can be correctly obtained"""
        data = Data()
        data.snp_list = [(10, "A", "-")]
        data.setup()

        # write a single read with match
        sam_file = open(data.sam_filename, "w")
        data.write_sam_header(sam_file)
        data.write_sam_read(sam_file, cigar="30M")
        sam_file.close()
        
        sam_file = pysam.Samfile(data.sam_filename)
        read = next(sam_file)

        snp_tab = snptable.SNPTable()
        snp_tab.read_file(data.snp_filename)
        snp_idx, snp_read_pos, \
            indel_idx, indel_read_pos = snp_tab.get_overlapping_snps(read)

        # check that overlapping indel found in correct location
        assert len(snp_idx) == 0
        assert len(indel_idx) == 1
        assert indel_idx[0] == 0
        assert indel_read_pos[0] == 10
Ejemplo n.º 4
0
    def test_get_overlapping_snps_softclip(self):
        """Test that soft-clipped part of read is not used"""
        data = Data()
        data.setup()

        # write a single read with softclipping on left end
        sam_file = open(data.sam_filename, "w")
        data.write_sam_header(sam_file)
        data.write_sam_read(sam_file, cigar="10S20M")
        sam_file.close()
        
        sam_file = pysam.Samfile(data.sam_filename)
        read = next(sam_file)

        snp_tab = snptable.SNPTable()
        snp_tab.read_file(data.snp_filename)
        snp_idx, snp_read_pos, \
            indel_idx, indel_read_pos = snp_tab.get_overlapping_snps(read)

        # check that overlapping SNPs are found and in correct locations
        assert len(snp_idx) == 2
        assert snp_idx[0] == 0
        assert snp_idx[1] == 1
        assert snp_read_pos[0] == 20
        assert snp_read_pos[1] == 30
Ejemplo n.º 5
0
    def test_get_overlapping_snps_intron(self):
        """Test a read spanning an intron (N in CIGAR string)"""
        data = Data()
        data.setup()

        # write a single read with intron in CIGAR (N)
        sam_file = open(data.sam_filename, "w")
        data.write_sam_header(sam_file)
        data.write_sam_read(sam_file, cigar="10M85N20M")
        sam_file.close()
        
        sam_file = pysam.Samfile(data.sam_filename)
        read = next(sam_file)

        snp_tab = snptable.SNPTable()
        snp_tab.read_file(data.snp_filename)
        snp_idx, snp_read_pos, \
            indel_idx, indel_read_pos = snp_tab.get_overlapping_snps(read)

        # check that overlapping SNPs are found and in correct locations
        assert len(snp_idx) == 2
        assert snp_idx[0] == 0
        assert snp_idx[1] == 2
        
        assert snp_read_pos[0] == 10
        assert snp_read_pos[1] == 15
Ejemplo n.º 6
0
    def test_get_overlapping_snps_simple(self):
        """Do a simple test of getting 2 overlapping SNPs
        with a read with 30 matches"""
        data = Data()
        data.setup()

        # write a single read with all matches to SAM
        sam_file = open(data.sam_filename, "w")
        data.write_sam_header(sam_file)
        data.write_sam_read(sam_file)
        sam_file.close()

        sam_file = pysam.Samfile(data.sam_filename)
        read = next(sam_file)

        # simple case where read has only one big match segment
        snp_tab = snptable.SNPTable()
        snp_tab.read_file(data.snp_filename)
        snp_idx, snp_read_pos, \
            indel_idx, indel_read_pos = snp_tab.get_overlapping_snps(read)

        # check that overlapping SNPs are found and in correct locations
        assert len(snp_idx) == 2
        assert snp_idx[0] == 0
        assert snp_idx[1] == 1
        
        assert snp_read_pos[0] == 10
        assert snp_read_pos[1] == 20

        assert len(indel_idx) == 0
        assert len(indel_read_pos) == 0
Ejemplo n.º 7
0
def main():
    args = parse_args()

    sys.stderr.write("command line: %s\n" % " ".join(sys.argv))
    sys.stderr.write("python version: %s\n" % sys.version)
    sys.stderr.write("pysam version: %s\n" % pysam.__version__)
    sys.stderr.write("pytables version: %s\n" % tables.__version__)

    util.check_pysam_version()
    util.check_pytables_version()

    # disable warnings that come from pytables when chromosome
    # names are like 1, 2, 3 (instead of chr1, chr2, chr3)
    warnings.filterwarnings('ignore', category=tables.NaturalNameWarning)

    snp_tab_h5 = tables.open_file(args.snp_tab, "r")
    snp_index_h5 = tables.open_file(args.snp_index, "r")

    if args.haplotype:
        hap_h5 = tables.open_file(args.haplotype, "r")
    else:
        hap_h5 = None

    ref_count_h5 = tables.open_file(args.ref_as_counts, "w")
    alt_count_h5 = tables.open_file(args.alt_as_counts, "w")
    other_count_h5 = tables.open_file(args.other_as_counts, "w")
    read_count_h5 = tables.open_file(args.read_counts, "w")

    output_h5 = [ref_count_h5, alt_count_h5, other_count_h5, read_count_h5]

    chrom_dict = {}

    # initialize every chromosome in output files
    chrom_list = chromosome.get_all_chromosomes(args.chrom)

    for chrom in chrom_list:
        for out_file in output_h5:
            create_carray(out_file, chrom, args.data_type)
        chrom_dict[chrom.name] = chrom

    count = 0
    dtype = None
    if args.data_type == "uint8":
        max_count = MAX_UINT8_COUNT
        dtype = np.uint8
    elif args.data_type == "uint16":
        max_count = MAX_UINT16_COUNT
        dtype = np.uint16
    else:
        raise NotImplementedError("unsupported datatype %s" % args.data_type)

    # create a txt file to also holds the counts
    if args.txt_counts is not None:
        if os.path.splitext(args.txt_counts)[1] == ".gz":
            txt_counts = gzip.open(args.txt_counts, 'wt+')
        else:
            txt_counts = open(args.txt_counts, 'w+')

    for chrom in chrom_list:
        sys.stderr.write("%s\n" % chrom.name)

        if args.test_chrom:
            if chrom.name != args.test_chrom:
                sys.stderr.write("skipping because not test chrom\n")
                continue

        warned_pos = {}

        # fetch SNP info for this chromosome
        if chrom.name not in snp_tab_h5.root:
            # no SNPs for this chromosome
            sys.stderr.write("skipping %s because chromosome with this name "
                             "not found in SNP table\n" % chrom.name)
            continue

        sys.stderr.write("fetching SNPs\n")

        snp_tab = snp_tab_h5.get_node("/%s" % chrom.name)
        snp_index_array = snp_index_h5.get_node("/%s" % chrom.name)[:]
        if hap_h5:
            hap_tab = hap_h5.get_node("/%s" % chrom.name)
            ind_dict, ind_idx = snptable.SNPTable().get_h5_sample_indices(
                hap_h5, chrom, [args.individual])

            if len(ind_idx) == 1:
                ind_idx = ind_idx[0]
                sys.stderr.write("index for individual %s is %d\n" %
                                 (args.individual, ind_idx))
            else:
                raise ValueError("got sample indices for %d individuals, "
                                 "but expected to get index for one "
                                 "individual (%s)" %
                                 (len(ind_idx), args.individual))
                hap_tab = None
                ind_idx = None
        else:
            hap_tab = None
            ind_idx = None

        # initialize count arrays for this chromosome to 0
        ref_carray = get_carray(ref_count_h5, chrom)
        alt_carray = get_carray(alt_count_h5, chrom)
        other_carray = get_carray(other_count_h5, chrom)
        read_count_carray = get_carray(read_count_h5, chrom)

        ref_array = np.zeros(chrom.length, dtype)
        alt_array = np.zeros(chrom.length, dtype)
        other_array = np.zeros(chrom.length, dtype)
        read_count_array = np.zeros(chrom.length, dtype)

        # loop over all BAM files, pulling out reads
        # for this chromosome
        for bam_filename in args.bam_filenames:
            sys.stderr.write("reading from file %s\n" % bam_filename)

            samfile = pysam.Samfile(bam_filename, "rb")

            for read in get_sam_iter(samfile, chrom):
                count += 1
                if count == 10000:
                    sys.stderr.write(".")
                    count = 0

                add_read_count(read, chrom, ref_array, alt_array, other_array,
                               read_count_array, snp_index_array, snp_tab,
                               hap_tab, warned_pos, max_count, ind_idx)

            # store results for this chromosome
            ref_carray[:] = ref_array
            alt_carray[:] = alt_array
            other_carray[:] = other_array
            read_count_carray[:] = read_count_array
            sys.stderr.write("\n")

            # write data to numpy arrays, so that they can be written to a txt
            # file later
            # columns are:
            # chrom, pos, ref, alt, genotype, ref_count, alt_count, other_count
            if args.txt_counts is not None:
                write_txt_file(txt_counts, chrom, snp_tab, hap_tab, ind_idx,
                               ref_array, alt_array, other_array)
            samfile.close()

    if args.txt_counts:
        # close the open txt file handler
        txt_counts.close()

    # check if any of the reads contained an unimplemented CIGAR
    if unimplemented_CIGAR[0] > 0:
        sys.stderr.write("WARNING: Encountered " +
                         str(unimplemented_CIGAR[0]) +
                         " instances of CIGAR codes: " +
                         str(unimplemented_CIGAR[1]) + ". Reads with these "
                         "CIGAR codes were skipped because they "
                         "are currently unimplemented.\n")

    # set track statistics and close HDF5 files

    sys.stderr.write("setting statistics for each chromosome\n")
    for h5f in output_h5:
        chromstat.set_stats(h5f, chrom_list)
        h5f.close()

    snp_tab_h5.close()
    snp_index_h5.close()
    if hap_h5:
        hap_h5.close()

    sys.stderr.write("done\n")
Ejemplo n.º 8
0
def main(bedpe_filename, snp_dir=None):

    endl = os.linesep

    out_f = sys.stdout
    
    # bam = pysam.Samfile(bam_filename)
    bedpe = gzip.open(bedpe_filename)
        
    cur_chrom = None
    cur_tid = None
    seen_chrom = set([])
    snp_chrom = None

    snp_tab = snptable.SNPTable()
    read_pair_cache = {}

    for line in bedpe:
        # print line

        line = line.rstrip(endl)
        cur_chrom,c_start,c_end,_,c_strand,_,_,c_iend,c_istart,_,_,_,_,_,c_seq1,c_seq2,c_cigar1,c_cigar2 = line.split("\t")
        # cur_chrom,c_start,c_end,_,c_strand,_,_,c_iend,c_istart,_,_,_,_,_,c_seq1,c_seq2 = line.split("\t")

        c_start=int(c_start)
        c_istart=int(c_istart)
        c_end=int(c_end)
        c_iend=int(c_iend)

        if (len(c_seq1) != (c_iend-c_start+1) or len(c_seq2) != (c_end-c_istart+1)):
            # print "indels in "+line
            continue
        # c_cigar1=str(len(c_seq1))+"M"
        # c_cigar2=str(len(c_seq2))+"M"

        # one of the reads is on minus strand, depending on the strand to which
        # the fragment is mapped; I need all reads on the plus strand
        # c_seq2=reverse_complement(c_seq2)
        # print("SEQ = "+c_seq2)

        if (snp_chrom is None) or (cur_chrom != snp_chrom):
            # this is a new chromosome
            
            if cur_chrom in seen_chrom:
                # sanity check that input bam file is sorted
                raise ValueError("expected input BAM file to be sorted "
                                 "but chromosome %s is repeated\n" % cur_chrom)
            seen_chrom.add(cur_chrom)
            # cur_tid = read.tid
            snp_chrom = cur_chrom
            sys.stderr.write("starting chromosome %s\n" % cur_chrom)

            # read SNPs for next chromomsome
            # read SNPs from text file
            snp_filename = "%s/%s.snps.txt.gz" % (snp_dir, cur_chrom)
            snp_tab.read_file(snp_filename)

            sys.stderr.write("read %d SNPs\n" % snp_tab.n_snp)
            
        # loop over all SNP that overlap this read; record:
        # - read_pos: SNP position in current read (ie read1 or read2), used to determine base identities
        # - frag_pos: SNP position in entire SuRE-fragment, may occur multiple times if both reads overlap same SNP
        # - read_base: base identities in current read, is compared to allele-variants for SNPs in current read
        # - frag_base: base identities for all SNPs in SuRE-fragment
        # - snp_pos: chromosome position of SNPs
        # - snp_var: whether frag_base is reference allele (0), alternative allele (1), or non-matching (2), or unknown (3)
        snp_idx, snp_read_pos, indel_idx, indel_read_pos = \
                snp_tab.get_overlapping_snps_from_bedpe(c_start-1, cigar2tuple(c_cigar1), len(c_seq1), c_seq1, c_cigar1)
        read_pos = [p-1 for p in snp_read_pos]
        frag_pos = read_pos
        read_base = [c_seq1[p] for p in read_pos]
        frag_base = read_base
        snp_pos = [snp_tab.snp_pos[i] for i in snp_idx]
        snp_var = [int((b==snp_tab.snp_allele1[i] and '0') or (b==snp_tab.snp_allele2[i] and 1) or 2) for b, i in zip(read_base, snp_idx)]
        snp_ind = snp_idx

        snp_idx, snp_read_pos, indel_idx, indel_read_pos = \
                snp_tab.get_overlapping_snps_from_bedpe(c_istart-1, cigar2tuple(c_cigar2), len(c_seq2), c_seq2, c_cigar2)
        read_pos = [p-1 for p in snp_read_pos]
        frag_pos = frag_pos + [p+(c_istart - c_start) for p in read_pos]
        read_base = [c_seq2[p] for p in read_pos]
        frag_base = frag_base + read_base
        snp_pos = snp_pos + [snp_tab.snp_pos[i] for i in snp_idx]
        snp_var = snp_var + \
                [int((b==snp_tab.snp_allele1[i] and '0') or (b==snp_tab.snp_allele2[i] and 1) or 2) for b, i in zip(read_base, snp_idx)]
        snp_ind += snp_idx

        # if reads do not overlap the sequence in between is also checked for SNP positions
        if c_iend < (c_istart-1):
            l = c_istart - c_iend - 1
            c = str(l)+"M"
            snp_idx, snp_read_pos, indel_idx, indel_read_pos = \
                    snp_tab.get_overlapping_snps_from_bedpe(c_iend+1, cigar2tuple(c), l, "middleSeq", c)
            frag_pos = frag_pos + [p+c_iend-c_start+1 for p in snp_read_pos]
            frag_base = frag_base + [iupac(snp_tab.snp_allele1[i], snp_tab.snp_allele2[i]) for i in snp_idx]
            # frag_base = frag_base + [snp_tab.snp_allele1[i]+snp_tab.snp_allele2[i] for i in snp_idx]
            snp_pos = snp_pos + [snp_tab.snp_pos[i] for i in snp_idx]
            # check whether the unread positions are in fact homogeneous, either reference or alternative
            # if so, output 4/5 for homozygous ref/homozyous alternative
            # THIS IS NOT GOING TO WORK; I only know at this point what allelic variants are in 1000-genomes (4) but not for this particular genome
           # tt = [snp_tab.snp_allele1[i] == snp_tab.snp_allele2[i]
            snp_var = snp_var + [3 for  i in snp_idx]
            snp_ind += snp_idx

        line = line + "\t"+ print_comma_sep_list(frag_pos)+"\t"+ print_comma_sep_list(frag_base)+"\t"+ print_comma_sep_list(snp_pos)+"\t"+ print_comma_sep_list(snp_var)+"\t"+print_comma_sep_list(snp_ind)
        # line = line + "\t"+ print_comma_sep_list(frag_pos)+"\t"+ print_comma_sep_list(frag_base)+"\t"+ print_comma_sep_list(snp_var)
        print(line)
Ejemplo n.º 9
0
def main(bam_filename,
         snp_dir=None,
         snp_tab_filename=None,
         snp_index_filename=None,
         haplotype_filename=None,
         samples=None,
         geno_sample=None):

    out_f = sys.stdout

    bam = pysam.Samfile(bam_filename)

    cur_chrom = None
    cur_tid = None
    seen_chrom = set([])

    snp_tab = snptable.SNPTable()
    read_pair_cache = {}

    # keep track of number of ref matches, non-ref matches, and other
    # for each SNP
    snp_ref_match = None
    snp_alt_match = None
    snp_other_match = None

    if geno_sample and not haplotype_filename:
        sys.stderr.write("WARNING: cannot obtain genotypes for sample "
                         "%s without --haplotype argument\n")
        geno_sample = None

    sys.stderr.write("GENOTYPE_SAMPLE: %s\n" % geno_sample)

    if snp_tab_filename:
        if (not snp_index_filename) or (not haplotype_filename):
            raise ValueError("--snp_index and --haplotype must be provided "
                             "if --snp_tab is provided")
        snp_tab_h5 = tables.open_file(snp_tab_filename, "r")
        snp_index_h5 = tables.open_file(snp_index_filename, "r")
        hap_h5 = tables.open_file(haplotype_filename, "r")
    else:
        snp_tab_h5 = None
        snp_index_h5 = None
        hap_h5 = None

    for read in bam:
        if (cur_tid is None) or (read.tid != cur_tid):
            # this is a new chromosome

            if cur_chrom:
                # write out results from last chromosome
                write_results(out_f, cur_chrom, snp_tab, snp_ref_match,
                              snp_alt_match, snp_oth_match, geno_sample)

            cur_chrom = bam.getrname(read.tid)

            if cur_chrom in seen_chrom:
                # sanity check that input bam file is sorted
                raise ValueError("expected input BAM file to be sorted "
                                 "but chromosome %s is repeated\n" % cur_chrom)
            seen_chrom.add(cur_chrom)
            cur_tid = read.tid
            sys.stderr.write("starting chromosome %s\n" % cur_chrom)

            # read SNPs for next chromomsome
            if snp_tab_h5:
                # read SNPs from HDF5 files, reduce to set that are
                # polymorphic in specified samples
                snp_tab.read_h5(snp_tab_h5,
                                snp_index_h5,
                                hap_h5,
                                cur_chrom,
                                samples=samples)
            elif snp_dir:
                # read SNPs from text file
                snp_filename = "%s/%s.snps.txt.gz" % (snp_dir, cur_chrom)
                snp_tab.read_file(snp_filename)
            else:
                raise ValueError("--snp_dir OR (--snp_tab, --snp_index, "
                                 "and --hap_h5) must be defined")

            sys.stderr.write("read %d SNPs\n" % snp_tab.n_snp)

            # clear SNP table and results
            snp_ref_match = np.zeros(snp_tab.n_snp, dtype=np.int16)
            snp_alt_match = np.zeros(snp_tab.n_snp, dtype=np.int16)
            snp_oth_match = np.zeros(snp_tab.n_snp, dtype=np.int16)

        if read.is_secondary:
            # this is a secondary alignment (i.e. read was aligned more than
            # once and this has align score that <= best score)
            continue

        # loop over all SNP that overlap this read
        snp_idx, snp_read_pos, \
            indel_idx, indel_read_pos = snp_tab.get_overlapping_snps(read)

        for snp_i, read_pos in zip(snp_idx, snp_read_pos):
            snp_pos = snp_tab.snp_pos[snp_i]
            ref_allele = snp_tab.snp_allele1[snp_i]
            alt_allele = snp_tab.snp_allele2[snp_i]

            if ref_allele == read.query_sequence[read_pos - 1]:
                snp_ref_match[snp_i] += 1
            elif alt_allele == read.query_sequence[read_pos - 1]:
                snp_alt_match[snp_i] += 1
            else:
                snp_oth_match[snp_i] += 1

    if cur_chrom:
        # write results for final chromosome
        write_results(out_f, cur_chrom, snp_tab, snp_ref_match, snp_alt_match,
                      snp_oth_match, geno_sample)
def filter_reads(files,
                 max_seqs=MAX_SEQS_DEFAULT,
                 max_snps=MAX_SNPS_DEFAULT,
                 samples=None):
    cur_chrom = None
    cur_tid = None
    seen_chrom = set([])

    snp_tab = snptable.SNPTable()
    read_stats = ReadStats()
    read_pair_cache = {}
    cache_size = 0
    read_count = 0

    for read in files.input_bam:
        read_count += 1
        # if (read_count % 100000) == 0:
        #     sys.stderr.write("\nread_count: %d\n" % read_count)
        #     sys.stderr.write("cache_size: %d\n" % cache_size)

        # TODO: need to change this to use new pysam API calls
        # but need to check pysam version for backward compatibility
        if read.tid == -1:
            # unmapped read
            read_stats.discard_unmapped += 1
            continue

        if (cur_tid is None) or (read.tid != cur_tid):
            # this is a new chromosome
            cur_chrom = files.input_bam.getrname(read.tid)

            if len(read_pair_cache) != 0:
                sys.stderr.write("WARNING: failed to find pairs for %d "
                                 "reads on this chromosome\n" %
                                 len(read_pair_cache))
                read_stats.discard_missing_pair += len(read_pair_cache)
            read_pair_cache = {}
            cache_size = 0
            read_count = 0

            if cur_chrom in seen_chrom:
                # sanity check that input bam file is sorted
                raise ValueError("expected input BAM file to be sorted "
                                 "but chromosome %s is repeated\n" % cur_chrom)
            seen_chrom.add(cur_chrom)
            cur_tid = read.tid
            sys.stderr.write("starting chromosome %s\n" % cur_chrom)

            # use HDF5 files if they are provided, otherwise use text
            # files from SNP dir
            if files.snp_tab_h5:
                sys.stderr.write("reading SNPs from file '%s'\n" %
                                 files.snp_tab_h5.filename)
                snp_tab.read_h5(files.snp_tab_h5, files.snp_index_h5,
                                files.hap_h5, cur_chrom, samples)
            else:
                snp_filename = "%s/%s.snps.txt.gz" % (files.snp_dir, cur_chrom)
                sys.stderr.write("reading SNPs from file '%s'\n" %
                                 snp_filename)
                snp_tab.read_file(snp_filename)

            sys.stderr.write("processing reads\n")

        if read.is_secondary:
            # this is a secondary alignment (i.e. read was aligned more than
            # once and this has align score that <= best score)
            read_stats.discard_secondary += 1
            continue

        if read.is_paired:
            if read.mate_is_unmapped:
                # other side of pair not mapped
                # we could process as single... but these not likely
                # useful so discard
                # process_single_read(read, read_stats, files,
                #                     snp_tab, max_seqs, max_snps)
                read_stats.discard_mate_unmapped += 1
            elif (read.next_reference_name == cur_chrom
                  or read.next_reference_name == "="):
                # other pair mapped to same chrom

                # sys.stderr.write("flag: %s" % read.flag)
                if not read.is_proper_pair:
                    # sys.stderr.write(' => improper\n')
                    read_stats.discard_improper_pair += 1
                    continue
                # sys.stderr.write(' => proper\n')

                if read.qname in read_pair_cache:
                    # we already saw prev pair, retrieve from cache
                    read1 = read_pair_cache[read.qname]
                    read2 = read
                    del read_pair_cache[read.qname]
                    cache_size -= 1

                    if read2.next_reference_start != read1.reference_start:
                        sys.stderr.write("WARNING: read pair positions "
                                         "do not match for pair %s\n" %
                                         read.qname)
                    else:
                        process_paired_read(read1, read2, read_stats, files,
                                            snp_tab, max_seqs, max_snps)
                else:
                    # we need to wait for next pair
                    read_pair_cache[read.qname] = read

                    cache_size += 1

            else:
                # other side of pair mapped to different
                # chromosome, discard this read
                read_stats.discard_different_chromosome += 1

        else:
            process_single_read(read, read_stats, files, snp_tab, max_seqs,
                                max_snps)

    if len(read_pair_cache) != 0:
        sys.stderr.write("WARNING: failed to find pairs for %d "
                         "reads on this chromosome\n" % len(read_pair_cache))
        read_stats.discard_missing_pair += len(read_pair_cache)

    read_stats.write(sys.stderr)
Ejemplo n.º 11
0
def main():
    args = parse_args()

    sys.stderr.write("command line: %s\n" % " ".join(sys.argv))
    sys.stderr.write("python version: %s\n" % sys.version)
    sys.stderr.write("pysam version: %s\n" % pysam.__version__)
    sys.stderr.write("pytables version: %s\n" % tables.__version__)

    util.check_pysam_version()
    util.check_pytables_version()

    snp_tab_h5 = tables.open_file(args.snp_tab, "r")
    snp_index_h5 = tables.open_file(args.snp_index, "r")

    if args.haplotype:
        hap_h5 = tables.open_file(args.haplotype, "r")
    else:
        hap_h5 = None

    ref_count_h5 = tables.open_file(args.ref_as_counts, "w")
    alt_count_h5 = tables.open_file(args.alt_as_counts, "w")
    other_count_h5 = tables.open_file(args.other_as_counts, "w")
    read_count_h5 = tables.open_file(args.read_counts, "w")

    output_h5 = [ref_count_h5, alt_count_h5, other_count_h5, read_count_h5]

    chrom_dict = {}

    # initialize every chromosome in output files
    chrom_list = chromosome.get_all_chromosomes(args.chrom)

    for chrom in chrom_list:
        for out_file in output_h5:
            create_carray(out_file, chrom, args.data_type)
        chrom_dict[chrom.name] = chrom

    count = 0
    dtype = None
    if args.data_type == "uint8":
        max_count = MAX_UINT8_COUNT
        dtype = np.uint8
    elif args.data_type == "uint16":
        max_count = MAX_UINT16_COUNT
        dtype = np.uint16
    else:
        raise NotImplementedError("unsupported datatype %s" % args.data_type)

    # create a txt file to also holds the counts
    if args.txt_counts is not None:
        if os.path.splitext(args.txt_counts)[1] == ".gz":
            txt_counts = gzip.open(args.txt_counts, 'a+')
        else:
            txt_counts = open(args.txt_counts, 'a+')

    for chrom in chrom_list:
        sys.stderr.write("%s\n" % chrom.name)

        warned_pos = {}

        # fetch SNP info for this chromosome
        if chrom.name not in snp_tab_h5.root:
            # no SNPs for this chromosome
            continue

        sys.stderr.write("fetching SNPs\n")

        snp_tab = snp_tab_h5.get_node("/%s" % chrom.name)
        snp_index_array = snp_index_h5.get_node("/%s" % chrom.name)[:]
        if hap_h5:
            hap_tab = hap_h5.get_node("/%s" % chrom.name)
            ind_idx = snptable.SNPTable().get_h5_sample_indices(
                hap_h5, chrom, [args.individual])[1]
            if len(ind_idx) != 0:
                ind_idx = ind_idx[0]
            else:
                hap_tab = None
                ind_idx = None
        else:
            hap_tab = None
            ind_idx = None

        # initialize count arrays for this chromosome to 0
        ref_carray = get_carray(ref_count_h5, chrom)
        alt_carray = get_carray(alt_count_h5, chrom)
        other_carray = get_carray(other_count_h5, chrom)
        read_count_carray = get_carray(read_count_h5, chrom)

        ref_array = np.zeros(chrom.length, dtype)
        alt_array = np.zeros(chrom.length, dtype)
        other_array = np.zeros(chrom.length, dtype)
        read_count_array = np.zeros(chrom.length, dtype)

        # loop over all BAM files, pulling out reads
        # for this chromosome
        for bam_filename in args.bam_filenames:
            sys.stderr.write("reading from file %s\n" % bam_filename)

            samfile = pysam.Samfile(bam_filename, "rb")

            for read in get_sam_iter(samfile, chrom):
                count += 1
                if count == 10000:
                    sys.stderr.write(".")
                    count = 0

                add_read_count(read, chrom, ref_array, alt_array, other_array,
                               read_count_array, snp_index_array, snp_tab,
                               hap_tab, warned_pos, max_count, ind_idx)

            # store results for this chromosome
            ref_carray[:] = ref_array
            alt_carray[:] = alt_array
            other_carray[:] = other_array
            read_count_carray[:] = read_count_array
            sys.stderr.write("\n")

            # write data to numpy arrays, so that they can be written to a txt
            # file later
            # columns are:
            # chrom, pos, ref, alt, genotype, ref_count, alt_count, other_count
            if args.txt_counts is not None:
                chrom = np.tile(chrom.name, len(snp_tab))
                pos = np.array([snp['pos'] for snp in snp_tab])
                ref = np.array([snp['allele1'] for snp in snp_tab])
                alt = np.array([snp['allele2'] for snp in snp_tab])
                if hap_tab is not None:
                    genotype = np.array(
                        [str(hap[0]) + "|" + str(hap[1]) for hap in hap_tab])
                else:
                    genotype = np.empty((len(snp_tab), 0))
                # write an np array to a txt file
                np.savetxt(txt_counts,
                           np.column_stack(
                               (chrom, pos, ref, alt, genotype,
                                ref_array[pos - 1], alt_array[pos - 1],
                                other_array[pos - 1])),
                           fmt="%1s",
                           delimiter=" ")

            samfile.close()

    if args.txt_counts:
        # close the open txt file handler
        txt_counts.close()

    # check if any of the reads contained an unimplemented CIGAR
    sys.stderr.write(
        "WARNING: Encountered " + str(unimplemented_CIGAR[0]) +
        " instances of any of the following CIGAR codes: " +
        str(unimplemented_CIGAR[1]) +
        ". The regions of reads with these CIGAR codes were skipped because these CIGAR codes are currently unimplemented.\n"
    )

    # set track statistics and close HDF5 files

    sys.stderr.write("setting statistics for each chromosome\n")
    for h5f in output_h5:
        chromstat.set_stats(h5f, chrom_list)
        h5f.close()

    snp_tab_h5.close()
    snp_index_h5.close()
    if hap_h5:
        hap_h5.close()

    sys.stderr.write("done\n")