def write_remap_bam_pe(data_dir="test_data",
                       bam_filename="test_data/test.remap.bam"):
    sam_lines = [
        # Read pair expected to map 2 times and maps to correct location 2 times
        "SRR1658224.34085432.16052611-16052734.1.2	163	chr22	16052611	12	101M	=	16052734	224	TGGAGACATAAAATGAGGCATATCTGACCTCCACTTCCAAAAACATCTGAGATAGGTCTCAGTTAATTAAGAAAGTTTGTTCTGCCTAGTTTAAGGACATG	CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJIJJIHIJJJJEHIJJJHJJJJJJJJJJJJ=DHHHHHFFFFFFEEEEEEDDCDDDC	AS:i:-11	XS:i:-17	XN:i:0	XM:i:2	XO:i:0	XG:i:0	NM:i:2	MD:Z:7G44C48	YS:i:0	YT:Z:CP",
        "SRR1658224.34085432.16052611-16052734.1.2	83	chr22	16052734	12	101M	=	16052611	-224	TCCTGACAGCATGTGCCCAAGGTGGTCAGGATACAGCTTGCTTCTATATATTTTAGGGAGAAAATACATCAGCCTGTAAACAAAAAATTAAATTCTAAGGT	DDDDDDDDDDDDDDEDEEEFFFFHHFHHIIFIIJJJJIJJJJJJJJJJIIJJJIIIJIJIJJJJIFIIIJJIJJJJJJJIIJJJJJJJHHHHHFFFFFCCC	AS:i:0	XS:i:-12	XN:i:0	XM:i:0	XO:i:0	XG:i:0	NM:i:0	MD:Z:101	YS:i:-11	YT:Z:CP",
        "SRR1658224.34085432.16052611-16052734.2.2	163	chr22	16052611	12	101M	=	16052734	224	TGGAGACATAAAATGAGGCATATCTGACCTCCACTTCCAAAAACATCTGAGATAGGTCTCAGTTAATTAAGAAAGTTTGTTCTGCCTAGTTTAAGGACATG	CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJIJJIHIJJJJEHIJJJHJJJJJJJJJJJJ=DHHHHHFFFFFFEEEEEEDDCDDDC	AS:i:-11	XS:i:-17	XN:i:0	XM:i:2	XO:i:0	XG:i:0	NM:i:2	MD:Z:7G44C48	YS:i:0	YT:Z:CP",
        "SRR1658224.34085432.16052611-16052734.2.2	83	chr22	16052734	12	101M	=	16052611	-224	TCCTGACAGCATGTGCCCAAGGTGGTCAGGATACAGCTTGCTTCTATATATTTTAGGGAGAAAATACATCAGCCTGTAAACAAAAAATTAAATTCTAAGGT	DDDDDDDDDDDDDDEDEEEFFFFHHFHHIIFIIJJJJIJJJJJJJJJJIIJJJIIIJIJIJJJJIFIIIJJIJJJJJJJIIJJJJJJJHHHHHFFFFFCCC	AS:i:0	XS:i:-12	XN:i:0	XM:i:0	XO:i:0	XG:i:0	NM:i:0	MD:Z:101	YS:i:-11	YT:Z:CP",

        # Read pair expected to map 2 times, but only maps 1 time
        "SRR1658224.34975561.16071944-16072163.2.2	99	chr22	16071944	12	101M	=	16072163	320	ATTTATTTATTTATTTATTATTGGGACAGAGTCTCACTCTGTCCCCCAGACTGGAGTCCAGTGACATGATCTCAGCTCACTGCAACCTCTGCCTCGTGGGT	CCCFFFFFHHHHHJJJJJJJJJJJJIJJJJIEHIJJJJJJJIIJJJJJIJJJJJJJJJJIJHIJIJJJJIJJJJJHHHHHHFFFFFECEEEEDDDDDDBBD	AS:i:-5	XS:i:-22	XN:i:0	XM:i:1	XO:i:0	XG:i:0	NM:i:1	MD:Z:89C11	YS:i:0	YT:Z:CP",
        "SRR1658224.34975561.16071944-16072163.2.2	147	chr22	16072163	12	101M	=	16071944	-320	GTCTCAAACTTCTGACCTCAGGTGATCCACCCACCTCGACCTCCCAAAGTGCTGGGATTACAGGCACTAGGTCCCTAAATTAGAGCCATATTCTTTAATGT	DDBCDEDCDCCDCC?DDDDDDDBACBDA<FFB:6HIIJIIJIIJJJJJJJJJJJJIJJIHJJJJJIJJJJJJJJJJJJJJJJJJJJJJHHHGGFFFFFCCC	AS:i:0	XS:i:0	XN:i:0	XM:i:0	XO:i:0	XG:i:0	NM:i:0	MD:Z:101	YS:i:-5	YT:Z:CP",

        # Read pair expected to map 2 times, but only 1/2 of 2nd pair maps back to same location
        "SRR1658224.7462188.16235410-16235625.1.2	163	chr22	16235410	17	101M	=	16235625	316	AGATAATTGTCTTATTTTTTTAAAAAAAGAGTAACTTTATATTATGGAATTCATAATATTTGAGACTATAATGCATGACATAAATAGTATAAAGGAGAGAG	CC@FFFFFHHHHHJJJJJJJJJJJJJJJJIJBGIJJJJJJJJJJJJJIJIFIJJJJJJJJJHHHHGFFFFFFEEEEDEEDDDDDEED@CFFFEDDD?ABB?	AS:i:0	XS:i:0	XN:i:0	XM:i:0	XO:i:0	XG:i:0	NM:i:0	MD:Z:101	YS:i:-5	YT:Z:CP",
        "SRR1658224.7462188.16235410-16235625.1.2	83	chr22	16235625	17	101M	=	16235410	-316	TTCAAAAGATGGTATATGCATTAATATTTTCATACAACTTCCAGCTTTTGTTTTTCTTCATTTAATTTTATTTATTTATTTATTTTTGAGATGGAGTCTCG	CBDDDDECEEDEFFFDFFFHHHHHHHJJIIJJIHIHFHGHJJJJJJJGJJJJJIJJJIIJJJJJJJJJJJJJJJJJJJJJJJJJJJJJHHHHHFFFDFCCC	AS:i:-5	XS:i:-39	XN:i:0	XM:i:1	XO:i:0	XG:i:0	NM:i:1	MD:Z:15G85	YS:i:0	YT:Z:CP",
        "SRR1658224.7462188.16235410-16235625.2.2	163	chr22	16235410	17	101M	*	0	0	AGATAATTGTCTTATTTTTTTAAAAAAAGAGTAACTTTATATTATGGAATTCATAATATTTGAGACTATAATGCATGACATAAATAGTATAAAGGAGAGAG	CC@FFFFFHHHHHJJJJJJJJJJJJJJJJIJBGIJJJJJJJJJJJJJIJIFIJJJJJJJJJHHHHGFFFFFFEEEEDEEDDDDDEED@CFFFEDDD?ABB?	AS:i:0	XS:i:0	XN:i:0	XM:i:0	XO:i:0	XG:i:0	NM:i:0	MD:Z:101	YS:i:-5	YT:Z:CP",

        # Read pair expected to map 2 times, but 1 pair maps to wrong location
        "SRR1658224.31153145.16235410-16235625.1.2	163	chr22	16235410	17	101M	=	16235625	316	AGATAATTGTCTTATTTTTTTAAAAAAAGAGTAACTTTATATTATGGAATTCATAATATTTGAGACTATAATGCATGACATAAATAGTATAAAGGAGAGAG	CCCFFFFFHHHHHJJJJJJJJJJJJJJJJIJFHIJJJJJJJJJJJIJIJJFHIJJJJJJJJHHHHHFFFFFFEDEEEEEDDDDDEED@DEEEEDDDDDDB2	AS:i:0	XS:i:0	XN:i:0	XM:i:0	XO:i:0	XG:i:0	NM:i:0	MD:Z:101	YS:i:-2	YT:Z:CP",
        "SRR1658224.31153145.16235410-16235625.1.2	83	chr22	16235625	17	101M	=	16235410	-316	TTCAAAAGATGGTATGTGCATTAATATTTTCATACAACTTCCAGTTTTTGTTTTTCTTCATTTAATTTTATTTATTTATTTATTTTTGAGATGGAGTCTCG	DDDDDDDDEEEEEEFFFFFFHHHHGHHJJIJJJIIJIJIHJHF@(JJJJJJJJJJJJIIIIJJJJJJJIJJJJJJJJJJJJJJJJJJJHHHHHFFFDFCCC	AS:i:-2	XS:i:-36	XN:i:0	XM:i:1	XO:i:0	XG:i:0	NM:i:1	MD:Z:44C56	YS:i:0	YT:Z:CP",
        "SRR1658224.31153145.16235410-16235625.2.2	163	chr22	18235410	17	101M	=	16235625	316	AGATAATTGTCTTATTTTTTTAAAAAAAGAGTAACTTTATATTATGGAATTCATAATATTTGAGACTATAATGCATGACATAAATAGTATAAAGGAGAGAG	CCCFFFFFHHHHHJJJJJJJJJJJJJJJJIJFHIJJJJJJJJJJJIJIJJFHIJJJJJJJJHHHHHFFFFFFEDEEEEEDDDDDEED@DEEEEDDDDDDB2	AS:i:0	XS:i:0	XN:i:0	XM:i:0	XO:i:0	XG:i:0	NM:i:0	MD:Z:101	YS:i:-2	YT:Z:CP",
        "SRR1658224.31153145.16235410-16235625.2.2	83	chr22	18235625	17	101M	=	16235410	-316	TTCAAAAGATGGTATGTGCATTAATATTTTCATACAACTTCCAGTTTTTGTTTTTCTTCATTTAATTTTATTTATTTATTTATTTTTGAGATGGAGTCTCG	DDDDDDDDEEEEEEFFFFFFHHHHGHHJJIJJJIIJIJIHJHF@(JJJJJJJJJJJJIIIIJJJJJJJIJJJJJJJJJJJJJJJJJJJHHHHHFFFDFCCC	AS:i:-2	XS:i:-36	XN:i:0	XM:i:1	XO:i:0	XG:i:0	NM:i:1	MD:Z:44C56	YS:i:0	YT:Z:CP",

        # Read pair expected to map 2 times, but does not map at all
        # "SRR1658224.25014179"

        # Read pairs expected to map 1 times, with read-pairs interleaved
        "readpair1.100-200.1.2	163	chr22	100	12	101M	=	200	201	TGGAGACATAAAATGAGGCATATCTGACCTCCACTTCCAAAAACATCTGAGATAGGTCTCAGTTAATTAAGAAAGTTTGTTCTGCCTAGTTTAAGGACATG	CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJIJJIHIJJJJEHIJJJHJJJJJJJJJJJJ=DHHHHHFFFFFFEEEEEEDDCDDDC	AS:i:-11	XS:i:-17	XN:i:0	XM:i:2	XO:i:0	XG:i:0	NM:i:2	MD:Z:7G44C48	YS:i:0	YT:Z:CP",
        "readpair2.150-250.1.2	163	chr22	150	12	101M	=	250	201	TGGAGACATAAAATGAGGCATATCTGACCTCCACTTCCAAAAACATCTGAGATAGGTCTCAGTTAATTAAGAAAGTTTGTTCTGCCTAGTTTAAGGACATG	CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJIJJIHIJJJJEHIJJJHJJJJJJJJJJJJ=DHHHHHFFFFFFEEEEEEDDCDDDC	AS:i:-11	XS:i:-17	XN:i:0	XM:i:2	XO:i:0	XG:i:0	NM:i:2	MD:Z:7G44C48	YS:i:0	YT:Z:CP",
        "readpair1.100-200.1.2	83	chr22	200	12	101M	=	100	-201	TCCTGACAGCATGTGCCCAAGGTGGTCAGGATACAGCTTGCTTCTATATATTTTAGGGAGAAAATACATCAGCCTGTAAACAAAAAATTAAATTCTAAGGT	DDDDDDDDDDDDDDEDEEEFFFFHHFHHIIFIIJJJJIJJJJJJJJJJIIJJJIIIJIJIJJJJIFIIIJJIJJJJJJJIIJJJJJJJHHHHHFFFFFCCC	AS:i:0	XS:i:-12	XN:i:0	XM:i:0	XO:i:0	XG:i:0	NM:i:0	MD:Z:101	YS:i:-11	YT:Z:CP",
        "readpair2.150-250.1.2	163	chr22	250	12	101M	=	150	-201	TGGAGACATAAAATGAGGCATATCTGACCTCCACTTCCAAAAACATCTGAGATAGGTCTCAGTTAATTAAGAAAGTTTGTTCTGCCTAGTTTAAGGACATG	CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJIJJIHIJJJJEHIJJJHJJJJJJJJJJJJ=DHHHHHFFFFFFEEEEEEDDCDDDC	AS:i:-11	XS:i:-17	XN:i:0	XM:i:2	XO:i:0	XG:i:0	NM:i:2	MD:Z:7G44C48	YS:i:0	YT:Z:CP",
        "readpair1.100-200.2.2	163	chr22	100	12	101M	=	200	201	TGGAGACATAAAATGAGGCATATCTGACCTCCACTTCCAAAAACATCTGAGATAGGTCTCAGTTAATTAAGAAAGTTTGTTCTGCCTAGTTTAAGGACATG	CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJIJJIHIJJJJEHIJJJHJJJJJJJJJJJJ=DHHHHHFFFFFFEEEEEEDDCDDDC	AS:i:-11	XS:i:-17	XN:i:0	XM:i:2	XO:i:0	XG:i:0	NM:i:2	MD:Z:7G44C48	YS:i:0	YT:Z:CP",
        "readpair2.150-250.2.2	163	chr22	150	12	101M	=	250	201	TGGAGACATAAAATGAGGCATATCTGACCTCCACTTCCAAAAACATCTGAGATAGGTCTCAGTTAATTAAGAAAGTTTGTTCTGCCTAGTTTAAGGACATG	CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJIJJIHIJJJJEHIJJJHJJJJJJJJJJJJ=DHHHHHFFFFFFEEEEEEDDCDDDC	AS:i:-11	XS:i:-17	XN:i:0	XM:i:2	XO:i:0	XG:i:0	NM:i:2	MD:Z:7G44C48	YS:i:0	YT:Z:CP",
        "readpair1.100-200.2.2	83	chr22	200	12	101M	=	100	-201	TCCTGACAGCATGTGCCCAAGGTGGTCAGGATACAGCTTGCTTCTATATATTTTAGGGAGAAAATACATCAGCCTGTAAACAAAAAATTAAATTCTAAGGT	DDDDDDDDDDDDDDEDEEEFFFFHHFHHIIFIIJJJJIJJJJJJJJJJIIJJJIIIJIJIJJJJIFIIIJJIJJJJJJJIIJJJJJJJHHHHHFFFFFCCC	AS:i:0	XS:i:-12	XN:i:0	XM:i:0	XO:i:0	XG:i:0	NM:i:0	MD:Z:101	YS:i:-11	YT:Z:CP",
        "readpair2.150-250.2.2	163	chr22	250	12	101M	=	150	-201	TGGAGACATAAAATGAGGCATATCTGACCTCCACTTCCAAAAACATCTGAGATAGGTCTCAGTTAATTAAGAAAGTTTGTTCTGCCTAGTTTAAGGACATG	CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJIJJIHIJJJJEHIJJJHJJJJJJJJJJJJ=DHHHHHFFFFFFEEEEEEDDCDDDC	AS:i:-11	XS:i:-17	XN:i:0	XM:i:2	XO:i:0	XG:i:0	NM:i:2	MD:Z:7G44C48	YS:i:0	YT:Z:CP"
    ]

    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    # write temporary file in SAM format, before converting to BAM
    sam_filename = data_dir + "/tmp.sam"
    f = open(sam_filename, "wt")
    write_sam_header(f)
    for line in sam_lines:
        f.write(line + "\n")
    f.close()

    # write to temp bam file
    tmp_bam_filename = data_dir + "/tmp.bam"
    subprocess.check_call("samtools view -b %s > %s" %
                          (sam_filename, tmp_bam_filename),
                          shell=True)
    # sort the temp bam file
    util.sort_bam(tmp_bam_filename, data_dir + "/tmp")
    # remove temp bam
    os.remove(tmp_bam_filename)
    # rename sorted bam to output bam filename
    os.rename(data_dir + "/tmp.sort.bam", bam_filename)
def write_remap_bam_pe(sam_lines, data_dir="test_data", bam_filename="test_data/test.remap.bam"):

    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

        
    # write temporary file in SAM format, before converting to BAM
    sam_filename = data_dir + "/tmp.sam"
    f = open(sam_filename, "wt")
    write_sam_header(f)
    for line in sam_lines:
        f.write(line + "\n")
    f.close()

    # write to temp bam file
    tmp_bam_filename = data_dir + "/tmp.bam"
    subprocess.check_call("samtools view -b %s > %s" % (sam_filename, tmp_bam_filename), shell=True)
    # sort the temp bam file
    util.sort_bam(tmp_bam_filename, data_dir + "/tmp")
    # remove temp bam
    os.remove(tmp_bam_filename)
    # rename sorted bam to output bam filename
    os.rename(data_dir + "/tmp.sort.bam", bam_filename)
    def __init__(self,
                 bam_filename,
                 is_sorted,
                 is_paired,
                 output_dir=None,
                 snp_dir=None,
                 snp_tab_filename=None,
                 snp_index_filename=None,
                 haplotype_filename=None,
                 samples=None):
        # flag indicating whether reads are paired-end
        self.is_paired = is_paired

        # prefix for output files
        self.prefix = None

        # name of input BAM filename
        self.bam_filename = bam_filename
        # name of sorted input bam_filename
        # (new file is created if input file is not
        #  already sorted)
        self.bam_sort_filename = None
        # pysam file handle for input BAM
        self.input_bam = None

        # name of output keep and to.remap BAM files
        self.keep_filename = None
        self.remap_filename = None

        # pysam file handles for output BAM filenames
        self.keep_bam = None
        self.remap_bam = None

        # name of output fastq files
        self.fastq_single_filename = None
        self.fastq1_filename = None
        self.fastq2_filename = None
        self.fastq1 = None
        self.fastq2 = None
        self.fastq_single = None

        # name of directory to read SNPs from
        self.snp_dir = snp_dir

        # paths to HDF5 files to read SNP info from
        self.snp_tab_filename = snp_tab_filename
        self.snp_index_filename = snp_index_filename
        self.haplotype_filename = haplotype_filename

        if self.snp_tab_filename:
            self.snp_tab_h5 = tables.openFile(snp_tab_filename, "r")
            self.snp_index_h5 = tables.openFile(snp_index_filename, "r")
            self.hap_h5 = tables.openFile(haplotype_filename, "r")
        else:
            self.snp_tab_h5 = None
            self.snp_index_h5 = None
            self.hap_h5 = None

        # separate input directory and bam filename
        tokens = self.bam_filename.split("/")
        bam_dir = "/".join(tokens[:-1])
        filename = tokens[-1]

        if output_dir is None:
            # if no output dir specified, use same directory as input
            # bam file
            output_dir = bam_dir
        else:
            if output_dir.endswith("/"):
                # strip trailing '/' from output dir name
                output_dir = output_dir[:-1]

        name_split = filename.split(".")
        if len(name_split) > 1:
            self.prefix = output_dir + "/" + ".".join(name_split[:-1])
        else:
            self.prefix = output_dir + "/" + name_split[0]

        # TODO: could allow names of output files to be specified
        # on command line rather than appending name to prefix
        sys.stderr.write("prefix: %s\n" % self.prefix)

        if not is_sorted:
            util.sort_bam(self.bam_filename, self.prefix)
            self.bam_sort_filename = self.prefix + ".sort.bam"
        else:
            self.bam_sort_filename = self.bam_filename

        self.keep_filename = self.prefix + ".keep.bam"
        self.remap_filename = self.prefix + ".to.remap.bam"

        sys.stderr.write("reading reads from:\n  %s\n" %
                         self.bam_sort_filename)

        sys.stderr.write("writing output files to:\n")

        if self.is_paired:
            self.fastq1_filename = self.prefix + ".remap.fq1.gz"
            self.fastq2_filename = self.prefix + ".remap.fq2.gz"
            self.fastq1 = gzip.open(self.fastq1_filename, "wb")
            self.fastq2 = gzip.open(self.fastq2_filename, "wb")
            self.fastq_single_filename = self.prefix + ".remap.single.fq.gz"
            self.fastq_single = gzip.open(self.fastq_single_filename, "wb")
            sys.stderr.write("  %s\n  %s\n  %s\n" %
                             (self.fastq1_filename, self.fastq2_filename,
                              self.fastq_single_filename))

        else:
            self.fastq_single_filename = self.prefix + ".remap.fq.gz"
            self.fastq_single = gzip.open(self.fastq_single_filename, "wb")
            sys.stderr.write("  %s\n" % (self.fastq_single_filename))

        self.input_bam = pysam.Samfile(self.bam_sort_filename, "rb")
        self.keep_bam = pysam.Samfile(self.keep_filename,
                                      "wb",
                                      template=self.input_bam)
        self.remap_bam = pysam.Samfile(self.remap_filename,
                                       "wb",
                                       template=self.input_bam)
        sys.stderr.write("  %s\n  %s\n" %
                         (self.keep_filename, self.remap_filename))
Beispiel #4
0
    def __init__(self,
                 bam_filename,
                 is_sorted,
                 is_paired,
                 output_dir=None,
                 snp_dir=None):
        # flag indicating whether reads are paired-end
        self.is_paired = is_paired

        # prefix for output files
        self.prefix = None

        # name of input BAM filename
        self.bam_filename = bam_filename
        # name of sorted input bam_filename
        # (new file is created if input file is not
        #  already sorted)
        self.bam_sort_filename = None
        # pysam file handle for input BAM
        self.input_bam = None

        # name of output  file to check initial imbalance
        self.initial_AI_filename = None

        # file handles for output pickle filename
        #self.initial_AI_pickle = None
        self.initial_AI_txt = None

        # name of output fastq files
        self.fastq_single_filename = None
        self.fastq1_filename = None
        self.fastq2_filename = None
        self.fastq1 = None
        self.fastq2 = None
        self.fastq_single = None

        # name of directory to read SNPs from
        self.snp_dir = snp_dir

        # separate input directory and bam filename
        tokens = self.bam_filename.split("/")
        bam_dir = "/".join(tokens[:-1])
        filename = tokens[-1]

        if output_dir is None:
            # if no output dir specified, use same directory as input
            # bam file
            output_dir = bam_dir
        else:
            if output_dir.endswith("/"):
                # strip trailing '/' from output dir name
                output_dir = output_dir[:-1]

        name_split = filename.split(".")
        if len(name_split) > 1:
            self.prefix = output_dir + "/" + ".".join(name_split[:-1])
        else:
            self.prefix = output_dir + "/" + name_split[0]

        # create output dir if does not exist
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        if not is_sorted:
            util.sort_bam(self.bam_filename, self.prefix)
            self.bam_sort_filename = self.prefix + ".sort.bam"
        else:
            self.bam_sort_filename = self.bam_filename

        self.initial_AI_filename = self.prefix + ".initial.AI.txt"
        sys.stderr.write("reading reads from:\n  %s\n" %
                         self.bam_sort_filename)
        sys.stderr.write("writing output files to:\n")

        if self.is_paired:
            self.fastq1_filename = self.prefix + ".remap.fq1.gz"
            self.fastq2_filename = self.prefix + ".remap.fq2.gz"
            self.fastq1 = gzip.open(self.fastq1_filename, "wt")
            self.fastq2 = gzip.open(self.fastq2_filename, "wt")
            self.fastq_single_filename = self.prefix + ".remap.single.fq.gz"
            self.fastq_single = gzip.open(self.fastq_single_filename, "wt")
            sys.stderr.write("  %s\n  %s\n  %s\n" %
                             (self.fastq1_filename, self.fastq2_filename,
                              self.fastq_single_filename))
        else:
            self.fastq_single_filename = self.prefix + ".remap.fq.gz"
            self.fastq_single = gzip.open(self.fastq_single_filename, "wt")
            sys.stderr.write("  %s\n" % (self.fastq_single_filename))

        self.input_bam = pysam.AlignmentFile(self.bam_sort_filename, "rb")
        self.initial_AI_txt = open(self.initial_AI_filename, "w+")

        sys.stderr.write("  %s\n  " % self.initial_AI_filename)
def write_remap_bam_pe(data_dir="test_data", bam_filename="test_data/test.remap.bam"):
    sam_lines = [
        # Read pair expected to map 2 times and maps to correct location 2 times
        "SRR1658224.34085432.16052611-16052734.1.2	163	chr22	16052611	12	101M	=	16052734	224	TGGAGACATAAAATGAGGCATATCTGACCTCCACTTCCAAAAACATCTGAGATAGGTCTCAGTTAATTAAGAAAGTTTGTTCTGCCTAGTTTAAGGACATG	CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJIJJIHIJJJJEHIJJJHJJJJJJJJJJJJ=DHHHHHFFFFFFEEEEEEDDCDDDC	AS:i:-11	XS:i:-17	XN:i:0	XM:i:2	XO:i:0	XG:i:0	NM:i:2	MD:Z:7G44C48	YS:i:0	YT:Z:CP",
        "SRR1658224.34085432.16052611-16052734.1.2	83	chr22	16052734	12	101M	=	16052611	-224	TCCTGACAGCATGTGCCCAAGGTGGTCAGGATACAGCTTGCTTCTATATATTTTAGGGAGAAAATACATCAGCCTGTAAACAAAAAATTAAATTCTAAGGT	DDDDDDDDDDDDDDEDEEEFFFFHHFHHIIFIIJJJJIJJJJJJJJJJIIJJJIIIJIJIJJJJIFIIIJJIJJJJJJJIIJJJJJJJHHHHHFFFFFCCC	AS:i:0	XS:i:-12	XN:i:0	XM:i:0	XO:i:0	XG:i:0	NM:i:0	MD:Z:101	YS:i:-11	YT:Z:CP",
        "SRR1658224.34085432.16052611-16052734.2.2	163	chr22	16052611	12	101M	=	16052734	224	TGGAGACATAAAATGAGGCATATCTGACCTCCACTTCCAAAAACATCTGAGATAGGTCTCAGTTAATTAAGAAAGTTTGTTCTGCCTAGTTTAAGGACATG	CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJIJJIHIJJJJEHIJJJHJJJJJJJJJJJJ=DHHHHHFFFFFFEEEEEEDDCDDDC	AS:i:-11	XS:i:-17	XN:i:0	XM:i:2	XO:i:0	XG:i:0	NM:i:2	MD:Z:7G44C48	YS:i:0	YT:Z:CP",
        "SRR1658224.34085432.16052611-16052734.2.2	83	chr22	16052734	12	101M	=	16052611	-224	TCCTGACAGCATGTGCCCAAGGTGGTCAGGATACAGCTTGCTTCTATATATTTTAGGGAGAAAATACATCAGCCTGTAAACAAAAAATTAAATTCTAAGGT	DDDDDDDDDDDDDDEDEEEFFFFHHFHHIIFIIJJJJIJJJJJJJJJJIIJJJIIIJIJIJJJJIFIIIJJIJJJJJJJIIJJJJJJJHHHHHFFFFFCCC	AS:i:0	XS:i:-12	XN:i:0	XM:i:0	XO:i:0	XG:i:0	NM:i:0	MD:Z:101	YS:i:-11	YT:Z:CP",
        

        # Read pair expected to map 2 times, but only maps 1 time
        "SRR1658224.34975561.16071944-16072163.2.2	99	chr22	16071944	12	101M	=	16072163	320	ATTTATTTATTTATTTATTATTGGGACAGAGTCTCACTCTGTCCCCCAGACTGGAGTCCAGTGACATGATCTCAGCTCACTGCAACCTCTGCCTCGTGGGT	CCCFFFFFHHHHHJJJJJJJJJJJJIJJJJIEHIJJJJJJJIIJJJJJIJJJJJJJJJJIJHIJIJJJJIJJJJJHHHHHHFFFFFECEEEEDDDDDDBBD	AS:i:-5	XS:i:-22	XN:i:0	XM:i:1	XO:i:0	XG:i:0	NM:i:1	MD:Z:89C11	YS:i:0	YT:Z:CP",
        "SRR1658224.34975561.16071944-16072163.2.2	147	chr22	16072163	12	101M	=	16071944	-320	GTCTCAAACTTCTGACCTCAGGTGATCCACCCACCTCGACCTCCCAAAGTGCTGGGATTACAGGCACTAGGTCCCTAAATTAGAGCCATATTCTTTAATGT	DDBCDEDCDCCDCC?DDDDDDDBACBDA<FFB:6HIIJIIJIIJJJJJJJJJJJJIJJIHJJJJJIJJJJJJJJJJJJJJJJJJJJJJHHHGGFFFFFCCC	AS:i:0	XS:i:0	XN:i:0	XM:i:0	XO:i:0	XG:i:0	NM:i:0	MD:Z:101	YS:i:-5	YT:Z:CP",


        # Read pair expected to map 2 times, but only 1/2 of 2nd pair maps back to same location
        "SRR1658224.7462188.16235410-16235625.1.2	163	chr22	16235410	17	101M	=	16235625	316	AGATAATTGTCTTATTTTTTTAAAAAAAGAGTAACTTTATATTATGGAATTCATAATATTTGAGACTATAATGCATGACATAAATAGTATAAAGGAGAGAG	CC@FFFFFHHHHHJJJJJJJJJJJJJJJJIJBGIJJJJJJJJJJJJJIJIFIJJJJJJJJJHHHHGFFFFFFEEEEDEEDDDDDEED@CFFFEDDD?ABB?	AS:i:0	XS:i:0	XN:i:0	XM:i:0	XO:i:0	XG:i:0	NM:i:0	MD:Z:101	YS:i:-5	YT:Z:CP",
        "SRR1658224.7462188.16235410-16235625.1.2	83	chr22	16235625	17	101M	=	16235410	-316	TTCAAAAGATGGTATATGCATTAATATTTTCATACAACTTCCAGCTTTTGTTTTTCTTCATTTAATTTTATTTATTTATTTATTTTTGAGATGGAGTCTCG	CBDDDDECEEDEFFFDFFFHHHHHHHJJIIJJIHIHFHGHJJJJJJJGJJJJJIJJJIIJJJJJJJJJJJJJJJJJJJJJJJJJJJJJHHHHHFFFDFCCC	AS:i:-5	XS:i:-39	XN:i:0	XM:i:1	XO:i:0	XG:i:0	NM:i:1	MD:Z:15G85	YS:i:0	YT:Z:CP",
        "SRR1658224.7462188.16235410-16235625.2.2	163	chr22	16235410	17	101M	*	0	0	AGATAATTGTCTTATTTTTTTAAAAAAAGAGTAACTTTATATTATGGAATTCATAATATTTGAGACTATAATGCATGACATAAATAGTATAAAGGAGAGAG	CC@FFFFFHHHHHJJJJJJJJJJJJJJJJIJBGIJJJJJJJJJJJJJIJIFIJJJJJJJJJHHHHGFFFFFFEEEEDEEDDDDDEED@CFFFEDDD?ABB?	AS:i:0	XS:i:0	XN:i:0	XM:i:0	XO:i:0	XG:i:0	NM:i:0	MD:Z:101	YS:i:-5	YT:Z:CP",

        
        # Read pair expected to map 2 times, but 1 pair maps to wrong location
        "SRR1658224.31153145.16235410-16235625.1.2	163	chr22	16235410	17	101M	=	16235625	316	AGATAATTGTCTTATTTTTTTAAAAAAAGAGTAACTTTATATTATGGAATTCATAATATTTGAGACTATAATGCATGACATAAATAGTATAAAGGAGAGAG	CCCFFFFFHHHHHJJJJJJJJJJJJJJJJIJFHIJJJJJJJJJJJIJIJJFHIJJJJJJJJHHHHHFFFFFFEDEEEEEDDDDDEED@DEEEEDDDDDDB2	AS:i:0	XS:i:0	XN:i:0	XM:i:0	XO:i:0	XG:i:0	NM:i:0	MD:Z:101	YS:i:-2	YT:Z:CP",
        "SRR1658224.31153145.16235410-16235625.1.2	83	chr22	16235625	17	101M	=	16235410	-316	TTCAAAAGATGGTATGTGCATTAATATTTTCATACAACTTCCAGTTTTTGTTTTTCTTCATTTAATTTTATTTATTTATTTATTTTTGAGATGGAGTCTCG	DDDDDDDDEEEEEEFFFFFFHHHHGHHJJIJJJIIJIJIHJHF@(JJJJJJJJJJJJIIIIJJJJJJJIJJJJJJJJJJJJJJJJJJJHHHHHFFFDFCCC	AS:i:-2	XS:i:-36	XN:i:0	XM:i:1	XO:i:0	XG:i:0	NM:i:1	MD:Z:44C56	YS:i:0	YT:Z:CP",
        "SRR1658224.31153145.16235410-16235625.2.2	163	chr22	18235410	17	101M	=	16235625	316	AGATAATTGTCTTATTTTTTTAAAAAAAGAGTAACTTTATATTATGGAATTCATAATATTTGAGACTATAATGCATGACATAAATAGTATAAAGGAGAGAG	CCCFFFFFHHHHHJJJJJJJJJJJJJJJJIJFHIJJJJJJJJJJJIJIJJFHIJJJJJJJJHHHHHFFFFFFEDEEEEEDDDDDEED@DEEEEDDDDDDB2	AS:i:0	XS:i:0	XN:i:0	XM:i:0	XO:i:0	XG:i:0	NM:i:0	MD:Z:101	YS:i:-2	YT:Z:CP",
        "SRR1658224.31153145.16235410-16235625.2.2	83	chr22	18235625	17	101M	=	16235410	-316	TTCAAAAGATGGTATGTGCATTAATATTTTCATACAACTTCCAGTTTTTGTTTTTCTTCATTTAATTTTATTTATTTATTTATTTTTGAGATGGAGTCTCG	DDDDDDDDEEEEEEFFFFFFHHHHGHHJJIJJJIIJIJIHJHF@(JJJJJJJJJJJJIIIIJJJJJJJIJJJJJJJJJJJJJJJJJJJHHHHHFFFDFCCC	AS:i:-2	XS:i:-36	XN:i:0	XM:i:1	XO:i:0	XG:i:0	NM:i:1	MD:Z:44C56	YS:i:0	YT:Z:CP",

        # Read pair expected to map 2 times, but does not map at all
        # "SRR1658224.25014179"


        # Read pairs expected to map 1 times, with read-pairs interleaved
        "readpair1.100-200.1.2	163	chr22	100	12	101M	=	200	201	TGGAGACATAAAATGAGGCATATCTGACCTCCACTTCCAAAAACATCTGAGATAGGTCTCAGTTAATTAAGAAAGTTTGTTCTGCCTAGTTTAAGGACATG	CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJIJJIHIJJJJEHIJJJHJJJJJJJJJJJJ=DHHHHHFFFFFFEEEEEEDDCDDDC	AS:i:-11	XS:i:-17	XN:i:0	XM:i:2	XO:i:0	XG:i:0	NM:i:2	MD:Z:7G44C48	YS:i:0	YT:Z:CP",
        "readpair2.150-250.1.2	163	chr22	150	12	101M	=	250	201	TGGAGACATAAAATGAGGCATATCTGACCTCCACTTCCAAAAACATCTGAGATAGGTCTCAGTTAATTAAGAAAGTTTGTTCTGCCTAGTTTAAGGACATG	CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJIJJIHIJJJJEHIJJJHJJJJJJJJJJJJ=DHHHHHFFFFFFEEEEEEDDCDDDC	AS:i:-11	XS:i:-17	XN:i:0	XM:i:2	XO:i:0	XG:i:0	NM:i:2	MD:Z:7G44C48	YS:i:0	YT:Z:CP",
        "readpair1.100-200.1.2	83	chr22	200	12	101M	=	100	-201	TCCTGACAGCATGTGCCCAAGGTGGTCAGGATACAGCTTGCTTCTATATATTTTAGGGAGAAAATACATCAGCCTGTAAACAAAAAATTAAATTCTAAGGT	DDDDDDDDDDDDDDEDEEEFFFFHHFHHIIFIIJJJJIJJJJJJJJJJIIJJJIIIJIJIJJJJIFIIIJJIJJJJJJJIIJJJJJJJHHHHHFFFFFCCC	AS:i:0	XS:i:-12	XN:i:0	XM:i:0	XO:i:0	XG:i:0	NM:i:0	MD:Z:101	YS:i:-11	YT:Z:CP",        
        "readpair2.150-250.1.2	163	chr22	250	12	101M	=	150	-201	TGGAGACATAAAATGAGGCATATCTGACCTCCACTTCCAAAAACATCTGAGATAGGTCTCAGTTAATTAAGAAAGTTTGTTCTGCCTAGTTTAAGGACATG	CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJIJJIHIJJJJEHIJJJHJJJJJJJJJJJJ=DHHHHHFFFFFFEEEEEEDDCDDDC	AS:i:-11	XS:i:-17	XN:i:0	XM:i:2	XO:i:0	XG:i:0	NM:i:2	MD:Z:7G44C48	YS:i:0	YT:Z:CP",
        "readpair1.100-200.2.2	163	chr22	100	12	101M	=	200	201	TGGAGACATAAAATGAGGCATATCTGACCTCCACTTCCAAAAACATCTGAGATAGGTCTCAGTTAATTAAGAAAGTTTGTTCTGCCTAGTTTAAGGACATG	CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJIJJIHIJJJJEHIJJJHJJJJJJJJJJJJ=DHHHHHFFFFFFEEEEEEDDCDDDC	AS:i:-11	XS:i:-17	XN:i:0	XM:i:2	XO:i:0	XG:i:0	NM:i:2	MD:Z:7G44C48	YS:i:0	YT:Z:CP",
        "readpair2.150-250.2.2	163	chr22	150	12	101M	=	250	201	TGGAGACATAAAATGAGGCATATCTGACCTCCACTTCCAAAAACATCTGAGATAGGTCTCAGTTAATTAAGAAAGTTTGTTCTGCCTAGTTTAAGGACATG	CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJIJJIHIJJJJEHIJJJHJJJJJJJJJJJJ=DHHHHHFFFFFFEEEEEEDDCDDDC	AS:i:-11	XS:i:-17	XN:i:0	XM:i:2	XO:i:0	XG:i:0	NM:i:2	MD:Z:7G44C48	YS:i:0	YT:Z:CP",
        "readpair1.100-200.2.2	83	chr22	200	12	101M	=	100	-201	TCCTGACAGCATGTGCCCAAGGTGGTCAGGATACAGCTTGCTTCTATATATTTTAGGGAGAAAATACATCAGCCTGTAAACAAAAAATTAAATTCTAAGGT	DDDDDDDDDDDDDDEDEEEFFFFHHFHHIIFIIJJJJIJJJJJJJJJJIIJJJIIIJIJIJJJJIFIIIJJIJJJJJJJIIJJJJJJJHHHHHFFFFFCCC	AS:i:0	XS:i:-12	XN:i:0	XM:i:0	XO:i:0	XG:i:0	NM:i:0	MD:Z:101	YS:i:-11	YT:Z:CP",        
        "readpair2.150-250.2.2	163	chr22	250	12	101M	=	150	-201	TGGAGACATAAAATGAGGCATATCTGACCTCCACTTCCAAAAACATCTGAGATAGGTCTCAGTTAATTAAGAAAGTTTGTTCTGCCTAGTTTAAGGACATG	CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJIJJIHIJJJJEHIJJJHJJJJJJJJJJJJ=DHHHHHFFFFFFEEEEEEDDCDDDC	AS:i:-11	XS:i:-17	XN:i:0	XM:i:2	XO:i:0	XG:i:0	NM:i:2	MD:Z:7G44C48	YS:i:0	YT:Z:CP"
    ]

    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

        
    # write temporary file in SAM format, before converting to BAM
    sam_filename = data_dir + "/tmp.sam"
    f = open(sam_filename, "w")
    write_sam_header(f)
    for line in sam_lines:
        f.write(line + "\n")
    f.close()

    # write to temp bam file
    tmp_bam_filename = data_dir + "/tmp.bam"
    subprocess.check_call("samtools view -b %s > %s" % (sam_filename, tmp_bam_filename), shell=True)
    # sort the temp bam file
    util.sort_bam(tmp_bam_filename, data_dir + "/tmp")
    # remove temp bam
    os.remove(tmp_bam_filename)
    # rename sorted bam to output bam filename
    os.rename(data_dir + "/tmp.sort.bam", bam_filename)
    def __init__(self, bam_filename, is_sorted, is_paired,
                 output_dir=None, snp_dir=None,
                 snp_tab_filename=None, snp_index_filename=None,
                 haplotype_filename=None, samples=None):
        # flag indicating whether reads are paired-end
        self.is_paired = is_paired
        
        # prefix for output files
        self.prefix = None

        # name of input BAM filename
        self.bam_filename = bam_filename        
        # name of sorted input bam_filename
        # (new file is created if input file is not
        #  already sorted)
        self.bam_sort_filename = None
        # pysam file handle for input BAM
        self.input_bam = None

        # name of output keep and to.remap BAM files
        self.keep_filename = None
        self.remap_filename = None

        # pysam file handles for output BAM filenames
        self.keep_bam = None
        self.remap_bam = None

                
        # name of output fastq files
        self.fastq_single_filename = None
        self.fastq1_filename = None
        self.fastq2_filename = None
        self.fastq1 = None
        self.fastq2 = None
        self.fastq_single = None

        # name of directory to read SNPs from
        self.snp_dir = snp_dir

        # paths to HDF5 files to read SNP info from
        self.snp_tab_filename = snp_tab_filename
        self.snp_index_filename = snp_index_filename
        self.haplotype_filename = haplotype_filename

        if self.snp_tab_filename:
            self.snp_tab_h5 = tables.open_file(snp_tab_filename, "r")
            self.snp_index_h5 = tables.open_file(snp_index_filename, "r")
            self.hap_h5 = tables.open_file(haplotype_filename, "r")
        else:
            self.snp_tab_h5 = None
            self.snp_index_h5 = None
            self.hap_h5 = None

            
        # separate input directory and bam filename
        tokens = self.bam_filename.split("/")
        bam_dir = "/".join(tokens[:-1])
        filename = tokens[-1]

        if output_dir is None:
            # if no output dir specified, use same directory as input
            # bam file
            output_dir = bam_dir
        else:
            if output_dir.endswith("/"):
                # strip trailing '/' from output dir name
                output_dir = output_dir[:-1]
                
        name_split = filename.split(".")
        if len(name_split) > 1:
           self.prefix = output_dir + "/" + ".".join(name_split[:-1])
        else:
            self.prefix = output_dir + "/" + name_split[0]
        
        # create output dir if does not exist
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

            
        # TODO: could allow names of output files to be specified
        # on command line rather than appending name to prefix
        sys.stderr.write("prefix: %s\n" % self.prefix)
        
        if not is_sorted:
            util.sort_bam(self.bam_filename, self.prefix)
            self.bam_sort_filename = self.prefix + ".sort.bam"
        else:
            self.bam_sort_filename = self.bam_filename

        self.keep_filename = self.prefix + ".keep.bam"
        self.remap_filename = self.prefix + ".to.remap.bam"

        sys.stderr.write("reading reads from:\n  %s\n" %
                         self.bam_sort_filename)
        
        sys.stderr.write("writing output files to:\n")

        
        if self.is_paired:
            self.fastq1_filename = self.prefix + ".remap.fq1.gz"
            self.fastq2_filename = self.prefix + ".remap.fq2.gz"
            self.fastq1 = gzip.open(self.fastq1_filename, "wt")
            self.fastq2 = gzip.open(self.fastq2_filename, "wt")
            self.fastq_single_filename = self.prefix + ".remap.single.fq.gz"
            self.fastq_single = gzip.open(self.fastq_single_filename, "wt")
            sys.stderr.write("  %s\n  %s\n  %s\n" %
                             (self.fastq1_filename,
                              self.fastq2_filename,
                              self.fastq_single_filename))
            
        else:
            self.fastq_single_filename = self.prefix + ".remap.fq.gz"
            self.fastq_single = gzip.open(self.fastq_single_filename, "wt")
            sys.stderr.write("  %s\n" % (self.fastq_single_filename))

        self.input_bam = pysam.Samfile(self.bam_sort_filename, "r")
        self.keep_bam = pysam.Samfile(self.keep_filename, "w",
                                      template=self.input_bam)
        self.remap_bam = pysam.Samfile(self.remap_filename, "w",
                                       template=self.input_bam)
        sys.stderr.write("  %s\n  %s\n" % (self.keep_filename,
                                           self.remap_filename))