예제 #1
0
 def open(self):
     """
     Open the read files for writing, appending .fastq to the output_prefix.
     Create directories as needed.
     """
     if self.isOpen:
         self.close()
     try:
         misc.make_sure_path_exists(os.path.dirname(self.output_prefix))
         self.R1f = open(self.output_prefix, "a")
     except:
         sys.stderr.write(
             "ERROR:[IlluminaFastaOutput] Cannot write reads to file with prefix: %s\n" % self.output_prefix
         )
         raise
     self.isOpen = True
     return 0
예제 #2
0
 def open(self):
     """
     Open the read file for writing, appending .fastq to the output_prefix.
     Create directories as needed.
     """
     if self.isOpen:
         self.close()
     try:
         misc.make_sure_path_exists(os.path.dirname(self.output_prefix))
         if self.uncompressed is True:
             self.R1f = open(self.output_prefix + ".fastq", "a")
         else:
             self.R1f = gzip.open(self.output_prefix + ".fastq.gz", "ab")
     except:
         sys.stderr.write(
             "ERROR:[IlluminaOneReadOutput] Cannot write reads to file with prefix: %s\n" % self.output_prefix
         )
         raise
     self.isOpen = True
     return 0
예제 #3
0
    def start(
        self,
        fastq_file1,
        fastq_file2,
        fastq_file3,
        reference,
        insertionSite,
        overwrite,
        sensitivity,
        output_prefix,
        minins,
        maxins,
        procs,
        mapq,
        dedup_reads=True,
        uncompressed=False,
        verbose=True,
        debug=False,
    ):
        """
            screen reads against a reference fasta file
        """
        try:
            mapped_pairs_count = 0
            mapped_pairs_lowqual = 0
            unmapped_pairs_count = 0
            mapped_singles_count = 0
            mapped_singles_lowqual = 0
            unmapped_singles_count = 0
            secondary_alignment = 0
            missing_IS = 0  # missing the insersion site
            duplicate_count = 0  # duplcicates (OF mapped!)
            count = Counter()

            # Set up output
            misc.make_sure_path_exists(os.path.dirname(output_prefix))
            run_out = {}
            run_out["mappedsam"] = open(output_prefix + ".sam", "w")

            read_profile = open(output_prefix + ".TnRead_profile.txt", "w")

            # 0x1 template having multiple segments in sequencing
            # 0x2 each segment properly aligned according to the aligner
            # 0x4 segment unmapped
            # 0x8 next segment in the template unmapped
            # 0x10 SEQ being reverse complemented
            # 0x20 SEQ of the next segment in the template being reversed
            # 0x40 the first segment in the template
            # 0x80 the last segment in the template
            # 0x100 secondary alignment
            # 0x200 not passing quality controls
            # 0x400 PCR or optical duplicate
            PE1 = {}
            PE2 = {}

            # Read in reference and identify all potential insertion sites
            site_counts_F = {}
            site_counts_R = {}
            sites = getISites(reference, insertionSite)
            for contig in sites.keys():
                sys.stdout.write("Found %s insertion sites for contig %s\n" % (len(sites[contig]), contig))
                site_counts_F[contig] = Counter()
                site_counts_R[contig] = Counter()

            i = 0
            lasttime = time.time()

            for line in sp_bowtie2_screen(
                fastq_file1, fastq_file2, fastq_file3, reference, overwrite, sensitivity, procs, minins, maxins
            ):
                if i % 10000 == 0 and i > 0:
                    read_profile.write(
                        "%s\t%s\t%s\t%s\t%s\n"
                        % (
                            i,
                            (mapped_pairs_count + mapped_singles_count),
                            (mapped_pairs_count + mapped_singles_count + duplicate_count),
                            sum(len(site_counts_F[c]) for c in site_counts_F.keys()),
                            sum(len(site_counts_R[c]) for c in site_counts_R.keys()),
                        )
                    )
                if i % 100000 == 0 and i > 0 and verbose:
                    sys.stderr.write(
                        "Processed: %s, PE in ref: %s, SE in ref: %s in %s minutes\n"
                        % (i, mapped_pairs_count, mapped_singles_count, round((time.time() - lasttime) / (60), 2))
                    )
                if line[0] == "@":  # header line
                    # write out to sam
                    run_out["mappedsam"].write(line)
                else:
                    i += 1

                    line2 = line.strip().split()
                    flag = int(line2[1])
                    # Secondary alignment
                    if flag & 0x100:
                        secondary_alignment += 1
                        continue

                    # check for insersion site before any further processing
                    if (not (flag & 0x1)) or (flag & 0x40):
                        if flag & 0x10:
                            IS = reverseComplement(line2[9])[0:2] == insertionSite
                        else:
                            IS = line2[9][0:2] == insertionSite
                        if not IS:
                            missing_IS += 1
                            continue

                    # Handle SE:
                    # mapped SE reads have 0x1 set to 0, and 0x4 (third bit) set to 1 if mapped
                    if not (flag & 0x1):  # SE READ
                        if not (flag & 0x4):  # MAPPED
                            if int(line2[4]) >= mapq:  # check mapq
                                key = getUniqueKey(line2)
                                if dedup_reads:
                                    count[key] += 1
                                    if count[key] == 1:
                                        if key.split("_*_")[1] == "F":
                                            site_counts_F[key.split("_*_")[2]][key.split("_*_")[3]] += 1
                                        else:
                                            site_counts_R[key.split("_*_")[2]][key.split("_*_")[3]] += 1
                                        mapped_singles_count += 1
                                        run_out["mappedsam"].write("\t".join(line2) + "\n")
                                    else:
                                        duplicate_count += 1
                                else:
                                    if key.split("_*_")[1] == "F":
                                        site_counts_F[key.split("_*_")[2]][key.split("_*_")[3]] += 1
                                    else:
                                        site_counts_R[key.split("_*_")[2]][key.split("_*_")[3]] += 1
                                    mapped_singles_count += 1
                                    run_out["mappedsam"].write("\t".join(line2) + "\n")
                            else:  # MAPPED BUT LOW QUAL
                                mapped_singles_lowqual += 1
                        else:  # UNMAPPED
                            unmapped_singles_count += 1
                        continue
                    # Handle PE:
                    # logic:  0x1 = multiple segments in sequencing,   0x4 = segment unmapped,  0x8 = next segment unmapped
                    if flag & 0x1:  # PE READ
                        if (not (flag & 0x4) and not (flag & 0x8)) and (flag & 0x2):  # both pair mapped and concordant
                            if int(line2[4]) >= mapq:  # check mapq
                                if flag & 0x40:  # is this PE1 (first segment in template)
                                    # PE1 read, check that PE2 is in dict and write out
                                    ID = line2[0]
                                    if ID in PE2:
                                        if flag & 0x10:  # reverse complement
                                            line2[1] = str(
                                                flag - 0x1 - 0x2 - 0x40
                                            )  # modify read1 flag (remove read2 assoc flags)
                                        else:  # forward complements
                                            line2[1] = str(
                                                flag - 0x1 - 0x2 - 0x20 - 0x40
                                            )  # modify read1 flag (remove read2 assoc flags)
                                        key = getUniqueKey(line2, PE2[ID])
                                        if dedup_reads:
                                            count[key] += 1
                                            if count[key] == 1:
                                                if key.split("_*_")[1] == "F":
                                                    site_counts_F[key.split("_*_")[2]][key.split("_*_")[3]] += 1
                                                else:
                                                    site_counts_R[key.split("_*_")[2]][key.split("_*_")[3]] += 1
                                                mapped_pairs_count += 1
                                                run_out["mappedsam"].write("\t".join(line2) + "\n")
                                            else:
                                                duplicate_count += 1
                                        else:
                                            if key.split("_*_")[1] == "F":
                                                site_counts_F[key.split("_*_")[2]][key.split("_*_")[3]] += 1
                                            else:
                                                site_counts_R[key.split("_*_")[2]][key.split("_*_")[3]] += 1
                                            mapped_pairs_count += 1
                                            run_out["mappedsam"].write("\t".join(line2) + "\n")

                                        del PE2[ID]
                                    else:
                                        PE1[ID] = line2
                                elif flag & 0x80:  # is this PE2 (last segment in template)
                                    # PE2 read, check that PE1 is in dict and write out
                                    ID = line2[0]
                                    if ID in PE1:
                                        if int(PE1[ID][1]) & 0x10:  # reverse complement
                                            PE1[ID][1] = str(
                                                int(PE1[ID][1]) - 0x1 - 0x2 - 0x40
                                            )  # modify read1 flag (remove read2 assoc flags)
                                        else:  # forward complements
                                            PE1[ID][1] = str(
                                                int(PE1[ID][1]) - 0x1 - 0x2 - 0x20 - 0x40
                                            )  # modify read1 flag (remove read2 assoc flags)
                                        key = getUniqueKey(PE1[ID], line2)
                                        if dedup_reads:
                                            count[key] += 1
                                            if count[key] == 1:
                                                if key.split("_*_")[1] == "F":
                                                    site_counts_F[key.split("_*_")[2]][key.split("_*_")[3]] += 1
                                                else:
                                                    site_counts_R[key.split("_*_")[2]][key.split("_*_")[3]] += 1
                                                mapped_pairs_count += 1
                                                run_out["mappedsam"].write("\t".join(PE1[ID]) + "\n")
                                            else:
                                                duplicate_count += 1
                                        else:
                                            if key.split("_*_")[1] == "F":
                                                site_counts_F[key.split("_*_")[2]][key.split("_*_")[3]] += 1
                                            else:
                                                site_counts_R[key.split("_*_")[2]][key.split("_*_")[3]] += 1
                                            mapped_pairs_count += 1
                                            run_out["mappedsam"].write("\t".join(PE1[ID]) + "\n")

                                        del PE1[ID]
                                    else:
                                        PE2[ID] = line2
                            else:
                                mapped_pairs_lowqual += 1
                        else:  # an 'unmapped' pair (both pairs unmapped, one of pair unmapped, or both mapped but discordant)
                            unmapped_pairs_count += 1

            sys.stderr.write(
                "Processed: %s, PE in ref: %s, SE in ref: %s in %s minutes\n"
                % (i, mapped_pairs_count, mapped_singles_count, round((time.time() - lasttime) / (60), 2))
            )

            read_profile.write(
                "%s\t%s\t%s\t%s\t%s\n"
                % (
                    i,
                    (mapped_pairs_count + mapped_singles_count),
                    (mapped_pairs_count + mapped_singles_count + duplicate_count),
                    sum(len(site_counts_F[c]) for c in site_counts_F.keys()),
                    sum(len(site_counts_R[c]) for c in site_counts_R.keys()),
                )
            )
            read_profile.close()

            site_output = open(output_prefix + ".sites", "w")
            wig_output = open(output_prefix + ".wig", "w")
            wig_output.write(
                "# output:%s fastq1:%s fastq2:%s fastq3:%s\n" % (output_prefix, fastq_file1, fastq_file2, fastq_file3)
            )
            for contig in sites.keys():
                wig_output.write("variableStep chrom=%s\n" % contig)
                for site in sites[contig]:
                    site_output.write(
                        "%s\t%s\t%s\t%s\n" % (contig, site, site_counts_F[contig][site], site_counts_R[contig][site])
                    )
                    wig_output.write(
                        "%s %s\n" % (site, str(int(site_counts_F[contig][site]) + int(site_counts_R[contig][site])))
                    )
            site_output.close()
            wig_output.close()

            if debug:
                data = open("reverse.txt", "w")
                for contig in sites.keys():
                    countF = 0
                    sitesF = 0
                    countR = 0
                    sitesR = 0
                    for site in site_counts_F[contig].keys():
                        countF += site_counts_F[contig][site]
                        sitesF += 1
                    for site in site_counts_R[contig].keys():
                        countR += site_counts_R[contig][site]
                        data.write("%s\t%s\n" % (site, site_counts_R[contig][site]))
                        sitesR += 1
                    print countF
                    print sitesF
                    print countR
                    print sitesR
                data.close()

            sys.stdout.write("total records: %s\n" % i)
            sys.stdout.write("secondary alignments: %s\n" % secondary_alignment)
            sys.stdout.write("pairs: %s\n" % (mapped_pairs_count + mapped_pairs_lowqual + unmapped_pairs_count))
            sys.stdout.write("\tmapped pairs: %s\n" % mapped_pairs_count)
            sys.stdout.write("\tlowqual pairs: %s\n" % mapped_pairs_lowqual)
            sys.stdout.write("\tunmapped pairs: %s\n" % unmapped_pairs_count)
            sys.stdout.write("singles: %s\n" % (mapped_singles_count + mapped_singles_lowqual + unmapped_singles_count))
            sys.stdout.write("\tmapped singles: %s\n" % mapped_singles_count)
            sys.stdout.write("\tlowqual singles: %s\n" % mapped_singles_lowqual)
            sys.stdout.write("\tunmapped singles: %s\n" % unmapped_singles_count)
            sys.stdout.write("missing insertion site: %s\n" % missing_IS)
            sys.stdout.write("PCR duplicate count: %s\n" % duplicate_count)

            self.clean()
            return 0

        except (KeyboardInterrupt, SystemExit):
            self.clean()
            sys.stderr.write("%s unexpectedly terminated\n" % (__name__))
            return 1
        except:
            self.clean()
            if not debug:
                sys.stderr.write("A fatal error was encountered. trying turning on debug\n")
            if debug:
                sys.stderr.write("".join(traceback.format_exception(*sys.exc_info())))
            return 1