def open(self): """ Open the read files for writing, appending .fastq to the output_prefix. Create directories as needed. """ if self.isOpen: self.close() try: misc.make_sure_path_exists(os.path.dirname(self.output_prefix)) self.R1f = open(self.output_prefix, "a") except: sys.stderr.write( "ERROR:[IlluminaFastaOutput] Cannot write reads to file with prefix: %s\n" % self.output_prefix ) raise self.isOpen = True return 0
def open(self): """ Open the read file for writing, appending .fastq to the output_prefix. Create directories as needed. """ if self.isOpen: self.close() try: misc.make_sure_path_exists(os.path.dirname(self.output_prefix)) if self.uncompressed is True: self.R1f = open(self.output_prefix + ".fastq", "a") else: self.R1f = gzip.open(self.output_prefix + ".fastq.gz", "ab") except: sys.stderr.write( "ERROR:[IlluminaOneReadOutput] Cannot write reads to file with prefix: %s\n" % self.output_prefix ) raise self.isOpen = True return 0
def start( self, fastq_file1, fastq_file2, fastq_file3, reference, insertionSite, overwrite, sensitivity, output_prefix, minins, maxins, procs, mapq, dedup_reads=True, uncompressed=False, verbose=True, debug=False, ): """ screen reads against a reference fasta file """ try: mapped_pairs_count = 0 mapped_pairs_lowqual = 0 unmapped_pairs_count = 0 mapped_singles_count = 0 mapped_singles_lowqual = 0 unmapped_singles_count = 0 secondary_alignment = 0 missing_IS = 0 # missing the insersion site duplicate_count = 0 # duplcicates (OF mapped!) count = Counter() # Set up output misc.make_sure_path_exists(os.path.dirname(output_prefix)) run_out = {} run_out["mappedsam"] = open(output_prefix + ".sam", "w") read_profile = open(output_prefix + ".TnRead_profile.txt", "w") # 0x1 template having multiple segments in sequencing # 0x2 each segment properly aligned according to the aligner # 0x4 segment unmapped # 0x8 next segment in the template unmapped # 0x10 SEQ being reverse complemented # 0x20 SEQ of the next segment in the template being reversed # 0x40 the first segment in the template # 0x80 the last segment in the template # 0x100 secondary alignment # 0x200 not passing quality controls # 0x400 PCR or optical duplicate PE1 = {} PE2 = {} # Read in reference and identify all potential insertion sites site_counts_F = {} site_counts_R = {} sites = getISites(reference, insertionSite) for contig in sites.keys(): sys.stdout.write("Found %s insertion sites for contig %s\n" % (len(sites[contig]), contig)) site_counts_F[contig] = Counter() site_counts_R[contig] = Counter() i = 0 lasttime = time.time() for line in sp_bowtie2_screen( fastq_file1, fastq_file2, fastq_file3, reference, overwrite, sensitivity, procs, minins, maxins ): if i % 10000 == 0 and i > 0: read_profile.write( "%s\t%s\t%s\t%s\t%s\n" % ( i, (mapped_pairs_count + mapped_singles_count), (mapped_pairs_count + mapped_singles_count + duplicate_count), sum(len(site_counts_F[c]) for c in site_counts_F.keys()), sum(len(site_counts_R[c]) for c in site_counts_R.keys()), ) ) if i % 100000 == 0 and i > 0 and verbose: sys.stderr.write( "Processed: %s, PE in ref: %s, SE in ref: %s in %s minutes\n" % (i, mapped_pairs_count, mapped_singles_count, round((time.time() - lasttime) / (60), 2)) ) if line[0] == "@": # header line # write out to sam run_out["mappedsam"].write(line) else: i += 1 line2 = line.strip().split() flag = int(line2[1]) # Secondary alignment if flag & 0x100: secondary_alignment += 1 continue # check for insersion site before any further processing if (not (flag & 0x1)) or (flag & 0x40): if flag & 0x10: IS = reverseComplement(line2[9])[0:2] == insertionSite else: IS = line2[9][0:2] == insertionSite if not IS: missing_IS += 1 continue # Handle SE: # mapped SE reads have 0x1 set to 0, and 0x4 (third bit) set to 1 if mapped if not (flag & 0x1): # SE READ if not (flag & 0x4): # MAPPED if int(line2[4]) >= mapq: # check mapq key = getUniqueKey(line2) if dedup_reads: count[key] += 1 if count[key] == 1: if key.split("_*_")[1] == "F": site_counts_F[key.split("_*_")[2]][key.split("_*_")[3]] += 1 else: site_counts_R[key.split("_*_")[2]][key.split("_*_")[3]] += 1 mapped_singles_count += 1 run_out["mappedsam"].write("\t".join(line2) + "\n") else: duplicate_count += 1 else: if key.split("_*_")[1] == "F": site_counts_F[key.split("_*_")[2]][key.split("_*_")[3]] += 1 else: site_counts_R[key.split("_*_")[2]][key.split("_*_")[3]] += 1 mapped_singles_count += 1 run_out["mappedsam"].write("\t".join(line2) + "\n") else: # MAPPED BUT LOW QUAL mapped_singles_lowqual += 1 else: # UNMAPPED unmapped_singles_count += 1 continue # Handle PE: # logic: 0x1 = multiple segments in sequencing, 0x4 = segment unmapped, 0x8 = next segment unmapped if flag & 0x1: # PE READ if (not (flag & 0x4) and not (flag & 0x8)) and (flag & 0x2): # both pair mapped and concordant if int(line2[4]) >= mapq: # check mapq if flag & 0x40: # is this PE1 (first segment in template) # PE1 read, check that PE2 is in dict and write out ID = line2[0] if ID in PE2: if flag & 0x10: # reverse complement line2[1] = str( flag - 0x1 - 0x2 - 0x40 ) # modify read1 flag (remove read2 assoc flags) else: # forward complements line2[1] = str( flag - 0x1 - 0x2 - 0x20 - 0x40 ) # modify read1 flag (remove read2 assoc flags) key = getUniqueKey(line2, PE2[ID]) if dedup_reads: count[key] += 1 if count[key] == 1: if key.split("_*_")[1] == "F": site_counts_F[key.split("_*_")[2]][key.split("_*_")[3]] += 1 else: site_counts_R[key.split("_*_")[2]][key.split("_*_")[3]] += 1 mapped_pairs_count += 1 run_out["mappedsam"].write("\t".join(line2) + "\n") else: duplicate_count += 1 else: if key.split("_*_")[1] == "F": site_counts_F[key.split("_*_")[2]][key.split("_*_")[3]] += 1 else: site_counts_R[key.split("_*_")[2]][key.split("_*_")[3]] += 1 mapped_pairs_count += 1 run_out["mappedsam"].write("\t".join(line2) + "\n") del PE2[ID] else: PE1[ID] = line2 elif flag & 0x80: # is this PE2 (last segment in template) # PE2 read, check that PE1 is in dict and write out ID = line2[0] if ID in PE1: if int(PE1[ID][1]) & 0x10: # reverse complement PE1[ID][1] = str( int(PE1[ID][1]) - 0x1 - 0x2 - 0x40 ) # modify read1 flag (remove read2 assoc flags) else: # forward complements PE1[ID][1] = str( int(PE1[ID][1]) - 0x1 - 0x2 - 0x20 - 0x40 ) # modify read1 flag (remove read2 assoc flags) key = getUniqueKey(PE1[ID], line2) if dedup_reads: count[key] += 1 if count[key] == 1: if key.split("_*_")[1] == "F": site_counts_F[key.split("_*_")[2]][key.split("_*_")[3]] += 1 else: site_counts_R[key.split("_*_")[2]][key.split("_*_")[3]] += 1 mapped_pairs_count += 1 run_out["mappedsam"].write("\t".join(PE1[ID]) + "\n") else: duplicate_count += 1 else: if key.split("_*_")[1] == "F": site_counts_F[key.split("_*_")[2]][key.split("_*_")[3]] += 1 else: site_counts_R[key.split("_*_")[2]][key.split("_*_")[3]] += 1 mapped_pairs_count += 1 run_out["mappedsam"].write("\t".join(PE1[ID]) + "\n") del PE1[ID] else: PE2[ID] = line2 else: mapped_pairs_lowqual += 1 else: # an 'unmapped' pair (both pairs unmapped, one of pair unmapped, or both mapped but discordant) unmapped_pairs_count += 1 sys.stderr.write( "Processed: %s, PE in ref: %s, SE in ref: %s in %s minutes\n" % (i, mapped_pairs_count, mapped_singles_count, round((time.time() - lasttime) / (60), 2)) ) read_profile.write( "%s\t%s\t%s\t%s\t%s\n" % ( i, (mapped_pairs_count + mapped_singles_count), (mapped_pairs_count + mapped_singles_count + duplicate_count), sum(len(site_counts_F[c]) for c in site_counts_F.keys()), sum(len(site_counts_R[c]) for c in site_counts_R.keys()), ) ) read_profile.close() site_output = open(output_prefix + ".sites", "w") wig_output = open(output_prefix + ".wig", "w") wig_output.write( "# output:%s fastq1:%s fastq2:%s fastq3:%s\n" % (output_prefix, fastq_file1, fastq_file2, fastq_file3) ) for contig in sites.keys(): wig_output.write("variableStep chrom=%s\n" % contig) for site in sites[contig]: site_output.write( "%s\t%s\t%s\t%s\n" % (contig, site, site_counts_F[contig][site], site_counts_R[contig][site]) ) wig_output.write( "%s %s\n" % (site, str(int(site_counts_F[contig][site]) + int(site_counts_R[contig][site]))) ) site_output.close() wig_output.close() if debug: data = open("reverse.txt", "w") for contig in sites.keys(): countF = 0 sitesF = 0 countR = 0 sitesR = 0 for site in site_counts_F[contig].keys(): countF += site_counts_F[contig][site] sitesF += 1 for site in site_counts_R[contig].keys(): countR += site_counts_R[contig][site] data.write("%s\t%s\n" % (site, site_counts_R[contig][site])) sitesR += 1 print countF print sitesF print countR print sitesR data.close() sys.stdout.write("total records: %s\n" % i) sys.stdout.write("secondary alignments: %s\n" % secondary_alignment) sys.stdout.write("pairs: %s\n" % (mapped_pairs_count + mapped_pairs_lowqual + unmapped_pairs_count)) sys.stdout.write("\tmapped pairs: %s\n" % mapped_pairs_count) sys.stdout.write("\tlowqual pairs: %s\n" % mapped_pairs_lowqual) sys.stdout.write("\tunmapped pairs: %s\n" % unmapped_pairs_count) sys.stdout.write("singles: %s\n" % (mapped_singles_count + mapped_singles_lowqual + unmapped_singles_count)) sys.stdout.write("\tmapped singles: %s\n" % mapped_singles_count) sys.stdout.write("\tlowqual singles: %s\n" % mapped_singles_lowqual) sys.stdout.write("\tunmapped singles: %s\n" % unmapped_singles_count) sys.stdout.write("missing insertion site: %s\n" % missing_IS) sys.stdout.write("PCR duplicate count: %s\n" % duplicate_count) self.clean() return 0 except (KeyboardInterrupt, SystemExit): self.clean() sys.stderr.write("%s unexpectedly terminated\n" % (__name__)) return 1 except: self.clean() if not debug: sys.stderr.write("A fatal error was encountered. trying turning on debug\n") if debug: sys.stderr.write("".join(traceback.format_exception(*sys.exc_info()))) return 1