def main(): for ex in ["bmftools_db", "bmftools", "bmftools_p"]: cstr = "../../%s hashdmp -o hashdmp_test.out hashdmp_test.fq" % ex subprocess.check_call(shlex.split(cstr)) fqh = pysam.FastqFile("hashdmp_test.out") try: r1 = fqh.next() except: r1 = next(fqh) # Python 3 tags = get_tags(r1) assert tags["FM"] == 7 assert round(tags["NF"], 2) == 0.14 assert tags["RV"] == 2 assert tags["DR"] assert len(r1.name) == 16 try: r1 = fqh.next() except: r1 = next(fqh) # Python 3 tags = get_tags(r1) assert tags["FM"] == 1 assert tags["FP"] == 0 assert tags["DR"] == 0 return
def test_mean_qscore(read_fastq_table, read_fastq_file): """Check the mean qscore against that produced by seqkit""" expected = pd.read_csv(read_fastq_table, sep="\t", usecols=["read_id", "mean_qscore"], index_col="read_id", squeeze=True) for rec in pysam.FastqFile(read_fastq_file): qscore = mean_qscore(rec.get_quality_array()) assert np.around(qscore, 2) == expected[rec.name]
def main(): for ex in ["bmftools_db", "bmftools", "bmftools_p"]: cstr = ( "../../%s collapse inline -wn0 -sTGACT -t%i -o marksplit_test_tmp -l 10 " "-v 11 marksplit_test.R1.fq marksplit_test.R2.fq" % (ex, mm_threshold)) subprocess.check_call(shlex.split(cstr)) for read in pysam.FastqFile("marksplit_test_tmp.tmp.0.R1.fastq"): check_bc(read) return 0
def add_edges_fastq(self): with pysam.FastqFile(self.sequence) as file: for read in file: sequence = read.sequence for i in range(len(sequence) - self.k): kmer = sequence[i:i + self.k + 1] rev_kmer = self.get_complementary_sequence(kmer) self.add_edge(kmer) self.add_edge(rev_kmer)
def test_output(self): """Compare output fastq to expected sequence and scores.""" expected = { 'input1': ('AGTGCTCA', (1, 1, 3, 2, 1, 1, 5, 1)), 'input2': ('ACTC', (3, 1, 3, 4))} got = {} with pysam.FastqFile(self.args.output) as output_handle: for read in output_handle: lengths = tuple(read.get_quality_array()) got[read.name] = (read.sequence, lengths) self.assertEqual(expected, got)
def add_vertices_fastq(self): with pysam.FastqFile(self.sequence) as file: for read in file: sequence = read.sequence for i in range(len(sequence) - self.k + 1): kmer = sequence[i:i + self.k] rev_kmer = self.get_complementary_sequence(kmer) if not (kmer in self.vertices): self.add_vertex(kmer) self.add_vertex(rev_kmer)
def main(argv=sys.argv): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-i", "--input-fastq-file", dest="input_fastq_file", type="string", help="input fastq file. " "[%default]") parser.add_option("-m", "--method", dest="methods", action="append", type="choice", choices=("length", ), help="methods to apply [%default]") parser.set_defaults( methods=[], input_fastq_file=None, ) (options, args) = E.start(parser, argv) if len(args) == 1: options.input_fastq_file = args[0] if options.input_fastq_file is None: raise ValueError("missing input fastq file") counter = E.Counter() # note: complete rewrite with Counters, currently only length if options.methods != ["length"]: raise NotImplementedError() with pysam.FastqFile(options.input_fastq_file) as inf: for read in inf: counter.input += 1 options.stdout.write( "\t".join(map(str, (read.name, len(read.sequence)))) + "\n") counter.output += 1 E.info(counter) E.stop()
def main(argv=sys.argv): parser = E.ArgumentParser(description=__doc__) parser.add_argument("-i", "--input-fastq-file", dest="input_fastq_file", type=str, help="input fastq file. ") parser.add_argument("-m", "--method", dest="methods", action="append", type=str, choices=("length", ), help="methods to apply ") parser.set_defaults( methods=[], input_fastq_file=None, ) (args, unknown) = E.start(parser, argv, unknowns=True) if len(unknown) == 1: args.input_fastq_file = unknown[0] if args.input_fastq_file is None: raise ValueError("missing input fastq file") counter = E.Counter() # note: complete rewrite with Counters, currently only length if args.methods != ["length"]: raise NotImplementedError() with pysam.FastqFile(args.input_fastq_file) as inf: for read in inf: counter.input += 1 args.stdout.write( "\t".join(map(str, (read.name, len(read.sequence)))) + "\n") counter.output += 1 E.info(counter) E.stop()
def main(argv=sys.argv): parser = E.ArgumentParser(description=__doc__) parser.add_argument("-i", "--input-fastq", dest="input_fastq_file", type=str, help="input fastq file") parser.add_argument("-m", "--method", dest="method", type=str, choices=["ont2pacbio"], help="methods to apply ") parser.set_defaults( input_fastq_file=None, line_width=80, method=None, ) (args, unknown) = E.start(parser, argv, add_output_options=True, unknowns=True) if len(unknown) == 1: args.input_fastq_file = unknown[0] if args.input_fastq_file == "-": args.input_fastq_file = args.stdin outf = args.stdout line_width = args.line_width well_no = 0 for record in pysam.FastqFile(args.input_fastq_file): well_no += 1 quals = record.get_quality_array() seq = record.sequence qv = int(math.floor(sum(quals) / len(quals))) outf.write(">{}/{}/{}_{} RQ=0.{}\n".format("test", well_no, 1, len(seq) + 1, qv)) for x in range(0, len(seq), line_width): outf.write(seq[x:x + line_width] + "\n") E.stop()
def base_quality_single_threaded(fastq, pkl): """Given a fastq file, read through the file :param fastq: name of fastq file :param pkl: name of pickle file to write to :return: """ # bp, phred bq_mat = np.zeros((500, 100), dtype=np.uint64) idx = list(range(500)) for r in pysam.FastqFile(filename=fastq, persist=False): bqa = r.get_quality_array() bq_mat[idx[:len(bqa)], bqa] += 1 pickle.dump(bq_mat, open(pkl, 'wb')) return bq_mat
def test_rle(self): """Test the conversion of basecalls into fastqrle file.""" block_size = 3 with open(self.output_fastqrle, 'w') as f: subprocess.call(['medaka', 'fastrle', self.input_fasta, '--block_size', str(block_size)], stdout=f) expected_results = ( [('A', 1), ('C', 3), ('C', 3), ('C', 1), ('G', 1), ('T', 3), ('A', 1)], [('C', 3), ('C', 3), ('C', 2)]) with pysam.FastqFile(self.output_fastqrle) as f: for index, entry in enumerate(f): bases = entry.sequence qualities = entry.get_quality_array() got = list(zip(bases, qualities)) expected = expected_results[index] self.assertEqual(expected, got, "Expected and got differ: ({} != {})".format(expected, got))
def run(self): ## do the extraction ofh = open(self.output_fq_filename, 'w') fq_reader = pysam.FastqFile(self.input_fq_filename) for fq_entry in fq_reader: read_name = fq_entry.name read_name = re.sub("/[12]$", "", read_name) if read_name in self.keep_set: ofh.write("\n".join([ "@" + fq_entry.name, fq_entry.sequence, "+", fq_entry.quality ]) + "\n") ofh.close() self.success = True
def process_one_fastq(fastq, threads=2, max_reads_in_queue=int(30e6)): """ :param fastq: :param threads: :param max_reads_in_queue: The default is about 6GB, considering 200 bytes per qual string :return: """ t0 = time.time() in_queue, out_queue = Queue(max_reads_in_queue), Queue() # Start worker processes logger.debug('Starting {} threads'.format(threads)) p_list = [Process(target=process_worker, args=(i, in_queue, out_queue)) for i in range(threads)] for p in p_list: p.start() # Burn through file logger.debug('Starting to read FASTQ file') for r in pysam.FastqFile(filename=fastq, persist=False): bqa = r.get_quality_array() in_queue.put(bqa) # TODO: Block when queue gets too big # Tell child processes to stop logger.debug('Telling child processes to stop') for i in range(threads): in_queue.put(__process_stop_code__) # Get results and add them logger.debug('Summing up result matrices') bq_mat = out_queue.get() for i in range(threads - 1): bq_mat += out_queue.get() # Wait for workers to finish logger.debug('Waiting for workers to shutdown') for p in p_list: p.join() t1 = time.time() logger.debug('Finished processing FASTQ in {} s'.format(t1 - t0)) return bq_mat
def main(): subprocess.check_call( "../../bmftools_db rsq -ftmp.fq rsq_test.bam rsq_test.out.bam 2> rsq_test.log", shell=True) try: assert (subprocess.check_output("samtools view -c rsq_test.out.bam", shell=True).strip() == "0") except AssertionError: assert (subprocess.check_output("samtools view -c rsq_test.out.bam", shell=True).strip().decode() == "0") recs = list(pysam.FastqFile("tmp.fq")) assert len(recs) == 2 try: assert str(recs[0]) == correct_string return 0 except AssertionError: sys.stderr.write("%s found not expected %s. TEST FAILED\n" % (repr(str(recs[0])), repr(correct_string))) return 1
def main(argv=sys.argv): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-i", "--input-fastq", dest="input_fastq_file", type="string", help="input fastq file") parser.add_option( "-m", "--method", dest="method", type="choice", choices=["ont2pacbio"], help="methods to apply [%default]") parser.set_defaults( input_fastq_file=None, line_width=80, method=None, ) (options, args) = E.start(parser, argv, add_output_options=True) if len(args) == 1: options.input_fastq_file = args[0] if options.input_fastq_file == "-": options.input_fastq_file = options.stdin outf = options.stdout line_width = options.line_width well_no = 0 for record in pysam.FastqFile(options.input_fastq_file): well_no += 1 quals = record.get_quality_array() seq = record.sequence qv = int(math.floor(sum(quals) / len(quals))) outf.write(">{}/{}/{}_{} RQ=0.{}\n".format( "test", well_no, 1, len(seq) + 1, qv)) for x in range(0, len(seq), line_width): outf.write(seq[x:x + line_width] + "\n") E.stop()
def main(): with gzip.open(args.output, 'wb') as w: cut_site = re.compile(args.cutsite) cutsite_counter = 0 total_fragments = 0 fragment_counts = dict() for counter, record in enumerate(pysam.FastqFile(args.input_fn)): fragments = cut_site.split(record.sequence) lengths = [len(frag) for frag in fragments if not frag == ''] if len(lengths) not in fragment_counts: fragment_counts[len(lengths)] = 0 fragment_counts[len(lengths)] += 1 if len(fragments) > 1: cutsite_counter += 1 last_slice = 0 for ii, length in enumerate(lengths): total_fragments += 1 current_slice = last_slice + length w.write(f'@{record.name}:PE1:{ii}\n'.encode()) w.write(f'{record.sequence[last_slice:current_slice]}\n'.encode()) w.write('+\n'.encode()) w.write(f'{record.quality[last_slice:current_slice]}\n'.encode()) last_slice = current_slice with open(args.logfile, 'w') as w: w.write(f'Records processed: {counter}') w.write(f'Records with cutsites: {cutsite_counter}') w.write(f'Fragments output: {total_fragments}') for k, v in fragment_counts.items(): w.write(f'Fragments {k}: {v}')
def run(self): ## do the extraction ofh = open(self.output_fq_filename, 'w') fq_reader = pysam.FastqFile(self.input_fq_filename) for fq_entry in fq_reader: read_name = fq_entry.name read_name = re.sub("/[12]$", "", read_name) if read_name not in self.keep_set: ofh.write(str(fq_entry) + "\n") # retains original formatting. #ofh.write( "\n".join(["@" + fq_entry.name, # fq_entry.sequence, # "+", # fq_entry.quality] # ) + "\n") ofh.close() self.success = True
def main(): parser = None try: parser = optparse.OptionParser(usage=usage, description=description) for opt in opts: if len(opt) == 4: parser.add_option(opt[0], opt[1], help=opt[2], **opt[3]) elif len(opt) == 3: parser.add_option(opt[0], help=opt[1], **opt[2]) (opt, args) = parser.parse_args() if not (opt.input and os.path.exists(opt.input)): raise Usage("Please provide a fastq file") if opt.debug: print(""" fastqToFasta.py i=%s n=%i x=%i """ % (opt.input, n, x)) fq = pysam.FastqFile(opt.input) faFile = opt.output or unique_filename_in() rlen = int(opt.length) rskip = int(opt.start) - 1 fa = open(faFile, "w") for i, s in enumerate(fq): seq = s.sequence[rskip:(rskip + rlen)] header = "_".join([s.name, s.sequence, s.quality]) fa.write(">" + header + "\n" + seq + "\n") fq.close() fa.close() except Usage, err: print >> sys.stderr, '\n', err.msg, '\n' if parser: parser.print_help() return 1
import argparse import pysam if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("fastq") parser.add_argument("--proportion_of_Ns_allowed", type=float, default=0.5) args = parser.parse_args() fh = pysam.FastqFile(args.fastq) for record in fh: if record.sequence.count("N") < args.proportion_of_Ns_allowed * len( record.sequence): print "@%s" % record.name print record.sequence print "+" print record.quality fh.close()
#!/usr/bin/env python3 import argparse import pysam if __name__ == "__main__": # Get arguments parser = argparse.ArgumentParser() parser.add_argument('fastq') parser.add_argument('--max_mask_prop', type=float, default=0.5) args = parser.parse_args() with pysam.FastqFile(args.fastq) as fh: for record in fh: if record.sequence.count('N') < args.max_mask_prop * len( record.sequence): print('@%s' % record.name) print(record.sequence) print('+') print(record.quality)
def main(): # Read parameters config = Config() # Parse the inputs args/options parser = argparse.ArgumentParser( usage="target_fasta query_fastq [options]", version="%prog 0.1") parser.add_argument("target_fasta", type=str, help="The target genome fasta file.") parser.add_argument("query_fastq", type=str, help="The query sequences.") parser.add_argument("--w", dest="w", help="Length of minimizer window. Default=%s" % config.w, default=config.w) parser.add_argument("--k", dest="k", help="Length of k-mer. Default=%s" % config.k, default=config.k) parser.add_argument("--t", dest="t", help="Discard minmers that occur more frequently " "in the target than t. Default=%s" % config.w, default=config.w) parser.add_argument( "--l", dest="l", help="Cluster two minmers into the same cluster if within l bases of" " each other in both target and query. Default=%s" % config.l, default=config.l) parser.add_argument( "--column", dest="column", help= "Add this many bases to the prefix and suffix of a seed cluster in the" " target and query sequence. Default=%s" % config.column, default=config.column) parser.add_argument("--gapScore", dest="gapScore", help="Smith-Waterman gap-score. Default=%s" % config.gapScore, default=config.gapScore) parser.add_argument("--matchScore", dest="matchScore", help="Smith-Waterman match-score. Default=%s" % config.gapScore, default=config.gapScore) parser.add_argument("--mismatchScore", dest="mismatchScore", help="Smith-Waterman mismatch-score. Default=%s" % config.mismatchScore, default=config.mismatchScore) parser.add_argument("--log", dest="logLevel", help="Logging level. Default=%s" % config.logLevel, default=config.logLevel) options = parser.parse_args() # Parse the log level numeric_level = getattr(logging, options.logLevel.upper(), None) if not isinstance(numeric_level, int): raise ValueError('Invalid log level: %s' % options.logLevel) # Setup a logger logger.setLevel(numeric_level) ch = logging.StreamHandler(sys.stdout) ch.setLevel(numeric_level) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') ch.setFormatter(formatter) logger.addHandler(ch) logger.debug("Established logger") startTime = time.time() # Parse the target sequence and read the first sequence with pysam.FastaFile(options.target_fasta) as targetFasta: targetString = targetFasta.fetch(targetFasta.references[0]) logger.info("Parsed target string. Length: %s" % len(targetString)) # Build minimizer index minimizerIndex = MinimizerIndexer(targetString.upper(), w=options.w, k=options.k, t=options.t) minmerInstances = sum(map(len, minimizerIndex.minimizerMap.values())) logger.info( "Built minimizer index in %s seconds. #minmers: %s, #minmer instances: %s" % ((time.time() - startTime), len( minimizerIndex.minimizerMap), minmerInstances)) # Open the query files alignmentScores = [] # Array storing the alignment scores found with pysam.FastqFile(options.query_fastq) as queryFastq: # For each query string build alignment for query, queryIndex in zip(queryFastq, xrange(sys.maxint)): print queryIndex alignment = simpleMap(targetString, minimizerIndex, query.sequence.upper(), config) alignmentScore = 0 if alignment is None else alignment.getMaxAlignmentScore( ) alignmentScores.append(alignmentScore) logger.debug( "Mapped query sequence #%i, length: %s alignment_found?: %s " "max_alignment_score: %s" % (queryIndex, len( query.sequence), alignment is not None, alignmentScore)) # Comment this out to test on a subset # if queryIndex > 100: # break # Print some stats logger.critical( "Finished alignments in %s total seconds, average alignment score: %s" % (time.time() - startTime, float(sum(alignmentScores)) / len(alignmentScores)))
def main(): # Read parameters config = Config() #Parse the inputs args/options parser = argparse.ArgumentParser(usage="target_fasta query_fastq [options]") # , version="%prog 0.1") parser.add_argument("target_fasta", type=str, help="The target genome fasta file.") parser.add_argument("query_fastq", type=str, help="The query sequences.") parser.add_argument("--g", dest="g", help="Use Numba cuda.jit kernel to parallelize MinimizerIndexer on GPU", action='store_true') parser.add_argument("--w", dest="w", type=int, help="Length of minimizer window. Default=%s" % config.w, default=config.w) parser.add_argument("--k", dest="k", type=int, help="Length of k-mer. Default=%s" % config.k, default=config.k) parser.add_argument("--t", dest="t", type=int, help="Discard minmers that occur more frequently " "in the target than t. Default=%s" % config.t, default=config.t) parser.add_argument("--l", dest="l", type=int, help="Cluster two minmers into the same cluster if within l bases of" " each other in both target and query. Default=%s" % config.l, default=config.l) parser.add_argument("--c", dest="c", type=int, help="Add this many bases to the prefix and suffix of a seed cluster in the" " target and query sequence. Default=%s" % config.c, default=config.c) parser.add_argument("--gapScore", type=float, dest="gapScore", help="Smith-Waterman gap-score. Default=%s" % config.gapScore, default=config.gapScore) parser.add_argument("--matchScore", type=float, dest="matchScore", help="Smith-Waterman match-score. Default=%s" % config.gapScore, default=config.gapScore) parser.add_argument("--mismatchScore", type=float, dest="mismatchScore", help="Smith-Waterman mismatch-score. Default=%s" % config.mismatchScore, default=config.mismatchScore) parser.add_argument("--log", dest="logLevel", help="Logging level. Default=%s" % config.logLevel, default=config.logLevel) options = parser.parse_args() # Parse the log level numeric_level = getattr(logging, options.logLevel.upper(), None) if not isinstance(numeric_level, int): raise ValueError('Invalid log level: %s' % options.logLevel) # Setup a logger logger.setLevel(numeric_level) ch = logging.StreamHandler(sys.stdout) ch.setLevel(numeric_level) formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') ch.setFormatter(formatter) logger.addHandler(ch) logger.debug("Established logger") startTime = time.time() global targetString # Parse the target sequence and read the first sequence with pysam.FastaFile(options.target_fasta) as targetFasta: targetString = targetFasta.fetch(targetFasta.references[0]) logger.info("Parsed target string. Length: %s in %s seconds" % (len(targetString), time.time()-startTime)) # Build minimizer index minimizerIndex = MinimizerIndexer(targetString.upper(), w=options.w, k=options.k, t=options.t) # print("minimizerIndex attributes", list(minimizerIndex.minimizerMap.items())[:10], # list(minimizerIndex.minmerOccurrences.items())[:10]) # Only seeing this many minmers for the target DNA sequence: print(len(minimizerIndex.minimizerMap.keys()), "minimizerMap keys", list(minimizerIndex.minimizerMap.keys())[:20], "\n", len(minimizerIndex.minmerOccurrences.keys()), "minmerOccurrences keys", list(minimizerIndex.minmerOccurrences.keys())[:20]) minmerInstances = sum(map(len, minimizerIndex.minimizerMap.values())) logger.info("Built minimizer index in %s seconds. #minmers: %s, #minmer instances: %s" % ((time.time()-startTime), len(minimizerIndex.minimizerMap), minmerInstances)) # Open the query files alignmentScores = [] # Array storing the alignment scores found threads = [] with pysam.FastqFile(options.query_fastq) as queryFastq: #, Pool(10) as p: # For each query string build alignment if options.g: # For each query string build alignment for query, queryIndex in zip(queryFastq, range(sys.maxsize)): # xrange(sys.maxint)): ## print queryIndex print(queryIndex) alignment = simpleMap(minimizerIndex, query.sequence.upper(), config, None, options.g) alignmentScore = 0 if alignment is None else alignment.getMaxAlignmentScore() alignmentScores.append(alignmentScore) logger.info("Mapped query sequence #%i, length: %s alignment_found?: %s " "max_alignment_score: %s" % (queryIndex, len(query.sequence), alignment is not None, alignmentScore)) else: results = list() q = Queue() for query, queryIndex in zip(queryFastq, range(sys.maxsize)): # xrange(sys.maxint)): print("Reading query", queryIndex) results.append((queryIndex, query.sequence)) p = Process(target=simpleMap, args=(minimizerIndex, query.sequence.upper(), config, q, options.g)) p.Daemon = True p.start() for r in results: queryIndex = r[0] querySeq = r[1] alignment = q.get() try: alignmentScore = alignment.getMaxAlignmentScore() except: print("None type, continue") continue # print("Query joined", queryIndex) alignmentScores.append(alignmentScore) logger.info("Mapped query sequence #%i, length: %s alignment_found?: %s " "max_alignment_score: %s" % (queryIndex, len(querySeq), alignment is not None, alignmentScore)) for t in threads: p.join() logger.info("Finished alignments in %s total seconds, average alignment score: %s" % (time.time()-startTime, float(sum(alignmentScores))/len(alignmentScores)))
if __name__ == "__main__": parser = argparse.ArgumentParser(description="Split reads from FASTQ or BAM into fragments of a given size and output fragments as FASTQ records") parser.add_argument("input", help="FASTQ or BAM with sequences to fragment") parser.add_argument("window", type=int, help="length to fragment each sequence to") parser.add_argument("--slide", type=int, default=0, help="length to slide the given window size across the input sequences") parser.add_argument("--full_length_only", action="store_true", help="omit sequences that are shorter than the requested window size") parser.add_argument("--read_counts", default=None, help="File to write read counts to") parser.add_argument("--clone_name", default="dummy", help="Name of clone for read_counts file") args = parser.parse_args() if args.input.endswith(".bam"): input_file = pysam.AlignmentFile(args.input, check_header=False, check_sq=False) is_bam = True else: input_file = pysam.FastqFile(args.input) is_bam = False for rcount, record in enumerate(input_file): if is_bam: record_name = "%s_%s" % (record.qname, (record.is_read1 and "1" or "2")) sequence = record.seq quality = record.qual else: record_name = record.name.replace("/", "_") sequence = record.sequence quality = record.quality sequences = fragment_sequence(sequence, args.window, args.slide) qualities = fragment_sequence(quality, args.window, args.slide)
def run(args): R1file = args.i # R1 (sequence read) file R2file = args.i.replace('R1', 'R2') # R2 (BC1) file R3file = args.i.replace('R1', 'R3') # R3 (BC2 + UMI) file outdir = args.d # output directory if outdir == None: outdir = os.path.dirname( R1file ) # if not provided, put the output file in the same directory as the input outbase = args.o # output file basename outFile = os.path.join(outdir, '%s_R1_valid.fastq' % outbase) bcFile = args.b # valid barcode file umi_len = args.u # umi length # load the valid barcode dictionary: fIn = open(bcFile, 'r') bcSet = {} while 1: line = fIn.readline() if not line: break # skip the header line: if line.startswith('well'): continue if line.endswith('\n'): line = line[:-1] fields = line.split('\t') # forward barcode (trimmed to BC1_LEN bases: bcFwd = fields[0][:BC1_LEN] bcSet.setdefault(bcFwd, 0) fIn.close() ## storage for the output file pointer and statistics counters: samp = {} samp['name'] = outbase oFile = open(outFile, 'w') # open outut file for this sample samp['file'] = oFile # initialize counters: samp['total'] = 0 # total reads samp['SBC'] = 0 # sample barcode corrected samp['valid'] = 0 # total valid reads samp['BC1v'] = 0 # valid BC1 samp['BC2v'] = 0 # valid BC2 samp['UMIv'] = 0 # valid UMI samp['BC1c'] = 0 # corrected BC1 samp['BC2c'] = 0 # corrected BC2 # open the input files: fq1 = pysam.FastqFile(R1file) fq2 = pysam.FastqFile(R2file) #fq3 = pysam.FastqFile(R3file) fq3 = pysam.FastqFile(R3file) # counters: nBc1Valid = 0 nBc2Valid = 0 nBc1Corr = 0 nBc2Corr = 0 nUmiValid = 0 countMod = 100000 unassigned = 0 rCount = 0 unassignedBC = {} # collect counts on unassigned barcodes # loop over all reads: while 1: try: r1 = fq1.next() # mRNA sequence read r2 = fq2.next() # BC1 #r3 = fq3.next() # sample index r3 = fq3.next() # BC2 + UMI rCount += 1 # read counter if not rCount % countMod: print 'read %d' % rCount except StopIteration: break # last item except: print 'pysam.FastqFile iterator error.' eFlag = True break # parse out the two halves of the cell barcode, and the UMI: bc1 = r2.sequence # first half of the cell barcode bc2 = r3.sequence[:BC2_LEN] # second half of the cell barcode umi = r3.sequence[BC2_LEN:(BC2_LEN + umi_len)] # UMI sequence # check the barcodes and UMI and update counts: (bc1, bc2, umi, bc1Valid, bc1Corr, bc2Valid, bc2Corr, umiValid) = parseBarcodeAndUmiV3(bc1, bc2, umi, bcSet) samp['total'] += 1 # total reads samp['BC1v'] += bc1Valid # valid BC1 samp['BC2v'] += bc2Valid # valid BC2 samp['BC1c'] += bc1Corr # corrected BC1 samp['BC2c'] += bc2Corr # corrected BC2 samp['UMIv'] += umiValid # valid UMI # write out the sequence read if bc1, bc2 and umi are all valid: if bc1 != None and bc2 != None and umiValid: samp['valid'] += 1 # total valid reads for this sample ## create the new read name: rName = '%s:%s%s:%s' % (r1.name, bc1, bc2, umi) fastqWrite(samp['file'], r1, rName) # close the input files: fq1.close() fq2.close() #fq3.close() fq3.close() # print counts: print 'Total reads: %d' % rCount # close the output file: samp['file'].close() # print sample-by-sample stats: print 'sample\ttotal\tvalid\tBC1valid\tBC1corr\tBC2valid\tBC2corr\tUMIvalid' x = samp sOut = '%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d' % ( x['name'], x['total'], x['valid'], x['BC1v'], x['BC1c'], x['BC2v'], x['BC2c'], x['UMIv']) print sOut return
def get_first_n(fn, n=3): res = collections.Counter() fq = pysam.FastqFile(fn, 'rb') for rd in fq: res[rd.sequence[:n]] += 1 return res