def readFq(fname, hdict): with open(fname, 'r') as FQ: for header, seq, qual in FastqGeneralIterator(FQ): if header.count(' '): # Header was created using CASAVA 1.8+ (mhead, suphead) = header.split(' ') hdict[mhead].append((seq, qual)) else: # Header was created using older versions of CASAVA header = re.sub('/[1-2]', '', header) hdict[header].append((seq, qual))
def distribute_reads(readfiles,read_hit_dict,single=True): iterator1 = FastqGeneralIterator(open(readfiles[0])) if len(readfiles) == 1: for ID1_long, Seq1, Qual1 in iterator1: ID1 = ID1_long.split()[0] if ID1 in read_hit_dict: for target in read_hit_dict[ID1]: write_single_seqs(target,ID1,Seq1) return elif len(readfiles) == 2: iterator2 = FastqGeneralIterator(open(readfiles[1])) for ID1_long, Seq1, Qual1 in iterator1: ID2_long, Seq2, Qual2 = iterator2.next() ID1 = ID1_long.split()[0] ID2 = ID2_long.split()[0] if ID1 in read_hit_dict: for target in read_hit_dict[ID1]: write_paired_seqs(target,ID1,Seq1,ID2,Seq2) elif ID2 in read_hit_dict: for target in read_hit_dict[ID2]: write_paired_seqs(target,ID1,Seq1,ID2,Seq2)
def main(args): usage = "usage: %prog [options] -i <input index file> -s <input seq file> -o <output merge file>"+__doc__ parser = OptionParser(usage) parser.add_option("-i", "--index", dest="index", default=None, help="Input index fastq file.") parser.add_option("-s", "--seq", dest="seq", default=None, help="Input seq fastq file.") parser.add_option("-o", "--output", dest="output", default=None, help="Output barcode file.") (opts, args) = parser.parse_args() if not (opts.index and os.path.isfile(opts.index) and opts.seq and os.path.isfile(opts.seq) and opts.output): parser.error("Missing input and/or output") outh = open(opts.output+'.tmp', 'w') itr1 = FastqGeneralIterator(open(opts.seq)) itr2 = FastqGeneralIterator(open(opts.index)) (h1, s1, q1) = itr1.next() (h2, s2, q2) = itr2.next() while 1: h1 = h1.split()[0] h2 = h2.split()[0] while h1 != h2: try: (h2, s2, q2) = itr2.next() h2 = h2.split()[0] except (StopIteration, IOError): break outh.write("@%s\n%s%s\n+\n%s%s\n" %(h1, s2, s1, q2, q1)) try: (h1, s1, q1) = itr1.next() (h2, s2, q2) = itr2.next() except (StopIteration, IOError): break outh.close() os.rename(opts.output+'.tmp', opts.output) return 0
def prepend_barcode(seqfile, bcfile, rc, text=''): tmph = open(seqfile+'.tmp', 'w') itr1 = FastqGeneralIterator(open(seqfile)) itr2 = FastqGeneralIterator(open(bcfile)) (h1, s1, q1) = itr1.next() (h2, s2, q2) = itr2.next() while 1: h1 = h1.split()[0] h2 = h2.split()[0] while h1 != h2: try: (h2, s2, q2) = itr2.next() h2 = h2.split()[0] except (StopIteration, IOError): break if rc: rcs = Seq(s2, generic_dna) s2 = rcs.reverse_complement() q2 = q2[::-1] if text: h1 = h1+'.'+text tmph.write("@%s\n%s%s\n+\n%s%s\n" %(h1, s2, s1, q2, q1)) try: (h1, s1, q1) = itr1.next() (h2, s2, q2) = itr2.next() except (StopIteration, IOError): break tmph.close() os.rename(seqfile+'.tmp', seqfile)
def read_fastq(fname): """Provide read info from fastq file, potentially not existing. """ if fname: with open(fname) as in_handle: for info in FastqGeneralIterator(in_handle): yield info else: for info in itertools.repeat(("", None, None)): yield info
def count_reads(in_fastq): """function count the number of reads""" #open the fastq file in_file = open(in_fastq) # iterate through the fastq file total_reads = 0 for (title, sequence, quality) in FastqGeneralIterator(in_file): total_reads = total_reads+1 in_file.close() return total_reads
def _get_fastq_num_records(path_to): with open(path_to) as in_handle: total_reads = 0 reads_ids = [] for title, seq, qual in FastqGeneralIterator(in_handle): total_reads += 1 reads_ids.append(title.split(" ")[0]) num_uniq_reads = len(set(reads_ids)) return total_reads, num_uniq_reads
def fastqtrimmer(fastq_in, fastq_out, trim=21): """ Cut a fastq file using only the first trim characterst. """ handle = open(fastq_out, "w") for title, seq, qual in FastqGeneralIterator(open(input)): handle.write("@%sn%sn+n%sn" % (title, seq[:trim], qual[:trim])) handle.close()
def filter_sample(file, output): global keep_count, total_count with open(output, 'w') as out: for title, seq, qual in FastqGeneralIterator(open(file)): total_count += 1 sample = title.split('barcodelabel=')[1] sample = sample[:-1] if not sample in keep_list: keep_count += 1 out.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
def run_mBP(f1_in, f1_out, min_bp_qual_in_read, min_av_read_qual, min_bp_qual_or_N): iter1 = FastqGeneralIterator(f1_in) for (idLine, seqLine, qualLine) in iter1: npQualLine = numpy.fromstring( qualLine, dtype=numpy.uint8) - 33 #assume illumina 1.7 min = numpy.min(npQualLine) if min >= min_bp_qual_in_read: f1_out.write("@%s\n%s\n%s\n%s\n" % (idLine, seqLine, "+", qualLine))
def deal_fastq_file(afastq): fastq_dict = {} header = gzip.open(afastq, "r") try: for title, seq, qual in FastqGeneralIterator(header): fastq_dict[title.split()[0]] = seq finally: header.close() return fastq_dict
def fastqreindex(input, output): from Bio.SeqIO.QualityIO import FastqGeneralIterator count = 1 with open(output, 'w') as out: with open(input, 'rU') as fastq: for title, sequence, qual in FastqGeneralIterator(fastq): cols = title.split(';') header = 'R_' + str(count) + ';' + cols[1] + ';' count += 1 out.write("@%s\n%s\n+\n%s\n" % (header, sequence, qual))
def getAvgLength(input): AvgLength = [] for title, seq, qual in FastqGeneralIterator(open(input)): AvgLength.append(len(seq)) Average = sum(AvgLength) / float(len(AvgLength)) Min = min(AvgLength) Max = max(AvgLength) a = np.array(AvgLength) nintyfive = np.percentile(a, 5) return (Average, Min, Max, int(nintyfive))
def splitread(args): """ %prog splitread fastqfile Split fastqfile into two read fastqfiles, cut in the middle. """ p = OptionParser(splitread.__doc__) p.add_option( "-n", dest="n", default=76, type="int", help="Split at N-th base position", ) p.add_option( "--rc", default=False, action="store_true", help="Reverse complement second read", ) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (pairsfastq, ) = args base = op.basename(pairsfastq).split(".")[0] fq1 = base + ".1.fastq" fq2 = base + ".2.fastq" fw1 = must_open(fq1, "w") fw2 = must_open(fq2, "w") fp = must_open(pairsfastq) n = opts.n minsize = n * 8 / 5 for name, seq, qual in FastqGeneralIterator(fp): if len(seq) < minsize: logging.error("Skipping read {0}, length={1}".format( name, len(seq))) continue name = "@" + name rec1 = FastqLite(name, seq[:n], qual[:n]) rec2 = FastqLite(name, seq[n:], qual[n:]) if opts.rc: rec2.rc() print(rec1, file=fw1) print(rec2, file=fw2) logging.debug("Reads split into `{0},{1}`".format(fq1, fq2)) fw1.close() fw2.close()
def split_fastqs(r1, r2, r1_o, r2_o, barcodes): '''Given paired-end fastq data, split reads based off of an inline 10 (or 11) mer barcode''' mismatch = 0 not_found_R1 = 0 not_found_R2 = 0 reads = 0 fqr1 = FastqGeneralIterator(r1) fqr2 = FastqGeneralIterator(r2) seqzip = it.izip(fqr1, fqr2) #Zip up the two iterators for expediency for pairs in seqzip: title1, seq1, qual1 = pairs[0] title2, seq2, qual2 = pairs[1] barcode1 = seq1[: 8] #Just look in read 1--barcodes SHOULD be the same on both ends of the molecule) barcode2 = seq2[: 8] #Check barcode 2 as well; print out how many times they disagree test1 = checkHamming(barcodes, barcode1) test2 = checkHamming(barcodes, barcode2) if test1[0]: if test2[0]: #if the barcodes match, print out the trimmed / split reads to new files if test1[1] == test2[1]: print >> r1_o, "@%s&%s\n%s\n+\n%s" % ( title1, test1[1], seq1[11:], qual1[11:]) print >> r2_o, "@%s&%s\n%s\n+\n%s" % ( title2, test2[1], seq2[11:], qual2[11:]) else: mismatch += 1 else: #If there isn't a match in R1 not_found_R2 += 1 elif test2[0]: not_found_R1 += 1 else: not_found_R1 += 1 not_found_R2 += 1 reads += 1 out_error0 = "<H3>Total number of reads:%s</H3>" % reads out_error1 = "<H3>Total number of barcode mismatches:%s</H3>" % mismatch out_error2 = "<H3>Total number of missed R1 barcodes:%s</H3>" % not_found_R1 out_error3 = "<H3>Total number of missed R2 barcodes:%s</H3>" % not_found_R2 sys.stderr.write(out_error0 + '\n' + out_error1 + '\n' + out_error2 + '\n' + out_error3 + '\n')
def fastq_filter(in_file, pos_file, neg_file, wanted): """FASTQ filter.""" from Bio.SeqIO.QualityIO import FastqGeneralIterator pos_count = neg_count = 0 handle = open(in_file, "r") if pos_file is not None and neg_file is not None: print("Generating two FASTQ files") positive_handle = open(pos_file, "w") negative_handle = open(neg_file, "w") print(in_file) for title, seq, qual in FastqGeneralIterator(handle): # print("%s --> %s" % (title, clean_name(title.split(None, 1)[0]))) if clean_name(title.split(None, 1)[0]) in wanted: positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) pos_count += 1 else: negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) neg_count += 1 positive_handle.close() negative_handle.close() elif pos_file is not None: print("Generating matching FASTQ file") positive_handle = open(pos_file, "w") for title, seq, qual in FastqGeneralIterator(handle): if clean_name(title.split(None, 1)[0]) in wanted: positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) pos_count += 1 else: neg_count += 1 positive_handle.close() elif neg_file is not None: print("Generating non-matching FASTQ file") negative_handle = open(neg_file, "w") for title, seq, qual in FastqGeneralIterator(handle): if clean_name(title.split(None, 1)[0]) in wanted: pos_count += 1 else: negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) neg_count += 1 negative_handle.close() handle.close() return pos_count, neg_count
def splitDemux2(input, outputdir): for title, seq, qual in FastqGeneralIterator(open(input)): sample = title.split('barcodelabel=')[1].split(';')[0] sample = sample.replace(';', '') if not args.length: with open(os.path.join(outputdir, sample+'.fastq'), 'ab') as output: output.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) else: if len(seq) >= int(args.length): with open(os.path.join(outputdir, sample+'.fastq'), 'ab') as output: output.write("@%s\n%s\n+\n%s\n" % (title, seq[:int(args.length):], qual[:int(args.length)]))
def run_mBPN_pair(f1_in, f1_out, f2_in, f2_out, min_bp_qual_in_read, min_av_read_qual, min_bp_qual_or_N): iter1 = FastqGeneralIterator(f1_in) iter2 = FastqGeneralIterator(f2_in) for (idLine, seqLine, qualLine) in iter1: (idLine2, seqLine2, qualLine2) = next(iter2) npQualLine = numpy.fromstring( qualLine, dtype=numpy.uint8) - 33 #assume illumina 1.7 npQualLine2 = numpy.fromstring( qualLine2, dtype=numpy.uint8) - 33 #assume illumina 1.7 npSeqLine = numpy.fromstring(seqLine, 'c') npSeqLine[npQualLine < min_bp_qual_or_N] = 'N' f1_out.write( "@%s\n%s\n%s\n%s\n" % (idLine, npSeqLine.tostring().decode('utf-8'), "+", qualLine)) npSeqLine2 = numpy.fromstring(seqLine2, 'c') npSeqLine2[npQualLine2 < min_bp_qual_or_N] = 'N' f2_out.write( "@%s\n%s\n%s\n%s\n" % (idLine2, npSeqLine2.tostring().decode('utf-8'), "+", qualLine2))
def quick_fastq_iterator(handle, alphabet=single_letter_alphabet): """Parse Illumina 1.3 to 1.7 FASTQ files without decoding the qualities to improve the performance. """ for title, sequence, quality in FastqGeneralIterator(handle): first_word = title.split()[0] yield SeqRecord(Seq(sequence, alphabet), id=first_word, name=first_word, description=title, annotations={'quality': quality})
def main(end1_fastq): base = _clean_name(os.path.basename(end1_fastq)) out_genomic = "%s-genomic.fastq" % base with gzip.open(end1_fastq) as in_handle: with open(out_genomic, "w") as genomic_handle: for name, seq, qual in FastqGeneralIterator(in_handle): seq_qual = extract_genomic(seq, qual) if seq_qual: genomic_handle.write( "@%s\n%s\n+\n%s\n" % (name, seq_qual.genomic_seq, seq_qual.genomic_qual))
def trim_file_handle(in_handle, config): """Retrieve trimmed sequences from opened input handle. """ link1, link2 = get_linker_regions(config["linkers"], config["algorithm"]["anchor_sizes"]) for name, seq, qual in FastqGeneralIterator(in_handle): trim_seq = internal_seq(seq, link1, link2, config["algorithm"]["anchor_mismatches"], config["algorithm"]["min_size"]) if trim_seq: yield name, trim_seq, trim_qual(qual, seq, trim_seq)
def main(in_file, out_file, trim=0): trim = int(trim) with open(in_file) as in_handle: with open(out_file, "w") as out_handle: for title, seq, qual in FastqGeneralIterator(in_handle): trim_seq = seq[:len(seq) - trim] trim_qual = qual[:len(qual) - trim] out_handle.write("@%s\n%s\n+\n%s\n" % (title, trim_seq, trim_qual))
def run_mBP_mRQ_pair(f1_in, f1_out, f2_in, f2_out, min_bp_qual_in_read, min_av_read_qual, min_bp_qual_or_N): iter1 = FastqGeneralIterator(f1_in) iter2 = FastqGeneralIterator(f2_in) for (idLine, seqLine, qualLine) in iter1: (idLine2, seqLine2, qualLine2) = next(iter2) npQualLine = numpy.fromstring( qualLine, dtype=numpy.uint8) - 33 #assume illumina 1.7 npQualLine2 = numpy.fromstring( qualLine2, dtype=numpy.uint8) - 33 #assume illumina 1.7 mean = numpy.mean(npQualLine) mean2 = numpy.mean(npQualLine2) if mean >= min_av_read_qual and mean2 >= min_av_read_qual: min = numpy.min(npQualLine) min2 = numpy.min(npQualLine2) if min >= min_bp_qual_in_read and min2 >= min_bp_qual_in_read: f1_out.write("@%s\n%s\n%s\n%s\n" % (idLine, seqLine, "+", qualLine)) f2_out.write("@%s\n%s\n%s\n%s\n" % (idLine2, seqLine2, "+", qualLine2))
def main(): ''' - Read demux_info file, and both read files in a synchronized way - Write read files to a file depending on the sample assignment in demux_info ''' args = parse_args() demux_info = pd.read_csv(args.mapping, header=None, index_col=0, sep="\t", dtype=str).dropna(axis=1, how='all') index_orient = ['fwd', 'rev'][:demux_info.shape[1] - 3] demux_info.columns = ['rid'] + index_orient + ['sample_name', 'mismatches'] read_orient = ['fwd', 'rev'][:len(args.fastqs)] print('Preparing handles.') handles = {} for sample in demux_info['sample_name'].unique(): if not pd.isnull(sample): for i, orient in enumerate(read_orient, 1): handles[sample + orient] = open( '{}_R{}.fastq'.format(sample, i), 'w') parsers = [FastqGeneralIterator(open(fastq, 'r')) for fastq in args.fastqs] print('Starting demultiplexing') for seq_nb, sequences in enumerate(zip(*parsers)): ids = [seq[0].split()[0] for seq in sequences] if len(ids) > 1: if ids[0] != ids[1]: print( "Sequence #{}: {} (fwd) and {} (rev) do not match. The forward and reverse read files seem to be out of order" .format(seq_nb, *ids)) exit(42) sample_assignment = demux_info.loc[ids[0], "sample_name"] if pd.isnull(sample_assignment): continue for orient, seq in zip(read_orient, sequences): handles[sample_assignment + orient].write( '@{}\n{}\n+\n{}\n'.format(*seq)) for sample in demux_info['sample_name'].unique(): if not pd.isnull(sample): for orient in read_orient: handles[sample + orient].close() print("Demultiplexing finished.")
def get_n_reads(fastx, ftype): n_lines = 0 with open(fastx) as f: for i, l in enumerate(f): n_lines += 1 if ftype=="fastq": n_reads = len([read_tup for read_tup in FastqGeneralIterator(open(fastx))]) elif ftype=="fasta": n_reads = len([read_tup for read_tup in SimpleFastaParser(open(fastx))]) return n_reads
def recordsToDict(outputprefix, inFastq1, inFastq2, idxBase, barcodeCutOff, constant_right, constant_left, barcode_dict): discarded_sequence_count = 0 constant_left_length = len(constant_left) constant_right_length = len(constant_right) hamming_left_threshold = float(1)/constant_left_length hamming_right_threshold = float(1)/constant_right_length usable_left_seq = idxBase + constant_left_length usable_right_seq = idxBase + constant_right_length func = partial(readClustering, barcode_dict, idxBase, barcodeCutOff, constant_left, constant_right, constant_left_length, constant_right_length, hamming_left_threshold, hamming_right_threshold, usable_left_seq, usable_right_seq) with gzip.open(inFastq1,'rb') as fq1, gzip.open(inFastq2,'rb') as fq2: iterator = enumerate(izip(FastqGeneralIterator(fq1),FastqGeneralIterator(fq2))) for read_num, (read1,read2) in iterator: discarded_sequence_count += func(read1,read2) barcode_count = len(barcode_dict.keys()) stderr.write('[%s] Extracted: %i barcode group\n' %(programname,barcode_count) +\ '[%s] discarded: %i sequences\n' %(programname, discarded_sequence_count) +\ '[%s] Parsed: %i seqeucnes\n' %(programname, read_num)) return barcode_dict, read_num, barcode_count
def count_kmers_and_reads(in_fastq, kmer_size): ktable = khmer.new_ktable(kmer_size) read_count = collections.defaultdict(int) with open(in_fastq) as in_handle: i = 0 for (_, seq, _) in FastqGeneralIterator(in_handle): i += 1 #if i > 1e5: break if seq.find("N") == -1: ktable.consume(seq) read_count[seq] += 1 return ktable, dict(read_count)
def filter_sample(file, output): global keep_count, total_count with open(output, 'w') as out: for title, seq, qual in FastqGeneralIterator(open(file)): total_count += 1 sample = title.split('=', 1)[1].split(';')[0] if not sample in keep_list: keep_count += 1 if args.format == 'fastq': out.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) elif args.format == 'fasta': out.write(">%s\n%s\n" % (title, seq))
def benchmark_biopython_faster(fh): from Bio.SeqIO.QualityIO import FastqGeneralIterator total_seq = int(0) t0 = time.time() it = FastqGeneralIterator(fh) for i, (title, seq, qual) in enumerate(it): total_seq += len(seq) if i % REFRESH_RATE == 0: t1 = time.time() print('\r%.2fMB/s' % (total_seq/(1E6)/(t1-t0)), end='', flush=True) print() print('%i entries in %.3f seconds.' % (i+1, time.time()-t0))
def findQuality(filename): fname = filename max_value = -9999 min_value = 9999 with open(filename) as handle: for (title, sequence, quality) in FastqGeneralIterator(handle): ascii_score = [ord(number) for number in quality] if min(ascii_score) < min_value: min_value = min(ascii_score) if max(ascii_score) > max_value: max_value = max(ascii_score) return (min_value, max_value)
def readFastQ(fastq_path): """ Reads fastq file and returns a dictionary with the header as a key """ with open(fastq_path, 'r') as FASTQ: fastq_generator = FastqGeneralIterator(FASTQ) readDict = { re.sub('/[1-2]', '', header).split(' ')[0]: (seq, qual) for header, seq, qual in fastq_generator } return (readDict)
def run_script(): """ runs the script to append paired end information to fastq header""" try: out = gzip.open(fixed_fastq_file, "wt", compresslevel=4, newline="\n") with gzip.open(fastq_file, "rt") as handle: for title, sequence, quality in FastqGeneralIterator(handle): title = title + "/1" record = "\n".join([title, sequence, "+", quality]) out.write(record) close(out) except Exception as e: print(e)
def interleave(prefix): #Setup variables file_f = prefix + "_1.fastq" file_r = prefix + "_2.fastq" file_out = prefix + "_interleaved.fastq" handle = open(file_out, "w") count = 0 f_iter = FastqGeneralIterator(open(file_f, "rU")) r_iter = FastqGeneralIterator(open(file_r, "rU")) for (f_id, f_seq, f_q), (r_id, r_seq, r_q) in itertools.izip(f_iter, r_iter): assert f_id.split(' ')[0] == r_id.split(' ')[0] count += 2 #Write out both reads with "/1" and "/2" suffix on ID handle.write( "@%s/1\n%s\n+\n%s\n@%s/2\n%s\n+\n%s\n" % (f_id.split(' ')[0], f_seq, f_q, r_id.split(' ')[0], r_seq, r_q)) handle.close() print "%i records written to %s" % (count, file_out)
def parse_2fastq_parallel(file1, file2): """ Parse two fastq files in parallel - generator yielding (name, seq1, seq2, qual1, qual2) tuples. Doesn't check that the readnames match. """ from Bio.SeqIO.QualityIO import FastqGeneralIterator # Bio is the biopython package with open(file1) as INFILE1: with open(file2) as INFILE2: generator1 = FastqGeneralIterator(INFILE1) generator2 = FastqGeneralIterator(INFILE2) if_finished_1, if_finished_2 = False, False while True: try: name1, seq1, qual1 = generator1.next() except StopIteration: if_finished_1 = True try: name2, seq2, qual2 = generator2.next() except StopIteration: if_finished_2 = True name = name1.split()[0] if not if_finished_1 and not if_finished_2: yield (name, seq1, seq2, qual1, qual2) elif if_finished_1 and if_finished_2: raise StopIteration else: raise DeepseqError("One file finished but the other one didn't! Read name %s"%( name if if_finished_2 else name2.split()[0]))
def stitch_seqs(outfile, file1, file2, blen): bseq = 'N' * blen bqual = '!' * blen itr1 = FastqGeneralIterator(open(file1)) itr2 = FastqGeneralIterator(open(file2)) rec1 = itr1.next() rec2 = itr2.next() outh = open(outfile, 'w') while 1: seq2 = Seq(rec2[1], generic_dna) outh.write("@%s\n%s%s%s\n+\n%s%s%s\n" %(rec1[0].split()[0], rec1[1], bseq, str(seq2.reverse_complement()), rec1[2], bqual, rec2[2][::-1])) try: rec1 = itr1.next() rec2 = itr2.next() except (StopIteration, IOError): break outh.close()
sys.stderr.write("Interlacing %s and %s\n" % (fastq1, fastq2)) if fastq1.endswith(".gz"): sys.stderr.write("Decompressing %s\n" % fastq1) handle1 = gzip.open(fastq1) else: handle1 = open(fastq1) if fastq2.endswith(".gz"): sys.stderr.write("Decompressing %s\n" % fastq2) handle2 = gzip.open(fastq2) else: handle2 = open(fastq2) sys.stderr.write("Interlacing paired FASTQ files to stdout...\n") out_handle = sys.stdout iter1 = FastqGeneralIterator(handle1) iter2 = FastqGeneralIterator(handle2) for title1, seq1, qual1 in iter1: try: title2, seq2, qual2 = iter2.next() except StopIteration: sys_exit("More records in %s than %s, e.g. %s" % (fastq1, fastq2, title1)) id1, descr1 = title1.split(None, 1) id2, descr2 = title2.split(None, 1) if id1 == id2: # Add the /1 and /2, preserve any description after the ID if descr1: descr1 = " " + descr1 if descr2: descr2 = " " + descr2