def process_assembled(myData): # take each read contig (from PEAR alignment) and search for the expected # linker sequence. If we find it, extract the r1 and r2 and process # into separate files. Takes of reverse complement of r1, and imposes # min score on linker alignment. Tests if reads are longer than 22 bp. If not, they are # written in the lenFail fastq files myData['newR1FileName'] = myData['outDir'] + myData['sampleName'] + '.processed.R1.fq.gz' myData['newR2FileName'] = myData['outDir'] + myData['sampleName'] + '.processed.R2.fq.gz' outR1 = genutils.open_gzip_write(myData['newR1FileName']) outR2 = genutils.open_gzip_write(myData['newR2FileName']) myData['lenFail_R1FileName'] = myData['outDir'] + myData['sampleName'] + '.lenFail.R1.fq.gz' myData['lenFail_R2FileName'] = myData['outDir'] + myData['sampleName'] + '.lenFail.R2.fq.gz' lenFailR1 = genutils.open_gzip_write(myData['lenFail_R1FileName']) lenFailR2 = genutils.open_gzip_write(myData['lenFail_R2FileName']) myData['numAssembled'] = 0 myData['numOK'] = 0 myData['numFail'] = 0 myData['lenFail'] = 0 fqFile = genutils.open_gzip_read(myData['assembledFQ']) while True: R1 = fastqstats.get_next_seq_record(fqFile) if R1 is None: break myData['numAssembled'] += 1 res = check_seq(R1,myData) if res['passChecks'] is True: passLength = read_len_test(res) if passLength is True: myData['numOK'] += 1 name = R1['readName'] name = name.split()[0] name1 = name + ' 1' name2 = name + ' 2' outR1.write('@%s\n%s\n+\n%s\n' % (name1,res['seq1'],res['seq1Qual'])) outR2.write('@%s\n%s\n+\n%s\n' % (name2,res['seq2'],res['seq2Qual'])) else: myData['numFail'] += 1 if myData['numAssembled'] % 25000 == 0: print '\tProcesssed %i assembled seqs...' % (myData['numAssembled']) if res['passChecks'] is True and passLength is False: myData['lenFail'] += 1 name = R1['readName'] name = name.split()[0] name1 = name + ' 1' name2 = name + ' 2' lenFailR1.write('@%s\n%s\n+\n%s\n' % (name1,res['seq1'],res['seq1Qual'])) lenFailR2.write('@%s\n%s\n+\n%s\n' % (name2,res['seq2'],res['seq2Qual'])) # if myData['numAssembled'] >= 1000: # break fqFile.close() myData['totReads'] = myData['numAssembled'] + myData['numDiscarded'] + myData['numNotAssem'] outR1.close() outR2.close()
def count_num_discarded(myData): myData['numDiscarded'] = 0 fqFile = genutils.open_gzip_read(myData['discardedFQ']) while True: R1 = fastqstats.get_next_seq_record(fqFile) if R1 is None: break myData['numDiscarded'] += 1 fqFile.close()
def count_num_not_assembled(myData): myData['numNotAssem'] = 0 fqFile = genutils.open_gzip_read(myData['notAssemF']) while True: R1 = fastqstats.get_next_seq_record(fqFile) if R1 is None: break myData['numNotAssem'] += 1 fqFile.close()