Beispiel #1
0
def process_assembled(myData):     
    # take each read contig (from PEAR alignment) and search for the expected
    # linker sequence.  If we find it, extract the r1 and r2 and process
    # into separate files.  Takes of reverse complement of r1, and imposes
    # min score on linker alignment.  Tests if reads are longer than 22 bp.  If not, they are
    # written in the lenFail fastq files

    myData['newR1FileName'] = myData['outDir'] + myData['sampleName'] + '.processed.R1.fq.gz'
    myData['newR2FileName'] = myData['outDir'] + myData['sampleName'] + '.processed.R2.fq.gz'    
    outR1 = genutils.open_gzip_write(myData['newR1FileName'])
    outR2 = genutils.open_gzip_write(myData['newR2FileName'])

    myData['lenFail_R1FileName'] = myData['outDir'] + myData['sampleName'] + '.lenFail.R1.fq.gz'
    myData['lenFail_R2FileName'] = myData['outDir'] + myData['sampleName'] + '.lenFail.R2.fq.gz'    
    lenFailR1 = genutils.open_gzip_write(myData['lenFail_R1FileName'])
    lenFailR2 = genutils.open_gzip_write(myData['lenFail_R2FileName'])
    
    myData['numAssembled'] = 0
    myData['numOK'] = 0
    myData['numFail'] = 0
    myData['lenFail'] = 0

    fqFile = genutils.open_gzip_read(myData['assembledFQ'])
    while True:
        R1 = fastqstats.get_next_seq_record(fqFile)
        if R1 is None: break    
        myData['numAssembled'] += 1
        res = check_seq(R1,myData)
        if res['passChecks'] is True:
            passLength = read_len_test(res)
            if passLength is True:  
                myData['numOK'] += 1 
                name = R1['readName']
                name = name.split()[0]
                name1 = name + ' 1'
                name2 = name + ' 2'
                outR1.write('@%s\n%s\n+\n%s\n' % (name1,res['seq1'],res['seq1Qual']))
                outR2.write('@%s\n%s\n+\n%s\n' % (name2,res['seq2'],res['seq2Qual']))
        else:
            myData['numFail']  += 1

        if myData['numAssembled']  % 25000 == 0:
            print '\tProcesssed %i assembled seqs...' % (myData['numAssembled'])
        
        if res['passChecks'] is True and passLength is False:
            myData['lenFail'] += 1
            name = R1['readName']
            name = name.split()[0]
            name1 = name + ' 1'
            name2 = name + ' 2'
            lenFailR1.write('@%s\n%s\n+\n%s\n' % (name1,res['seq1'],res['seq1Qual']))           
            lenFailR2.write('@%s\n%s\n+\n%s\n' % (name2,res['seq2'],res['seq2Qual']))   

#        if myData['numAssembled']  >= 1000:
#            break
    fqFile.close()    
    myData['totReads'] = myData['numAssembled'] + myData['numDiscarded'] + myData['numNotAssem']
    outR1.close()
    outR2.close()    
Beispiel #2
0
def count_num_discarded(myData):
    myData['numDiscarded'] = 0
    fqFile = genutils.open_gzip_read(myData['discardedFQ'])
    while True:
        R1 = fastqstats.get_next_seq_record(fqFile)
        if R1 is None: break    
        myData['numDiscarded'] += 1
    fqFile.close()
Beispiel #3
0
def count_num_not_assembled(myData):
    myData['numNotAssem'] = 0
    fqFile = genutils.open_gzip_read(myData['notAssemF'])
    while True:
        R1 = fastqstats.get_next_seq_record(fqFile)
        if R1 is None: break    
        myData['numNotAssem'] += 1
    fqFile.close()