Example #1
0
def calc_phredperbase_boxplot(infiles = None, filepattern = '', data_inpath = '', 
                              saveprefix = '', png_filename=''):
    ''' Find the median, upper and lower quartile for the Phred score per base 
    
    Returns the stats and the counter dictionary. 
    
    Counter dictionary may become standard way to store mass Phred/seq bases data.  '''
    from collections import Counter

    RecCycler = SeqRecCycler(infiles = infiles, filepattern = filepattern, data_inpath = data_inpath)
    
    print '\nCalculating Box plot stats of phred scores per base position.\n'
    
    # Define vars and outputs
    numfiles = RecCycler.numfiles

    toc = time.time()
    cum_t = 0
    
    counter_list = [0] * 101
    for i in range(len(counter_list)):
        counter_list[i] = Counter()
    
    for seqrecgen in RecCycler.seqfilegen:
        
        filename = RecCycler.curfilename
        filenum = RecCycler.curfilenum
        
        for rec in seqrecgen:
            for basenum, phred in enumerate(rec.letter_annotations['phred_quality']):
                counter_list[basenum][phred] += 1
                
        loop_t = time.time() - toc - cum_t
        cum_t += loop_t
        print 'Finished {0} \nfile {1} of {2} after {3}'.format(filename, filenum, numfiles, time.strftime('%H:%M:%S', time.gmtime(loop_t))) 

    # Calculate min, max Q1, Q2, Median and Average
    stats = getStatsFT(counter_list)

    total_t = time.time() - toc
    print 'Processed all files in {0}'.format(time.strftime('%H:%M:%S', time.gmtime(total_t)))
    
    pklfilename = data_inpath.split('/')[-1]
            
    pklsave(counter_list, '_'.join([pklfilename, saveprefix , 'phredCount']))
    np.save( '_'.join([pklfilename, saveprefix , 'phredStats.npy']) , stats)
    
    plotFTstats(stats, png_filename)
    
    return stats, counter_list
Example #2
0
          
    total_t = time.time() - toc
    print 'Finished in {0}'.format(
            time.strftime('%H:%M:%S', time.gmtime(total_t))) 
        
    return TagCounter


if __name__ == '__main__':
     
    #===========================================================================
    ''' RUNS SCRIPT FOR ALLL READS IN LANE 6 '''
    #===========================================================================
    
    LANE = '6'
    
    # Set paths and file patterns 
    data_inpath = '/space/musselle/datasets/gazellesAndZebras/lane' + LANE + '/L6_phredprop_filtered/'
#    data_inpath = '/home/musselle/data/lane' + LANE
    os.chdir(data_inpath)
    raw_files = glob.glob('*[0-9]-pass.fastq.bgzf')
    raw_files.sort()
    
    TagsCounter = tags_counter(infiles = raw_files, sl=(6,12))
    pklsave(TagsCounter, 'L{0}_TagsCount-pass'.format(LANE))