def makeboxplot(filteredclusts, dblibrary, figname, pool=False): '''takes a filtered dict of clusts worth keeping and creates a boxplot of either by lane (default) or pool''' indiv_cluster_count = defaultdict(int) for clust, inddict in filteredclusts.items(): for ind, reads in inddict.items(): if ind in indiv_cluster_count.keys(): indiv_cluster_count[ind]+=1 else: indiv_cluster_count[ind]+=1 t = gdata_tools.get_table_as_dict(dblibrary) db_ind_countd = Util.countdict([d['sampleid'] for d in t if d['sampleid'] in indiv_cluster_count.keys()[3]]) #creates a table of individual dicts from google spreadsheet indiv_by_group = defaultdict(list) for d in t: if 'pool' in d: indkey = (d.get('flowcell',None),d.get('lane',None),d.get('index',None),d.get('sampleid',None)) if indkey in indiv_cluster_count: if pool == True: indiv_by_group[(d['flowcell'],d['lane'],d.get('index',None),d['pool'])].append(indiv_cluster_count[indkey]) else: indiv_by_group[(d['flowcell'],d['lane'],d.get('index',None))].append(indiv_cluster_count[indkey]) boxes = [] labels = [] for group,indcounts in indiv_by_group.items(): boxes.append(indcounts) labels.append(group) boxplt = pylab.figure(1) pylab.boxplot(boxes) pylab.xticks(arange(1,(len(labels)+1)),labels,fontsize='small') #legend with best location (0) if pools boxplt.savefig(figname)
def makeboxplot(filteredclusts, dblibrary, figname, pool=False): '''takes a filtered dict of clusts worth keeping and creates a boxplot of either by lane (default) or pool''' indiv_cluster_count = defaultdict(int) for clust, inddict in filteredclusts.items(): for ind, reads in inddict.items(): if ind in indiv_cluster_count.keys(): indiv_cluster_count[ind] += 1 else: indiv_cluster_count[ind] += 1 t = gdata_tools.get_table_as_dict(dblibrary) db_ind_countd = Util.countdict([ d['sampleid'] for d in t if d['sampleid'] in indiv_cluster_count.keys()[3] ]) #creates a table of individual dicts from google spreadsheet indiv_by_group = defaultdict(list) for d in t: if 'pool' in d: indkey = (d.get('flowcell', None), d.get('lane', None), d.get('index', None), d.get('sampleid', None)) if indkey in indiv_cluster_count: if pool == True: indiv_by_group[(d['flowcell'], d['lane'], d.get('index', None), d['pool'])].append( indiv_cluster_count[indkey]) else: indiv_by_group[(d['flowcell'], d['lane'], d.get('index', None))].append( indiv_cluster_count[indkey]) boxes = [] labels = [] for group, indcounts in indiv_by_group.items(): boxes.append(indcounts) labels.append(group) boxplt = pylab.figure(1) pylab.boxplot(boxes) pylab.xticks(arange(1, (len(labels) + 1)), labels, fontsize='small') #legend with best location (0) if pools boxplt.savefig(figname)
counts_by_pool[pool][ind] += ct return counts_by_pool def get_uniqued_info(uniqued): if 'index' in uniqued: ufields = os.path.splitext(os.path.basename(uniqued))[0].rsplit('_',3) ufields[3] = ufields[3][5:] else: ufields = os.path.splitext(os.path.basename(uniqued))[0].rsplit('_',2) ufields.append(None) ufields[1] = ufields[1][4:] return ufields if __name__ == "__main__": db = gdata_tools.get_table_as_dict('DB_library_data') uniqued = sys.argv[1] ufields = get_uniqued_info(uniqued) counts_by_pool = get_counts_by_pool(uniqued,db) for k,v in counts_by_pool.items(): print '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%0.1f\t%d' % (ufields[0],ufields[1],ufields[2],ufields[3],k,sum(v.values()),len(v),numpy.mean(v.values()),numpy.median(v.values()))