def wiki_sizes_chart(path, prefixes, upperlimit = None ): prefixes, sizes = zip(*sorted( [(pr, dumpSize(pr)) for pr in prefixes] , key = operator.itemgetter(1) ) ) blockSize = 5 ind = p.arange(0, blockSize*len(prefixes), blockSize) # y location for groups height = 4 # bar height #colors = ['g','r','c','m','y'] colors = html_colors thresholds = [5000, 2000,1000,500,200,100,50,20,10] #colors = [str(float(i+1) / (len(thresholds)+1)) for i in xrange(len(thresholds))] #colors.reverse() overall = p.barh( ind , sizes , height , color = 'b' , linewidth = 0 , align='center' ) subbars = [] for i, thresh in enumerate(thresholds) : subbars.append( p.barh( ind , [ docs_under_thresh(pr, thresh) for pr in prefixes] , height , color = colors[ i % len(colors) ] , linewidth = 0 , align='center' ) ) p.ylim(-height, len(prefixes) * blockSize) if upperlimit: p.xlim(0, upperlimit) yfontprop = FontProperties(size=4) xfontprop = FontProperties(size=4) p.xlabel('Documents') p.ylabel('Language Code') p.title('Number of Documents Under Threshold') p.yticks(ind, prefixes, fontproperties = yfontprop) xmin, xmax = p.xlim() xtick_interval = rounded_interval(xmin, xmax, 20, 2) p.xticks( p.arange(xmin,xmax,xtick_interval),fontproperties = xfontprop) p.gca().xaxis.grid(linestyle = '-', linewidth=0.15) p.gca().yaxis.tick_left() p.legend( [ b[0] for b in subbars] , map(str,thresholds) , prop = xfontprop , loc = 'lower right' ) p.savefig(path, dpi=300) p.close() p.clf()
def wiki_proportions_chart(path, prefixes): prefixes, sizes = zip(*sorted([(pr, dumpSize(pr)) for pr in prefixes], key=operator.itemgetter(1))) blockSize = 5 ind = p.arange(0, blockSize * len(prefixes), blockSize) # y location for groups height = 4 # bar height #colors = ['g','r','c','m','y'] thresholds = [5000, 2000, 1000, 500, 200, 100, 50, 20, 10] colors = [ str(float(i + 1) / (len(thresholds) + 1)) for i in xrange(len(thresholds)) ] colors.reverse() p.clf() """ overall = p.barh( ind , [1.0] * len(ind) , height , color = 'b' , linewidth = 0 , align='center' ) """ subbars = [] for i, thresh in enumerate(thresholds): subbars.append( p.barh(ind, [ float(docs_under_thresh(pr, thresh)) / dumpSize(pr) for pr in prefixes ], height, color=colors[i % len(colors)], linewidth=0, align='center')) p.ylim(-height, len(prefixes) * blockSize) p.xlim(0, 1) yfontprop = FontProperties(size=4) xfontprop = FontProperties(size=4) p.xlabel('Proportion') p.ylabel('Language Code') p.title('Proportion of Documents Under Threshold') p.yticks(ind, prefixes, fontproperties=yfontprop) xmin, xmax = p.xlim() xtick_interval = 0.1 p.xticks(p.arange(xmin, xmax, xtick_interval), fontproperties=xfontprop) p.gca().xaxis.grid(linestyle='-', linewidth=0.15) p.gca().yaxis.tick_left() p.savefig(path, dpi=300) p.close() p.clf()
def wiki_sizes_chart(path, prefixes, upperlimit=None): prefixes, sizes = zip(*sorted([(pr, dumpSize(pr)) for pr in prefixes], key=operator.itemgetter(1))) blockSize = 5 ind = p.arange(0, blockSize * len(prefixes), blockSize) # y location for groups height = 4 # bar height #colors = ['g','r','c','m','y'] colors = html_colors thresholds = [5000, 2000, 1000, 500, 200, 100, 50, 20, 10] #colors = [str(float(i+1) / (len(thresholds)+1)) for i in xrange(len(thresholds))] #colors.reverse() overall = p.barh(ind, sizes, height, color='b', linewidth=0, align='center') subbars = [] for i, thresh in enumerate(thresholds): subbars.append( p.barh(ind, [docs_under_thresh(pr, thresh) for pr in prefixes], height, color=colors[i % len(colors)], linewidth=0, align='center')) p.ylim(-height, len(prefixes) * blockSize) if upperlimit: p.xlim(0, upperlimit) yfontprop = FontProperties(size=4) xfontprop = FontProperties(size=4) p.xlabel('Documents') p.ylabel('Language Code') p.title('Number of Documents Under Threshold') p.yticks(ind, prefixes, fontproperties=yfontprop) xmin, xmax = p.xlim() xtick_interval = rounded_interval(xmin, xmax, 20, 2) p.xticks(p.arange(xmin, xmax, xtick_interval), fontproperties=xfontprop) p.gca().xaxis.grid(linestyle='-', linewidth=0.15) p.gca().yaxis.tick_left() p.legend([b[0] for b in subbars], map(str, thresholds), prop=xfontprop, loc='lower right') p.savefig(path, dpi=300) p.close() p.clf()
def wiki_proportions_chart(path, prefixes): prefixes, sizes = zip(*sorted( [(pr, dumpSize(pr)) for pr in prefixes] , key = operator.itemgetter(1) ) ) blockSize = 5 ind = p.arange(0, blockSize*len(prefixes), blockSize) # y location for groups height = 4 # bar height #colors = ['g','r','c','m','y'] thresholds = [5000, 2000,1000,500,200,100,50,20,10] colors = [str(float(i+1) / (len(thresholds)+1)) for i in xrange(len(thresholds))] colors.reverse() p.clf() """ overall = p.barh( ind , [1.0] * len(ind) , height , color = 'b' , linewidth = 0 , align='center' ) """ subbars = [] for i, thresh in enumerate(thresholds) : subbars.append( p.barh( ind , [ float(docs_under_thresh(pr, thresh)) / dumpSize(pr) for pr in prefixes] , height , color = colors[ i % len(colors) ] , linewidth = 0 , align='center' ) ) p.ylim(-height, len(prefixes) * blockSize) p.xlim(0, 1) yfontprop = FontProperties(size=4) xfontprop = FontProperties(size=4) p.xlabel('Proportion') p.ylabel('Language Code') p.title('Proportion of Documents Under Threshold') p.yticks(ind, prefixes, fontproperties = yfontprop) xmin, xmax = p.xlim() xtick_interval = 0.1 p.xticks( p.arange(xmin,xmax,xtick_interval),fontproperties = xfontprop) p.gca().xaxis.grid(linestyle = '-', linewidth=0.15) p.gca().yaxis.tick_left() p.savefig(path, dpi=300) p.close() p.clf()