Esempio n. 1
0
def wiki_sizes_chart(path, prefixes, upperlimit = None ):
  prefixes, sizes = zip(*sorted( [(pr, dumpSize(pr)) for pr in prefixes]
                               , key = operator.itemgetter(1)
                               )
                       )

  blockSize = 5 
  ind = p.arange(0, blockSize*len(prefixes), blockSize) # y location for groups
  height = 4 # bar height 

  #colors = ['g','r','c','m','y']
  colors = html_colors

  thresholds = [5000, 2000,1000,500,200,100,50,20,10]
  #colors = [str(float(i+1) / (len(thresholds)+1)) for i in xrange(len(thresholds))]
  #colors.reverse()

  overall = p.barh( ind 
                  , sizes
                  , height
                  , color = 'b'
                  , linewidth = 0
                  , align='center'
                  )
  subbars = []
  for i, thresh in enumerate(thresholds) :
    subbars.append( p.barh( ind
                          , [ docs_under_thresh(pr, thresh) for pr in prefixes]
                          , height
                          , color = colors[ i % len(colors) ] 
                          , linewidth = 0
                          , align='center'
                          )
                  )
  
  p.ylim(-height, len(prefixes) * blockSize)
  if upperlimit:
    p.xlim(0, upperlimit)
  yfontprop = FontProperties(size=4)
  xfontprop = FontProperties(size=4)
  p.xlabel('Documents')
  p.ylabel('Language Code')
  p.title('Number of Documents Under Threshold')
  p.yticks(ind, prefixes, fontproperties = yfontprop)
  xmin, xmax = p.xlim()
  xtick_interval         = rounded_interval(xmin, xmax, 20, 2) 
  p.xticks( p.arange(xmin,xmax,xtick_interval),fontproperties = xfontprop)
  p.gca().xaxis.grid(linestyle = '-', linewidth=0.15)
  p.gca().yaxis.tick_left()
  p.legend( [ b[0] for b in subbars]
          , map(str,thresholds)
          , prop = xfontprop
          , loc = 'lower right' 
          )


  p.savefig(path, dpi=300)
  p.close()
  p.clf()
Esempio n. 2
0
def wiki_proportions_chart(path, prefixes):
    prefixes, sizes = zip(*sorted([(pr, dumpSize(pr)) for pr in prefixes],
                                  key=operator.itemgetter(1)))

    blockSize = 5
    ind = p.arange(0, blockSize * len(prefixes),
                   blockSize)  # y location for groups
    height = 4  # bar height

    #colors = ['g','r','c','m','y']
    thresholds = [5000, 2000, 1000, 500, 200, 100, 50, 20, 10]
    colors = [
        str(float(i + 1) / (len(thresholds) + 1))
        for i in xrange(len(thresholds))
    ]
    colors.reverse()

    p.clf()
    """
  overall = p.barh( ind 
                  , [1.0] * len(ind) 
                  , height
                  , color = 'b'
                  , linewidth = 0
                  , align='center'
                  )
  """
    subbars = []
    for i, thresh in enumerate(thresholds):
        subbars.append(
            p.barh(ind, [
                float(docs_under_thresh(pr, thresh)) / dumpSize(pr)
                for pr in prefixes
            ],
                   height,
                   color=colors[i % len(colors)],
                   linewidth=0,
                   align='center'))

    p.ylim(-height, len(prefixes) * blockSize)
    p.xlim(0, 1)
    yfontprop = FontProperties(size=4)
    xfontprop = FontProperties(size=4)
    p.xlabel('Proportion')
    p.ylabel('Language Code')
    p.title('Proportion of Documents Under Threshold')
    p.yticks(ind, prefixes, fontproperties=yfontprop)
    xmin, xmax = p.xlim()
    xtick_interval = 0.1
    p.xticks(p.arange(xmin, xmax, xtick_interval), fontproperties=xfontprop)
    p.gca().xaxis.grid(linestyle='-', linewidth=0.15)
    p.gca().yaxis.tick_left()
    p.savefig(path, dpi=300)
    p.close()
    p.clf()
Esempio n. 3
0
def wiki_sizes_chart(path, prefixes, upperlimit=None):
    prefixes, sizes = zip(*sorted([(pr, dumpSize(pr)) for pr in prefixes],
                                  key=operator.itemgetter(1)))

    blockSize = 5
    ind = p.arange(0, blockSize * len(prefixes),
                   blockSize)  # y location for groups
    height = 4  # bar height

    #colors = ['g','r','c','m','y']
    colors = html_colors

    thresholds = [5000, 2000, 1000, 500, 200, 100, 50, 20, 10]
    #colors = [str(float(i+1) / (len(thresholds)+1)) for i in xrange(len(thresholds))]
    #colors.reverse()

    overall = p.barh(ind,
                     sizes,
                     height,
                     color='b',
                     linewidth=0,
                     align='center')
    subbars = []
    for i, thresh in enumerate(thresholds):
        subbars.append(
            p.barh(ind, [docs_under_thresh(pr, thresh) for pr in prefixes],
                   height,
                   color=colors[i % len(colors)],
                   linewidth=0,
                   align='center'))

    p.ylim(-height, len(prefixes) * blockSize)
    if upperlimit:
        p.xlim(0, upperlimit)
    yfontprop = FontProperties(size=4)
    xfontprop = FontProperties(size=4)
    p.xlabel('Documents')
    p.ylabel('Language Code')
    p.title('Number of Documents Under Threshold')
    p.yticks(ind, prefixes, fontproperties=yfontprop)
    xmin, xmax = p.xlim()
    xtick_interval = rounded_interval(xmin, xmax, 20, 2)
    p.xticks(p.arange(xmin, xmax, xtick_interval), fontproperties=xfontprop)
    p.gca().xaxis.grid(linestyle='-', linewidth=0.15)
    p.gca().yaxis.tick_left()
    p.legend([b[0] for b in subbars],
             map(str, thresholds),
             prop=xfontprop,
             loc='lower right')

    p.savefig(path, dpi=300)
    p.close()
    p.clf()
Esempio n. 4
0
def wiki_proportions_chart(path, prefixes):
  prefixes, sizes = zip(*sorted( [(pr, dumpSize(pr)) for pr in prefixes]
                               , key = operator.itemgetter(1)
                               )
                       )

  blockSize = 5 
  ind = p.arange(0, blockSize*len(prefixes), blockSize) # y location for groups
  height = 4 # bar height 

  #colors = ['g','r','c','m','y']
  thresholds = [5000, 2000,1000,500,200,100,50,20,10]
  colors = [str(float(i+1) / (len(thresholds)+1)) for i in xrange(len(thresholds))]
  colors.reverse()

  p.clf()
  """
  overall = p.barh( ind 
                  , [1.0] * len(ind) 
                  , height
                  , color = 'b'
                  , linewidth = 0
                  , align='center'
                  )
  """
  subbars = []
  for i, thresh in enumerate(thresholds) :
    subbars.append( p.barh( ind
                          , [ float(docs_under_thresh(pr, thresh)) / dumpSize(pr) for pr in prefixes]
                          , height
                          , color = colors[ i % len(colors) ] 
                          , linewidth = 0
                          , align='center'
                          )
                  )
  
  p.ylim(-height, len(prefixes) * blockSize)
  p.xlim(0, 1)
  yfontprop = FontProperties(size=4)
  xfontprop = FontProperties(size=4)
  p.xlabel('Proportion')
  p.ylabel('Language Code')
  p.title('Proportion of Documents Under Threshold')
  p.yticks(ind, prefixes, fontproperties = yfontprop)
  xmin, xmax = p.xlim()
  xtick_interval         = 0.1 
  p.xticks( p.arange(xmin,xmax,xtick_interval),fontproperties = xfontprop)
  p.gca().xaxis.grid(linestyle = '-', linewidth=0.15)
  p.gca().yaxis.tick_left()
  p.savefig(path, dpi=300)
  p.close()
  p.clf()