def run(self): """Saves page to db""" db = Storage() db.execute("""CREATE TABLE IF NOT EXISTS quality (name TEXT, templates INT, edits INT, len INT, cats INT, linked INT, referenced INT, images INT, iwiki INT, sections INT, users INT)""") db.delete("quality", {"name": self.pagename}) db.insert("quality", self.eval())
class ReplicsCounter(): def __init__(self): self.cache = Storage() self.cache.create("articles", \ {"oldid":"INT UNIQUE", "name":"TEXT", "ts":"DATE", "replics": "INT"}) def countPage(self, page): """Counts repics at AFI page""" sections = {} sect = None n = -1 # one line for header for s in page.getSections(): if sect != None: sections[sect] = (n, s[0]) sect = s[3] n = s[0] sections[sect] = (n, len(page.get())) # last one for s in sections: replics = -1 # one for header text = page.get()[sections[s][0]:sections[s][1]].splitlines() for line in text: sline = line.strip() if (len(sline) > 2): if sline[:2] != "{{" and sline[:-2] != "}}": replics += 1 #print "%s %s" % (replics, line) wikipedia.output( u"%s %s %s" % (s, sections[s], replics)) self.cache.execute(u'UPDATE articles SET replics = %s WHERE name = "%s";' % (replics, self.cache.quote(s))) def countCat(self, catname): cat = catlib.Category(wikipedia.getSite(), catname) for page in cat.articles(): print page self.countPage(page) def replicsPage(self, pagename): r = self.cache.findone('articles', {"name":pagename}, what = ["replics"]) if r == None: return "-" else: return r[0]
class Corellations(): """Рассчитать и распечатать статистические данные Количество статей Средние величины Корелляции see http://stackoverflow.com/questions/3949226/calculating-pearson-correlation-and-significance-in-python""" def average(self, x): assert len(x) > 0 return float(sum(x)) / len(x) def sq_avg(self, x): s = 0 for i in x: s+=i*i return math.sqrt((s/len(x))) def sq_dev(self, x): s = 0 avg = self.average(x) for i in x: s += (i-avg)**2 return math.sqrt((s/len(x))) def pearson_def(self, x, y): assert len(x) == len(y) n = len(x) assert n > 0 avg_x = self.average(x) avg_y = self.average(y) diffprod = 0 xdiff2 = 0 ydiff2 = 0 for idx in range(n): xdiff = x[idx] - avg_x ydiff = y[idx] - avg_y diffprod += xdiff * ydiff xdiff2 += xdiff * xdiff ydiff2 += ydiff * ydiff return diffprod / math.sqrt(xdiff2 * ydiff2) def __init__(self): """инициализируем и хреначим массив""" self.data = [] for i in xrange(0, 12): self.data.append([]) self.db = Storage() s = u"""SELECT name, templates, edits, len, cats, linked, referenced, images, iwiki, sections, users FROM quality ORDER BY name;""" re = self.db.execute(s) for l in re.fetchall(): #print l[0] for i in xrange(1, 11): self.data[i].append(l[i]) def print_stats(self): #print self.data stats = u"Articles count %s \r\n" % len(self.data[1]) val = ["", "templ", "edi", "len", "cat", "links", "refs", "img", "iwiki", "sect", "users"] stats += " math avg root mean deviation max min \r\n" for i in xrange(1, 11): stats += "%8s: %-12.10g %-12.10g %-12.10g %8g %6g \r\n"% (val[i], self.average(self.data[i]), self.sq_avg(self.data[i]), self.sq_dev(self.data[i]), max(self.data[i]), min(self.data[i])) r = "" stats += "\r\n" stats += "Corellations table \r\n" for v in val: r += "%10s"%(v) stats += r+"\r\n" r = "" p = {} for i in xrange(1, 11): for j in xrange(1, 11): d = self.pearson_def(self.data[i], self.data[j]) r+="%-10.4g " % d if i > j: p["%s-%s"%(val[i], val[j])] = d stats += "%8s %s\r\n"%(val[i], r) r="" stats += "\r\n" stats += " Maximum values | Minimum values \r\n" up = sorted(p.items(), key=lambda x: -abs(x[1])) #print up[0] for l in xrange(0, 12): stats += "%12s %6.12s | %12s %6.12s \r\n" % (up[l][0], up[l][1], up[-l-1][0], up[-l-1][1]) return stats def print_sel(self): """Распечатываем максимальные и минимальные статьи""" k = (1000, 1000, 1, 1000, 1000, 1000, 1000, 1000, 1000, 1000) s = """SELECT name, ((templates * %s) + (edits * %s) + (len * %s) + (cats * %s) + (linked * %s) + (referenced * %s) + (images * %s) + (iwiki * %s) + (sections * %s) + (users * %s)) AS value FROM quality ORDER BY value ASC LIMIT 10;""" % k re = self.db.execute(s) for l in re.fetchall(): print "%s %s" % l print "------------" s = """SELECT name, ((templates * %s) + (edits * %s) + (len * %s) + (cats * %s) + (linked * %s) + (referenced * %s) + (images * %s) + (iwiki * %s) + (sections * %s) + (users * %s)) AS value FROM quality ORDER BY value DESC LIMIT 10;""" % k re = self.db.execute(s) for l in re.fetchall(): print "%s %s" % l