Esempio n. 1
0
 def run(self):
     """Saves page to db"""
     db = Storage()
     db.execute("""CREATE TABLE IF NOT EXISTS quality
                 (name TEXT, templates INT, edits INT, len INT,
                 cats INT, linked INT, referenced INT, images INT,
                 iwiki INT, sections INT, users INT)""")
     db.delete("quality", {"name": self.pagename})
     db.insert("quality", self.eval())
Esempio n. 2
0
class ReplicsCounter():
    def __init__(self):
        self.cache = Storage()
        self.cache.create("articles", \
                    {"oldid":"INT UNIQUE", "name":"TEXT", "ts":"DATE", "replics": "INT"})
    def countPage(self, page):
        """Counts repics at AFI page"""
        sections = {}
        sect = None
        n = -1 # one line for header
        for s in page.getSections():
            if sect != None:
                sections[sect] = (n, s[0])
            sect = s[3]
            n = s[0]
        sections[sect] = (n, len(page.get())) # last one
        
        for s in sections:
            replics = -1 # one for header
            text = page.get()[sections[s][0]:sections[s][1]].splitlines()
        
            for line in text:
                sline = line.strip()
                if (len(sline) > 2):
                    if sline[:2] != "{{" and sline[:-2] != "}}":
                        replics += 1
                        #print "%s %s" % (replics, line)
            wikipedia.output( u"%s %s %s" % (s, sections[s], replics))
            self.cache.execute(u'UPDATE articles SET replics = %s WHERE name = "%s";' % (replics, self.cache.quote(s)))
    def countCat(self, catname):
        cat = catlib.Category(wikipedia.getSite(), catname)
        for page in cat.articles():
            print page
            self.countPage(page)
    def replicsPage(self, pagename):
        r = self.cache.findone('articles', {"name":pagename}, what = ["replics"])
        if r == None:
            return "-"
        else:
            return r[0]
Esempio n. 3
0
class Corellations():
    """Рассчитать и распечатать статистические данные
    Количество статей
    Средние величины
    Корелляции
    see http://stackoverflow.com/questions/3949226/calculating-pearson-correlation-and-significance-in-python"""
    
    def average(self, x):
        assert len(x) > 0
        return float(sum(x)) / len(x)
    
    def sq_avg(self, x):
        s = 0
        for i in x:
            s+=i*i
        return math.sqrt((s/len(x)))
    
    def sq_dev(self, x):
        s = 0
        avg = self.average(x)
        for i in x:
            s += (i-avg)**2
        return math.sqrt((s/len(x)))
        
    def pearson_def(self, x, y):
        assert len(x) == len(y)
        n = len(x)
        assert n > 0
        avg_x = self.average(x)
        avg_y = self.average(y)
        diffprod = 0
        xdiff2 = 0
        ydiff2 = 0
        for idx in range(n):
            xdiff = x[idx] - avg_x
            ydiff = y[idx] - avg_y
            diffprod += xdiff * ydiff
            xdiff2 += xdiff * xdiff
            ydiff2 += ydiff * ydiff
        return diffprod / math.sqrt(xdiff2 * ydiff2)

    def __init__(self):
        """инициализируем и хреначим массив"""
        self.data = []
        for i in xrange(0, 12):
                self.data.append([])
        self.db = Storage()
        s = u"""SELECT name, templates, edits, len,
                    cats, linked, referenced, images,
                    iwiki, sections, users FROM quality ORDER BY name;"""
        re = self.db.execute(s)
        for l in re.fetchall():
            #print l[0]
            for i in xrange(1, 11):
                self.data[i].append(l[i])
    def print_stats(self):
        #print self.data
        stats =  u"Articles count %s \r\n" % len(self.data[1])
        val = ["", "templ", "edi", "len", "cat", "links", "refs", "img", "iwiki", "sect", "users"]



        stats += "          math avg     root mean      deviation        max    min \r\n"
        for i in xrange(1, 11):
            stats += "%8s: %-12.10g %-12.10g  %-12.10g %8g %6g \r\n"% (val[i], self.average(self.data[i]), self.sq_avg(self.data[i]), self.sq_dev(self.data[i]), max(self.data[i]), min(self.data[i]))
        r = ""
        stats += "\r\n"
        stats += "Corellations table \r\n"
        for v in val:
            r += "%10s"%(v)
        stats += r+"\r\n"
        r = ""
        p = {}
        for i in xrange(1, 11):
            for j in xrange(1, 11):
                d = self.pearson_def(self.data[i], self.data[j])
                r+="%-10.4g " % d
                if i > j:
                    p["%s-%s"%(val[i], val[j])] = d
            stats += "%8s %s\r\n"%(val[i], r)
            r=""
        stats += "\r\n"
        stats += " Maximum values           | Minimum values \r\n"
        up = sorted(p.items(), key=lambda x: -abs(x[1]))
        #print up[0]
        for l in xrange(0, 12):
            stats += "%12s %6.12s | %12s %6.12s \r\n" % (up[l][0], up[l][1], up[-l-1][0], up[-l-1][1])
        return stats
    def print_sel(self):
        """Распечатываем максимальные и минимальные статьи"""
       
        k = (1000, 1000, 1, 1000, 1000, 1000, 1000, 1000, 1000, 1000)

        s = """SELECT name, ((templates * %s) + (edits * %s) + (len * %s) +
               (cats * %s) + (linked * %s) + (referenced * %s) + (images * %s) +
               (iwiki * %s) + (sections * %s) + (users * %s)) AS value FROM quality ORDER BY value ASC LIMIT 10;""" % k
        re = self.db.execute(s)
        for l in re.fetchall():
            print "%s %s" % l
        
        print "------------"
        
        s = """SELECT name, ((templates * %s) + (edits * %s) + (len * %s) +
               (cats * %s) + (linked * %s) + (referenced * %s) + (images * %s) +
               (iwiki * %s) + (sections * %s) + (users * %s)) AS value FROM quality ORDER BY value DESC LIMIT 10;""" % k
        re = self.db.execute(s)
        for l in re.fetchall():
            print "%s %s" % l