def createTables(self): cur = self.connection.cursor() logging.logMore("Creating database tables ...") cur.execute( "CREATE TABLE IF NOT EXISTS items(id INT UNSIGNED PRIMARY KEY, \ rev INT UNSIGNED NOT NULL, \ claims BLOB, links BLOB, label BLOB, aliases BLOB, description BLOB)") cur.execute( "CREATE TABLE IF NOT EXISTS properties(id INT UNSIGNED PRIMARY KEY, \ rev INT UNSIGNED NOT NULL, \ claims BLOB, datatype VARCHAR(20), label BLOB, aliases BLOB, description BLOB)" ) cur.execute( "CREATE TABLE IF NOT EXISTS itemrevstats(id INT UNSIGNED NOT NULL, \ rev INT UNSIGNED NOT NULL PRIMARY KEY, \ day SMALLINT UNSIGNED NOT NULL, \ langinfo BLOB, propinfo BLOB,\ stat_num SMALLINT UNSIGNED NOT NULL, stat_ref_num SMALLINT UNSIGNED NOT NULL, stat_q_num SMALLINT UNSIGNED NOT NULL,\ label_num SMALLINT UNSIGNED NOT NULL, desc_num SMALLINT UNSIGNED NOT NULL, link_num SMALLINT UNSIGNED NOT NULL,\ alias_num SMALLINT UNSIGNED NOT NULL)") cur.execute("CREATE UNIQUE INDEX idx_idday ON itemrevstats (id,day)") cur.execute("CREATE INDEX idx_day ON itemrevstats (day)") cur.execute("CREATE INDEX idx_stat_num ON itemrevstats (stat_num)") cur.execute( "CREATE TABLE IF NOT EXISTS proprevstats(id INT UNSIGNED NOT NULL, \ rev INT UNSIGNED NOT NULL PRIMARY KEY, \ day SMALLINT UNSIGNED NOT NULL, \ langinfo BLOB, label_num SMALLINT UNSIGNED NOT NULL, desc_num SMALLINT UNSIGNED NOT NULL,\ alias_num SMALLINT UNSIGNED NOT NULL)") cur.execute("CREATE UNIQUE INDEX idx_idday ON proprevstats (id,day)") logging.log(" done.")
def createTables(self): cur = self.connection.cursor() logging.logMore("Creating database tables ...") cur.execute("CREATE TABLE IF NOT EXISTS items(id INT UNSIGNED PRIMARY KEY, \ rev INT UNSIGNED NOT NULL, \ claims BLOB, links BLOB, label BLOB, aliases BLOB, description BLOB)") cur.execute("CREATE TABLE IF NOT EXISTS properties(id INT UNSIGNED PRIMARY KEY, \ rev INT UNSIGNED NOT NULL, \ claims BLOB, datatype VARCHAR(20), label BLOB, aliases BLOB, description BLOB)") cur.execute("CREATE TABLE IF NOT EXISTS itemrevstats(id INT UNSIGNED NOT NULL, \ rev INT UNSIGNED NOT NULL PRIMARY KEY, \ day SMALLINT UNSIGNED NOT NULL, \ langinfo BLOB, propinfo BLOB,\ stat_num SMALLINT UNSIGNED NOT NULL, stat_ref_num SMALLINT UNSIGNED NOT NULL, stat_q_num SMALLINT UNSIGNED NOT NULL,\ label_num SMALLINT UNSIGNED NOT NULL, desc_num SMALLINT UNSIGNED NOT NULL, link_num SMALLINT UNSIGNED NOT NULL,\ alias_num SMALLINT UNSIGNED NOT NULL)") cur.execute("CREATE UNIQUE INDEX idx_idday ON itemrevstats (id,day)") cur.execute("CREATE INDEX idx_day ON itemrevstats (day)") cur.execute("CREATE INDEX idx_stat_num ON itemrevstats (stat_num)") cur.execute("CREATE TABLE IF NOT EXISTS proprevstats(id INT UNSIGNED NOT NULL, \ rev INT UNSIGNED NOT NULL PRIMARY KEY, \ day SMALLINT UNSIGNED NOT NULL, \ langinfo BLOB, label_num SMALLINT UNSIGNED NOT NULL, desc_num SMALLINT UNSIGNED NOT NULL,\ alias_num SMALLINT UNSIGNED NOT NULL)") cur.execute("CREATE UNIQUE INDEX idx_idday ON proprevstats (id,day)") logging.log(" done.")
def __init__(self, helper): self.helper = helper self.botEdits = {} self.humanEdits = {} self.anonEdits = {} self.botTotal = 0 self.humanTotal = 0 self.anonTotal = 0 self.curMin = 100000000 self.curMax = -100000000 self.editsByUser = {} logging.logMore('Loading list of bots ') self.bots = [] try: botsjson = urllib.urlopen( 'http://www.wikidata.org/w/api.php?action=query&list=allusers&augroup=bot&aulimit=500&format=json' ).read() botsdata = eval(botsjson) for bot in botsdata['query']['allusers']: self.bots.append(bot['name']) logging.logMore('.') logging.log(' found ' + str(len(self.bots)) + ' bot accounts.') except IOError: logging.log( ' *** Error: Could not retrieve bot accounts. Bots will not be distinguished.' )
def fetchNewerDailies(self): self.getDailyDates() if not self.maxrevid: self.fetchLatestDump() self.__cdData() self.stopdaily = '20121026' self.newerdailies = [] for daily in reversed(self.dailies) : logging.logMore('Checking daily ' + daily + ' ... ') if not os.path.exists('daily' + daily) : os.makedirs('daily' + daily) os.chdir('daily' + daily) if daily > self.maxdumpdate: logging.log('too recent to consider') os.chdir('..') continue if not os.path.exists('maxrevid.txt') : maxrevSource = 'http://dumps.wikimedia.org/other/incr/wikidatawiki/' + daily + '/maxrevid.txt' urllib.urlretrieve(maxrevSource, 'maxrevid.txt') else: maxrevSource = 'Local Max Rev File' try: dailymaxrevid = int(open('maxrevid.txt').read()) except ValueError: #This happens if a daily dump failed? logging.log(maxrevSource + ' throws ValueError') if daily < self.getLatestDumpDate() : logging.log('already in latest ' + self.dumpName) self.stopdaily = daily os.chdir('..') break if not os.path.exists('pages-meta-hist-incr.xml.bz2') : if self.offline: logging.log('not downloading daily in offline mode') else: logging.logMore('downloading ... ') if urllib.urlopen('http://dumps.wikimedia.org/other/incr/wikidatawiki/' + daily + '/status.txt').read() == 'done' : urllib.urlretrieve('http://dumps.wikimedia.org/other/incr/wikidatawiki/' + daily + '/wikidatawiki-' + daily + '-pages-meta-hist-incr.xml.bz2', 'pages-meta-hist-incr.xml.bz2') #xxx logging.log('done') self.newerdailies.append(daily) else : logging.log('daily not done yet; download aborted') else: logging.log('daily already downloaded') self.newerdailies.append(daily) os.chdir('..') self.__cdBase()
def getDays(self): logging.logMore("Getting days: ") cur = self.db.query("SELECT DISTINCT(day) FROM itemrevstats ORDER BY day ASC",()) self.days = [] print "Days: " + str(cur.rowcount) row = cur.fetchone() while row: self.days.append(int(row[0])) logging.logMore(str(row[0]) + ' ') row = cur.fetchone() cur.close() logging.log("... done")
def getDays(self): logging.logMore("Getting days: ") cur = self.db.query( "SELECT DISTINCT(day) FROM itemrevstats ORDER BY day ASC", ()) self.days = [] print "Days: " + str(cur.rowcount) row = cur.fetchone() while row: self.days.append(int(row[0])) logging.logMore(str(row[0]) + ' ') row = cur.fetchone() cur.close() logging.log("... done")
def fetchNewerDailies(self): self.getDailyDates() if not self.maxrevid: self.fetchLatestDump() self.__cdData() self.stopdaily = "20121026" self.newerdailies = [] for daily in reversed(self.dailies): logging.logMore("Checking daily " + daily + " ... ") if not os.path.exists("daily" + daily): os.makedirs("daily" + daily) os.chdir("daily" + daily) if not os.path.exists("maxrevid.txt"): urllib.urlretrieve( "http://dumps.wikimedia.org/other/incr/wikidatawiki/" + daily + "/maxrevid.txt", "maxrevid.txt" ) dailymaxrevid = int(open("maxrevid.txt").read()) if dailymaxrevid < self.maxrevid: logging.log("already in latest dump") self.stopdaily = daily os.chdir("..") break if not os.path.exists("pages-meta-hist-incr.xml.bz2"): logging.logMore("downloading ... ") if ( urllib.urlopen("http://dumps.wikimedia.org/other/incr/wikidatawiki/" + daily + "/status.txt").read() == "done" ): urllib.urlretrieve( "http://dumps.wikimedia.org/other/incr/wikidatawiki/" + daily + "/wikidatawiki-" + daily + "-pages-meta-hist-incr.xml.bz2", "pages-meta-hist-incr.xml.bz2", ) # xxx logging.log("done") self.newerdailies.append(daily) else: logging.log("daily not done yet; download aborted") else: logging.log("daily already downloaded") self.newerdailies.append(daily) os.chdir("..") self.__cdBase()
def getLatestDumpDate(self): if not self.latestdump: self.latestdump = '00000000' if self.offline: logging.logMore('Checking for the date of the last local ' + self.dumpName + ' ') dataDirs = os.listdir("data") for dirName in dataDirs: if not dirName.startswith(self.dumpDirName): continue date = dirName[len(self.dumpDirName):] if not re.match('\d\d\d\d\d\d\d\d', date) : continue logging.logMore('.') if date > self.latestdump and date <= self.maxdumpdate: self.latestdump = date else: logging.logMore('Checking for the date of the last online ' + self.dumpName + ' ') for line in urllib.urlopen('http://dumps.wikimedia.org/wikidatawiki/') : if not line.startswith('<tr><td class="n">') : continue date = line[27:35] if not re.match('\d\d\d\d\d\d\d\d', date) : continue logging.logMore('.') #logging.log("Checking dump of " + date) # check if dump is finished finished = False for md5 in urllib.urlopen('http://dumps.wikimedia.org/wikidatawiki/' + date + '/wikidatawiki-' + date + '-md5sums.txt') : if md5.endswith(self.dumpPostFix + "\n") : finished = True break if finished and date > self.latestdump and date <= self.maxdumpdate: self.latestdump = date if self.latestdump == '00000000': logging.log('-- Warning: no latest ' + self.dumpName + ' found.') else: logging.log(' latest ' + self.dumpName + ' is ' + self.latestdump) return self.latestdump
def getDailyDates(self): if not self.dailies: logging.logMore("Fetching information about available daily exports ") self.dailies = [] for line in urllib.urlopen("http://dumps.wikimedia.org/other/incr/wikidatawiki/"): if not line.startswith('<tr><td class="n">'): continue date = line[27:35] if not re.match("\d\d\d\d\d\d\d\d", date): continue logging.logMore(".") self.dailies.append(date) logging.log(" found " + str(len(self.dailies)) + " daily exports.") return self.dailies
def makeStatistics(self): self.startTime = time.time() self.getDays() #self.days = [447] prevDay = -1 for day in self.days: if prevDay == -1: self.dayStats[day] = self.getEmptyDayStats() else: self.dayStats[day] = self.dayStats[prevDay].copy() self.dayStats[day]['changeditems'] = 0 prevDay = day logging.logMore("Fetching data for day " + str(day)) cur = self.db.query("SELECT * FROM itemrevstats WHERE day=%s", (day)) logging.log(" ... done.") row = cur.fetchone() while row: if not self.processeditems[int(row[0])]: isNewEntity = True self.processeditems[int(row[0])] = True self.dayStats[day]['items'] += 1 else: isNewEntity = False self.updateStats(row, isNewEntity, day) self.dayStats[day]['changeditems'] += 1 self.totalItems += 1 if self.totalItems % 100000 == 0: print "Processed " + str( self.totalItems) + " items in " + str( round(time.time() - self.startTime, 2)) + " sec. Current data:\n" + str( self.dayStats[day]) row = cur.fetchone() print "Processed " + str( self.totalItems) + " items. Final data:\n" + str(self.dayStats) # Close the cursor (this is essential) ... cur.close() # ... and be paranoid about memory leaks (may not have much effect) del cur self.db.reopenDatabase()
def makeStatistics(self): self.startTime = time.time() self.getDays() #self.days = [447] prevDay = -1 for day in self.days: if prevDay == -1: self.dayStats[day] = self.getEmptyDayStats() else: self.dayStats[day] = self.dayStats[prevDay].copy() self.dayStats[day]['changeditems'] = 0 prevDay = day logging.logMore("Fetching data for day " + str(day)) cur = self.db.query("SELECT * FROM itemrevstats WHERE day=%s",(day)) logging.log(" ... done.") row = cur.fetchone() while row: if not self.processeditems[int(row[0])]: isNewEntity = True self.processeditems[int(row[0])] = True self.dayStats[day]['items'] += 1 else: isNewEntity = False self.updateStats(row,isNewEntity,day) self.dayStats[day]['changeditems'] += 1 self.totalItems += 1 if self.totalItems % 100000 == 0: print "Processed " + str(self.totalItems) + " items in " + str(round(time.time() - self.startTime,2)) + " sec. Current data:\n" + str(self.dayStats[day]) row = cur.fetchone() print "Processed " + str(self.totalItems) + " items. Final data:\n" + str(self.dayStats) # Close the cursor (this is essential) ... cur.close() # ... and be paranoid about memory leaks (may not have much effect) del cur self.db.reopenDatabase()
def __init__(self,helper): self.helper = helper self.botEdits = {} self.humanEdits = {} self.anonEdits = {} self.botTotal = 0 self.humanTotal = 0 self.anonTotal = 0 self.curMin = 100000000 self.curMax = -100000000 self.editsByUser = {} logging.logMore('Loading list of bots ') self.bots = [] botsjson = urllib.urlopen('http://www.wikidata.org/w/api.php?action=query&list=allusers&augroup=bot&aulimit=500&format=json').read() botsdata = eval(botsjson) for bot in botsdata['query']['allusers'] : self.bots.append(bot['name']) logging.logMore('.') logging.log(' found ' + str(len(self.bots)) + ' bot accounts.')
def getLatestDumpDate(self): if not self.latestdump: logging.logMore("Checking for the date of the last dump ") self.latestdump = "20121026" for line in urllib.urlopen("http://dumps.wikimedia.org/wikidatawiki/"): if not line.startswith('<tr><td class="n">'): continue date = line[27:35] if not re.match("\d\d\d\d\d\d\d\d", date): continue logging.logMore(".") # logging.log("Checking dump of " + date) # check if dump is finished finished = False for md5 in urllib.urlopen( "http://dumps.wikimedia.org/wikidatawiki/" + date + "/wikidatawiki-" + date + "-md5sums.txt" ): if md5.endswith("-pages-meta-history.xml.bz2" + "\n"): finished = True if finished: self.latestdump = date logging.log(" latest dump has been on " + self.latestdump) return self.latestdump
def getLatestDumpDate(self): if not self.latestdump: self.latestdump = '00000000' if self.offline: logging.logMore('Checking for the date of the last local ' + self.dumpName + ' ') dataDirs = os.listdir("data") for dirName in dataDirs: if not dirName.startswith(self.dumpDirName): continue date = dirName[len(self.dumpDirName):] if not re.match('\d\d\d\d\d\d\d\d', date): continue logging.logMore('.') if date > self.latestdump and date <= self.maxdumpdate: self.latestdump = date else: logging.logMore('Checking for the date of the last online ' + self.dumpName + ' ') for line in urllib.urlopen( 'http://dumps.wikimedia.org/wikidatawiki/'): if not line.startswith('<tr><td class="n">'): continue date = line[27:35] if not re.match('\d\d\d\d\d\d\d\d', date): continue logging.logMore('.') #logging.log("Checking dump of " + date) # check if dump is finished finished = False for md5 in urllib.urlopen( 'http://dumps.wikimedia.org/wikidatawiki/' + date + '/wikidatawiki-' + date + '-md5sums.txt'): if md5.endswith(self.dumpPostFix + "\n"): finished = True break if finished and date > self.latestdump and date <= self.maxdumpdate: self.latestdump = date if self.latestdump == '00000000': logging.log('-- Warning: no latest ' + self.dumpName + ' found.') else: logging.log(' latest ' + self.dumpName + ' is ' + self.latestdump) return self.latestdump
def getDailyDates(self): if not self.dailies: self.dailies = [] if self.offline: logging.logMore("Finding daily exports available locally ") dataDirs = os.listdir("data") for dirName in dataDirs: if not dirName.startswith('daily'): continue date = dirName[5:] if not re.match('\d\d\d\d\d\d\d\d', date) : continue logging.logMore('.') self.dailies.append(date) self.dailies = sorted(self.dailies) else: logging.logMore("Finding daily exports online ") for line in urllib.urlopen('http://dumps.wikimedia.org/other/incr/wikidatawiki/') : if not line.startswith('<tr><td class="n">') : continue date = line[27:35] if not re.match('\d\d\d\d\d\d\d\d', date) : continue logging.logMore('.') self.dailies.append(date) logging.log(" found " + str(len(self.dailies)) + " daily exports.") return self.dailies
def getDailyDates(self): if not self.dailies: self.dailies = [] if self.offline: logging.logMore("Finding daily exports available locally ") dataDirs = os.listdir("data") for dirName in dataDirs: if not dirName.startswith('daily'): continue date = dirName[5:] if not re.match('\d\d\d\d\d\d\d\d', date): continue logging.logMore('.') self.dailies.append(date) self.dailies = sorted(self.dailies) else: logging.logMore("Finding daily exports online ") for line in urllib.urlopen( 'http://dumps.wikimedia.org/other/incr/wikidatawiki/'): if not line.startswith('<tr><td class="n">'): continue date = line[27:35] if not re.match('\d\d\d\d\d\d\d\d', date): continue logging.logMore('.') self.dailies.append(date) logging.log(" found " + str(len(self.dailies)) + " daily exports.") return self.dailies
def fetchNewerDailies(self): self.getDailyDates() if not self.maxrevid: self.fetchLatestDump() self.__cdData() self.stopdaily = '20121026' self.newerdailies = [] for daily in reversed(self.dailies): logging.logMore('Checking daily ' + daily + ' ... ') if not os.path.exists('daily' + daily): os.makedirs('daily' + daily) os.chdir('daily' + daily) if daily > self.maxdumpdate: logging.log('too recent to consider') os.chdir('..') continue if not os.path.exists('maxrevid.txt'): maxrevSource = 'http://dumps.wikimedia.org/other/incr/wikidatawiki/' + daily + '/maxrevid.txt' urllib.urlretrieve(maxrevSource, 'maxrevid.txt') else: maxrevSource = 'Local Max Rev File' try: dailymaxrevid = int(open('maxrevid.txt').read()) except ValueError: #This happens if a daily dump failed? logging.log(maxrevSource + ' throws ValueError') if daily < self.getLatestDumpDate(): logging.log('already in latest ' + self.dumpName) self.stopdaily = daily os.chdir('..') break if not os.path.exists('pages-meta-hist-incr.xml.bz2'): if self.offline: logging.log('not downloading daily in offline mode') else: logging.logMore('downloading ... ') if urllib.urlopen( 'http://dumps.wikimedia.org/other/incr/wikidatawiki/' + daily + '/status.txt').read() == 'done': urllib.urlretrieve( 'http://dumps.wikimedia.org/other/incr/wikidatawiki/' + daily + '/wikidatawiki-' + daily + '-pages-meta-hist-incr.xml.bz2', 'pages-meta-hist-incr.xml.bz2') #xxx logging.log('done') self.newerdailies.append(daily) else: logging.log('daily not done yet; download aborted') else: logging.log('daily already downloaded') self.newerdailies.append(daily) os.chdir('..') self.__cdBase()