Exemple #1
0
    def createTables(self):
        cur = self.connection.cursor()
        logging.logMore("Creating database tables ...")
        cur.execute(
            "CREATE TABLE IF NOT EXISTS items(id INT UNSIGNED PRIMARY KEY, \
				rev INT UNSIGNED NOT NULL, \
				claims BLOB, links BLOB, label BLOB, aliases BLOB, description BLOB)")
        cur.execute(
            "CREATE TABLE IF NOT EXISTS properties(id INT UNSIGNED PRIMARY KEY, \
				rev INT UNSIGNED NOT NULL, \
				claims BLOB, datatype VARCHAR(20), label BLOB, aliases BLOB, description BLOB)"
        )
        cur.execute(
            "CREATE TABLE IF NOT EXISTS itemrevstats(id INT UNSIGNED NOT NULL, \
				rev INT UNSIGNED NOT NULL PRIMARY KEY, \
				day SMALLINT UNSIGNED NOT NULL, \
				langinfo BLOB, propinfo BLOB,\
				stat_num SMALLINT UNSIGNED NOT NULL, stat_ref_num SMALLINT UNSIGNED NOT NULL, stat_q_num SMALLINT UNSIGNED NOT NULL,\
				label_num SMALLINT UNSIGNED NOT NULL, desc_num SMALLINT UNSIGNED NOT NULL, link_num SMALLINT UNSIGNED NOT NULL,\
				alias_num SMALLINT UNSIGNED NOT NULL)")
        cur.execute("CREATE UNIQUE INDEX idx_idday ON itemrevstats (id,day)")
        cur.execute("CREATE INDEX idx_day ON itemrevstats (day)")
        cur.execute("CREATE INDEX idx_stat_num ON itemrevstats (stat_num)")
        cur.execute(
            "CREATE TABLE IF NOT EXISTS proprevstats(id INT UNSIGNED NOT NULL, \
				rev INT UNSIGNED NOT NULL PRIMARY KEY, \
				day SMALLINT UNSIGNED NOT NULL, \
				langinfo BLOB, label_num SMALLINT UNSIGNED NOT NULL, desc_num SMALLINT UNSIGNED NOT NULL,\
				alias_num SMALLINT UNSIGNED NOT NULL)")
        cur.execute("CREATE UNIQUE INDEX idx_idday ON proprevstats (id,day)")
        logging.log(" done.")
Exemple #2
0
	def createTables(self):
		cur = self.connection.cursor()
		logging.logMore("Creating database tables ...")
		cur.execute("CREATE TABLE IF NOT EXISTS items(id INT UNSIGNED PRIMARY KEY, \
				rev INT UNSIGNED NOT NULL, \
				claims BLOB, links BLOB, label BLOB, aliases BLOB, description BLOB)")
		cur.execute("CREATE TABLE IF NOT EXISTS properties(id INT UNSIGNED PRIMARY KEY, \
				rev INT UNSIGNED NOT NULL, \
				claims BLOB, datatype VARCHAR(20), label BLOB, aliases BLOB, description BLOB)")
		cur.execute("CREATE TABLE IF NOT EXISTS itemrevstats(id INT UNSIGNED NOT NULL, \
				rev INT UNSIGNED NOT NULL PRIMARY KEY, \
				day SMALLINT UNSIGNED NOT NULL, \
				langinfo BLOB, propinfo BLOB,\
				stat_num SMALLINT UNSIGNED NOT NULL, stat_ref_num SMALLINT UNSIGNED NOT NULL, stat_q_num SMALLINT UNSIGNED NOT NULL,\
				label_num SMALLINT UNSIGNED NOT NULL, desc_num SMALLINT UNSIGNED NOT NULL, link_num SMALLINT UNSIGNED NOT NULL,\
				alias_num SMALLINT UNSIGNED NOT NULL)")
		cur.execute("CREATE UNIQUE INDEX idx_idday ON itemrevstats (id,day)")
		cur.execute("CREATE INDEX idx_day ON itemrevstats (day)")
		cur.execute("CREATE INDEX idx_stat_num ON itemrevstats (stat_num)")
		cur.execute("CREATE TABLE IF NOT EXISTS proprevstats(id INT UNSIGNED NOT NULL, \
				rev INT UNSIGNED NOT NULL PRIMARY KEY, \
				day SMALLINT UNSIGNED NOT NULL, \
				langinfo BLOB, label_num SMALLINT UNSIGNED NOT NULL, desc_num SMALLINT UNSIGNED NOT NULL,\
				alias_num SMALLINT UNSIGNED NOT NULL)")
		cur.execute("CREATE UNIQUE INDEX idx_idday ON proprevstats (id,day)")
		logging.log(" done.")
Exemple #3
0
    def __init__(self, helper):
        self.helper = helper
        self.botEdits = {}
        self.humanEdits = {}
        self.anonEdits = {}
        self.botTotal = 0
        self.humanTotal = 0
        self.anonTotal = 0
        self.curMin = 100000000
        self.curMax = -100000000

        self.editsByUser = {}

        logging.logMore('Loading list of bots ')
        self.bots = []
        try:
            botsjson = urllib.urlopen(
                'http://www.wikidata.org/w/api.php?action=query&list=allusers&augroup=bot&aulimit=500&format=json'
            ).read()
            botsdata = eval(botsjson)
            for bot in botsdata['query']['allusers']:
                self.bots.append(bot['name'])
                logging.logMore('.')
            logging.log(' found ' + str(len(self.bots)) + ' bot accounts.')
        except IOError:
            logging.log(
                ' *** Error: Could not retrieve bot accounts. Bots will not be distinguished.'
            )
Exemple #4
0
	def fetchNewerDailies(self):
		self.getDailyDates()
		if not self.maxrevid:
			self.fetchLatestDump()

		self.__cdData()
		self.stopdaily = '20121026'
		self.newerdailies = []
		for daily in reversed(self.dailies) :
			logging.logMore('Checking daily ' + daily + ' ... ')
			if not os.path.exists('daily' + daily) :
				os.makedirs('daily' + daily)
			os.chdir('daily' + daily)

			if daily > self.maxdumpdate:
				logging.log('too recent to consider')
				os.chdir('..')
				continue

			if not os.path.exists('maxrevid.txt') :
				maxrevSource = 'http://dumps.wikimedia.org/other/incr/wikidatawiki/' + daily + '/maxrevid.txt'
				urllib.urlretrieve(maxrevSource, 'maxrevid.txt')
			else:
				maxrevSource = 'Local Max Rev File'
			
			try:
				dailymaxrevid = int(open('maxrevid.txt').read())
			except ValueError:
				#This happens if a daily dump failed?
				logging.log(maxrevSource + ' throws ValueError')

			if daily < self.getLatestDumpDate() :
				logging.log('already in latest ' + self.dumpName)
				self.stopdaily = daily
				os.chdir('..')
				break

			if not os.path.exists('pages-meta-hist-incr.xml.bz2') :
				if self.offline:
					logging.log('not downloading daily in offline mode')
				else:
					logging.logMore('downloading ... ')
					if urllib.urlopen('http://dumps.wikimedia.org/other/incr/wikidatawiki/' + daily + '/status.txt').read() == 'done' :
						urllib.urlretrieve('http://dumps.wikimedia.org/other/incr/wikidatawiki/' + daily + '/wikidatawiki-' + daily + '-pages-meta-hist-incr.xml.bz2', 'pages-meta-hist-incr.xml.bz2') #xxx
						logging.log('done')
						self.newerdailies.append(daily)
					else :
						logging.log('daily not done yet; download aborted')
			else:
				logging.log('daily already downloaded')
				self.newerdailies.append(daily)

			os.chdir('..')

		self.__cdBase()
Exemple #5
0
	def getDays(self):
		logging.logMore("Getting days: ")
		cur = self.db.query("SELECT DISTINCT(day) FROM itemrevstats ORDER BY day ASC",())
		self.days = []
		print "Days: " + str(cur.rowcount)
		row = cur.fetchone()
		while row:
			self.days.append(int(row[0]))
			logging.logMore(str(row[0]) + ' ')
			row = cur.fetchone()
		cur.close()
		logging.log("... done")
Exemple #6
0
 def getDays(self):
     logging.logMore("Getting days: ")
     cur = self.db.query(
         "SELECT DISTINCT(day) FROM itemrevstats ORDER BY day ASC", ())
     self.days = []
     print "Days: " + str(cur.rowcount)
     row = cur.fetchone()
     while row:
         self.days.append(int(row[0]))
         logging.logMore(str(row[0]) + ' ')
         row = cur.fetchone()
     cur.close()
     logging.log("... done")
Exemple #7
0
    def fetchNewerDailies(self):
        self.getDailyDates()
        if not self.maxrevid:
            self.fetchLatestDump()

        self.__cdData()
        self.stopdaily = "20121026"
        self.newerdailies = []
        for daily in reversed(self.dailies):
            logging.logMore("Checking daily " + daily + " ... ")
            if not os.path.exists("daily" + daily):
                os.makedirs("daily" + daily)
            os.chdir("daily" + daily)

            if not os.path.exists("maxrevid.txt"):
                urllib.urlretrieve(
                    "http://dumps.wikimedia.org/other/incr/wikidatawiki/" + daily + "/maxrevid.txt", "maxrevid.txt"
                )
            dailymaxrevid = int(open("maxrevid.txt").read())

            if dailymaxrevid < self.maxrevid:
                logging.log("already in latest dump")
                self.stopdaily = daily
                os.chdir("..")
                break

            if not os.path.exists("pages-meta-hist-incr.xml.bz2"):
                logging.logMore("downloading ... ")
                if (
                    urllib.urlopen("http://dumps.wikimedia.org/other/incr/wikidatawiki/" + daily + "/status.txt").read()
                    == "done"
                ):
                    urllib.urlretrieve(
                        "http://dumps.wikimedia.org/other/incr/wikidatawiki/"
                        + daily
                        + "/wikidatawiki-"
                        + daily
                        + "-pages-meta-hist-incr.xml.bz2",
                        "pages-meta-hist-incr.xml.bz2",
                    )  # xxx
                    logging.log("done")
                    self.newerdailies.append(daily)
                else:
                    logging.log("daily not done yet; download aborted")
            else:
                logging.log("daily already downloaded")
                self.newerdailies.append(daily)

            os.chdir("..")

        self.__cdBase()
Exemple #8
0
	def getLatestDumpDate(self):
		if not self.latestdump:
			self.latestdump = '00000000'
			if self.offline:
				logging.logMore('Checking for the date of the last local ' + self.dumpName + ' ')
				dataDirs = os.listdir("data")
				for dirName in dataDirs:
					if not dirName.startswith(self.dumpDirName): continue
					date = dirName[len(self.dumpDirName):]
					if not re.match('\d\d\d\d\d\d\d\d', date) : continue
					logging.logMore('.')
					if date > self.latestdump and date <= self.maxdumpdate:
						self.latestdump = date
			else:
				logging.logMore('Checking for the date of the last online ' + self.dumpName + ' ')
				for line in urllib.urlopen('http://dumps.wikimedia.org/wikidatawiki/') :
					if not line.startswith('<tr><td class="n">') : continue
					date = line[27:35]
					if not re.match('\d\d\d\d\d\d\d\d', date) : continue
					logging.logMore('.')
					#logging.log("Checking dump of " + date)
					# check if dump is finished
					finished = False
					for md5 in urllib.urlopen('http://dumps.wikimedia.org/wikidatawiki/' + date + '/wikidatawiki-' + date + '-md5sums.txt') :
						if md5.endswith(self.dumpPostFix + "\n") :
							finished = True
							break
					if finished and date > self.latestdump and date <= self.maxdumpdate:
						self.latestdump = date

			if self.latestdump == '00000000':
				logging.log('-- Warning: no latest ' + self.dumpName + ' found.')
			else:
				logging.log(' latest ' + self.dumpName + ' is ' + self.latestdump)
		return self.latestdump
Exemple #9
0
 def getDailyDates(self):
     if not self.dailies:
         logging.logMore("Fetching information about available daily exports ")
         self.dailies = []
         for line in urllib.urlopen("http://dumps.wikimedia.org/other/incr/wikidatawiki/"):
             if not line.startswith('<tr><td class="n">'):
                 continue
             date = line[27:35]
             if not re.match("\d\d\d\d\d\d\d\d", date):
                 continue
             logging.logMore(".")
             self.dailies.append(date)
         logging.log(" found " + str(len(self.dailies)) + " daily exports.")
     return self.dailies
Exemple #10
0
    def makeStatistics(self):
        self.startTime = time.time()
        self.getDays()
        #self.days = [447]
        prevDay = -1
        for day in self.days:
            if prevDay == -1:
                self.dayStats[day] = self.getEmptyDayStats()
            else:
                self.dayStats[day] = self.dayStats[prevDay].copy()
                self.dayStats[day]['changeditems'] = 0
            prevDay = day

            logging.logMore("Fetching data for day " + str(day))
            cur = self.db.query("SELECT * FROM itemrevstats WHERE day=%s",
                                (day))
            logging.log(" ... done.")

            row = cur.fetchone()
            while row:
                if not self.processeditems[int(row[0])]:
                    isNewEntity = True
                    self.processeditems[int(row[0])] = True
                    self.dayStats[day]['items'] += 1
                else:
                    isNewEntity = False

                self.updateStats(row, isNewEntity, day)

                self.dayStats[day]['changeditems'] += 1
                self.totalItems += 1
                if self.totalItems % 100000 == 0:
                    print "Processed " + str(
                        self.totalItems) + " items in " + str(
                            round(time.time() - self.startTime,
                                  2)) + " sec. Current data:\n" + str(
                                      self.dayStats[day])

                row = cur.fetchone()

            print "Processed " + str(
                self.totalItems) + " items. Final data:\n" + str(self.dayStats)
            # Close the cursor (this is essential) ...
            cur.close()
            # ... and be paranoid about memory leaks (may not have much effect)
            del cur
            self.db.reopenDatabase()
Exemple #11
0
	def makeStatistics(self):
		self.startTime = time.time()
		self.getDays()
		#self.days = [447]
		prevDay = -1
		for day in self.days:
			if prevDay == -1:
				self.dayStats[day] = self.getEmptyDayStats()
			else:
				self.dayStats[day] = self.dayStats[prevDay].copy()
				self.dayStats[day]['changeditems'] = 0
			prevDay = day

			logging.logMore("Fetching data for day " + str(day))
			cur = self.db.query("SELECT * FROM itemrevstats WHERE day=%s",(day))
			logging.log(" ... done.")

			row = cur.fetchone()
			while row:
				if not self.processeditems[int(row[0])]:
					isNewEntity = True
					self.processeditems[int(row[0])] = True
					self.dayStats[day]['items'] += 1
				else:
					isNewEntity = False

				self.updateStats(row,isNewEntity,day)

				self.dayStats[day]['changeditems'] += 1
				self.totalItems += 1
				if self.totalItems % 100000 == 0:
					print "Processed " + str(self.totalItems) + " items in " + str(round(time.time() - self.startTime,2)) + " sec. Current data:\n" + str(self.dayStats[day])

				row = cur.fetchone()

			print "Processed " + str(self.totalItems) + " items. Final data:\n" + str(self.dayStats)
			# Close the cursor (this is essential) ...
			cur.close()
			# ... and be paranoid about memory leaks (may not have much effect)
			del cur
			self.db.reopenDatabase()
Exemple #12
0
	def __init__(self,helper):
		self.helper = helper
		self.botEdits = {}
		self.humanEdits = {}
		self.anonEdits = {}
		self.botTotal = 0
		self.humanTotal = 0
		self.anonTotal = 0
		self.curMin = 100000000
		self.curMax = -100000000

		self.editsByUser = {}

		logging.logMore('Loading list of bots ')
		self.bots = []
		botsjson = urllib.urlopen('http://www.wikidata.org/w/api.php?action=query&list=allusers&augroup=bot&aulimit=500&format=json').read()
		botsdata = eval(botsjson)
		for bot in botsdata['query']['allusers'] :
			self.bots.append(bot['name'])
			logging.logMore('.')
		logging.log(' found ' + str(len(self.bots)) + ' bot accounts.')
Exemple #13
0
 def getLatestDumpDate(self):
     if not self.latestdump:
         logging.logMore("Checking for the date of the last dump ")
         self.latestdump = "20121026"
         for line in urllib.urlopen("http://dumps.wikimedia.org/wikidatawiki/"):
             if not line.startswith('<tr><td class="n">'):
                 continue
             date = line[27:35]
             if not re.match("\d\d\d\d\d\d\d\d", date):
                 continue
             logging.logMore(".")
             # logging.log("Checking dump of " + date)
             # check if dump is finished
             finished = False
             for md5 in urllib.urlopen(
                 "http://dumps.wikimedia.org/wikidatawiki/" + date + "/wikidatawiki-" + date + "-md5sums.txt"
             ):
                 if md5.endswith("-pages-meta-history.xml.bz2" + "\n"):
                     finished = True
             if finished:
                 self.latestdump = date
         logging.log(" latest dump has been on " + self.latestdump)
     return self.latestdump
Exemple #14
0
    def getLatestDumpDate(self):
        if not self.latestdump:
            self.latestdump = '00000000'
            if self.offline:
                logging.logMore('Checking for the date of the last local ' +
                                self.dumpName + ' ')
                dataDirs = os.listdir("data")
                for dirName in dataDirs:
                    if not dirName.startswith(self.dumpDirName): continue
                    date = dirName[len(self.dumpDirName):]
                    if not re.match('\d\d\d\d\d\d\d\d', date): continue
                    logging.logMore('.')
                    if date > self.latestdump and date <= self.maxdumpdate:
                        self.latestdump = date
            else:
                logging.logMore('Checking for the date of the last online ' +
                                self.dumpName + ' ')
                for line in urllib.urlopen(
                        'http://dumps.wikimedia.org/wikidatawiki/'):
                    if not line.startswith('<tr><td class="n">'): continue
                    date = line[27:35]
                    if not re.match('\d\d\d\d\d\d\d\d', date): continue
                    logging.logMore('.')
                    #logging.log("Checking dump of " + date)
                    # check if dump is finished
                    finished = False
                    for md5 in urllib.urlopen(
                            'http://dumps.wikimedia.org/wikidatawiki/' + date +
                            '/wikidatawiki-' + date + '-md5sums.txt'):
                        if md5.endswith(self.dumpPostFix + "\n"):
                            finished = True
                            break
                    if finished and date > self.latestdump and date <= self.maxdumpdate:
                        self.latestdump = date

            if self.latestdump == '00000000':
                logging.log('-- Warning: no latest ' + self.dumpName +
                            ' found.')
            else:
                logging.log(' latest ' + self.dumpName + ' is ' +
                            self.latestdump)
        return self.latestdump
Exemple #15
0
	def getDailyDates(self):
		if not self.dailies:
			self.dailies = []
			if self.offline:
				logging.logMore("Finding daily exports available locally ")
				dataDirs = os.listdir("data")
				for dirName in dataDirs:
					if not dirName.startswith('daily'): continue
					date = dirName[5:]
					if not re.match('\d\d\d\d\d\d\d\d', date) : continue
					logging.logMore('.')
					self.dailies.append(date)
				self.dailies = sorted(self.dailies)
			else:
				logging.logMore("Finding daily exports online ")
				for line in urllib.urlopen('http://dumps.wikimedia.org/other/incr/wikidatawiki/') :
					if not line.startswith('<tr><td class="n">') : continue
					date = line[27:35]
					if not re.match('\d\d\d\d\d\d\d\d', date) : continue
					logging.logMore('.')
					self.dailies.append(date)
			logging.log(" found " + str(len(self.dailies)) + " daily exports.")
		return self.dailies
Exemple #16
0
 def getDailyDates(self):
     if not self.dailies:
         self.dailies = []
         if self.offline:
             logging.logMore("Finding daily exports available locally ")
             dataDirs = os.listdir("data")
             for dirName in dataDirs:
                 if not dirName.startswith('daily'): continue
                 date = dirName[5:]
                 if not re.match('\d\d\d\d\d\d\d\d', date): continue
                 logging.logMore('.')
                 self.dailies.append(date)
             self.dailies = sorted(self.dailies)
         else:
             logging.logMore("Finding daily exports online ")
             for line in urllib.urlopen(
                     'http://dumps.wikimedia.org/other/incr/wikidatawiki/'):
                 if not line.startswith('<tr><td class="n">'): continue
                 date = line[27:35]
                 if not re.match('\d\d\d\d\d\d\d\d', date): continue
                 logging.logMore('.')
                 self.dailies.append(date)
         logging.log(" found " + str(len(self.dailies)) + " daily exports.")
     return self.dailies
Exemple #17
0
    def fetchNewerDailies(self):
        self.getDailyDates()
        if not self.maxrevid:
            self.fetchLatestDump()

        self.__cdData()
        self.stopdaily = '20121026'
        self.newerdailies = []
        for daily in reversed(self.dailies):
            logging.logMore('Checking daily ' + daily + ' ... ')
            if not os.path.exists('daily' + daily):
                os.makedirs('daily' + daily)
            os.chdir('daily' + daily)

            if daily > self.maxdumpdate:
                logging.log('too recent to consider')
                os.chdir('..')
                continue

            if not os.path.exists('maxrevid.txt'):
                maxrevSource = 'http://dumps.wikimedia.org/other/incr/wikidatawiki/' + daily + '/maxrevid.txt'
                urllib.urlretrieve(maxrevSource, 'maxrevid.txt')
            else:
                maxrevSource = 'Local Max Rev File'

            try:
                dailymaxrevid = int(open('maxrevid.txt').read())
            except ValueError:
                #This happens if a daily dump failed?
                logging.log(maxrevSource + ' throws ValueError')

            if daily < self.getLatestDumpDate():
                logging.log('already in latest ' + self.dumpName)
                self.stopdaily = daily
                os.chdir('..')
                break

            if not os.path.exists('pages-meta-hist-incr.xml.bz2'):
                if self.offline:
                    logging.log('not downloading daily in offline mode')
                else:
                    logging.logMore('downloading ... ')
                    if urllib.urlopen(
                            'http://dumps.wikimedia.org/other/incr/wikidatawiki/'
                            + daily + '/status.txt').read() == 'done':
                        urllib.urlretrieve(
                            'http://dumps.wikimedia.org/other/incr/wikidatawiki/'
                            + daily + '/wikidatawiki-' + daily +
                            '-pages-meta-hist-incr.xml.bz2',
                            'pages-meta-hist-incr.xml.bz2')  #xxx
                        logging.log('done')
                        self.newerdailies.append(daily)
                    else:
                        logging.log('daily not done yet; download aborted')
            else:
                logging.log('daily already downloaded')
                self.newerdailies.append(daily)

            os.chdir('..')

        self.__cdBase()