def get_known_tables(self): dbserver = DbServerInfo(self, self.db_name) commands = dbserver.build_sql_command("'show tables'") echocmd = commands[0] mysqlcmd = commands[1] to_run = " ".join(echocmd) + " | " + " ".join(mysqlcmd) + " --silent" results = RunSimpleCommand.run_with_output(to_run, shell=True) return results.decode('utf-8').splitlines()
def run_simple_query(query, wiki): ''' run a mysql query which returns only one field from one row. return the value of that one field (as a string) ''' db_info = DbServerInfo(wiki, wiki.db_name) commands = db_info.build_sql_command(query) echocmd = commands[0] mysqlcmd = commands[1] to_run = " ".join(echocmd) + " | " + " ".join(mysqlcmd) + " --silent" log.info("running with no output: " + to_run) return RunSimpleCommand.run_with_output(to_run, shell=True)
def dump_max_revid(self): ''' dump maximum rev id from wiki that's older than the configured number of seconds (cutoff) we have this cutoff so that content really new is not dumped; we want to give curators the chance to remove problematic entries first. a cutoff of some hours is reasonable. ''' max_revid = None revidfile = MaxRevIDFile(self.wiki.config, self.wiki.date, self.wiki.db_name) if exists(revidfile.get_path()): self.log.info("Wiki %s, max rev id file %s already exists", self.wiki.db_name, revidfile.get_path()) else: self.log.info("Wiki %s retrieving max revid from db.", self.wiki.db_name) query = ("select rev_id from revision where rev_timestamp < \"%s\" " "order by rev_timestamp desc limit 1" % self.cutoff) db_info = DbServerInfo(self.wiki, self.wiki.db_name) results = db_info.run_sql_and_get_output(query) if results: lines = results.splitlines() if lines and lines[1] and lines[1].isdigit(): max_revid = lines[1] if self.dryrun: print("would write file {path} with contents {revid}".format( path=revidfile.get_path(), revid=max_revid)) else: FileUtils.write_file_in_place( revidfile.get_path(), max_revid.decode('utf-8'), self.wiki.config.fileperms) if not max_revid: try: file_obj = MaxRevIDFile(self.wiki.config, self.wiki.date, self.wiki.db_name) max_revid = FileUtils.read_file(file_obj.get_path().rstrip()) except Exception as ex: self.log.info("Error encountered reading maxrevid from %s ", file_obj.get_path(), exc_info=ex) max_revid = None # end rev id is not included in dump if max_revid is not None: max_revid = str(int(max_revid) + 1) self.log.info("max_revid is %s", safe(max_revid)) return max_revid
def get_revs_per_page_interval(page_id_start, interval, wiki, db_info): ''' given page id start and the number of pages, get and return total number of revisions these pages have wiki is a Wiki object for the specific wiki db_info is a DbServerInfo object for the specific wiki ''' query = ("select COUNT(rev_id) from revision where " "rev_page >= %s and rev_page < %s;" % ( page_id_start, page_id_start + interval)) results = None retries = 0 maxretries = 5 end = 0 results = db_info.run_sql_and_get_output(query) if results: lines = results.splitlines() if lines and lines[1]: if not lines[1].isdigit(): return 0 # probably NULL or missing table end = int(lines[1]) return end while results is None and retries < maxretries: retries = retries + 1 time.sleep(5) # maybe the server was depooled. if so we will get another one db_info = DbServerInfo(wiki, wiki.db_name) results = db_info.run_sql_and_get_output(query) if not results: continue lines = results.splitlines() if lines and lines[1]: end = int(lines[1]) break if not end: sys.stderr.write("failed to get revision count for page range from db, exiting\n") sys.exit(1) else: return end
def get_command(self, wiki, outfile_path, outfile_base, base): ''' given the output directory and filename and the wiki object, put together and return a command string for mysql to run the query and dump the output where required. ''' if base is None: base = wiki dbserver = DbServerInfo(base, base.db_name) if outfile_base.endswith(".gz"): compress = "gzip" elif outfile_base.endswith(".bz2"): compress = "bzip2" else: compress = "" pipeto = "%s > %s" % (compress, outfile_path) query = self.query.format(w=wiki.db_name) return dbserver.build_sql_command(query, pipeto)
def get_max_id(wikiconf, wikidb, id_field, table): ''' retrieve the largest id for this wiki from the db for specific table pass in name of id field, name of table ''' wiki = Wiki(wikiconf, wikidb) db_info = DbServerInfo(wiki, wikidb) query = "select MAX(%s) from %s%s;" % ( id_field, db_info.db_table_prefix, table) results = None retries = 0 maxretries = 5 end = 0 results = db_info.run_sql_and_get_output(query) if results: lines = results.splitlines() if lines and lines[1]: if not lines[1].isdigit(): return 0 # probably NULL or missing table end = int(lines[1]) return end while results is None and retries < maxretries: retries = retries + 1 time.sleep(5) results = db_info.run_sql_and_get_output(query) if not results: continue lines = results.splitlines() if lines and lines[1]: end = int(lines[1]) break if not end: sys.stderr.write("failed to get max page id from db, exiting\n") sys.exit(1) else: return end
def __init__(self, dbname, config, verbose=False): self.dbname = dbname self.config = config self.verbose = verbose self.wiki = Wiki(self.config, self.dbname) self.db_info = DbServerInfo(self.wiki, self.dbname)
class QueryRunner(): """ runs various db queries related to page, revision count, etc. """ def __init__(self, dbname, config, verbose=False): self.dbname = dbname self.config = config self.verbose = verbose self.wiki = Wiki(self.config, self.dbname) self.db_info = DbServerInfo(self.wiki, self.dbname) def get_max_id(self, idtype): """ get and return the max (rev or page) id """ if idtype == 'page': return xmlstreams.get_max_id(self.config, self.dbname, 'page_id', 'page') if idtype == 'rev': return xmlstreams.get_max_id(self.config, self.dbname, 'rev_id', 'revision') return None def get_count(self, page_start, page_end): """ get the number of revisions for the pages starting from page_start and ending with page_end and return it """ query = ("select count(rev_id) from revision where " "rev_page >= {start} and rev_page < {end}".format( start=page_start, end=page_end)) queryout = self.db_info.run_sql_query_with_retries(query) if queryout is None: print("unexpected output from sql query, giving up:") print(query, queryout) sys.exit(1) revcount = get_count_from_output(queryout) if revcount is None: print("unexpected output from sql query, giving up:") print(query, queryout) sys.exit(1) return revcount def get_length(self, page_start, page_end): """ get cumulative byte count of all revisions for the pages starting from page_start and ending with page_end, and return it """ query = ("select sum(rev_len) from revision where " "rev_page >= {start} and rev_page < {end}".format( start=page_start, end=page_end)) queryout = self.db_info.run_sql_query_with_retries(query) if queryout is None: print("unexpected output from sql query, giving up:") print(query, queryout) sys.exit(1) revlength = get_length_from_output(queryout) if revlength is None: print("unexpected output from sql query, giving up:") print(query, queryout) sys.exit(1) return revlength def get_estimate(self, page_start, page_end): """ get estimate of number of revisions (via explain) for page range from page_start to page_end and return it """ query = ("explain select count(rev_id) from revision where " "rev_page >= {start} and rev_page <= {end}".format( start=page_start, end=page_end)) queryout = self.db_info.run_sql_query_with_retries(query) if queryout is None: print("unexpected output from sql query, giving up:") print(query, queryout) sys.exit(1) return get_estimate_from_output(queryout)