Esempio n. 1
0
 def get_known_tables(self):
     dbserver = DbServerInfo(self, self.db_name)
     commands = dbserver.build_sql_command("'show tables'")
     echocmd = commands[0]
     mysqlcmd = commands[1]
     to_run = " ".join(echocmd) + " | " + " ".join(mysqlcmd) + " --silent"
     results = RunSimpleCommand.run_with_output(to_run, shell=True)
     return results.decode('utf-8').splitlines()
Esempio n. 2
0
def run_simple_query(query, wiki):
    '''
    run a mysql query which returns only one field from
    one row.
    return the value of that one field (as a string)
    '''
    db_info = DbServerInfo(wiki, wiki.db_name)
    commands = db_info.build_sql_command(query)
    echocmd = commands[0]
    mysqlcmd = commands[1]
    to_run = " ".join(echocmd) + " | " + " ".join(mysqlcmd) + " --silent"
    log.info("running with no output: " + to_run)
    return RunSimpleCommand.run_with_output(to_run, shell=True)
Esempio n. 3
0
    def dump_max_revid(self):
        '''
        dump maximum rev id from wiki that's older than
        the configured number of seconds (cutoff)

        we have this cutoff so that content really new
        is not dumped; we want to give curators the chance to
        remove problematic entries first.

        a cutoff of some hours is reasonable.
        '''
        max_revid = None
        revidfile = MaxRevIDFile(self.wiki.config, self.wiki.date, self.wiki.db_name)
        if exists(revidfile.get_path()):
            self.log.info("Wiki %s, max rev id file %s already exists",
                          self.wiki.db_name, revidfile.get_path())
        else:
            self.log.info("Wiki %s retrieving max revid from db.",
                          self.wiki.db_name)
            query = ("select rev_id from revision where rev_timestamp < \"%s\" "
                     "order by rev_timestamp desc limit 1" % self.cutoff)
            db_info = DbServerInfo(self.wiki, self.wiki.db_name)
            results = db_info.run_sql_and_get_output(query)
            if results:
                lines = results.splitlines()
                if lines and lines[1] and lines[1].isdigit():
                    max_revid = lines[1]
                    if self.dryrun:
                        print("would write file {path} with contents {revid}".format(
                            path=revidfile.get_path(), revid=max_revid))
                    else:
                        FileUtils.write_file_in_place(
                            revidfile.get_path(), max_revid.decode('utf-8'),
                            self.wiki.config.fileperms)
        if not max_revid:
            try:
                file_obj = MaxRevIDFile(self.wiki.config, self.wiki.date, self.wiki.db_name)
                max_revid = FileUtils.read_file(file_obj.get_path().rstrip())
            except Exception as ex:
                self.log.info("Error encountered reading maxrevid from %s ", file_obj.get_path(),
                              exc_info=ex)
                max_revid = None

        # end rev id is not included in dump
        if max_revid is not None:
            max_revid = str(int(max_revid) + 1)

        self.log.info("max_revid is %s", safe(max_revid))
        return max_revid
Esempio n. 4
0
def get_revs_per_page_interval(page_id_start, interval, wiki, db_info):
    '''
    given page id start and the number of pages, get
    and return total number of revisions these pages have

    wiki is a Wiki object for the specific wiki
    db_info is a DbServerInfo object for the specific wiki
    '''

    query = ("select COUNT(rev_id) from revision where "
             "rev_page >= %s and rev_page < %s;" % (
                 page_id_start, page_id_start + interval))
    results = None
    retries = 0
    maxretries = 5
    end = 0
    results = db_info.run_sql_and_get_output(query)
    if results:
        lines = results.splitlines()
        if lines and lines[1]:
            if not lines[1].isdigit():
                return 0   # probably NULL or missing table
            end = int(lines[1])
            return end

    while results is None and retries < maxretries:
        retries = retries + 1
        time.sleep(5)
        # maybe the server was depooled. if so we will get another one
        db_info = DbServerInfo(wiki, wiki.db_name)
        results = db_info.run_sql_and_get_output(query)
        if not results:
            continue
        lines = results.splitlines()
        if lines and lines[1]:
            end = int(lines[1])
            break

    if not end:
        sys.stderr.write("failed to get revision count for page range from db, exiting\n")
        sys.exit(1)
    else:
        return end
Esempio n. 5
0
    def get_command(self, wiki, outfile_path, outfile_base, base):
        '''
        given the output directory and filename and the wiki
        object, put together and return a command string
        for mysql to run the query and dump the output
        where required.
        '''
        if base is None:
            base = wiki

        dbserver = DbServerInfo(base, base.db_name)

        if outfile_base.endswith(".gz"):
            compress = "gzip"
        elif outfile_base.endswith(".bz2"):
            compress = "bzip2"
        else:
            compress = ""
        pipeto = "%s > %s" % (compress, outfile_path)

        query = self.query.format(w=wiki.db_name)
        return dbserver.build_sql_command(query, pipeto)
Esempio n. 6
0
def get_max_id(wikiconf, wikidb, id_field, table):
    '''
    retrieve the largest id for this wiki from the db for specific table
    pass in name of id field, name of table
    '''
    wiki = Wiki(wikiconf, wikidb)

    db_info = DbServerInfo(wiki, wikidb)
    query = "select MAX(%s) from %s%s;" % (
        id_field, db_info.db_table_prefix, table)
    results = None
    retries = 0
    maxretries = 5
    end = 0
    results = db_info.run_sql_and_get_output(query)
    if results:
        lines = results.splitlines()
        if lines and lines[1]:
            if not lines[1].isdigit():
                return 0   # probably NULL or missing table
            end = int(lines[1])
            return end

    while results is None and retries < maxretries:
        retries = retries + 1
        time.sleep(5)
        results = db_info.run_sql_and_get_output(query)
        if not results:
            continue
        lines = results.splitlines()
        if lines and lines[1]:
            end = int(lines[1])
            break

    if not end:
        sys.stderr.write("failed to get max page id from db, exiting\n")
        sys.exit(1)
    else:
        return end
Esempio n. 7
0
 def __init__(self, dbname, config, verbose=False):
     self.dbname = dbname
     self.config = config
     self.verbose = verbose
     self.wiki = Wiki(self.config, self.dbname)
     self.db_info = DbServerInfo(self.wiki, self.dbname)
Esempio n. 8
0
class QueryRunner():
    """
    runs various db queries related to page, revision count, etc.
    """
    def __init__(self, dbname, config, verbose=False):
        self.dbname = dbname
        self.config = config
        self.verbose = verbose
        self.wiki = Wiki(self.config, self.dbname)
        self.db_info = DbServerInfo(self.wiki, self.dbname)

    def get_max_id(self, idtype):
        """
        get and return the max (rev or page) id
        """
        if idtype == 'page':
            return xmlstreams.get_max_id(self.config, self.dbname, 'page_id', 'page')
        if idtype == 'rev':
            return xmlstreams.get_max_id(self.config, self.dbname, 'rev_id', 'revision')
        return None

    def get_count(self, page_start, page_end):
        """
        get the number of revisions for the pages starting
        from page_start and ending with page_end
        and return it
        """
        query = ("select count(rev_id) from revision where "
                 "rev_page >= {start} and rev_page < {end}".format(
                     start=page_start, end=page_end))
        queryout = self.db_info.run_sql_query_with_retries(query)
        if queryout is None:
            print("unexpected output from sql query, giving up:")
            print(query, queryout)
            sys.exit(1)

        revcount = get_count_from_output(queryout)
        if revcount is None:
            print("unexpected output from sql query, giving up:")
            print(query, queryout)
            sys.exit(1)
        return revcount

    def get_length(self, page_start, page_end):
        """
        get cumulative byte count of all revisions for the pages
        starting from page_start and ending with page_end,
        and return it
        """
        query = ("select sum(rev_len) from revision where "
                 "rev_page >= {start} and rev_page < {end}".format(
                     start=page_start, end=page_end))
        queryout = self.db_info.run_sql_query_with_retries(query)
        if queryout is None:
            print("unexpected output from sql query, giving up:")
            print(query, queryout)
            sys.exit(1)

        revlength = get_length_from_output(queryout)
        if revlength is None:
            print("unexpected output from sql query, giving up:")
            print(query, queryout)
            sys.exit(1)
        return revlength

    def get_estimate(self, page_start, page_end):
        """
        get estimate of number of revisions (via explain)
        for page range from page_start to page_end
        and return it
        """
        query = ("explain select count(rev_id) from revision where "
                 "rev_page >= {start} and rev_page <= {end}".format(
                     start=page_start, end=page_end))
        queryout = self.db_info.run_sql_query_with_retries(query)
        if queryout is None:
            print("unexpected output from sql query, giving up:")
            print(query, queryout)
            sys.exit(1)
        return get_estimate_from_output(queryout)