def dump_html(self):
        '''
        dump HTML-formated revision content from RESTBase
        for the given wiki and date
        '''
        dumpdir = MiscDumpDir(self.wiki.config, self.wiki.date)
        outputdir = dumpdir.get_dumpdir(self.wiki.db_name, self.wiki.date)
        htmlfile = HTMLFile(self.wiki.config, self.wiki.date, self.wiki.db_name)
        outputfile = htmlfile.get_filename(self.args['ns'])
        # /usr/bin/nodejs ./bin/dump_wiki --domain en.wikipedia.org --ns 0 \
        # --apiURL http://en.wikipedia.org/w/api.php \
        # --dataBase /srv/www/htmldumps/dumps/20160826/en.wikipedia.org.articles.ns0.sqlite3
        domain = self.get_domain_from_wikidbname()
        # FIXME: the nodejs wrapper which will do the compress etc stuff for one wiki is
        # not yet written
        command = [self.wiki.config.nodejs]
        command.append(self.wiki.config.scriptpath)
        command.extend(["--domain", domain, "--ns", self.args['ns'],
                        "--apiURL", "http://%s/w/api.php" % domain,
                        "--dataBase", os.path.join(outputdir, outputfile),
                        "--wiki=%s" % self.wiki.db_name,
                        "--output=gzip:%s" % os.path.join(outputdir, outputfile)])

        if self.dryrun:
            print("would run command for html dump:", command)
        else:
            success = RunSimpleCommand.run_with_no_output(
                command, shell=False,
                timeout=self.get_lock_timeout_interval(),
                timeout_callback=self.periodic_callback)
            if not success:
                self.log.warning("error producing html files for wiki %s", self.wiki.db_name)
                return False
        return True
Exemple #2
0
    def dump_stub(self, start_revid, end_revid):
        '''
        dump stubs (metadata) for revs from start_revid
        up to but not including end_revid
        '''
        if not self.steps['stubs']['run']:
            return True

        dumpdir = MiscDumpDir(self.wiki.config, self.wiki.date)
        outputdir = dumpdir.get_dumpdir(self.wiki.db_name, self.wiki.date)
        stubfile = StubFile(self.wiki.config, self.wiki.date, self.wiki.db_name)
        outputfile = stubfile.get_filename()
        script_command = MultiVersion.mw_script_as_array(self.wiki.config,
                                                         "dumpBackup.php")
        command = [self.wiki.config.php]
        command.extend(script_command)
        command.extend(["--wiki=%s" % self.wiki.db_name, "--stub", "--quiet",
                        "--output=gzip:%s" % os.path.join(outputdir, outputfile),
                        "--revrange", "--revstart=%s" % start_revid,
                        "--revend=%s" % end_revid])
        if self.dryrun:
            print "would run command for stubs dump:", command
        else:
            log.info("running with no output: " + " ".join(command))
            success = RunSimpleCommand.run_with_no_output(
                command, shell=False, timeout=self.get_lock_timeout_interval(),
                timeout_callback=self.periodic_callback)
            if not success:
                log.warning("error producing stub files for wiki %s", self.wiki.db_name)
                return False
        return True
Exemple #3
0
 def dump_revs(self):
     '''
     dump revision content corresponding to previously-dumped
     stubs (revision metadata)
     '''
     if not self.steps['revs']['run']:
         return True
     dumpdir = MiscDumpDir(self.wiki.config, self.wiki.date)
     outputdir = dumpdir.get_dumpdir(self.wiki.db_name, self.wiki.date)
     revsfile = RevsFile(self.wiki.config, self.wiki.date, self.wiki.db_name)
     outputfile = revsfile.get_filename()
     script_command = MultiVersion.mw_script_as_array(self.wiki.config,
                                                      "dumpTextPass.php")
     command = [self.wiki.config.php]
     command.extend(script_command)
     stubfile = StubFile(self.wiki.config, self.wiki.date, self.wiki.db_name)
     stuboutputfile = stubfile.get_filename()
     command.extend(["--wiki=%s" % self.wiki.db_name,
                     "--stub=gzip:%s" % os.path.join(outputdir, stuboutputfile),
                     "--quiet",
                     "--spawn=%s" % self.wiki.config.php,
                     "--output=bzip2:%s" % os.path.join(outputdir, outputfile)])
     if self.dryrun:
         print "would run command for revs dump:", command
     else:
         log.info("running with no output: " + " ".join(command))
         success = RunSimpleCommand.run_with_no_output(
             command, shell=False, timeout=self.get_lock_timeout_interval(),
             timeout_callback=self.periodic_callback)
         if not success:
             log.warning("error producing revision text files"
                         " for wiki %s", self.wiki.db_name)
             return False
     return True
 def get_outputfile_indextxt(self, filenames_tocheck, expected, wikiname, dump_date):
     '''
     generate and return a list of text strings that provide a
     link to the given files, along with filename, size and date.
     if the file does not exist, it will be silently excluded from
     the list.
     the expected list is a list of filenames that are expected to
     be produced by the dump; currently no errors are generated
     on this basis but this may change in the future.
     '''
     dirinfo = MiscDumpDir(self.args['config'], dump_date)
     path = dirinfo.get_dumpdir(wikiname)
     output_fileinfo = {}
     for filename in filenames_tocheck:
         output_fileinfo[filename] = FileUtils.file_info(os.path.join(path, filename))
     files_text = []
     filenames = sorted(output_fileinfo.keys())
     for filename in filenames:
         file_date, file_size = output_fileinfo[filename]
         self.log.info("output file %s for %s %s %s",
                       filename, wikiname, safe(file_date), safe(file_size))
         if filename in expected and file_date is None:
             # may do more with this sort of error in the future
             # for now, just get stats on the other files
             continue
         if file_date:
             files_text.append(
                 "%s: %s (size %s)<br />"
                 % (make_link(
                     os.path.join(
                         wikiname, dump_date,
                         filename),
                     os.path.basename(filename)), file_date, file_size))
     return files_text
    def do_one_wiki(self):
        """
        run dump of specified type for one wiki, for given date
        unless it is among the wikis we skip, has already been run
        for the date, or some other process has the lock and is
        therefore presumably already dumping it
        """
        if not skip_wiki(self.wiki.db_name, self.wiki.config):

            dumpdir = MiscDumpDir(self.args["config"], self.args["date"])
            if not exists(dumpdir.get_dumpdir(self.wiki.db_name)):
                os.makedirs(dumpdir.get_dumpdir(self.wiki.db_name))

            status_info = StatusInfo(self.args["config"], self.wiki.date, self.wiki.db_name)
            status = status_info.get_status()
            if status == "done:all" and not self.flags["forcerun"]:
                log.info("wiki %s skipped, adds/changes dump already" " complete", self.wiki.db_name)
                return STATUS_GOOD

            if not self.flags["dryrun"]:
                lock = MiscDumpLock(self.args["config"], self.wiki.date, self.wiki.db_name)

                # if lock is stale, remove it
                lock.remove_if_stale(self.wiki.config.lock_stale)

                # try to get the lock ourselves
                if not lock.get_lock():
                    log.info(
                        "wiki %s skipped, wiki is locked," " another process should be doing the job", self.wiki.db_name
                    )
                    return STATUS_TODO

                self.dumper.set_lockinfo(lock)
                dumps_dirs = MiscDumpDirs(self.wiki.config, self.wiki.db_name)
                dumps_dirs.cleanup_old_dumps(self.wiki.date)

            log.info("Doing run for wiki: %s", self.wiki.db_name)

            try:
                result = self.dumper.run()
                if not result:
                    return STATUS_FAILED

                if not self.flags["dryrun"]:
                    output_files, expected = self.dumper.get_output_files()
                    if not md5sums(self.wiki, self.wiki.config.fileperms, output_files, expected):
                        return STATUS_FAILED
                    status_info.set_status("done:" + self.dumper.get_steps_done())
                    lock.unlock_if_owner()

                if self.flags["do_index"]:
                    index = Index(self.args)
                    index.do_all_wikis()
            except Exception as ex:
                log.warning("error from dump run" " for wiki %s", self.wiki.db_name, exc_info=ex)
                if not self.flags["dryrun"]:
                    lock.unlock_if_owner()
                return STATUS_FAILED
        log.info("Success!  Wiki %s %s dump complete.", self.wiki.db_name, self.args["dumptype"])
        return STATUS_GOOD
 def __init__(self, args):
     """
     pass a dict of the standard args
     (config, date, dumptype, args)
     """
     self.args = args
     self.indexfile = IndexFile(self.args["config"])
     self.dumpdir = MiscDumpDir(self.args["config"])
 def __init__(self, dryrun, args, log):
     '''
     pass a dict of the standard args
     (config, date, dumptype, args)
     '''
     self.args = args
     self.dryrun = dryrun
     self.log = log
     self.indexfile = IndexFile(self.args['config'])
     self.dumpdir = MiscDumpDir(self.args['config'])
 def get_output_files(self):
     dumpdir = MiscDumpDir(self.wiki.config, self.wiki.date)
     outputdir = dumpdir.get_dumpdir(self.wiki.db_name, self.wiki.date)
     htmlfile = HTMLFile(self.wiki.config, self.wiki.date, self.wiki.db_name)
     filenames = [htmlfile.get_filename(self.args['ns'])]
     return [os.path.join(outputdir, filename) for filename in filenames]
class Index():
    '''
    generate index.html page containing information for the dump
    run of the specified date for all wikis
    '''
    def __init__(self, dryrun, args, log):
        '''
        pass a dict of the standard args
        (config, date, dumptype, args)
        '''
        self.args = args
        self.dryrun = dryrun
        self.log = log
        self.indexfile = IndexFile(self.args['config'])
        self.dumpdir = MiscDumpDir(self.args['config'])

    def do_all_wikis(self):
        '''
        generate index.html file for all wikis for the given date.
        This should only be done if the run is the most recent, otherwise
        your nice fresh index.html file willbe replaced with old data.
        '''
        text = ""
        for wikiname in self.args['config'].all_wikis_list:
            result = self.do_one_wiki(wikiname)
            if result:
                self.log.info("result for wiki %s is %s", wikiname, result)
                text = text + "<li>" + result + "</li>\n"
        index_text = (self.args['config'].read_template(self.args['config'].indextmpl)
                      % {"items": text})
        if self.dryrun:
            print("would write {path} with index text".format(path=self.indexfile.get_path()))
        else:
            FileUtils.write_file_in_place(self.indexfile.get_path(),
                                          index_text, self.args['config'].fileperms)

    def get_outputfile_indextxt(self, filenames_tocheck, expected, wikiname, dump_date):
        '''
        generate and return a list of text strings that provide a
        link to the given files, along with filename, size and date.
        if the file does not exist, it will be silently excluded from
        the list.
        the expected list is a list of filenames that are expected to
        be produced by the dump; currently no errors are generated
        on this basis but this may change in the future.
        '''
        dirinfo = MiscDumpDir(self.args['config'], dump_date)
        path = dirinfo.get_dumpdir(wikiname)
        output_fileinfo = {}
        for filename in filenames_tocheck:
            output_fileinfo[filename] = FileUtils.file_info(os.path.join(path, filename))
        files_text = []
        filenames = sorted(output_fileinfo.keys())
        for filename in filenames:
            file_date, file_size = output_fileinfo[filename]
            self.log.info("output file %s for %s %s %s",
                          filename, wikiname, safe(file_date), safe(file_size))
            if filename in expected and file_date is None:
                # may do more with this sort of error in the future
                # for now, just get stats on the other files
                continue
            if file_date:
                files_text.append(
                    "%s: %s (size %s)<br />"
                    % (make_link(
                        os.path.join(
                            wikiname, dump_date,
                            filename),
                        os.path.basename(filename)), file_date, file_size))
        return files_text

    def get_stat_text(self, dump_date, wikiname):
        '''
        generate and return the text string describing
        the status of the dump of the wiki for the given date
        '''
        stat = StatusFile(self.args['config'], dump_date, wikiname)
        stat_contents = FileUtils.read_file(stat.get_path())
        self.log.info("status for %s %s", wikiname, safe(stat_contents))
        if stat_contents:
            stat_text = "(%s)" % (stat_contents)
        else:
            stat_text = None
        return stat_text

    def get_files_text(self, wiki):
        '''
        given wiki object, return the list of links and descriptions
        for the output files for that wiki of the current dump type
        and date
        '''
        dump_class = MiscDumpFactory.get_dumper(self.args['dumptype'])
        dumper = dump_class(wiki, self.log, False, self.args['args'])
        output_files, expected = dumper.get_output_files()
        files_text = self.get_outputfile_indextxt(output_files, expected,
                                                  wiki.db_name, wiki.date)

        md5file = MD5File(wiki.config, wiki.date, wiki.db_name)
        md5file_text = self.get_outputfile_indextxt(
            [md5file.get_filename()], [], wiki.db_name, wiki.date)
        files_text.extend(md5file_text)
        return files_text

    def do_one_wiki(self, wikiname, date=None):
        '''
        collect the text strings for one wiki to be inserted into
        the index.html file
        '''
        if not skip_wiki(wikiname, self.args['config']):
            dumps_dirs = MiscDumpDirs(self.args['config'], wikiname, self.log)
            if not exists(self.dumpdir.get_dumpdir_no_date(wikiname)):
                self.log.info("No dump for wiki %s", wikiname)
                return None
            if date is not None:
                dump_date = date
            else:
                dump_date = dumps_dirs.get_latest_dump_date(True)
            if not dump_date:
                self.log.info("No dump for wiki %s", wikiname)
                return None

            other_runs_text = "other runs: %s<br />" % make_link(wikiname, wikiname)

            try:
                wiki = Wiki(self.args['config'], wikiname)
                wiki.set_date(dump_date)
                files_text = self.get_files_text(wiki)
                stat_text = self.get_stat_text(dump_date, wikiname)

            except Exception as ex:
                self.log.warning("Error encountered, no information available"
                                 " for wiki %s", wikiname, exc_info=ex)
                return ("<strong>%s</strong> Error encountered,"
                        " no information available | %s" % (wikiname, other_runs_text))

            try:
                wikiname_text = "<strong>%s</strong>" % wikiname

                wiki_info = (" ".join([entry for entry in [wikiname_text, stat_text]
                                       if entry is not None]) + "<br />")
                wiki_info = (wiki_info + "&nbsp;&nbsp;" + "\n&nbsp;&nbsp;".join(files_text))
                wiki_info = wiki_info + "\n&nbsp;" + other_runs_text
            except Exception as ex:
                self.log.warning("Error encountered formatting information"
                                 " for wiki %s", wikiname, exc_info=ex)
                return ("Error encountered formatting information"
                        " for wiki %s" % wikiname)

            return wiki_info
        return None