Esempio n. 1
0
 def dump_revs(self):
     '''
     dump revision content corresponding to previously-dumped
     stubs (revision metadata)
     '''
     if not self.steps['revs']['run']:
         return True
     dumpdir = MiscDumpDir(self.wiki.config, self.wiki.date)
     outputdir = dumpdir.get_dumpdir(self.wiki.db_name, self.wiki.date)
     revsfile = RevsFile(self.wiki.config, self.wiki.date, self.wiki.db_name)
     outputfile = revsfile.get_filename()
     script_command = MultiVersion.mw_script_as_array(self.wiki.config,
                                                      "dumpTextPass.php")
     command = [self.wiki.config.php]
     command.extend(script_command)
     stubfile = StubFile(self.wiki.config, self.wiki.date, self.wiki.db_name)
     stuboutputfile = stubfile.get_filename()
     command.extend(["--wiki=%s" % self.wiki.db_name,
                     "--stub=gzip:%s" % os.path.join(outputdir, stuboutputfile),
                     "--quiet",
                     "--spawn=%s" % self.wiki.config.php,
                     "--output=bzip2:%s" % os.path.join(outputdir, outputfile)])
     if self.dryrun:
         print "would run command for revs dump:", command
     else:
         log.info("running with no output: " + " ".join(command))
         success = RunSimpleCommand.run_with_no_output(
             command, shell=False, timeout=self.get_lock_timeout_interval(),
             timeout_callback=self.periodic_callback)
         if not success:
             log.warning("error producing revision text files"
                         " for wiki %s", self.wiki.db_name)
             return False
     return True
 def get_outputfile_indextxt(self, filenames_tocheck, expected, wikiname, dump_date):
     """
     generate and return a list of text strings that provide a
     link to the given files, along with filename, size and date.
     if the file does not exist, it will be silently excluded from
     the list.
     the expected list is a list of filenames that are expected to
     be produced by the dump; currently no errors are generated
     on this basis but this may change in the future.
     """
     dirinfo = MiscDumpDir(self.args["config"], dump_date)
     path = dirinfo.get_dumpdir(wikiname)
     output_fileinfo = {}
     for filename in filenames_tocheck:
         output_fileinfo[filename] = FileUtils.file_info(os.path.join(path, filename))
     files_text = []
     filenames = sorted(output_fileinfo.keys())
     for filename in filenames:
         file_date, file_size = output_fileinfo[filename]
         log.info("output file %s for %s %s %s", filename, wikiname, safe(file_date), safe(file_size))
         if filename in expected and file_date is None:
             # may do more with this sort of error in the future
             # for now, just get stats on the other files
             continue
         if file_date:
             files_text.append(
                 "%s: %s (size %s)<br />"
                 # FIXME check that this link is correct
                 % (
                     make_link(os.path.join(wikiname, dump_date, filename), os.path.basename(filename)),
                     file_date,
                     file_size,
                 )
             )
     return files_text
Esempio n. 3
0
    def dump_stub(self, start_revid, end_revid):
        '''
        dump stubs (metadata) for revs from start_revid
        up to but not including end_revid
        '''
        if not self.steps['stubs']['run']:
            return True

        dumpdir = MiscDumpDir(self.wiki.config, self.wiki.date)
        outputdir = dumpdir.get_dumpdir(self.wiki.db_name, self.wiki.date)
        stubfile = StubFile(self.wiki.config, self.wiki.date, self.wiki.db_name)
        outputfile = stubfile.get_filename()
        script_command = MultiVersion.mw_script_as_array(self.wiki.config,
                                                         "dumpBackup.php")
        command = [self.wiki.config.php]
        command.extend(script_command)
        command.extend(["--wiki=%s" % self.wiki.db_name, "--stub", "--quiet",
                        "--output=gzip:%s" % os.path.join(outputdir, outputfile),
                        "--revrange", "--revstart=%s" % start_revid,
                        "--revend=%s" % end_revid])
        if self.dryrun:
            print "would run command for stubs dump:", command
        else:
            log.info("running with no output: " + " ".join(command))
            success = RunSimpleCommand.run_with_no_output(
                command, shell=False, timeout=self.get_lock_timeout_interval(),
                timeout_callback=self.periodic_callback)
            if not success:
                log.warning("error producing stub files for wiki %s", self.wiki.db_name)
                return False
        return True
Esempio n. 4
0
 def read_max_revid_from_file(self, date=None):
     '''
     read and return max rev id for wiki from file
     '''
     if date is None:
         date = self.wiki.date
     try:
         file_obj = MaxRevIDFile(self.wiki.config, date, self.wiki.db_name)
         return FileUtils.read_file(file_obj.get_path().rstrip())
     except Exception as ex:
         log.info("Error encountered reading maxrevid from %s ", file_obj.get_path(),
                  exc_info=ex)
         return None
Esempio n. 5
0
 def run(self):
     '''
     dump html from RESTBase of revision content, for given wiki and date
     '''
     try:
         log.info("dumping html for wiki %s", self.wiki.db_name)
         if not self.dump_html():
             return False
     except Exception as ex:
         log.warning("Error encountered runing dump for %s ", self.wiki.db_name,
                     exc_info=ex)
         return False
     return True
 def get_stat_text(self, dump_date, wikiname):
     """
     generate and return the text string describing
     the status of the dump of the wiki for the given date
     """
     stat = StatusFile(self.args["config"], dump_date, wikiname)
     stat_contents = FileUtils.read_file(stat.get_path())
     log.info("status for %s %s", wikiname, safe(stat_contents))
     if stat_contents:
         stat_text = "(%s)" % (stat_contents)
     else:
         stat_text = None
     return stat_text
 def do_all_wikis(self):
     """
     generate index.html file for all wikis for the given date.
     FIXME maybe this should be for the latest run date? Hrm.
     """
     text = ""
     for wikiname in self.args["config"].all_wikis_list:
         result = self.do_one_wiki(wikiname)
         if result:
             log.info("result for wiki %s is %s", wikiname, result)
             text = text + "<li>" + result + "</li>\n"
     index_text = self.args["config"].read_template(self.args["config"].indextmpl) % {"items": text}
     FileUtils.write_file_in_place(self.indexfile.get_path(), index_text, self.args["config"].fileperms)
Esempio n. 8
0
    def get_prev_revid(self, max_revid):
        '''
        get the previous rundate, with or without maxrevid file
        we can populate that file if need be
        '''
        prev_date = self.get_prev_incrdate(self.wiki.date)
        log.info("prev_date is %s", safe(prev_date))

        prev_revid = None

        if prev_date:
            cutoff = cutoff_from_date(prev_date, self.wiki.config)
            id_reader = MaxRevID(self.wiki, cutoff, self.dryrun)
            prev_revid = id_reader.read_max_revid_from_file(prev_date)

            if prev_revid is None:
                log.info("Wiki %s retrieving prevRevId from db.",
                         self.wiki.db_name)
                id_reader.record_max_revid()
                prev_revid = id_reader.max_id
        else:
            log.info("Wiki %s no previous runs, using %s - 10 ",
                     self.wiki.db_name, max_revid)
            prev_revid = str(int(max_revid) - 10)
            if int(prev_revid) < 1:
                prev_revid = str(1)

        # this incr will cover every revision from the last
        # incremental through the maxid we wrote out already.
        if prev_revid is not None:
            prev_revid = str(int(prev_revid) + 1)
        log.info("prev_revid is %s", safe(prev_revid))
        return prev_revid
Esempio n. 9
0
    def run(self):
        '''
        dump maxrevid, stubs for revs from previous maxrevid to current one,
        revision content for these stubs, for given wiki and date
        '''
        try:
            log.info("retrieving max rev id for wiki %s", self.wiki.db_name)
            max_revid = self.dump_max_revid()
            if not max_revid:
                return False

            log.info("retrieving prev max rev id for wiki %s", self.wiki.db_name)
            prev_revid = self.get_prev_revid(max_revid)
            if not prev_revid:
                return False

            log.info("producing stub file for wiki %s", self.wiki.db_name)
            if not self.dump_stub(prev_revid, max_revid):
                return False

            log.info("producing content file for wiki %s", self.wiki.db_name)
            if not self.dump_revs():
                return False
        except Exception as ex:
            log.warning("Error encountered runing dump for %s ", self.wiki.db_name,
                        exc_info=ex)
            return False
        return True
    def do_one_wiki(self):
        """
        run dump of specified type for one wiki, for given date
        unless it is among the wikis we skip, has already been run
        for the date, or some other process has the lock and is
        therefore presumably already dumping it
        """
        if not skip_wiki(self.wiki.db_name, self.wiki.config):

            dumpdir = MiscDumpDir(self.args["config"], self.args["date"])
            if not exists(dumpdir.get_dumpdir(self.wiki.db_name)):
                os.makedirs(dumpdir.get_dumpdir(self.wiki.db_name))

            status_info = StatusInfo(self.args["config"], self.wiki.date, self.wiki.db_name)
            status = status_info.get_status()
            if status == "done:all" and not self.flags["forcerun"]:
                log.info("wiki %s skipped, adds/changes dump already" " complete", self.wiki.db_name)
                return STATUS_GOOD

            if not self.flags["dryrun"]:
                lock = MiscDumpLock(self.args["config"], self.wiki.date, self.wiki.db_name)

                # if lock is stale, remove it
                lock.remove_if_stale(self.wiki.config.lock_stale)

                # try to get the lock ourselves
                if not lock.get_lock():
                    log.info(
                        "wiki %s skipped, wiki is locked," " another process should be doing the job", self.wiki.db_name
                    )
                    return STATUS_TODO

                self.dumper.set_lockinfo(lock)
                dumps_dirs = MiscDumpDirs(self.wiki.config, self.wiki.db_name)
                dumps_dirs.cleanup_old_dumps(self.wiki.date)

            log.info("Doing run for wiki: %s", self.wiki.db_name)

            try:
                result = self.dumper.run()
                if not result:
                    return STATUS_FAILED

                if not self.flags["dryrun"]:
                    output_files, expected = self.dumper.get_output_files()
                    if not md5sums(self.wiki, self.wiki.config.fileperms, output_files, expected):
                        return STATUS_FAILED
                    status_info.set_status("done:" + self.dumper.get_steps_done())
                    lock.unlock_if_owner()

                if self.flags["do_index"]:
                    index = Index(self.args)
                    index.do_all_wikis()
            except Exception as ex:
                log.warning("error from dump run" " for wiki %s", self.wiki.db_name, exc_info=ex)
                if not self.flags["dryrun"]:
                    lock.unlock_if_owner()
                return STATUS_FAILED
        log.info("Success!  Wiki %s %s dump complete.", self.wiki.db_name, self.args["dumptype"])
        return STATUS_GOOD
Esempio n. 11
0
 def dump_aliases(self):
     '''
     returns True on success
     False or exception on error are fine
     '''
     if not self.steps['aliases']['run']:
         return True
     try:
         contents = "for wiki %s: alias meow=more\n" % self.wiki.db_name
         aliasesfile = AliasesFile(self.wiki.config, self.wiki.date, self.wiki.db_name)
         FileUtils.write_file_in_place(aliasesfile.get_path(),
                                       contents, self.wiki.config.fileperms)
         return True
     except Exception as ex:
         log.info("Error encountered dumping namespaces for %s ", self.wiki.db_name,
                  exc_info=ex)
         raise
    def do_one_wiki(self, wikiname, date=None):
        """
        collect the text strings for one wiki to be inserted into
        the index.html file
        """
        if not skip_wiki(wikiname, self.args["config"]):
            dumps_dirs = MiscDumpDirs(self.args["config"], wikiname)
            if not exists(self.dumpdir.get_dumpdir_no_date(wikiname)):
                log.info("No dump for wiki %s", wikiname)
                return
            if date is not None:
                dump_date = date
            else:
                dump_date = dumps_dirs.get_latest_dump_date(True)
            if not dump_date:
                log.info("No dump for wiki %s", wikiname)
                return

            other_runs_text = "other runs: %s<br />" % make_link(wikiname, wikiname)

            try:
                wiki = Wiki(self.args["config"], wikiname)
                wiki.set_date(dump_date)
                files_text = self.get_files_text(wiki)
                stat_text = self.get_stat_text(dump_date, wikiname)

            except Exception as ex:
                log.warning("Error encountered, no information available" " for wiki %s", wikiname, exc_info=ex)
                return "<strong>%s</strong> Error encountered," " no information available | %s" % (
                    wikiname,
                    other_runs_text,
                )

            try:
                wikiname_text = "<strong>%s</strong>" % wikiname

                wiki_info = " ".join([entry for entry in [wikiname_text, stat_text] if entry is not None]) + "<br />"
                wiki_info = wiki_info + "&nbsp;&nbsp;" + "\n&nbsp;&nbsp;".join(files_text)
                wiki_info = wiki_info + "\n&nbsp;" + other_runs_text
            except Exception as ex:
                log.warning("Error encountered formatting information" " for wiki %s", wikiname, exc_info=ex)
                return "Error encountered formatting information" " for wiki %s" % wikiname

            return wiki_info
Esempio n. 13
0
 def get_domain_from_wikidbname(self):
     '''
     given the name of the wiki db, turn this into the
     fqdn of the wiki project (i.e. enwiki -> en.wikipedia.org)
     '''
     script_command = MultiVersion.mw_script_as_array(self.wiki.config,
                                                      "eval.php")
     # echo $wgCanonicalServer | php "$multiversionscript" eval.php $wiki
     command = ["echo", "'echo $wgCanonicalServer;'", "|", self.wiki.config.php]
     command.extend(script_command)
     command.append(self.wiki.db_name)
     command_text = " ".join(command)
     log.info("running with no output: " + command_text)
     output = RunSimpleCommand.run_with_output(command_text, shell=True)
     if not output:
         log.warning("error retrieving domain for wiki %s", self.wiki.db_name)
         return None
     # rstrip gets rid of any trailing newlines from eval.php
     return output.split('//')[1].rstrip()
Esempio n. 14
0
 def run(self):
     '''
     dump namespaces, namespace aliases for given wiki and date
     '''
     try:
         log.info("dumping namespaces for wiki %s", self.wiki.db_name)
         if not self.dump_namespaces():
             return False
         log.info("dumping aliases for wiki %s", self.wiki.db_name)
         if not self.dump_aliases():
             return False
     except Exception as ex:
         log.info("Error encountered runing dump for %s ", self.wiki.db_name,
                  exc_info=ex)
         return False
     return True
Esempio n. 15
0
    def dump_max_revid(self):
        '''
        dump maximum rev id from wiki that's older than
        the configured number of seconds (cutoff)

        we have this cutoff so that content really new
        is not dumped; we want to give curators the chance to
        remove problematic entries first.

        a cutoff of some hours is reasonable.
        '''
        max_id = None
        revidfile = MaxRevIDFile(self.wiki.config, self.wiki.date, self.wiki.db_name)
        if not exists(revidfile.get_path()):
            log.info("Wiki %s retrieving max revid from db.",
                     self.wiki.db_name)
            query = ("select rev_id from revision where rev_timestamp < \"%s\" "
                     "order by rev_timestamp desc limit 1" % self.cutoff)
            db_info = DbServerInfo(self.wiki, self.wiki.db_name)
            results = db_info.run_sql_and_get_output(query)
            if results:
                lines = results.splitlines()
                if lines and lines[1] and lines[1].isdigit():
                    max_id = lines[1]
                    FileUtils.write_file_in_place(revidfile.get_path(),
                                                  max_id, self.wiki.config.fileperms)
        try:
            file_obj = MaxRevIDFile(self.wiki.config, self.wiki.date, self.wiki.db_name)
            max_revid = FileUtils.read_file(file_obj.get_path().rstrip())
        except Exception as ex:
            log.info("Error encountered reading maxrevid from %s ", file_obj.get_path(),
                     exc_info=ex)
            max_revid = None

        # end rev id is not included in dump
        if max_revid is not None:
            max_revid = str(int(max_revid) + 1)

        log.info("max_revid is %s", safe(max_revid))
        return max_revid