Esempio n. 1
0
 def report_statusline(wiki, status, error=False):
     if error:
         # No state information, hide the timestamp
         stamp = "<span style=\"visible: none\">" + TimeUtils.pretty_time() + "</span>"
     else:
         stamp = TimeUtils.pretty_time()
     if wiki.is_private():
         link = "%s (private data)" % wiki.db_name
     else:
         if wiki.date:
             link = "<a href=\"%s/%s\">%s</a>" % (wiki.db_name, wiki.date, wiki.db_name)
         else:
             link = "%s (new)" % wiki.db_name
         if wiki.is_closed():
             link = link + " (closed)"
     return "<li>%s %s: %s</li>\n" % (stamp, link, status)
Esempio n. 2
0
    def report_previous_dump_link(self, done):
        """Produce a link to the previous dump, if any"""

        # get the list of dumps for this wiki in order, find me in the list,
        # find the one prev to me.
        # why? we might be rerunning a job from an older dumps. we might have two
        # runs going at once (think en pedia, one finishing up the history, another
        # starting at the beginning to get the new abstracts and stubs).
        try:
            dumps_in_order = self.wiki.latest_dump(return_all=True)
            me_index = dumps_in_order.index(self.wiki.date)
            # don't wrap around to the newest dump in the list!
            if me_index > 0:
                raw_date = dumps_in_order[me_index - 1]
            elif me_index == 0:
                # We are the first item in the list. This is not an error, but there is no
                # previous dump
                return "No prior dumps of this database stored."
            else:
                raise ValueError
        except Exception as ex:
            if self.verbose:
                exc_type, exc_value, exc_traceback = sys.exc_info()
                sys.stderr.write(repr(
                    traceback.format_exception(exc_type, exc_value, exc_traceback)))
            return "No prior dumps of this database stored."
        pretty_date = TimeUtils.pretty_date(raw_date)
        if done:
            prefix = ""
            message = "Last dumped on"
        else:
            prefix = "This dump is in progress; see also the "
            message = "previous dump from"
        return "%s<a href=\"../%s/\">%s %s</a>" % (prefix, raw_date, message, pretty_date)
Esempio n. 3
0
 def get_date(self, date):
     if date == 'last':
         dumps = sorted(self.wiki.dump_dirs())
         if dumps:
             date = dumps[-1]
         else:
             date = None
     if date is None:
         date = TimeUtils.today()
     return date
Esempio n. 4
0
 def report_failure(self):
     if self.email:
         if self.wiki.config.admin_mail and self.wiki.config.admin_mail.lower() != 'nomail':
             subject = "Dump failure for " + self.wiki.db_name
             message = self.wiki.config.read_template("errormail.txt") % {
                 "db": self.wiki.db_name,
                 "date": self.wiki.date,
                 "time": TimeUtils.pretty_time(),
                 "url": "/".join((self.wiki.config.web_root, self.wiki.db_name,
                                  self.wiki.date, ''))}
             self.mail(subject, message)
Esempio n. 5
0
    def db_info_by_age(self, use_status_time=False):
        """
        Sort wikis in reverse order of last successful dump and return
        tuples of information for each wiki:
          * whether the dump failed,
          * the date of the run as found in dump dir string OR
            as determined by time of status file, if use_status_time is True,
          * age of status file if any,
          * wiki name

        Order is (DumpFailed, Age), and False < True:
        First, wikis whose latest dump was successful, most recent dump first
        Then, wikis whose latest dump failed, most recent dump first.
        Finally, wikis which have never been dumped.

        According to that sort, the last item of this list is, when applicable,
        the oldest failed dump attempt.

        If some error occurs checking a dump status, that dump is put last in the
        list (sort value is (True, maxsize) )

        Note that we now sort this list by the date of the dump directory, not the
        last date that a dump file in that directory may have been touched. This
        allows us to rerun jobs to completion from older runs, for example
        an en pedia history urn that failed in the middle, without borking the
        index page links.
        """
        available = []
        today = int(TimeUtils.today())
        for dbname in self.db_list:
            wiki = Wiki(self, dbname)

            age = sys.maxsize
            date = sys.maxsize
            last = wiki.latest_dump()
            status = ''
            if last:
                dump_status = StatusHtml.get_statusfile_path(wiki, last)
                try:
                    if use_status_time:
                        # only use the status file time, not the dir date
                        date = today
                    else:
                        date = today - int(last)
                    # tack on the file mtime so that if we have multiple wikis
                    # dumped on the same day, they get ordered properly
                    age = FileUtils.file_age(dump_status)
                    status = FileUtils.read_file(dump_status)
                except Exception as ex:
                    print("dump dir missing status file %s?" % dump_status)
            dump_failed = (status == '') or ('dump aborted' in status)
            available.append((dump_failed, date, age, dbname))
        available = sorted(available)
        return available
Esempio n. 6
0
 def report_statusline(wiki, status, error=False):
     """
     given a wiki name, the status (done, in progress, etc), produce
     and return a line of html describing the status of the wiki,
     with a link to the wiki dump directory for the dump run date
     if appropriate
     """
     if error:
         # No state information, hide the timestamp
         stamp = "<span style=\"visible: none\">" + TimeUtils.pretty_time() + "</span>"
     else:
         stamp = TimeUtils.pretty_time()
     if wiki.is_private():
         link = "%s (private data)" % wiki.db_name
     else:
         if wiki.date:
             link = "<a href=\"%s/%s\">%s</a>" % (wiki.db_name, wiki.date, wiki.db_name)
         else:
             link = "%s (new)" % wiki.db_name
         if wiki.is_closed():
             link = link + " (closed)"
     return "<li>%s %s: %s</li>\n" % (stamp, link, status)
Esempio n. 7
0
 def do_all_wikis_til_done(self, num_fails, overwrite, date):
     """Run through all wikis, retrying up to numFails
     times in case of error"""
     if not date:
         date = TimeUtils.today()
     fails = 0
     while 1:
         self.do_all_wikis(overwrite, date)
         if not len(self.wikis_todo):
             break
         fails = fails + 1
         if fails > num_fails:
             raise BackupError("Too many failures, giving up")
         # wait 5 minutes and try another loop
         time.sleep(300)
Esempio n. 8
0
def do_main():
    '''
    main entry point, do all the work
    '''

    (configfile, date, dryrun, filenameformat,
     output_dir, overwrite, wikiname, script,
     basename, query, retries, verbose, remainder) = get_args()

    validate_args(date, output_dir, retries, script, query)

    if retries is None:
        retries = "3"
    retries = int(retries)

    if configfile:
        config = Config(configfile)
    else:
        config = Config()

    if date is None:
        date = TimeUtils.today()

    if script is not None:
        runner = ScriptRunner(script, remainder, dryrun, verbose)
    else:
        if query is None:
            query = FileUtils.read_file(config.queryfile)
        runner = QueryRunner(query, dryrun, verbose)

    if basename is not None:
        base = Wiki(config, basename)
        base.set_date(date)
        if base is not None:
            base.config.parse_conffile_per_project(base.db_name)
    else:
        base = None

    if wikiname is not None:
        wiki = Wiki(config, wikiname)
        wiki.set_date(date)
        wikirunner = WikiRunner(runner, wiki, filenameformat,
                                output_dir, base)
        wikirunner.do_one_wiki(overwrite)
    else:
        wikirunner = WikiRunnerLoop(config, runner, filenameformat,
                                    output_dir, base)
        wikirunner.do_all_wikis_til_done(retries, overwrite, date)
Esempio n. 9
0
 def set_status(self, status, set_updated=True):
     self.runinfo["status"] = status
     if set_updated:
         self.runinfo["updated"] = TimeUtils.pretty_time()
Esempio n. 10
0
 def debug(self, stuff):
     self.log_and_print("%s: %s" % (TimeUtils.pretty_time(), stuff))
Esempio n. 11
0
def main():
    os.environ['DUMPS'] = str(os.getpid())

    try:
        date = None
        config_file = False
        force_lock = False
        prefetch = True
        prefetchdate = None
        spawn = True
        restart = False
        jobs_requested = None
        skip_jobs = None
        enable_logging = False
        html_notice = ""
        dryrun = False
        partnum_todo = None
        after_checkpoint = False
        checkpoint_file = None
        page_id_range = None
        cutoff = None
        exitcode = 1
        skipdone = False
        do_locking = False
        verbose = False
        cleanup_files = False
        do_prereqs = False

        try:
            (options, remainder) = getopt.gnu_getopt(
                sys.argv[1:], "",
                ['date=', 'job=', 'skipjobs=', 'configfile=', 'addnotice=',
                 'delnotice', 'force', 'dryrun', 'noprefetch', 'prefetchdate=',
                 'nospawn', 'restartfrom', 'aftercheckpoint=', 'log', 'partnum=',
                 'checkpoint=', 'pageidrange=', 'cutoff=', "skipdone",
                 "exclusive", "prereqs", "cleanup", 'verbose'])
        except Exception as ex:
            usage("Unknown option specified")

        for (opt, val) in options:
            if opt == "--date":
                date = val
            elif opt == "--configfile":
                config_file = val
            elif opt == '--checkpoint':
                checkpoint_file = val
            elif opt == '--partnum':
                partnum_todo = int(val)
            elif opt == "--force":
                force_lock = True
            elif opt == '--aftercheckpoint':
                after_checkpoint = True
                checkpoint_file = val
            elif opt == "--noprefetch":
                prefetch = False
            elif opt == "--prefetchdate":
                prefetchdate = val
            elif opt == "--nospawn":
                spawn = False
            elif opt == "--dryrun":
                dryrun = True
            elif opt == "--job":
                jobs_requested = val
            elif opt == "--skipjobs":
                skip_jobs = val
            elif opt == "--restartfrom":
                restart = True
            elif opt == "--log":
                enable_logging = True
            elif opt == "--addnotice":
                html_notice = val
            elif opt == "--delnotice":
                html_notice = False
            elif opt == "--pageidrange":
                page_id_range = val
            elif opt == "--cutoff":
                cutoff = val
                if not cutoff.isdigit() or not len(cutoff) == 8:
                    usage("--cutoff value must be in yyyymmdd format")
            elif opt == "--skipdone":
                skipdone = True
            elif opt == "--cleanup":
                cleanup_files = True
            elif opt == "--exclusive":
                do_locking = True
            elif opt == "--verbose":
                verbose = True
            elif opt == "--prereqs":
                do_prereqs = True

        if jobs_requested is not None:
            if ',' in jobs_requested:
                jobs_todo = jobs_requested.split(',')
            else:
                jobs_todo = [jobs_requested]
        else:
            jobs_todo = []

        if dryrun and (len(remainder) == 0):
            usage("--dryrun requires the name of a wikidb to be specified")
        if restart and not jobs_requested:
            usage("--restartfrom requires --job and the job from which to restart")
        if restart and len(jobs_todo) > 1:
            usage("--restartfrom requires --job and exactly one job from which to restart")
        if partnum_todo is not None and not jobs_requested:
            usage("--partnum option requires specific job(s) for which to rerun that part")
        if partnum_todo is not None and restart:
            usage("--partnum option can be specified only for a specific list of jobs")
        if checkpoint_file is not None and (len(remainder) == 0):
            usage("--checkpoint option requires the name of a wikidb to be specified")
        if checkpoint_file is not None and not jobs_requested:
            usage("--checkpoint option requires --job")
        if page_id_range and not jobs_requested:
            usage("--pageidrange option requires --job")
        if page_id_range and checkpoint_file is not None:
            usage("--pageidrange option cannot be used with --checkpoint option")
        if prefetchdate is not None and not prefetch:
            usage("prefetchdate and noprefetch options may not be specified together")
        if prefetchdate is not None and (not prefetchdate.isdigit() or len(prefetchdate) != 8):
            usage("prefetchdate must be of the form YYYYMMDD")
        if skip_jobs is None:
            skip_jobs = []
        else:
            skip_jobs = skip_jobs.split(",")

        # allow alternate config file
        if config_file:
            config = Config(config_file)
        else:
            config = Config()
        externals = ['php', 'mysql', 'mysqldump', 'head', 'tail',
                     'checkforbz2footer', 'grep', 'gzip', 'bzip2',
                     'writeuptopageid', 'recompressxml', 'sevenzip', 'cat']

        failed = False
        unknowns = []
        notfound = []
        for external in externals:
            try:
                ext = getattr(config, external)
            except AttributeError as ex:
                unknowns.append(external)
                failed = True
            else:
                if not exists(ext):
                    notfound.append(ext)
                    failed = True
        if failed:
            if unknowns:
                sys.stderr.write("Unknown config param(s): %s\n" % ", ".join(unknowns))
            if notfound:
                sys.stderr.write("Command(s) not found: %s\n" % ", ".join(notfound))
            sys.stderr.write("Exiting.\n")
            sys.exit(1)

        if (dryrun or partnum_todo is not None or
                (jobs_requested is not None and
                 not restart and
                 not do_locking and
                 not force_lock)):
            locks_enabled = False
        else:
            locks_enabled = True

        if dryrun:
            print "***"
            print "Dry run only, no files will be updated."
            print "***"

        if len(remainder) > 0:
            wiki = Wiki(config, remainder[0])
            if cutoff:
                # fixme if we asked for a specific job then check that job only
                # not the dir
                last_ran = wiki.latest_dump()
                if last_ran >= cutoff:
                    wiki = None
            if wiki is not None and locks_enabled:
                locker = Locker(wiki, date)
                if force_lock and locks_enabled:
                    lockfiles = locker.is_locked()
                    locker.unlock(lockfiles, owner=False)
                if locks_enabled:
                    locker.lock()

        else:
            # if the run is across all wikis and we are just doing one job,
            # we want the age of the wikis by the latest status update
            # and not the date the run started

            if jobs_requested is not None and jobs_requested[0] == 'createdirs':
                check_status_time = False
                # there won't actually be a status for this job but we want
                # to ensure that the directory and the status file are present
                # and intact
                check_job_status = True
                check_prereq_status = False
            else:
                check_status_time = bool(jobs_requested is not None)
                check_job_status = bool(skipdone)
                check_prereq_status = bool(jobs_requested is not None and skipdone)
            wiki = find_lock_next_wiki(config, locks_enabled, cutoff, prefetch,
                                       prefetchdate, spawn,
                                       dryrun, html_notice, check_status_time,
                                       check_job_status, check_prereq_status, date,
                                       jobs_todo[0] if len(jobs_todo) else None,
                                       skip_jobs, page_id_range,
                                       partnum_todo, checkpoint_file, skipdone, restart, verbose)

        if wiki is not None and wiki:
            # process any per-project configuration options
            config.parse_conffile_per_project(wiki.db_name)

            if date == 'last':
                dumps = sorted(wiki.dump_dirs())
                if dumps:
                    date = dumps[-1]
                else:
                    date = None

            if date is None or not date:
                date = TimeUtils.today()
            wiki.set_date(date)

            if after_checkpoint:
                fname = DumpFilename(wiki)
                fname.new_from_filename(checkpoint_file)
                if not fname.is_checkpoint_file:
                    usage("--aftercheckpoint option requires the "
                          "name of a checkpoint file, bad filename provided")
                page_id_range = str(int(fname.last_page_id) + 1)
                partnum_todo = fname.partnum_int
                # now we don't need this.
                checkpoint_file = None
                after_checkpoint_jobs = ['articlesdump', 'metacurrentdump',
                                         'metahistorybz2dump']
                if (jobs_requested is None or
                        not set(jobs_requested).issubset(set(after_checkpoint_jobs))):
                    usage("--aftercheckpoint option requires --job option with one or more of %s"
                          % ", ".join(after_checkpoint_jobs))

            enabled = {}
            if enable_logging:
                enabled = {"logging": True}

            if restart:
                sys.stderr.write("Running %s, restarting from job %s...\n" %
                                 (wiki.db_name, jobs_todo[0]))
            elif jobs_requested:
                sys.stderr.write("Running %s, jobs %s...\n" % (wiki.db_name, jobs_requested))
            else:
                sys.stderr.write("Running %s...\n" % wiki.db_name)

            # no specific jobs requested, runner will do them all
            if not len(jobs_todo):
                runner = Runner(wiki, prefetch, prefetchdate, spawn, None, skip_jobs,
                                restart, html_notice, dryrun, enabled,
                                partnum_todo, checkpoint_file, page_id_range, skipdone,
                                cleanup_files, do_prereqs, verbose)

                result = runner.run()
                if result is not None and result:
                    exitcode = 0

            else:
                # do each job requested one at a time
                for job in jobs_todo:
                    runner = Runner(wiki, prefetch, prefetchdate, spawn, job, skip_jobs,
                                    restart, html_notice, dryrun, enabled,
                                    partnum_todo, checkpoint_file, page_id_range, skipdone,
                                    cleanup_files, do_prereqs, verbose)

                    result = runner.run()
                    if result is not None and result:
                        exitcode = 0

            # if we are doing one piece only of the dump, we don't unlock either
            if locks_enabled:
                locker = Locker(wiki, date)
                lockfiles = locker.is_locked()
                locker.unlock(lockfiles, owner=True)
        elif wiki is not None:
            sys.stderr.write("Wikis available to run but prereqs not complete.\n")
            exitcode = 0
        else:
            sys.stderr.write("No wikis available to run.\n")
            exitcode = 255
    finally:
        cleanup()
    sys.exit(exitcode)
Esempio n. 12
0
 def debug(self, stuff):
     """
     display a debugging message with wiki name and time,
     log it also if logging is enabled
     """
     self.log_and_print("%s: %s %s" % (TimeUtils.pretty_time(), self.db_name, stuff))