Exemple #1
0
def find_lock_next_wiki(config, locks_enabled, cutoff, prefetch, prefetchdate,
                        spawn, dryrun, html_notice, bystatustime=False,
                        check_job_status=False, check_prereq_status=False,
                        date=None, job=None, skipjobs=None, page_id_range=None,
                        partnum_todo=None, checkpoint_file=None, skipdone=False, restart=False,
                        verbose=False):
    nextdbs = config.db_list_by_age(bystatustime)
    nextdbs.reverse()

    if verbose and not cutoff:
        sys.stderr.write("Finding oldest unlocked wiki...\n")

    # if we skip locked wikis which are missing the prereqs for this job,
    # there are still wikis where this job needs to run
    missing_prereqs = False

    for dbname in nextdbs:
        wiki = Wiki(config, dbname)
        if cutoff:
            if bystatustime:
                last_updated = wiki.date_touched_latest_dump()
            else:
                last_updated = wiki.latest_dump()

            if last_updated >= cutoff:
                continue
        if check_job_status:
            if check_jobs(wiki, date, job, skipjobs, page_id_range,
                          partnum_todo, checkpoint_file, restart,
                          prefetch, prefetchdate, spawn, True,
                          skipdone, verbose, html_notice):
                continue
        try:
            if locks_enabled:
                locker = Locker(wiki, date)
                locker.lock()
            return wiki
        except Exception as ex:
            if check_prereq_status:
                # if we skip locked wikis which are missing the prereqs for this job,
                # there are still wikis where this job needs to run
                if not check_jobs(wiki, date, job, skipjobs, page_id_range, partnum_todo,
                                  checkpoint_file, prefetch, prefetchdate,
                                  spawn, True, skipdone, verbose,
                                  html_notice, prereqs=True, restart=restart):
                    missing_prereqs = True
            sys.stderr.write("Couldn't lock %s, someone else must have got it...\n" % dbname)
            continue
    if missing_prereqs:
        return False
    else:
        return None
Exemple #2
0
    def do_mark(self, wikiname):
        '''
        mark the specified job with the specified status.
        '''

        wiki = Wiki(self.wikiconfs[wikiname], wikiname)
        date = wiki.latest_dump()
        if date is None:
            print "dump never run, not marking job for wiki", wikiname
            return
        wiki.set_date(date)

        runner = Runner(wiki, prefetch=True, spawn=True, job=None,
                        skip_jobs=[], restart=False, notice="", dryrun=False,
                        enabled=None, partnum_todo=False, checkpoint_file=None,
                        page_id_range=None, skipdone=[], cleanup=False, verbose=self.verbose)

        known_jobs = [item.name() for item in runner.dump_item_list.dump_items] + ['tables']
        if ':' in self.job_status:
            job, status = self.job_status.split(":", 1)
            if status not in ["done", "failed"]:
                status = None
            if job not in known_jobs:
                job = None
        if job is None or status is None:
            print "bad or no job/status specified", self.job_status
            if self.verbose:
                print "known jobs", known_jobs
            return

        runner.dumpjobdata.do_before_dump()

        for item in runner.dump_item_list.dump_items:
            if item.name() == job:
                item.set_status(status, True)
            if item.status() == "done":
                runner.dumpjobdata.do_after_job(item)
            elif item.status() not in ["done", "waiting", "skipped"]:
                runner.failurehandler.failure_count += 1

        if self.verbose:
            print "updating status files for wiki", wiki.db_name
        if runner.dump_item_list.all_possible_jobs_done():
            # All jobs are either in status "done", "waiting", "failed", "skipped"
            runner.indexhtml.update_index_html("done")
            runner.statushtml.update_status_file("done")
        else:
            runner.indexhtml.update_index_html("partialdone")
            runner.statushtml.update_status_file("partialdone")

        runner.dumpjobdata.do_after_dump(runner.dump_item_list.dump_items)
        return
Exemple #3
0
 def do_all_wikis(self, overwrite, date):
     '''
     run a script on all wikis, removing the completed wikis
     from the todo list in case the caller wants to retry the rest
     '''
     for wiki_name in self.wikis_todo[:]:
         wiki = Wiki(self.config, wiki_name)
         wiki.set_date(date)
         runner = WikiRunner(self.runner,
                             wiki, self.filenameformat,
                             self.output_dir, self.base)
         if runner.do_one_wiki(overwrite):
             self.wikis_todo.remove(wiki_name)
 def __init__(self, args, wikiname, flags):
     self.args = args
     self.wiki = Wiki(self.args["config"], wikiname)
     self.wiki.set_date(self.args["date"])
     self.flags = flags
     dump_class = MiscDumpFactory.get_dumper(self.args["dumptype"])
     self.dumper = dump_class(self.wiki, flags["dryrun"], self.args["args"])
    def do_one_wiki(self, wikiname, date=None):
        """
        collect the text strings for one wiki to be inserted into
        the index.html file
        """
        if not skip_wiki(wikiname, self.args["config"]):
            dumps_dirs = MiscDumpDirs(self.args["config"], wikiname)
            if not exists(self.dumpdir.get_dumpdir_no_date(wikiname)):
                log.info("No dump for wiki %s", wikiname)
                return
            if date is not None:
                dump_date = date
            else:
                dump_date = dumps_dirs.get_latest_dump_date(True)
            if not dump_date:
                log.info("No dump for wiki %s", wikiname)
                return

            other_runs_text = "other runs: %s<br />" % make_link(wikiname, wikiname)

            try:
                wiki = Wiki(self.args["config"], wikiname)
                wiki.set_date(dump_date)
                files_text = self.get_files_text(wiki)
                stat_text = self.get_stat_text(dump_date, wikiname)

            except Exception as ex:
                log.warning("Error encountered, no information available" " for wiki %s", wikiname, exc_info=ex)
                return "<strong>%s</strong> Error encountered," " no information available | %s" % (
                    wikiname,
                    other_runs_text,
                )

            try:
                wikiname_text = "<strong>%s</strong>" % wikiname

                wiki_info = " ".join([entry for entry in [wikiname_text, stat_text] if entry is not None]) + "<br />"
                wiki_info = wiki_info + "&nbsp;&nbsp;" + "\n&nbsp;&nbsp;".join(files_text)
                wiki_info = wiki_info + "\n&nbsp;" + other_runs_text
            except Exception as ex:
                log.warning("Error encountered formatting information" " for wiki %s", wikiname, exc_info=ex)
                return "Error encountered formatting information" " for wiki %s" % wikiname

            return wiki_info
Exemple #6
0
    def undo_notice(self, wikiname):
        '''
        remove any notice.txt file that may exist
        for the most current run for the given wiki
        '''
        wiki = Wiki(self.wikiconfs[wikiname], wikiname)
        date = wiki.latest_dump()
        if date is None:
            print "dump never run, no notice file to remove for wiki", wikiname
            return

        if self.dryrun:
            print "would remove notice.txt for wiki", wikiname, "date", date
            return
        elif self.verbose:
            print "removing notice file for wiki", wikiname, "date", date

        wiki.set_date(date)
        NoticeFile(wiki, False, True)
Exemple #7
0
    def do_notice(self, wikiname):
        '''
        create a notice.txt file for the particular wiki for
        the most recent run. the contents will appear on its
        web page for that dump run
        '''
        wiki = Wiki(self.wikiconfs[wikiname], wikiname)
        date = wiki.latest_dump()
        if date is None:
            print "dump never run, not adding notice file for wiki", wikiname
            return

        if self.dryrun:
            print "would add notice.txt for wiki", wikiname, "date", date
            return
        elif self.verbose:
            print "creating notice file for wiki", wikiname, "date", date

        wiki.set_date(date)
        NoticeFile(wiki, self.message, True)
Exemple #8
0
    def do_remove(self, rerun=False):
        '''
        find all failed dump jobs for unlocked wikis
        clean them up after getting lock on each one
        first, then remove lock

        if a specific wiki was specified at instantiation,
        clean up only that wiki
        '''
        failed_dumps = self.find_failed_dumps()
        for wikiname in failed_dumps:
            for date in failed_dumps[wikiname]:
                wiki = Wiki(self.wikiconfs[wikiname], wikiname)
                wiki.set_date(date)
                locker = Locker(wiki, date)
                try:
                    locker.lock()
                except Exception as ex:
                    sys.stderr.write("Couldn't lock %s, can't do cleanup\n" % wikiname)
                    continue
                self.cleanup_dump(wiki, failed_dumps[wikiname][date], rerun=rerun)
                locker.unlock(locker.get_lock_file_path())
Exemple #9
0
    def find_failed_dumps_for_wiki(self, wikiname):
        '''
        return list of failed jobs for the latest run
        for the specified wiki or empty list if there are none
        '''

        failed_jobs = []
        # fixme how is the above a string, shouldn't it be a function?
        wiki = Wiki(self.wikiconfs[wikiname], wikiname)
        date = wiki.latest_dump()
        if date is None:
            return [], None

        wiki.set_date(date)
        run_info_file = RunInfoFile(wiki, False)
        results = run_info_file.get_old_runinfo_from_file()
        if not results:
            return [], None

        for entry in results:
            if entry["status"] == "failed":
                failed_jobs.append(entry["name"])
        return failed_jobs, date
Exemple #10
0
def do_main():
    '''
    main entry point, do all the work
    '''

    (configfile, date, dryrun, filenameformat,
     output_dir, overwrite, wikiname, script,
     basename, query, retries, verbose, remainder) = get_args()

    validate_args(date, output_dir, retries, script, query)

    if retries is None:
        retries = "3"
    retries = int(retries)

    if configfile:
        config = Config(configfile)
    else:
        config = Config()

    if date is None:
        date = TimeUtils.today()

    if script is not None:
        runner = ScriptRunner(script, remainder, dryrun, verbose)
    else:
        if query is None:
            query = FileUtils.read_file(config.queryfile)
        runner = QueryRunner(query, dryrun, verbose)

    if basename is not None:
        base = Wiki(config, basename)
        base.set_date(date)
        if base is not None:
            base.config.parse_conffile_per_project(base.db_name)
    else:
        base = None

    if wikiname is not None:
        wiki = Wiki(config, wikiname)
        wiki.set_date(date)
        wikirunner = WikiRunner(runner, wiki, filenameformat,
                                output_dir, base)
        wikirunner.do_one_wiki(overwrite)
    else:
        wikirunner = WikiRunnerLoop(config, runner, filenameformat,
                                    output_dir, base)
        wikirunner.do_all_wikis_til_done(retries, overwrite, date)
Exemple #11
0
def main():
    os.environ['DUMPS'] = str(os.getpid())

    try:
        date = None
        config_file = False
        force_lock = False
        prefetch = True
        prefetchdate = None
        spawn = True
        restart = False
        jobs_requested = None
        skip_jobs = None
        enable_logging = False
        html_notice = ""
        dryrun = False
        partnum_todo = None
        after_checkpoint = False
        checkpoint_file = None
        page_id_range = None
        cutoff = None
        exitcode = 1
        skipdone = False
        do_locking = False
        verbose = False
        cleanup_files = False
        do_prereqs = False

        try:
            (options, remainder) = getopt.gnu_getopt(
                sys.argv[1:], "",
                ['date=', 'job=', 'skipjobs=', 'configfile=', 'addnotice=',
                 'delnotice', 'force', 'dryrun', 'noprefetch', 'prefetchdate=',
                 'nospawn', 'restartfrom', 'aftercheckpoint=', 'log', 'partnum=',
                 'checkpoint=', 'pageidrange=', 'cutoff=', "skipdone",
                 "exclusive", "prereqs", "cleanup", 'verbose'])
        except Exception as ex:
            usage("Unknown option specified")

        for (opt, val) in options:
            if opt == "--date":
                date = val
            elif opt == "--configfile":
                config_file = val
            elif opt == '--checkpoint':
                checkpoint_file = val
            elif opt == '--partnum':
                partnum_todo = int(val)
            elif opt == "--force":
                force_lock = True
            elif opt == '--aftercheckpoint':
                after_checkpoint = True
                checkpoint_file = val
            elif opt == "--noprefetch":
                prefetch = False
            elif opt == "--prefetchdate":
                prefetchdate = val
            elif opt == "--nospawn":
                spawn = False
            elif opt == "--dryrun":
                dryrun = True
            elif opt == "--job":
                jobs_requested = val
            elif opt == "--skipjobs":
                skip_jobs = val
            elif opt == "--restartfrom":
                restart = True
            elif opt == "--log":
                enable_logging = True
            elif opt == "--addnotice":
                html_notice = val
            elif opt == "--delnotice":
                html_notice = False
            elif opt == "--pageidrange":
                page_id_range = val
            elif opt == "--cutoff":
                cutoff = val
                if not cutoff.isdigit() or not len(cutoff) == 8:
                    usage("--cutoff value must be in yyyymmdd format")
            elif opt == "--skipdone":
                skipdone = True
            elif opt == "--cleanup":
                cleanup_files = True
            elif opt == "--exclusive":
                do_locking = True
            elif opt == "--verbose":
                verbose = True
            elif opt == "--prereqs":
                do_prereqs = True

        if jobs_requested is not None:
            if ',' in jobs_requested:
                jobs_todo = jobs_requested.split(',')
            else:
                jobs_todo = [jobs_requested]
        else:
            jobs_todo = []

        if dryrun and (len(remainder) == 0):
            usage("--dryrun requires the name of a wikidb to be specified")
        if restart and not jobs_requested:
            usage("--restartfrom requires --job and the job from which to restart")
        if restart and len(jobs_todo) > 1:
            usage("--restartfrom requires --job and exactly one job from which to restart")
        if partnum_todo is not None and not jobs_requested:
            usage("--partnum option requires specific job(s) for which to rerun that part")
        if partnum_todo is not None and restart:
            usage("--partnum option can be specified only for a specific list of jobs")
        if checkpoint_file is not None and (len(remainder) == 0):
            usage("--checkpoint option requires the name of a wikidb to be specified")
        if checkpoint_file is not None and not jobs_requested:
            usage("--checkpoint option requires --job")
        if page_id_range and not jobs_requested:
            usage("--pageidrange option requires --job")
        if page_id_range and checkpoint_file is not None:
            usage("--pageidrange option cannot be used with --checkpoint option")
        if prefetchdate is not None and not prefetch:
            usage("prefetchdate and noprefetch options may not be specified together")
        if prefetchdate is not None and (not prefetchdate.isdigit() or len(prefetchdate) != 8):
            usage("prefetchdate must be of the form YYYYMMDD")
        if skip_jobs is None:
            skip_jobs = []
        else:
            skip_jobs = skip_jobs.split(",")

        # allow alternate config file
        if config_file:
            config = Config(config_file)
        else:
            config = Config()
        externals = ['php', 'mysql', 'mysqldump', 'head', 'tail',
                     'checkforbz2footer', 'grep', 'gzip', 'bzip2',
                     'writeuptopageid', 'recompressxml', 'sevenzip', 'cat']

        failed = False
        unknowns = []
        notfound = []
        for external in externals:
            try:
                ext = getattr(config, external)
            except AttributeError as ex:
                unknowns.append(external)
                failed = True
            else:
                if not exists(ext):
                    notfound.append(ext)
                    failed = True
        if failed:
            if unknowns:
                sys.stderr.write("Unknown config param(s): %s\n" % ", ".join(unknowns))
            if notfound:
                sys.stderr.write("Command(s) not found: %s\n" % ", ".join(notfound))
            sys.stderr.write("Exiting.\n")
            sys.exit(1)

        if (dryrun or partnum_todo is not None or
                (jobs_requested is not None and
                 not restart and
                 not do_locking and
                 not force_lock)):
            locks_enabled = False
        else:
            locks_enabled = True

        if dryrun:
            print "***"
            print "Dry run only, no files will be updated."
            print "***"

        if len(remainder) > 0:
            wiki = Wiki(config, remainder[0])
            if cutoff:
                # fixme if we asked for a specific job then check that job only
                # not the dir
                last_ran = wiki.latest_dump()
                if last_ran >= cutoff:
                    wiki = None
            if wiki is not None and locks_enabled:
                locker = Locker(wiki, date)
                if force_lock and locks_enabled:
                    lockfiles = locker.is_locked()
                    locker.unlock(lockfiles, owner=False)
                if locks_enabled:
                    locker.lock()

        else:
            # if the run is across all wikis and we are just doing one job,
            # we want the age of the wikis by the latest status update
            # and not the date the run started

            if jobs_requested is not None and jobs_requested[0] == 'createdirs':
                check_status_time = False
                # there won't actually be a status for this job but we want
                # to ensure that the directory and the status file are present
                # and intact
                check_job_status = True
                check_prereq_status = False
            else:
                check_status_time = bool(jobs_requested is not None)
                check_job_status = bool(skipdone)
                check_prereq_status = bool(jobs_requested is not None and skipdone)
            wiki = find_lock_next_wiki(config, locks_enabled, cutoff, prefetch,
                                       prefetchdate, spawn,
                                       dryrun, html_notice, check_status_time,
                                       check_job_status, check_prereq_status, date,
                                       jobs_todo[0] if len(jobs_todo) else None,
                                       skip_jobs, page_id_range,
                                       partnum_todo, checkpoint_file, skipdone, restart, verbose)

        if wiki is not None and wiki:
            # process any per-project configuration options
            config.parse_conffile_per_project(wiki.db_name)

            if date == 'last':
                dumps = sorted(wiki.dump_dirs())
                if dumps:
                    date = dumps[-1]
                else:
                    date = None

            if date is None or not date:
                date = TimeUtils.today()
            wiki.set_date(date)

            if after_checkpoint:
                fname = DumpFilename(wiki)
                fname.new_from_filename(checkpoint_file)
                if not fname.is_checkpoint_file:
                    usage("--aftercheckpoint option requires the "
                          "name of a checkpoint file, bad filename provided")
                page_id_range = str(int(fname.last_page_id) + 1)
                partnum_todo = fname.partnum_int
                # now we don't need this.
                checkpoint_file = None
                after_checkpoint_jobs = ['articlesdump', 'metacurrentdump',
                                         'metahistorybz2dump']
                if (jobs_requested is None or
                        not set(jobs_requested).issubset(set(after_checkpoint_jobs))):
                    usage("--aftercheckpoint option requires --job option with one or more of %s"
                          % ", ".join(after_checkpoint_jobs))

            enabled = {}
            if enable_logging:
                enabled = {"logging": True}

            if restart:
                sys.stderr.write("Running %s, restarting from job %s...\n" %
                                 (wiki.db_name, jobs_todo[0]))
            elif jobs_requested:
                sys.stderr.write("Running %s, jobs %s...\n" % (wiki.db_name, jobs_requested))
            else:
                sys.stderr.write("Running %s...\n" % wiki.db_name)

            # no specific jobs requested, runner will do them all
            if not len(jobs_todo):
                runner = Runner(wiki, prefetch, prefetchdate, spawn, None, skip_jobs,
                                restart, html_notice, dryrun, enabled,
                                partnum_todo, checkpoint_file, page_id_range, skipdone,
                                cleanup_files, do_prereqs, verbose)

                result = runner.run()
                if result is not None and result:
                    exitcode = 0

            else:
                # do each job requested one at a time
                for job in jobs_todo:
                    runner = Runner(wiki, prefetch, prefetchdate, spawn, job, skip_jobs,
                                    restart, html_notice, dryrun, enabled,
                                    partnum_todo, checkpoint_file, page_id_range, skipdone,
                                    cleanup_files, do_prereqs, verbose)

                    result = runner.run()
                    if result is not None and result:
                        exitcode = 0

            # if we are doing one piece only of the dump, we don't unlock either
            if locks_enabled:
                locker = Locker(wiki, date)
                lockfiles = locker.is_locked()
                locker.unlock(lockfiles, owner=True)
        elif wiki is not None:
            sys.stderr.write("Wikis available to run but prereqs not complete.\n")
            exitcode = 0
        else:
            sys.stderr.write("No wikis available to run.\n")
            exitcode = 255
    finally:
        cleanup()
    sys.exit(exitcode)
class MiscDumpOne(object):
    """
    run dump of specified name on all wikis, or if do_dump
    is False, only generate the index.html file containing
    information on the dump run, for all wikis.

    args are keyword args converted to a dict, these get passed
    through to the class for the specific dump you want
    """

    def __init__(self, args, wikiname, flags):
        self.args = args
        self.wiki = Wiki(self.args["config"], wikiname)
        self.wiki.set_date(self.args["date"])
        self.flags = flags
        dump_class = MiscDumpFactory.get_dumper(self.args["dumptype"])
        self.dumper = dump_class(self.wiki, flags["dryrun"], self.args["args"])

    def do_one_wiki(self):
        """
        run dump of specified type for one wiki, for given date
        unless it is among the wikis we skip, has already been run
        for the date, or some other process has the lock and is
        therefore presumably already dumping it
        """
        if not skip_wiki(self.wiki.db_name, self.wiki.config):

            dumpdir = MiscDumpDir(self.args["config"], self.args["date"])
            if not exists(dumpdir.get_dumpdir(self.wiki.db_name)):
                os.makedirs(dumpdir.get_dumpdir(self.wiki.db_name))

            status_info = StatusInfo(self.args["config"], self.wiki.date, self.wiki.db_name)
            status = status_info.get_status()
            if status == "done:all" and not self.flags["forcerun"]:
                log.info("wiki %s skipped, adds/changes dump already" " complete", self.wiki.db_name)
                return STATUS_GOOD

            if not self.flags["dryrun"]:
                lock = MiscDumpLock(self.args["config"], self.wiki.date, self.wiki.db_name)

                # if lock is stale, remove it
                lock.remove_if_stale(self.wiki.config.lock_stale)

                # try to get the lock ourselves
                if not lock.get_lock():
                    log.info(
                        "wiki %s skipped, wiki is locked," " another process should be doing the job", self.wiki.db_name
                    )
                    return STATUS_TODO

                self.dumper.set_lockinfo(lock)
                dumps_dirs = MiscDumpDirs(self.wiki.config, self.wiki.db_name)
                dumps_dirs.cleanup_old_dumps(self.wiki.date)

            log.info("Doing run for wiki: %s", self.wiki.db_name)

            try:
                result = self.dumper.run()
                if not result:
                    return STATUS_FAILED

                if not self.flags["dryrun"]:
                    output_files, expected = self.dumper.get_output_files()
                    if not md5sums(self.wiki, self.wiki.config.fileperms, output_files, expected):
                        return STATUS_FAILED
                    status_info.set_status("done:" + self.dumper.get_steps_done())
                    lock.unlock_if_owner()

                if self.flags["do_index"]:
                    index = Index(self.args)
                    index.do_all_wikis()
            except Exception as ex:
                log.warning("error from dump run" " for wiki %s", self.wiki.db_name, exc_info=ex)
                if not self.flags["dryrun"]:
                    lock.unlock_if_owner()
                return STATUS_FAILED
        log.info("Success!  Wiki %s %s dump complete.", self.wiki.db_name, self.args["dumptype"])
        return STATUS_GOOD