Ejemplo n.º 1
0
    def do_mark(self, wikiname):
        '''
        mark the specified job with the specified status.
        '''

        wiki = Wiki(self.wikiconfs[wikiname], wikiname)
        date = wiki.latest_dump()
        if date is None:
            print "dump never run, not marking job for wiki", wikiname
            return
        wiki.set_date(date)

        runner = Runner(wiki, prefetch=True, spawn=True, job=None,
                        skip_jobs=[], restart=False, notice="", dryrun=False,
                        enabled=None, partnum_todo=False, checkpoint_file=None,
                        page_id_range=None, skipdone=[], cleanup=False, verbose=self.verbose)

        known_jobs = [item.name() for item in runner.dump_item_list.dump_items] + ['tables']
        if ':' in self.job_status:
            job, status = self.job_status.split(":", 1)
            if status not in ["done", "failed"]:
                status = None
            if job not in known_jobs:
                job = None
        if job is None or status is None:
            print "bad or no job/status specified", self.job_status
            if self.verbose:
                print "known jobs", known_jobs
            return

        runner.dumpjobdata.do_before_dump()

        for item in runner.dump_item_list.dump_items:
            if item.name() == job:
                item.set_status(status, True)
            if item.status() == "done":
                runner.dumpjobdata.do_after_job(item)
            elif item.status() not in ["done", "waiting", "skipped"]:
                runner.failurehandler.failure_count += 1

        if self.verbose:
            print "updating status files for wiki", wiki.db_name
        if runner.dump_item_list.all_possible_jobs_done():
            # All jobs are either in status "done", "waiting", "failed", "skipped"
            runner.indexhtml.update_index_html("done")
            runner.statushtml.update_status_file("done")
        else:
            runner.indexhtml.update_index_html("partialdone")
            runner.statushtml.update_status_file("partialdone")

        runner.dumpjobdata.do_after_dump(runner.dump_item_list.dump_items)
        return
Ejemplo n.º 2
0
def find_lock_next_wiki(config, locks_enabled, cutoff, prefetch, prefetchdate,
                        spawn, dryrun, html_notice, bystatustime=False,
                        check_job_status=False, check_prereq_status=False,
                        date=None, job=None, skipjobs=None, page_id_range=None,
                        partnum_todo=None, checkpoint_file=None, skipdone=False, restart=False,
                        verbose=False):
    nextdbs = config.db_list_by_age(bystatustime)
    nextdbs.reverse()

    if verbose and not cutoff:
        sys.stderr.write("Finding oldest unlocked wiki...\n")

    # if we skip locked wikis which are missing the prereqs for this job,
    # there are still wikis where this job needs to run
    missing_prereqs = False

    for dbname in nextdbs:
        wiki = Wiki(config, dbname)
        if cutoff:
            if bystatustime:
                last_updated = wiki.date_touched_latest_dump()
            else:
                last_updated = wiki.latest_dump()

            if last_updated >= cutoff:
                continue
        if check_job_status:
            if check_jobs(wiki, date, job, skipjobs, page_id_range,
                          partnum_todo, checkpoint_file, restart,
                          prefetch, prefetchdate, spawn, True,
                          skipdone, verbose, html_notice):
                continue
        try:
            if locks_enabled:
                locker = Locker(wiki, date)
                locker.lock()
            return wiki
        except Exception as ex:
            if check_prereq_status:
                # if we skip locked wikis which are missing the prereqs for this job,
                # there are still wikis where this job needs to run
                if not check_jobs(wiki, date, job, skipjobs, page_id_range, partnum_todo,
                                  checkpoint_file, prefetch, prefetchdate,
                                  spawn, True, skipdone, verbose,
                                  html_notice, prereqs=True, restart=restart):
                    missing_prereqs = True
            sys.stderr.write("Couldn't lock %s, someone else must have got it...\n" % dbname)
            continue
    if missing_prereqs:
        return False
    else:
        return None
Ejemplo n.º 3
0
    def undo_notice(self, wikiname):
        '''
        remove any notice.txt file that may exist
        for the most current run for the given wiki
        '''
        wiki = Wiki(self.wikiconfs[wikiname], wikiname)
        date = wiki.latest_dump()
        if date is None:
            print "dump never run, no notice file to remove for wiki", wikiname
            return

        if self.dryrun:
            print "would remove notice.txt for wiki", wikiname, "date", date
            return
        elif self.verbose:
            print "removing notice file for wiki", wikiname, "date", date

        wiki.set_date(date)
        NoticeFile(wiki, False, True)
Ejemplo n.º 4
0
    def do_notice(self, wikiname):
        '''
        create a notice.txt file for the particular wiki for
        the most recent run. the contents will appear on its
        web page for that dump run
        '''
        wiki = Wiki(self.wikiconfs[wikiname], wikiname)
        date = wiki.latest_dump()
        if date is None:
            print "dump never run, not adding notice file for wiki", wikiname
            return

        if self.dryrun:
            print "would add notice.txt for wiki", wikiname, "date", date
            return
        elif self.verbose:
            print "creating notice file for wiki", wikiname, "date", date

        wiki.set_date(date)
        NoticeFile(wiki, self.message, True)
Ejemplo n.º 5
0
    def find_failed_dumps_for_wiki(self, wikiname):
        '''
        return list of failed jobs for the latest run
        for the specified wiki or empty list if there are none
        '''

        failed_jobs = []
        # fixme how is the above a string, shouldn't it be a function?
        wiki = Wiki(self.wikiconfs[wikiname], wikiname)
        date = wiki.latest_dump()
        if date is None:
            return [], None

        wiki.set_date(date)
        run_info_file = RunInfoFile(wiki, False)
        results = run_info_file.get_old_runinfo_from_file()
        if not results:
            return [], None

        for entry in results:
            if entry["status"] == "failed":
                failed_jobs.append(entry["name"])
        return failed_jobs, date
Ejemplo n.º 6
0
def main():
    os.environ['DUMPS'] = str(os.getpid())

    try:
        date = None
        config_file = False
        force_lock = False
        prefetch = True
        prefetchdate = None
        spawn = True
        restart = False
        jobs_requested = None
        skip_jobs = None
        enable_logging = False
        html_notice = ""
        dryrun = False
        partnum_todo = None
        after_checkpoint = False
        checkpoint_file = None
        page_id_range = None
        cutoff = None
        exitcode = 1
        skipdone = False
        do_locking = False
        verbose = False
        cleanup_files = False
        do_prereqs = False

        try:
            (options, remainder) = getopt.gnu_getopt(
                sys.argv[1:], "",
                ['date=', 'job=', 'skipjobs=', 'configfile=', 'addnotice=',
                 'delnotice', 'force', 'dryrun', 'noprefetch', 'prefetchdate=',
                 'nospawn', 'restartfrom', 'aftercheckpoint=', 'log', 'partnum=',
                 'checkpoint=', 'pageidrange=', 'cutoff=', "skipdone",
                 "exclusive", "prereqs", "cleanup", 'verbose'])
        except Exception as ex:
            usage("Unknown option specified")

        for (opt, val) in options:
            if opt == "--date":
                date = val
            elif opt == "--configfile":
                config_file = val
            elif opt == '--checkpoint':
                checkpoint_file = val
            elif opt == '--partnum':
                partnum_todo = int(val)
            elif opt == "--force":
                force_lock = True
            elif opt == '--aftercheckpoint':
                after_checkpoint = True
                checkpoint_file = val
            elif opt == "--noprefetch":
                prefetch = False
            elif opt == "--prefetchdate":
                prefetchdate = val
            elif opt == "--nospawn":
                spawn = False
            elif opt == "--dryrun":
                dryrun = True
            elif opt == "--job":
                jobs_requested = val
            elif opt == "--skipjobs":
                skip_jobs = val
            elif opt == "--restartfrom":
                restart = True
            elif opt == "--log":
                enable_logging = True
            elif opt == "--addnotice":
                html_notice = val
            elif opt == "--delnotice":
                html_notice = False
            elif opt == "--pageidrange":
                page_id_range = val
            elif opt == "--cutoff":
                cutoff = val
                if not cutoff.isdigit() or not len(cutoff) == 8:
                    usage("--cutoff value must be in yyyymmdd format")
            elif opt == "--skipdone":
                skipdone = True
            elif opt == "--cleanup":
                cleanup_files = True
            elif opt == "--exclusive":
                do_locking = True
            elif opt == "--verbose":
                verbose = True
            elif opt == "--prereqs":
                do_prereqs = True

        if jobs_requested is not None:
            if ',' in jobs_requested:
                jobs_todo = jobs_requested.split(',')
            else:
                jobs_todo = [jobs_requested]
        else:
            jobs_todo = []

        if dryrun and (len(remainder) == 0):
            usage("--dryrun requires the name of a wikidb to be specified")
        if restart and not jobs_requested:
            usage("--restartfrom requires --job and the job from which to restart")
        if restart and len(jobs_todo) > 1:
            usage("--restartfrom requires --job and exactly one job from which to restart")
        if partnum_todo is not None and not jobs_requested:
            usage("--partnum option requires specific job(s) for which to rerun that part")
        if partnum_todo is not None and restart:
            usage("--partnum option can be specified only for a specific list of jobs")
        if checkpoint_file is not None and (len(remainder) == 0):
            usage("--checkpoint option requires the name of a wikidb to be specified")
        if checkpoint_file is not None and not jobs_requested:
            usage("--checkpoint option requires --job")
        if page_id_range and not jobs_requested:
            usage("--pageidrange option requires --job")
        if page_id_range and checkpoint_file is not None:
            usage("--pageidrange option cannot be used with --checkpoint option")
        if prefetchdate is not None and not prefetch:
            usage("prefetchdate and noprefetch options may not be specified together")
        if prefetchdate is not None and (not prefetchdate.isdigit() or len(prefetchdate) != 8):
            usage("prefetchdate must be of the form YYYYMMDD")
        if skip_jobs is None:
            skip_jobs = []
        else:
            skip_jobs = skip_jobs.split(",")

        # allow alternate config file
        if config_file:
            config = Config(config_file)
        else:
            config = Config()
        externals = ['php', 'mysql', 'mysqldump', 'head', 'tail',
                     'checkforbz2footer', 'grep', 'gzip', 'bzip2',
                     'writeuptopageid', 'recompressxml', 'sevenzip', 'cat']

        failed = False
        unknowns = []
        notfound = []
        for external in externals:
            try:
                ext = getattr(config, external)
            except AttributeError as ex:
                unknowns.append(external)
                failed = True
            else:
                if not exists(ext):
                    notfound.append(ext)
                    failed = True
        if failed:
            if unknowns:
                sys.stderr.write("Unknown config param(s): %s\n" % ", ".join(unknowns))
            if notfound:
                sys.stderr.write("Command(s) not found: %s\n" % ", ".join(notfound))
            sys.stderr.write("Exiting.\n")
            sys.exit(1)

        if (dryrun or partnum_todo is not None or
                (jobs_requested is not None and
                 not restart and
                 not do_locking and
                 not force_lock)):
            locks_enabled = False
        else:
            locks_enabled = True

        if dryrun:
            print "***"
            print "Dry run only, no files will be updated."
            print "***"

        if len(remainder) > 0:
            wiki = Wiki(config, remainder[0])
            if cutoff:
                # fixme if we asked for a specific job then check that job only
                # not the dir
                last_ran = wiki.latest_dump()
                if last_ran >= cutoff:
                    wiki = None
            if wiki is not None and locks_enabled:
                locker = Locker(wiki, date)
                if force_lock and locks_enabled:
                    lockfiles = locker.is_locked()
                    locker.unlock(lockfiles, owner=False)
                if locks_enabled:
                    locker.lock()

        else:
            # if the run is across all wikis and we are just doing one job,
            # we want the age of the wikis by the latest status update
            # and not the date the run started

            if jobs_requested is not None and jobs_requested[0] == 'createdirs':
                check_status_time = False
                # there won't actually be a status for this job but we want
                # to ensure that the directory and the status file are present
                # and intact
                check_job_status = True
                check_prereq_status = False
            else:
                check_status_time = bool(jobs_requested is not None)
                check_job_status = bool(skipdone)
                check_prereq_status = bool(jobs_requested is not None and skipdone)
            wiki = find_lock_next_wiki(config, locks_enabled, cutoff, prefetch,
                                       prefetchdate, spawn,
                                       dryrun, html_notice, check_status_time,
                                       check_job_status, check_prereq_status, date,
                                       jobs_todo[0] if len(jobs_todo) else None,
                                       skip_jobs, page_id_range,
                                       partnum_todo, checkpoint_file, skipdone, restart, verbose)

        if wiki is not None and wiki:
            # process any per-project configuration options
            config.parse_conffile_per_project(wiki.db_name)

            if date == 'last':
                dumps = sorted(wiki.dump_dirs())
                if dumps:
                    date = dumps[-1]
                else:
                    date = None

            if date is None or not date:
                date = TimeUtils.today()
            wiki.set_date(date)

            if after_checkpoint:
                fname = DumpFilename(wiki)
                fname.new_from_filename(checkpoint_file)
                if not fname.is_checkpoint_file:
                    usage("--aftercheckpoint option requires the "
                          "name of a checkpoint file, bad filename provided")
                page_id_range = str(int(fname.last_page_id) + 1)
                partnum_todo = fname.partnum_int
                # now we don't need this.
                checkpoint_file = None
                after_checkpoint_jobs = ['articlesdump', 'metacurrentdump',
                                         'metahistorybz2dump']
                if (jobs_requested is None or
                        not set(jobs_requested).issubset(set(after_checkpoint_jobs))):
                    usage("--aftercheckpoint option requires --job option with one or more of %s"
                          % ", ".join(after_checkpoint_jobs))

            enabled = {}
            if enable_logging:
                enabled = {"logging": True}

            if restart:
                sys.stderr.write("Running %s, restarting from job %s...\n" %
                                 (wiki.db_name, jobs_todo[0]))
            elif jobs_requested:
                sys.stderr.write("Running %s, jobs %s...\n" % (wiki.db_name, jobs_requested))
            else:
                sys.stderr.write("Running %s...\n" % wiki.db_name)

            # no specific jobs requested, runner will do them all
            if not len(jobs_todo):
                runner = Runner(wiki, prefetch, prefetchdate, spawn, None, skip_jobs,
                                restart, html_notice, dryrun, enabled,
                                partnum_todo, checkpoint_file, page_id_range, skipdone,
                                cleanup_files, do_prereqs, verbose)

                result = runner.run()
                if result is not None and result:
                    exitcode = 0

            else:
                # do each job requested one at a time
                for job in jobs_todo:
                    runner = Runner(wiki, prefetch, prefetchdate, spawn, job, skip_jobs,
                                    restart, html_notice, dryrun, enabled,
                                    partnum_todo, checkpoint_file, page_id_range, skipdone,
                                    cleanup_files, do_prereqs, verbose)

                    result = runner.run()
                    if result is not None and result:
                        exitcode = 0

            # if we are doing one piece only of the dump, we don't unlock either
            if locks_enabled:
                locker = Locker(wiki, date)
                lockfiles = locker.is_locked()
                locker.unlock(lockfiles, owner=True)
        elif wiki is not None:
            sys.stderr.write("Wikis available to run but prereqs not complete.\n")
            exitcode = 0
        else:
            sys.stderr.write("No wikis available to run.\n")
            exitcode = 255
    finally:
        cleanup()
    sys.exit(exitcode)