Example #1
0
def find_lock_next_wiki(config, locks_enabled, cutoff, prefetch, prefetchdate,
                        spawn, dryrun, html_notice, bystatustime=False,
                        check_job_status=False, check_prereq_status=False,
                        date=None, job=None, skipjobs=None, page_id_range=None,
                        partnum_todo=None, checkpoint_file=None, skipdone=False, restart=False,
                        verbose=False):
    nextdbs = config.db_list_by_age(bystatustime)
    nextdbs.reverse()

    if verbose and not cutoff:
        sys.stderr.write("Finding oldest unlocked wiki...\n")

    # if we skip locked wikis which are missing the prereqs for this job,
    # there are still wikis where this job needs to run
    missing_prereqs = False

    for dbname in nextdbs:
        wiki = Wiki(config, dbname)
        if cutoff:
            if bystatustime:
                last_updated = wiki.date_touched_latest_dump()
            else:
                last_updated = wiki.latest_dump()

            if last_updated >= cutoff:
                continue
        if check_job_status:
            if check_jobs(wiki, date, job, skipjobs, page_id_range,
                          partnum_todo, checkpoint_file, restart,
                          prefetch, prefetchdate, spawn, True,
                          skipdone, verbose, html_notice):
                continue
        try:
            if locks_enabled:
                locker = Locker(wiki, date)
                locker.lock()
            return wiki
        except Exception as ex:
            if check_prereq_status:
                # if we skip locked wikis which are missing the prereqs for this job,
                # there are still wikis where this job needs to run
                if not check_jobs(wiki, date, job, skipjobs, page_id_range, partnum_todo,
                                  checkpoint_file, prefetch, prefetchdate,
                                  spawn, True, skipdone, verbose,
                                  html_notice, prereqs=True, restart=restart):
                    missing_prereqs = True
            sys.stderr.write("Couldn't lock %s, someone else must have got it...\n" % dbname)
            continue
    if missing_prereqs:
        return False
    else:
        return None
Example #2
0
def generate_index(config, other_indexhtml=None, sorted_by_db=False):
    running = False
    states = []

    if sorted_by_db:
        dbs = sorted(config.db_list)
    else:
        dbs = config.db_list_by_age()

    for db_name in dbs:
        try:
            wiki = Wiki(config, db_name)
            locker = Locker(wiki)
            lockfiles = locker.is_stale(all_locks=True)
            if lockfiles:
                locker.cleanup_stale_locks(lockfiles)
            running = running or locker.is_locked(all_locks=True)
            states.append(StatusHtml.status_line(wiki))
        except Exception:
            # if there's a problem with one wiki at least
            # let's show the rest
            if VERBOSE:
                traceback.print_exc(file=sys.stdout)
    if running:
        status = "Dumps are in progress..."
    elif exists("maintenance.txt"):
        status = FileUtils.read_file("maintenance.txt")
    else:
        status = "Dump process is idle."

    if other_indexhtml is None:
        other_index_link = ""
    else:
        if sorted_by_db:
            other_sortedby = "dump date"
        else:
            other_sortedby = "wiki name"

        other_index_link = ('Also view sorted by <a href="%s">%s</a>'
                            % (os.path.basename(other_indexhtml), other_sortedby))

    return config.read_template("download-index.html") % {
        "otherIndexLink": other_index_link,
        "status": status,
        "items": "\n".join(states)}
Example #3
0
    def do_remove(self, rerun=False):
        '''
        find all failed dump jobs for unlocked wikis
        clean them up after getting lock on each one
        first, then remove lock

        if a specific wiki was specified at instantiation,
        clean up only that wiki
        '''
        failed_dumps = self.find_failed_dumps()
        for wikiname in failed_dumps:
            for date in failed_dumps[wikiname]:
                wiki = Wiki(self.wikiconfs[wikiname], wikiname)
                wiki.set_date(date)
                locker = Locker(wiki, date)
                try:
                    locker.lock()
                except Exception as ex:
                    sys.stderr.write("Couldn't lock %s, can't do cleanup\n" % wikiname)
                    continue
                self.cleanup_dump(wiki, failed_dumps[wikiname][date], rerun=rerun)
                locker.unlock(locker.get_lock_file_path())
Example #4
0
def main():
    os.environ['DUMPS'] = str(os.getpid())

    try:
        date = None
        config_file = False
        force_lock = False
        prefetch = True
        prefetchdate = None
        spawn = True
        restart = False
        jobs_requested = None
        skip_jobs = None
        enable_logging = False
        html_notice = ""
        dryrun = False
        partnum_todo = None
        after_checkpoint = False
        checkpoint_file = None
        page_id_range = None
        cutoff = None
        exitcode = 1
        skipdone = False
        do_locking = False
        verbose = False
        cleanup_files = False
        do_prereqs = False

        try:
            (options, remainder) = getopt.gnu_getopt(
                sys.argv[1:], "",
                ['date=', 'job=', 'skipjobs=', 'configfile=', 'addnotice=',
                 'delnotice', 'force', 'dryrun', 'noprefetch', 'prefetchdate=',
                 'nospawn', 'restartfrom', 'aftercheckpoint=', 'log', 'partnum=',
                 'checkpoint=', 'pageidrange=', 'cutoff=', "skipdone",
                 "exclusive", "prereqs", "cleanup", 'verbose'])
        except Exception as ex:
            usage("Unknown option specified")

        for (opt, val) in options:
            if opt == "--date":
                date = val
            elif opt == "--configfile":
                config_file = val
            elif opt == '--checkpoint':
                checkpoint_file = val
            elif opt == '--partnum':
                partnum_todo = int(val)
            elif opt == "--force":
                force_lock = True
            elif opt == '--aftercheckpoint':
                after_checkpoint = True
                checkpoint_file = val
            elif opt == "--noprefetch":
                prefetch = False
            elif opt == "--prefetchdate":
                prefetchdate = val
            elif opt == "--nospawn":
                spawn = False
            elif opt == "--dryrun":
                dryrun = True
            elif opt == "--job":
                jobs_requested = val
            elif opt == "--skipjobs":
                skip_jobs = val
            elif opt == "--restartfrom":
                restart = True
            elif opt == "--log":
                enable_logging = True
            elif opt == "--addnotice":
                html_notice = val
            elif opt == "--delnotice":
                html_notice = False
            elif opt == "--pageidrange":
                page_id_range = val
            elif opt == "--cutoff":
                cutoff = val
                if not cutoff.isdigit() or not len(cutoff) == 8:
                    usage("--cutoff value must be in yyyymmdd format")
            elif opt == "--skipdone":
                skipdone = True
            elif opt == "--cleanup":
                cleanup_files = True
            elif opt == "--exclusive":
                do_locking = True
            elif opt == "--verbose":
                verbose = True
            elif opt == "--prereqs":
                do_prereqs = True

        if jobs_requested is not None:
            if ',' in jobs_requested:
                jobs_todo = jobs_requested.split(',')
            else:
                jobs_todo = [jobs_requested]
        else:
            jobs_todo = []

        if dryrun and (len(remainder) == 0):
            usage("--dryrun requires the name of a wikidb to be specified")
        if restart and not jobs_requested:
            usage("--restartfrom requires --job and the job from which to restart")
        if restart and len(jobs_todo) > 1:
            usage("--restartfrom requires --job and exactly one job from which to restart")
        if partnum_todo is not None and not jobs_requested:
            usage("--partnum option requires specific job(s) for which to rerun that part")
        if partnum_todo is not None and restart:
            usage("--partnum option can be specified only for a specific list of jobs")
        if checkpoint_file is not None and (len(remainder) == 0):
            usage("--checkpoint option requires the name of a wikidb to be specified")
        if checkpoint_file is not None and not jobs_requested:
            usage("--checkpoint option requires --job")
        if page_id_range and not jobs_requested:
            usage("--pageidrange option requires --job")
        if page_id_range and checkpoint_file is not None:
            usage("--pageidrange option cannot be used with --checkpoint option")
        if prefetchdate is not None and not prefetch:
            usage("prefetchdate and noprefetch options may not be specified together")
        if prefetchdate is not None and (not prefetchdate.isdigit() or len(prefetchdate) != 8):
            usage("prefetchdate must be of the form YYYYMMDD")
        if skip_jobs is None:
            skip_jobs = []
        else:
            skip_jobs = skip_jobs.split(",")

        # allow alternate config file
        if config_file:
            config = Config(config_file)
        else:
            config = Config()
        externals = ['php', 'mysql', 'mysqldump', 'head', 'tail',
                     'checkforbz2footer', 'grep', 'gzip', 'bzip2',
                     'writeuptopageid', 'recompressxml', 'sevenzip', 'cat']

        failed = False
        unknowns = []
        notfound = []
        for external in externals:
            try:
                ext = getattr(config, external)
            except AttributeError as ex:
                unknowns.append(external)
                failed = True
            else:
                if not exists(ext):
                    notfound.append(ext)
                    failed = True
        if failed:
            if unknowns:
                sys.stderr.write("Unknown config param(s): %s\n" % ", ".join(unknowns))
            if notfound:
                sys.stderr.write("Command(s) not found: %s\n" % ", ".join(notfound))
            sys.stderr.write("Exiting.\n")
            sys.exit(1)

        if (dryrun or partnum_todo is not None or
                (jobs_requested is not None and
                 not restart and
                 not do_locking and
                 not force_lock)):
            locks_enabled = False
        else:
            locks_enabled = True

        if dryrun:
            print "***"
            print "Dry run only, no files will be updated."
            print "***"

        if len(remainder) > 0:
            wiki = Wiki(config, remainder[0])
            if cutoff:
                # fixme if we asked for a specific job then check that job only
                # not the dir
                last_ran = wiki.latest_dump()
                if last_ran >= cutoff:
                    wiki = None
            if wiki is not None and locks_enabled:
                locker = Locker(wiki, date)
                if force_lock and locks_enabled:
                    lockfiles = locker.is_locked()
                    locker.unlock(lockfiles, owner=False)
                if locks_enabled:
                    locker.lock()

        else:
            # if the run is across all wikis and we are just doing one job,
            # we want the age of the wikis by the latest status update
            # and not the date the run started

            if jobs_requested is not None and jobs_requested[0] == 'createdirs':
                check_status_time = False
                # there won't actually be a status for this job but we want
                # to ensure that the directory and the status file are present
                # and intact
                check_job_status = True
                check_prereq_status = False
            else:
                check_status_time = bool(jobs_requested is not None)
                check_job_status = bool(skipdone)
                check_prereq_status = bool(jobs_requested is not None and skipdone)
            wiki = find_lock_next_wiki(config, locks_enabled, cutoff, prefetch,
                                       prefetchdate, spawn,
                                       dryrun, html_notice, check_status_time,
                                       check_job_status, check_prereq_status, date,
                                       jobs_todo[0] if len(jobs_todo) else None,
                                       skip_jobs, page_id_range,
                                       partnum_todo, checkpoint_file, skipdone, restart, verbose)

        if wiki is not None and wiki:
            # process any per-project configuration options
            config.parse_conffile_per_project(wiki.db_name)

            if date == 'last':
                dumps = sorted(wiki.dump_dirs())
                if dumps:
                    date = dumps[-1]
                else:
                    date = None

            if date is None or not date:
                date = TimeUtils.today()
            wiki.set_date(date)

            if after_checkpoint:
                fname = DumpFilename(wiki)
                fname.new_from_filename(checkpoint_file)
                if not fname.is_checkpoint_file:
                    usage("--aftercheckpoint option requires the "
                          "name of a checkpoint file, bad filename provided")
                page_id_range = str(int(fname.last_page_id) + 1)
                partnum_todo = fname.partnum_int
                # now we don't need this.
                checkpoint_file = None
                after_checkpoint_jobs = ['articlesdump', 'metacurrentdump',
                                         'metahistorybz2dump']
                if (jobs_requested is None or
                        not set(jobs_requested).issubset(set(after_checkpoint_jobs))):
                    usage("--aftercheckpoint option requires --job option with one or more of %s"
                          % ", ".join(after_checkpoint_jobs))

            enabled = {}
            if enable_logging:
                enabled = {"logging": True}

            if restart:
                sys.stderr.write("Running %s, restarting from job %s...\n" %
                                 (wiki.db_name, jobs_todo[0]))
            elif jobs_requested:
                sys.stderr.write("Running %s, jobs %s...\n" % (wiki.db_name, jobs_requested))
            else:
                sys.stderr.write("Running %s...\n" % wiki.db_name)

            # no specific jobs requested, runner will do them all
            if not len(jobs_todo):
                runner = Runner(wiki, prefetch, prefetchdate, spawn, None, skip_jobs,
                                restart, html_notice, dryrun, enabled,
                                partnum_todo, checkpoint_file, page_id_range, skipdone,
                                cleanup_files, do_prereqs, verbose)

                result = runner.run()
                if result is not None and result:
                    exitcode = 0

            else:
                # do each job requested one at a time
                for job in jobs_todo:
                    runner = Runner(wiki, prefetch, prefetchdate, spawn, job, skip_jobs,
                                    restart, html_notice, dryrun, enabled,
                                    partnum_todo, checkpoint_file, page_id_range, skipdone,
                                    cleanup_files, do_prereqs, verbose)

                    result = runner.run()
                    if result is not None and result:
                        exitcode = 0

            # if we are doing one piece only of the dump, we don't unlock either
            if locks_enabled:
                locker = Locker(wiki, date)
                lockfiles = locker.is_locked()
                locker.unlock(lockfiles, owner=True)
        elif wiki is not None:
            sys.stderr.write("Wikis available to run but prereqs not complete.\n")
            exitcode = 0
        else:
            sys.stderr.write("No wikis available to run.\n")
            exitcode = 255
    finally:
        cleanup()
    sys.exit(exitcode)