def do_mark(self, wikiname): ''' mark the specified job with the specified status. ''' wiki = Wiki(self.wikiconfs[wikiname], wikiname) date = wiki.latest_dump() if date is None: print "dump never run, not marking job for wiki", wikiname return wiki.set_date(date) runner = Runner(wiki, prefetch=True, spawn=True, job=None, skip_jobs=[], restart=False, notice="", dryrun=False, enabled=None, partnum_todo=False, checkpoint_file=None, page_id_range=None, skipdone=[], cleanup=False, verbose=self.verbose) known_jobs = [item.name() for item in runner.dump_item_list.dump_items] + ['tables'] if ':' in self.job_status: job, status = self.job_status.split(":", 1) if status not in ["done", "failed"]: status = None if job not in known_jobs: job = None if job is None or status is None: print "bad or no job/status specified", self.job_status if self.verbose: print "known jobs", known_jobs return runner.dumpjobdata.do_before_dump() for item in runner.dump_item_list.dump_items: if item.name() == job: item.set_status(status, True) if item.status() == "done": runner.dumpjobdata.do_after_job(item) elif item.status() not in ["done", "waiting", "skipped"]: runner.failurehandler.failure_count += 1 if self.verbose: print "updating status files for wiki", wiki.db_name if runner.dump_item_list.all_possible_jobs_done(): # All jobs are either in status "done", "waiting", "failed", "skipped" runner.indexhtml.update_index_html("done") runner.statushtml.update_status_file("done") else: runner.indexhtml.update_index_html("partialdone") runner.statushtml.update_status_file("partialdone") runner.dumpjobdata.do_after_dump(runner.dump_item_list.dump_items) return
def find_lock_next_wiki(config, locks_enabled, cutoff, prefetch, prefetchdate, spawn, dryrun, html_notice, bystatustime=False, check_job_status=False, check_prereq_status=False, date=None, job=None, skipjobs=None, page_id_range=None, partnum_todo=None, checkpoint_file=None, skipdone=False, restart=False, verbose=False): nextdbs = config.db_list_by_age(bystatustime) nextdbs.reverse() if verbose and not cutoff: sys.stderr.write("Finding oldest unlocked wiki...\n") # if we skip locked wikis which are missing the prereqs for this job, # there are still wikis where this job needs to run missing_prereqs = False for dbname in nextdbs: wiki = Wiki(config, dbname) if cutoff: if bystatustime: last_updated = wiki.date_touched_latest_dump() else: last_updated = wiki.latest_dump() if last_updated >= cutoff: continue if check_job_status: if check_jobs(wiki, date, job, skipjobs, page_id_range, partnum_todo, checkpoint_file, restart, prefetch, prefetchdate, spawn, True, skipdone, verbose, html_notice): continue try: if locks_enabled: locker = Locker(wiki, date) locker.lock() return wiki except Exception as ex: if check_prereq_status: # if we skip locked wikis which are missing the prereqs for this job, # there are still wikis where this job needs to run if not check_jobs(wiki, date, job, skipjobs, page_id_range, partnum_todo, checkpoint_file, prefetch, prefetchdate, spawn, True, skipdone, verbose, html_notice, prereqs=True, restart=restart): missing_prereqs = True sys.stderr.write("Couldn't lock %s, someone else must have got it...\n" % dbname) continue if missing_prereqs: return False else: return None
def undo_notice(self, wikiname): ''' remove any notice.txt file that may exist for the most current run for the given wiki ''' wiki = Wiki(self.wikiconfs[wikiname], wikiname) date = wiki.latest_dump() if date is None: print "dump never run, no notice file to remove for wiki", wikiname return if self.dryrun: print "would remove notice.txt for wiki", wikiname, "date", date return elif self.verbose: print "removing notice file for wiki", wikiname, "date", date wiki.set_date(date) NoticeFile(wiki, False, True)
def do_notice(self, wikiname): ''' create a notice.txt file for the particular wiki for the most recent run. the contents will appear on its web page for that dump run ''' wiki = Wiki(self.wikiconfs[wikiname], wikiname) date = wiki.latest_dump() if date is None: print "dump never run, not adding notice file for wiki", wikiname return if self.dryrun: print "would add notice.txt for wiki", wikiname, "date", date return elif self.verbose: print "creating notice file for wiki", wikiname, "date", date wiki.set_date(date) NoticeFile(wiki, self.message, True)
def find_failed_dumps_for_wiki(self, wikiname): ''' return list of failed jobs for the latest run for the specified wiki or empty list if there are none ''' failed_jobs = [] # fixme how is the above a string, shouldn't it be a function? wiki = Wiki(self.wikiconfs[wikiname], wikiname) date = wiki.latest_dump() if date is None: return [], None wiki.set_date(date) run_info_file = RunInfoFile(wiki, False) results = run_info_file.get_old_runinfo_from_file() if not results: return [], None for entry in results: if entry["status"] == "failed": failed_jobs.append(entry["name"]) return failed_jobs, date
def main(): os.environ['DUMPS'] = str(os.getpid()) try: date = None config_file = False force_lock = False prefetch = True prefetchdate = None spawn = True restart = False jobs_requested = None skip_jobs = None enable_logging = False html_notice = "" dryrun = False partnum_todo = None after_checkpoint = False checkpoint_file = None page_id_range = None cutoff = None exitcode = 1 skipdone = False do_locking = False verbose = False cleanup_files = False do_prereqs = False try: (options, remainder) = getopt.gnu_getopt( sys.argv[1:], "", ['date=', 'job=', 'skipjobs=', 'configfile=', 'addnotice=', 'delnotice', 'force', 'dryrun', 'noprefetch', 'prefetchdate=', 'nospawn', 'restartfrom', 'aftercheckpoint=', 'log', 'partnum=', 'checkpoint=', 'pageidrange=', 'cutoff=', "skipdone", "exclusive", "prereqs", "cleanup", 'verbose']) except Exception as ex: usage("Unknown option specified") for (opt, val) in options: if opt == "--date": date = val elif opt == "--configfile": config_file = val elif opt == '--checkpoint': checkpoint_file = val elif opt == '--partnum': partnum_todo = int(val) elif opt == "--force": force_lock = True elif opt == '--aftercheckpoint': after_checkpoint = True checkpoint_file = val elif opt == "--noprefetch": prefetch = False elif opt == "--prefetchdate": prefetchdate = val elif opt == "--nospawn": spawn = False elif opt == "--dryrun": dryrun = True elif opt == "--job": jobs_requested = val elif opt == "--skipjobs": skip_jobs = val elif opt == "--restartfrom": restart = True elif opt == "--log": enable_logging = True elif opt == "--addnotice": html_notice = val elif opt == "--delnotice": html_notice = False elif opt == "--pageidrange": page_id_range = val elif opt == "--cutoff": cutoff = val if not cutoff.isdigit() or not len(cutoff) == 8: usage("--cutoff value must be in yyyymmdd format") elif opt == "--skipdone": skipdone = True elif opt == "--cleanup": cleanup_files = True elif opt == "--exclusive": do_locking = True elif opt == "--verbose": verbose = True elif opt == "--prereqs": do_prereqs = True if jobs_requested is not None: if ',' in jobs_requested: jobs_todo = jobs_requested.split(',') else: jobs_todo = [jobs_requested] else: jobs_todo = [] if dryrun and (len(remainder) == 0): usage("--dryrun requires the name of a wikidb to be specified") if restart and not jobs_requested: usage("--restartfrom requires --job and the job from which to restart") if restart and len(jobs_todo) > 1: usage("--restartfrom requires --job and exactly one job from which to restart") if partnum_todo is not None and not jobs_requested: usage("--partnum option requires specific job(s) for which to rerun that part") if partnum_todo is not None and restart: usage("--partnum option can be specified only for a specific list of jobs") if checkpoint_file is not None and (len(remainder) == 0): usage("--checkpoint option requires the name of a wikidb to be specified") if checkpoint_file is not None and not jobs_requested: usage("--checkpoint option requires --job") if page_id_range and not jobs_requested: usage("--pageidrange option requires --job") if page_id_range and checkpoint_file is not None: usage("--pageidrange option cannot be used with --checkpoint option") if prefetchdate is not None and not prefetch: usage("prefetchdate and noprefetch options may not be specified together") if prefetchdate is not None and (not prefetchdate.isdigit() or len(prefetchdate) != 8): usage("prefetchdate must be of the form YYYYMMDD") if skip_jobs is None: skip_jobs = [] else: skip_jobs = skip_jobs.split(",") # allow alternate config file if config_file: config = Config(config_file) else: config = Config() externals = ['php', 'mysql', 'mysqldump', 'head', 'tail', 'checkforbz2footer', 'grep', 'gzip', 'bzip2', 'writeuptopageid', 'recompressxml', 'sevenzip', 'cat'] failed = False unknowns = [] notfound = [] for external in externals: try: ext = getattr(config, external) except AttributeError as ex: unknowns.append(external) failed = True else: if not exists(ext): notfound.append(ext) failed = True if failed: if unknowns: sys.stderr.write("Unknown config param(s): %s\n" % ", ".join(unknowns)) if notfound: sys.stderr.write("Command(s) not found: %s\n" % ", ".join(notfound)) sys.stderr.write("Exiting.\n") sys.exit(1) if (dryrun or partnum_todo is not None or (jobs_requested is not None and not restart and not do_locking and not force_lock)): locks_enabled = False else: locks_enabled = True if dryrun: print "***" print "Dry run only, no files will be updated." print "***" if len(remainder) > 0: wiki = Wiki(config, remainder[0]) if cutoff: # fixme if we asked for a specific job then check that job only # not the dir last_ran = wiki.latest_dump() if last_ran >= cutoff: wiki = None if wiki is not None and locks_enabled: locker = Locker(wiki, date) if force_lock and locks_enabled: lockfiles = locker.is_locked() locker.unlock(lockfiles, owner=False) if locks_enabled: locker.lock() else: # if the run is across all wikis and we are just doing one job, # we want the age of the wikis by the latest status update # and not the date the run started if jobs_requested is not None and jobs_requested[0] == 'createdirs': check_status_time = False # there won't actually be a status for this job but we want # to ensure that the directory and the status file are present # and intact check_job_status = True check_prereq_status = False else: check_status_time = bool(jobs_requested is not None) check_job_status = bool(skipdone) check_prereq_status = bool(jobs_requested is not None and skipdone) wiki = find_lock_next_wiki(config, locks_enabled, cutoff, prefetch, prefetchdate, spawn, dryrun, html_notice, check_status_time, check_job_status, check_prereq_status, date, jobs_todo[0] if len(jobs_todo) else None, skip_jobs, page_id_range, partnum_todo, checkpoint_file, skipdone, restart, verbose) if wiki is not None and wiki: # process any per-project configuration options config.parse_conffile_per_project(wiki.db_name) if date == 'last': dumps = sorted(wiki.dump_dirs()) if dumps: date = dumps[-1] else: date = None if date is None or not date: date = TimeUtils.today() wiki.set_date(date) if after_checkpoint: fname = DumpFilename(wiki) fname.new_from_filename(checkpoint_file) if not fname.is_checkpoint_file: usage("--aftercheckpoint option requires the " "name of a checkpoint file, bad filename provided") page_id_range = str(int(fname.last_page_id) + 1) partnum_todo = fname.partnum_int # now we don't need this. checkpoint_file = None after_checkpoint_jobs = ['articlesdump', 'metacurrentdump', 'metahistorybz2dump'] if (jobs_requested is None or not set(jobs_requested).issubset(set(after_checkpoint_jobs))): usage("--aftercheckpoint option requires --job option with one or more of %s" % ", ".join(after_checkpoint_jobs)) enabled = {} if enable_logging: enabled = {"logging": True} if restart: sys.stderr.write("Running %s, restarting from job %s...\n" % (wiki.db_name, jobs_todo[0])) elif jobs_requested: sys.stderr.write("Running %s, jobs %s...\n" % (wiki.db_name, jobs_requested)) else: sys.stderr.write("Running %s...\n" % wiki.db_name) # no specific jobs requested, runner will do them all if not len(jobs_todo): runner = Runner(wiki, prefetch, prefetchdate, spawn, None, skip_jobs, restart, html_notice, dryrun, enabled, partnum_todo, checkpoint_file, page_id_range, skipdone, cleanup_files, do_prereqs, verbose) result = runner.run() if result is not None and result: exitcode = 0 else: # do each job requested one at a time for job in jobs_todo: runner = Runner(wiki, prefetch, prefetchdate, spawn, job, skip_jobs, restart, html_notice, dryrun, enabled, partnum_todo, checkpoint_file, page_id_range, skipdone, cleanup_files, do_prereqs, verbose) result = runner.run() if result is not None and result: exitcode = 0 # if we are doing one piece only of the dump, we don't unlock either if locks_enabled: locker = Locker(wiki, date) lockfiles = locker.is_locked() locker.unlock(lockfiles, owner=True) elif wiki is not None: sys.stderr.write("Wikis available to run but prereqs not complete.\n") exitcode = 0 else: sys.stderr.write("No wikis available to run.\n") exitcode = 255 finally: cleanup() sys.exit(exitcode)