def find_lock_next_wiki(config, locks_enabled, cutoff, prefetch, prefetchdate, spawn, dryrun, html_notice, bystatustime=False, check_job_status=False, check_prereq_status=False, date=None, job=None, skipjobs=None, page_id_range=None, partnum_todo=None, checkpoint_file=None, skipdone=False, restart=False, verbose=False): nextdbs = config.db_list_by_age(bystatustime) nextdbs.reverse() if verbose and not cutoff: sys.stderr.write("Finding oldest unlocked wiki...\n") # if we skip locked wikis which are missing the prereqs for this job, # there are still wikis where this job needs to run missing_prereqs = False for dbname in nextdbs: wiki = Wiki(config, dbname) if cutoff: if bystatustime: last_updated = wiki.date_touched_latest_dump() else: last_updated = wiki.latest_dump() if last_updated >= cutoff: continue if check_job_status: if check_jobs(wiki, date, job, skipjobs, page_id_range, partnum_todo, checkpoint_file, restart, prefetch, prefetchdate, spawn, True, skipdone, verbose, html_notice): continue try: if locks_enabled: locker = Locker(wiki, date) locker.lock() return wiki except Exception as ex: if check_prereq_status: # if we skip locked wikis which are missing the prereqs for this job, # there are still wikis where this job needs to run if not check_jobs(wiki, date, job, skipjobs, page_id_range, partnum_todo, checkpoint_file, prefetch, prefetchdate, spawn, True, skipdone, verbose, html_notice, prereqs=True, restart=restart): missing_prereqs = True sys.stderr.write("Couldn't lock %s, someone else must have got it...\n" % dbname) continue if missing_prereqs: return False else: return None
def do_mark(self, wikiname): ''' mark the specified job with the specified status. ''' wiki = Wiki(self.wikiconfs[wikiname], wikiname) date = wiki.latest_dump() if date is None: print "dump never run, not marking job for wiki", wikiname return wiki.set_date(date) runner = Runner(wiki, prefetch=True, spawn=True, job=None, skip_jobs=[], restart=False, notice="", dryrun=False, enabled=None, partnum_todo=False, checkpoint_file=None, page_id_range=None, skipdone=[], cleanup=False, verbose=self.verbose) known_jobs = [item.name() for item in runner.dump_item_list.dump_items] + ['tables'] if ':' in self.job_status: job, status = self.job_status.split(":", 1) if status not in ["done", "failed"]: status = None if job not in known_jobs: job = None if job is None or status is None: print "bad or no job/status specified", self.job_status if self.verbose: print "known jobs", known_jobs return runner.dumpjobdata.do_before_dump() for item in runner.dump_item_list.dump_items: if item.name() == job: item.set_status(status, True) if item.status() == "done": runner.dumpjobdata.do_after_job(item) elif item.status() not in ["done", "waiting", "skipped"]: runner.failurehandler.failure_count += 1 if self.verbose: print "updating status files for wiki", wiki.db_name if runner.dump_item_list.all_possible_jobs_done(): # All jobs are either in status "done", "waiting", "failed", "skipped" runner.indexhtml.update_index_html("done") runner.statushtml.update_status_file("done") else: runner.indexhtml.update_index_html("partialdone") runner.statushtml.update_status_file("partialdone") runner.dumpjobdata.do_after_dump(runner.dump_item_list.dump_items) return
def do_all_wikis(self, overwrite, date): ''' run a script on all wikis, removing the completed wikis from the todo list in case the caller wants to retry the rest ''' for wiki_name in self.wikis_todo[:]: wiki = Wiki(self.config, wiki_name) wiki.set_date(date) runner = WikiRunner(self.runner, wiki, self.filenameformat, self.output_dir, self.base) if runner.do_one_wiki(overwrite): self.wikis_todo.remove(wiki_name)
def __init__(self, args, wikiname, flags): self.args = args self.wiki = Wiki(self.args["config"], wikiname) self.wiki.set_date(self.args["date"]) self.flags = flags dump_class = MiscDumpFactory.get_dumper(self.args["dumptype"]) self.dumper = dump_class(self.wiki, flags["dryrun"], self.args["args"])
def do_one_wiki(self, wikiname, date=None): """ collect the text strings for one wiki to be inserted into the index.html file """ if not skip_wiki(wikiname, self.args["config"]): dumps_dirs = MiscDumpDirs(self.args["config"], wikiname) if not exists(self.dumpdir.get_dumpdir_no_date(wikiname)): log.info("No dump for wiki %s", wikiname) return if date is not None: dump_date = date else: dump_date = dumps_dirs.get_latest_dump_date(True) if not dump_date: log.info("No dump for wiki %s", wikiname) return other_runs_text = "other runs: %s<br />" % make_link(wikiname, wikiname) try: wiki = Wiki(self.args["config"], wikiname) wiki.set_date(dump_date) files_text = self.get_files_text(wiki) stat_text = self.get_stat_text(dump_date, wikiname) except Exception as ex: log.warning("Error encountered, no information available" " for wiki %s", wikiname, exc_info=ex) return "<strong>%s</strong> Error encountered," " no information available | %s" % ( wikiname, other_runs_text, ) try: wikiname_text = "<strong>%s</strong>" % wikiname wiki_info = " ".join([entry for entry in [wikiname_text, stat_text] if entry is not None]) + "<br />" wiki_info = wiki_info + " " + "\n ".join(files_text) wiki_info = wiki_info + "\n " + other_runs_text except Exception as ex: log.warning("Error encountered formatting information" " for wiki %s", wikiname, exc_info=ex) return "Error encountered formatting information" " for wiki %s" % wikiname return wiki_info
def undo_notice(self, wikiname): ''' remove any notice.txt file that may exist for the most current run for the given wiki ''' wiki = Wiki(self.wikiconfs[wikiname], wikiname) date = wiki.latest_dump() if date is None: print "dump never run, no notice file to remove for wiki", wikiname return if self.dryrun: print "would remove notice.txt for wiki", wikiname, "date", date return elif self.verbose: print "removing notice file for wiki", wikiname, "date", date wiki.set_date(date) NoticeFile(wiki, False, True)
def do_notice(self, wikiname): ''' create a notice.txt file for the particular wiki for the most recent run. the contents will appear on its web page for that dump run ''' wiki = Wiki(self.wikiconfs[wikiname], wikiname) date = wiki.latest_dump() if date is None: print "dump never run, not adding notice file for wiki", wikiname return if self.dryrun: print "would add notice.txt for wiki", wikiname, "date", date return elif self.verbose: print "creating notice file for wiki", wikiname, "date", date wiki.set_date(date) NoticeFile(wiki, self.message, True)
def do_remove(self, rerun=False): ''' find all failed dump jobs for unlocked wikis clean them up after getting lock on each one first, then remove lock if a specific wiki was specified at instantiation, clean up only that wiki ''' failed_dumps = self.find_failed_dumps() for wikiname in failed_dumps: for date in failed_dumps[wikiname]: wiki = Wiki(self.wikiconfs[wikiname], wikiname) wiki.set_date(date) locker = Locker(wiki, date) try: locker.lock() except Exception as ex: sys.stderr.write("Couldn't lock %s, can't do cleanup\n" % wikiname) continue self.cleanup_dump(wiki, failed_dumps[wikiname][date], rerun=rerun) locker.unlock(locker.get_lock_file_path())
def find_failed_dumps_for_wiki(self, wikiname): ''' return list of failed jobs for the latest run for the specified wiki or empty list if there are none ''' failed_jobs = [] # fixme how is the above a string, shouldn't it be a function? wiki = Wiki(self.wikiconfs[wikiname], wikiname) date = wiki.latest_dump() if date is None: return [], None wiki.set_date(date) run_info_file = RunInfoFile(wiki, False) results = run_info_file.get_old_runinfo_from_file() if not results: return [], None for entry in results: if entry["status"] == "failed": failed_jobs.append(entry["name"]) return failed_jobs, date
def do_main(): ''' main entry point, do all the work ''' (configfile, date, dryrun, filenameformat, output_dir, overwrite, wikiname, script, basename, query, retries, verbose, remainder) = get_args() validate_args(date, output_dir, retries, script, query) if retries is None: retries = "3" retries = int(retries) if configfile: config = Config(configfile) else: config = Config() if date is None: date = TimeUtils.today() if script is not None: runner = ScriptRunner(script, remainder, dryrun, verbose) else: if query is None: query = FileUtils.read_file(config.queryfile) runner = QueryRunner(query, dryrun, verbose) if basename is not None: base = Wiki(config, basename) base.set_date(date) if base is not None: base.config.parse_conffile_per_project(base.db_name) else: base = None if wikiname is not None: wiki = Wiki(config, wikiname) wiki.set_date(date) wikirunner = WikiRunner(runner, wiki, filenameformat, output_dir, base) wikirunner.do_one_wiki(overwrite) else: wikirunner = WikiRunnerLoop(config, runner, filenameformat, output_dir, base) wikirunner.do_all_wikis_til_done(retries, overwrite, date)
def main(): os.environ['DUMPS'] = str(os.getpid()) try: date = None config_file = False force_lock = False prefetch = True prefetchdate = None spawn = True restart = False jobs_requested = None skip_jobs = None enable_logging = False html_notice = "" dryrun = False partnum_todo = None after_checkpoint = False checkpoint_file = None page_id_range = None cutoff = None exitcode = 1 skipdone = False do_locking = False verbose = False cleanup_files = False do_prereqs = False try: (options, remainder) = getopt.gnu_getopt( sys.argv[1:], "", ['date=', 'job=', 'skipjobs=', 'configfile=', 'addnotice=', 'delnotice', 'force', 'dryrun', 'noprefetch', 'prefetchdate=', 'nospawn', 'restartfrom', 'aftercheckpoint=', 'log', 'partnum=', 'checkpoint=', 'pageidrange=', 'cutoff=', "skipdone", "exclusive", "prereqs", "cleanup", 'verbose']) except Exception as ex: usage("Unknown option specified") for (opt, val) in options: if opt == "--date": date = val elif opt == "--configfile": config_file = val elif opt == '--checkpoint': checkpoint_file = val elif opt == '--partnum': partnum_todo = int(val) elif opt == "--force": force_lock = True elif opt == '--aftercheckpoint': after_checkpoint = True checkpoint_file = val elif opt == "--noprefetch": prefetch = False elif opt == "--prefetchdate": prefetchdate = val elif opt == "--nospawn": spawn = False elif opt == "--dryrun": dryrun = True elif opt == "--job": jobs_requested = val elif opt == "--skipjobs": skip_jobs = val elif opt == "--restartfrom": restart = True elif opt == "--log": enable_logging = True elif opt == "--addnotice": html_notice = val elif opt == "--delnotice": html_notice = False elif opt == "--pageidrange": page_id_range = val elif opt == "--cutoff": cutoff = val if not cutoff.isdigit() or not len(cutoff) == 8: usage("--cutoff value must be in yyyymmdd format") elif opt == "--skipdone": skipdone = True elif opt == "--cleanup": cleanup_files = True elif opt == "--exclusive": do_locking = True elif opt == "--verbose": verbose = True elif opt == "--prereqs": do_prereqs = True if jobs_requested is not None: if ',' in jobs_requested: jobs_todo = jobs_requested.split(',') else: jobs_todo = [jobs_requested] else: jobs_todo = [] if dryrun and (len(remainder) == 0): usage("--dryrun requires the name of a wikidb to be specified") if restart and not jobs_requested: usage("--restartfrom requires --job and the job from which to restart") if restart and len(jobs_todo) > 1: usage("--restartfrom requires --job and exactly one job from which to restart") if partnum_todo is not None and not jobs_requested: usage("--partnum option requires specific job(s) for which to rerun that part") if partnum_todo is not None and restart: usage("--partnum option can be specified only for a specific list of jobs") if checkpoint_file is not None and (len(remainder) == 0): usage("--checkpoint option requires the name of a wikidb to be specified") if checkpoint_file is not None and not jobs_requested: usage("--checkpoint option requires --job") if page_id_range and not jobs_requested: usage("--pageidrange option requires --job") if page_id_range and checkpoint_file is not None: usage("--pageidrange option cannot be used with --checkpoint option") if prefetchdate is not None and not prefetch: usage("prefetchdate and noprefetch options may not be specified together") if prefetchdate is not None and (not prefetchdate.isdigit() or len(prefetchdate) != 8): usage("prefetchdate must be of the form YYYYMMDD") if skip_jobs is None: skip_jobs = [] else: skip_jobs = skip_jobs.split(",") # allow alternate config file if config_file: config = Config(config_file) else: config = Config() externals = ['php', 'mysql', 'mysqldump', 'head', 'tail', 'checkforbz2footer', 'grep', 'gzip', 'bzip2', 'writeuptopageid', 'recompressxml', 'sevenzip', 'cat'] failed = False unknowns = [] notfound = [] for external in externals: try: ext = getattr(config, external) except AttributeError as ex: unknowns.append(external) failed = True else: if not exists(ext): notfound.append(ext) failed = True if failed: if unknowns: sys.stderr.write("Unknown config param(s): %s\n" % ", ".join(unknowns)) if notfound: sys.stderr.write("Command(s) not found: %s\n" % ", ".join(notfound)) sys.stderr.write("Exiting.\n") sys.exit(1) if (dryrun or partnum_todo is not None or (jobs_requested is not None and not restart and not do_locking and not force_lock)): locks_enabled = False else: locks_enabled = True if dryrun: print "***" print "Dry run only, no files will be updated." print "***" if len(remainder) > 0: wiki = Wiki(config, remainder[0]) if cutoff: # fixme if we asked for a specific job then check that job only # not the dir last_ran = wiki.latest_dump() if last_ran >= cutoff: wiki = None if wiki is not None and locks_enabled: locker = Locker(wiki, date) if force_lock and locks_enabled: lockfiles = locker.is_locked() locker.unlock(lockfiles, owner=False) if locks_enabled: locker.lock() else: # if the run is across all wikis and we are just doing one job, # we want the age of the wikis by the latest status update # and not the date the run started if jobs_requested is not None and jobs_requested[0] == 'createdirs': check_status_time = False # there won't actually be a status for this job but we want # to ensure that the directory and the status file are present # and intact check_job_status = True check_prereq_status = False else: check_status_time = bool(jobs_requested is not None) check_job_status = bool(skipdone) check_prereq_status = bool(jobs_requested is not None and skipdone) wiki = find_lock_next_wiki(config, locks_enabled, cutoff, prefetch, prefetchdate, spawn, dryrun, html_notice, check_status_time, check_job_status, check_prereq_status, date, jobs_todo[0] if len(jobs_todo) else None, skip_jobs, page_id_range, partnum_todo, checkpoint_file, skipdone, restart, verbose) if wiki is not None and wiki: # process any per-project configuration options config.parse_conffile_per_project(wiki.db_name) if date == 'last': dumps = sorted(wiki.dump_dirs()) if dumps: date = dumps[-1] else: date = None if date is None or not date: date = TimeUtils.today() wiki.set_date(date) if after_checkpoint: fname = DumpFilename(wiki) fname.new_from_filename(checkpoint_file) if not fname.is_checkpoint_file: usage("--aftercheckpoint option requires the " "name of a checkpoint file, bad filename provided") page_id_range = str(int(fname.last_page_id) + 1) partnum_todo = fname.partnum_int # now we don't need this. checkpoint_file = None after_checkpoint_jobs = ['articlesdump', 'metacurrentdump', 'metahistorybz2dump'] if (jobs_requested is None or not set(jobs_requested).issubset(set(after_checkpoint_jobs))): usage("--aftercheckpoint option requires --job option with one or more of %s" % ", ".join(after_checkpoint_jobs)) enabled = {} if enable_logging: enabled = {"logging": True} if restart: sys.stderr.write("Running %s, restarting from job %s...\n" % (wiki.db_name, jobs_todo[0])) elif jobs_requested: sys.stderr.write("Running %s, jobs %s...\n" % (wiki.db_name, jobs_requested)) else: sys.stderr.write("Running %s...\n" % wiki.db_name) # no specific jobs requested, runner will do them all if not len(jobs_todo): runner = Runner(wiki, prefetch, prefetchdate, spawn, None, skip_jobs, restart, html_notice, dryrun, enabled, partnum_todo, checkpoint_file, page_id_range, skipdone, cleanup_files, do_prereqs, verbose) result = runner.run() if result is not None and result: exitcode = 0 else: # do each job requested one at a time for job in jobs_todo: runner = Runner(wiki, prefetch, prefetchdate, spawn, job, skip_jobs, restart, html_notice, dryrun, enabled, partnum_todo, checkpoint_file, page_id_range, skipdone, cleanup_files, do_prereqs, verbose) result = runner.run() if result is not None and result: exitcode = 0 # if we are doing one piece only of the dump, we don't unlock either if locks_enabled: locker = Locker(wiki, date) lockfiles = locker.is_locked() locker.unlock(lockfiles, owner=True) elif wiki is not None: sys.stderr.write("Wikis available to run but prereqs not complete.\n") exitcode = 0 else: sys.stderr.write("No wikis available to run.\n") exitcode = 255 finally: cleanup() sys.exit(exitcode)
class MiscDumpOne(object): """ run dump of specified name on all wikis, or if do_dump is False, only generate the index.html file containing information on the dump run, for all wikis. args are keyword args converted to a dict, these get passed through to the class for the specific dump you want """ def __init__(self, args, wikiname, flags): self.args = args self.wiki = Wiki(self.args["config"], wikiname) self.wiki.set_date(self.args["date"]) self.flags = flags dump_class = MiscDumpFactory.get_dumper(self.args["dumptype"]) self.dumper = dump_class(self.wiki, flags["dryrun"], self.args["args"]) def do_one_wiki(self): """ run dump of specified type for one wiki, for given date unless it is among the wikis we skip, has already been run for the date, or some other process has the lock and is therefore presumably already dumping it """ if not skip_wiki(self.wiki.db_name, self.wiki.config): dumpdir = MiscDumpDir(self.args["config"], self.args["date"]) if not exists(dumpdir.get_dumpdir(self.wiki.db_name)): os.makedirs(dumpdir.get_dumpdir(self.wiki.db_name)) status_info = StatusInfo(self.args["config"], self.wiki.date, self.wiki.db_name) status = status_info.get_status() if status == "done:all" and not self.flags["forcerun"]: log.info("wiki %s skipped, adds/changes dump already" " complete", self.wiki.db_name) return STATUS_GOOD if not self.flags["dryrun"]: lock = MiscDumpLock(self.args["config"], self.wiki.date, self.wiki.db_name) # if lock is stale, remove it lock.remove_if_stale(self.wiki.config.lock_stale) # try to get the lock ourselves if not lock.get_lock(): log.info( "wiki %s skipped, wiki is locked," " another process should be doing the job", self.wiki.db_name ) return STATUS_TODO self.dumper.set_lockinfo(lock) dumps_dirs = MiscDumpDirs(self.wiki.config, self.wiki.db_name) dumps_dirs.cleanup_old_dumps(self.wiki.date) log.info("Doing run for wiki: %s", self.wiki.db_name) try: result = self.dumper.run() if not result: return STATUS_FAILED if not self.flags["dryrun"]: output_files, expected = self.dumper.get_output_files() if not md5sums(self.wiki, self.wiki.config.fileperms, output_files, expected): return STATUS_FAILED status_info.set_status("done:" + self.dumper.get_steps_done()) lock.unlock_if_owner() if self.flags["do_index"]: index = Index(self.args) index.do_all_wikis() except Exception as ex: log.warning("error from dump run" " for wiki %s", self.wiki.db_name, exc_info=ex) if not self.flags["dryrun"]: lock.unlock_if_owner() return STATUS_FAILED log.info("Success! Wiki %s %s dump complete.", self.wiki.db_name, self.args["dumptype"]) return STATUS_GOOD