def get_wiki_config(self, wikiname): ''' parse and return the configuration for a particular wiki ''' wikiconf = Config(self.configfile) wikiconf.parse_conffile_per_project(wikiname) return wikiconf
def __init__(self, actions, show, message, job_status, undo, configfile, wikiname, dryrun, verbose): ''' constructor. reads configs for every wiki, this might be wasteful but really how long can it take? even with 1k wikis ''' self.verbose = verbose if not actions and not undo: if self.verbose: sys.stderr.write("No actions specified.\n") return self.actions = actions self.undo = undo self.dryrun = dryrun self.wikiname = wikiname self.configfile = configfile self.message = message self.show = show self.job_status = job_status self.conf = Config(self.configfile) if self.wikiname is None: self.wikilist = self.conf.db_list else: self.wikilist = [self.wikiname] self.wikiconfs = {} for wiki in self.wikilist: self.wikiconfs[wiki] = self.get_wiki_config(wiki)
def parse_conffile(self): ''' grab values from configuration and assign them to appropriate variables ''' self.wiki_dir = self.conf.get("wiki", "mediawiki") self.all_wikis_list = MiscUtils.db_list(self.conf.get("wiki", "allwikislist")) self.private_wikis_list = MiscUtils.db_list(self.conf.get("wiki", "privatewikislist")) self.closed_wikis_list = MiscUtils.db_list(self.conf.get("wiki", "closedwikislist")) self.skip_wikis_list = MiscUtils.db_list(self.conf.get("wiki", "skipwikislist")) if not self.conf.has_section('output'): self.conf.add_section('output') self.dump_dir = self.conf.get("output", "dumpdir") self.temp_dir = self.conf.get("output", "temp") self.indextmpl = self.conf.get("output", "indextmpl") self.template_dir = self.conf.get("output", "templatedir") self.webroot = self.conf.get("output", "webroot") fileperms = self.conf.get("output", "fileperms") self.fileperms = int(fileperms, 0) lock_stale = self.conf.get("output", "lockstale") self.lock_stale = int(lock_stale, 0) if not self.conf.has_section('tools'): self.conf.add_section('tools') self.php = self.conf.get("tools", "php") self.gzip = self.conf.get("tools", "gzip") self.bzip2 = self.conf.get("tools", "bzip2") self.mysql = self.conf.get("tools", "mysql") self.checkforbz2footer = self.conf.get("tools", "checkforbz2footer") self.multiversion = self.conf.get("tools", "multiversion") self.adminsettings = self.conf.get("tools", "adminsettings") if not self.conf.has_section('cleanup'): self.conf.add_section('cleanup') self.keep = self.conf.getint("cleanup", "keep") self.db_user = None self.db_password = None if not self.conf.has_section('database'): self.conf.add_section('database') if self.conf.has_option('database', 'user'): self.db_user = self.conf.get("database", "user") if self.conf.has_option('database', 'password'): self.db_password = self.conf.get("database", "password") # get from MW adminsettings file if not set in conf file if not self.db_user: self.db_user, self.db_password = Config.get_db_user_and_password( self.conf, self.wiki_dir) self.max_allowed_packet = self.conf.get("database", "max_allowed_packet")
def main(): 'main entry point, does all the work' wiki = None output_file = None start = None end = None configfile = "wikidump.conf" dryrun = False try: (options, remainder) = getopt.gnu_getopt( sys.argv[1:], "w:o:s:e:C:fhv", ["wiki=", "outfile=", "start=", "end=", "config=", "help", "dryrun"]) except getopt.GetoptError as err: usage("Unknown option specified: " + str(err)) for (opt, val) in options: if opt in ["-w", "--wiki"]: wiki = val elif opt in ["-o", "--outfile"]: output_file = val elif opt in ["-s", "--start"]: start = val elif opt in ["-e", "--end"]: end = val elif opt in ["-C", "--config"]: configfile = val elif opt in ["-d", "--dryrun"]: dryrun = True elif opt in ["-h", "--help"]: usage('Help for this script\n') else: usage("Unknown option specified: <%s>" % opt) if len(remainder) > 0: usage("Unknown option(s) specified: <%s>" % remainder[0]) if wiki is None: usage("mandatory argument argument missing: --wiki") if output_file is None: usage("mandatory argument argument missing: --output") if start is not None: if not start.isdigit(): usage("value for --start must be a number") else: start = int(start) if end is not None: if not end.isdigit(): usage("value for --end must be a number") else: end = int(end) - 1 if not os.path.exists(configfile): usage("no such file found: " + configfile) wikiconf = Config(configfile) wikiconf.parse_conffile_per_project(wiki) dologsbackup(wiki, output_file, wikiconf, start, end, dryrun)
class ActionHandler(object): ''' methods for all actions, whether on one wiki or on all ''' def __init__(self, actions, show, message, job_status, undo, configfile, wikiname, dryrun, verbose): ''' constructor. reads configs for every wiki, this might be wasteful but really how long can it take? even with 1k wikis ''' self.verbose = verbose if not actions and not undo: if self.verbose: sys.stderr.write("No actions specified.\n") return self.actions = actions self.undo = undo self.dryrun = dryrun self.wikiname = wikiname self.configfile = configfile self.message = message self.show = show self.job_status = job_status self.conf = Config(self.configfile) if self.wikiname is None: self.wikilist = self.conf.db_list else: self.wikilist = [self.wikiname] self.wikiconfs = {} for wiki in self.wikilist: self.wikiconfs[wiki] = self.get_wiki_config(wiki) def get_wiki_config(self, wikiname): ''' parse and return the configuration for a particular wiki ''' wikiconf = Config(self.configfile) wikiconf.parse_conffile_per_project(wikiname) return wikiconf def do_all(self): ''' do all actions specified at instantiation time ''' self.conf.parse_conffile_globally() self.do_global_actions() self.undo_global_actions() self.do_per_wiki_actions() self.undo_per_wiki_actions() def do_global_actions(self): ''' do all actions that either do not reference a particular wiki (maintenance, exit) or may run on one or all wikis ''' for item in self.actions: if item == "kill": self.do_kill() elif item == "unlock": self.do_unlock() elif item == "remove": self.do_remove() elif item == "rerun": self.do_rerun() elif item == "maintenance": self.do_maintenance() elif item == "exit": self.do_exit() elif item == "show": self.do_show() def do_per_wiki_actions(self): ''' do all actions that must reference only one wiki ''' for item in self.actions: if item == "notice": for wiki in self.wikiconfs: self.do_notice(wiki) elif item == "mark": self.do_mark(self.wikiname) def undo_global_actions(self): ''' undo all specified actions that do not reference a particular wiki ''' for item in self.undo: if item == "maintenance": self.undo_maintenance() elif item == "exit": self.undo_exit() def undo_per_wiki_actions(self): ''' undo all specified actions that must reference a particular wiki ''' for wiki in self.wikiconfs: for item in self.undo: if item == "notice": self.undo_notice(wiki) def get_dump_pids(self): ''' get list of pids either for one wiki or for all which are running dumps; these must have been started by either the scheduler, the bash wrapper or the worker.py script. i.e. if a user runs dumpBackups.php by hand that is not going to be picked up. don't rely on lock files, they may have been removed or not created look up processes with DUMPS environ var set. values: 'scheduler' (the dumps scheduler) 'wrapper' (the bash dumps wrapper that runs across all wikis pid (the worker that runs on one wiki and any processes it spawned) we want at all costs to avoid hardcoded list of commands ''' pids = [] uid = os.geteuid() for process_id in os.listdir('/proc'): if process_id.isdigit(): # owned by us puid = os.stat(os.path.join('/proc', process_id)).st_uid if puid == uid: # has DUMPS environ var try: process_environ = open("/proc/%s/environ" % process_id, "r") except IOError as ex: # permission or gone, anyways not us continue for line in process_environ: if line: fields = line.split("\x00") for field in fields: if field.startswith("DUMPS="): # if no wiki specified for instance, get procs for all if self.wikiname is None or command_has_wiki( process_id, self.wikiname): pids.append(process_id) break process_environ.close() return pids def do_kill(self): ''' kill all dump related processes for the wiki specified at instantiation or all wikis; good only for processes started by the scheduler, the bash wrapper script or the python worker script ''' pids = self.get_dump_pids() if self.dryrun: print "would kill processes", pids return elif self.verbose: print "killing these processes:", pids for pid in pids: os.kill(int(pid), signal.SIGTERM) def do_unlock(self): ''' unlock either wiki specified at instantiation or all wikis, provided they were locked on current host ''' lock_info = self.find_dump_lockinfo() for wiki in lock_info: for lockfile_content in lock_info[wiki]: if check_process_running(lockfile_content['pid']): continue if self.dryrun: print("would remove lock", lockfile_content['filename'], "for wiki", wiki) else: if self.verbose: print "removing lock for", wiki os.unlink(lockfile_content['filename']) def find_failed_dumps_for_wiki(self, wikiname): ''' return list of failed jobs for the latest run for the specified wiki or empty list if there are none ''' failed_jobs = [] # fixme how is the above a string, shouldn't it be a function? wiki = Wiki(self.wikiconfs[wikiname], wikiname) date = wiki.latest_dump() if date is None: return [], None wiki.set_date(date) run_info_file = RunInfoFile(wiki, False) results = run_info_file.get_old_runinfo_from_file() if not results: return [], None for entry in results: if entry["status"] == "failed": failed_jobs.append(entry["name"]) return failed_jobs, date def find_failed_dumps(self): ''' return dict of failed jobs per wiki during most recent run, skipping over wikis with no failed jobs ''' failed_dumps = {} for wiki in self.wikilist: results, date = self.find_failed_dumps_for_wiki(wiki) if results and date is not None: failed_dumps[wiki] = {} failed_dumps[wiki][date] = results if self.verbose: print "failed dumps info:", failed_dumps return failed_dumps def do_rerun(self): self.do_remove(rerun=True) def do_remove(self, rerun=False): ''' find all failed dump jobs for unlocked wikis clean them up after getting lock on each one first, then remove lock if a specific wiki was specified at instantiation, clean up only that wiki ''' failed_dumps = self.find_failed_dumps() for wikiname in failed_dumps: for date in failed_dumps[wikiname]: wiki = Wiki(self.wikiconfs[wikiname], wikiname) wiki.set_date(date) locker = Locker(wiki, date) try: locker.lock() except Exception as ex: sys.stderr.write("Couldn't lock %s, can't do cleanup\n" % wikiname) continue self.cleanup_dump(wiki, failed_dumps[wikiname][date], rerun=rerun) locker.unlock(locker.get_lock_file_path()) def cleanup_dump(self, wiki, failed_jobs, rerun=False): ''' for the specified wiki, and the given list of failed jobs, find all the output files, toss them, then rebuild: md5sums file, symlinks into latest dir, dump run info file ''' # need to update status files, dumpruninfo, checksums file # and latest links. runner = Runner(wiki, prefetch=True, spawn=True, job=None, skip_jobs=[], restart=False, notice="", dryrun=False, enabled=None, partnum_todo=False, checkpoint_file=None, page_id_range=None, skipdone=[], cleanup=False, verbose=self.verbose) if not failed_jobs: if self.verbose: print "no failed jobs for wiki", wiki return if not self.dryrun: runner.dumpjobdata.do_before_dump() # need to redo the md5sums again of the files we don't toss... # so they are copied into the temp file. eeewww for job in failed_jobs: files = get_job_output_files(wiki, job, runner.dump_item_list.dump_items) paths = [runner.dump_dir.filename_public_path(fileinfo) for fileinfo in files] if self.verbose: print "for job", job, "these are the output files:", paths for filename in paths: if self.dryrun: print "would unlink", filename else: try: os.unlink(filename) except Exception as ex: continue if not self.dryrun: for item in runner.dump_item_list.dump_items: if item.status() == "done": runner.dumpjobdata.do_after_job(item) if self.dryrun: print "would update dumpruninfo file, checksums file, ", print "status file, index.html file and symlinks to latest dir" return runner.dumpjobdata.do_after_dump(runner.dump_item_list.dump_items) if self.verbose: print "updating status files for wiki", wiki.db_name if runner.dump_item_list.all_possible_jobs_done(): # All jobs are either in status "done", "waiting", "failed", "skipped" runner.indexhtml.update_index_html("done") runner.statushtml.update_status_file("done") else: runner.indexhtml.update_index_html("partialdone") runner.statushtml.update_status_file("partialdone") if rerun: for job in failed_jobs: runner.dump_item_list.mark_dumps_to_run(job) self.rerun_jobs(runner) def log_and_print(self, message): sys.stderr.write("%s\n" % message) def debug(self, stuff): self.log_and_print("%s: %s" % (TimeUtils.pretty_time(), stuff)) def rerun_jobs(self, runner): runner.dumpjobdata.do_before_dump() for item in runner.dump_item_list.dump_items: if item.to_run(): item.start() runner.indexhtml.update_index_html() runner.statushtml.update_status_file() runner.dumpjobdata.do_before_job(runner.dump_item_list.dump_items) try: item.dump(runner) except Exception as ex: exc_type, exc_value, exc_traceback = sys.exc_info() if self.verbose: sys.stderr.write(repr(traceback.format_exception( exc_type, exc_value, exc_traceback))) else: if exc_type.__name__ == 'BackupPrereqError': self.debug(str(ex)) else: self.debug("*** exception! " + str(ex)) if exc_type.__name__ != 'BackupPrereqError': item.set_status("failed") # Here for example status is "failed". But maybe also # "in-progress", if an item chooses to override dump(...) and # forgets to set the status. This is a failure as well. if item.status() not in ["done", "waiting", "skipped"]: runner.failurehandler.report_failure() runner.failurehandler.failure_count += 1 if item.status() == "done": runner.dumpjobdata.do_after_job(item) elif item.status() == "waiting" or item.status() == "skipped": continue else: # failure continue if runner.dump_item_list.all_possible_jobs_done(): # All jobs are either in status "done", "waiting", "failed", "skipped" runner.indexhtml.update_index_html_file("done") runner.statushtml.update_status_file("done") else: # This may happen if we start a dump now and abort before all items are # done. Then some are left for example in state "waiting". When # afterwards running a specific job, all (but one) of the jobs # previously in "waiting" are still in status "waiting" runner.indexhtml.update_index_html("partialdone") runner.statushtml.update_status_file("partialdone") runner.dumpjobdata.do_after_dump(runner.dump_item_list.dump_items) def do_maintenance(self): ''' create an empty maintenance.txt file causes the dump runners after the next job to run no jobs per wiki and sleep 5 minutes in between each wiki this is a global action that affects all wikis run on the given host ''' if self.dryrun: print "would create maintenance file" return elif self.verbose: print "creating maintenance file" create_file("maintenance.txt") def do_exit(self): ''' create an empty exit.txt file; causes the dump runners to exit after next job this is a global action that affects all wikis run on the given host ''' if self.dryrun: print "would create exit file" return elif self.verbose: print "creating exit file" create_file("exit.txt") def do_show(self): ''' show specified information for all wikis ''' if self.show == 'lastrun': dbinfo = self.conf.db_latest_status() dbdates = [date for (_dbname, _status, date) in dbinfo if date is not None] dbdates = sorted(dbdates) if not len(dbdates): print "" else: print dbdates[-1] elif self.show == "alldone": dbinfo = self.conf.db_latest_status() # skip cases where there is no status file. maybe we will revisit this later statuses = [status for (_dbname, status, _date) in dbinfo if status is not None] for status in statuses: if status != "complete": print "" break else: print "True" elif (self.show in ["failed", "aborted", "missing", "progress", "partial", "complete", "not yet"]): dbinfo = self.conf.db_latest_status() # skip cases where there is no status file. maybe we will revisit this later dbs_to_show = [dbname for (dbname, status, date) in dbinfo if status == self.show] if dbs_to_show: print dbs_to_show else: print "No such known element for 'show'" def do_notice(self, wikiname): ''' create a notice.txt file for the particular wiki for the most recent run. the contents will appear on its web page for that dump run ''' wiki = Wiki(self.wikiconfs[wikiname], wikiname) date = wiki.latest_dump() if date is None: print "dump never run, not adding notice file for wiki", wikiname return if self.dryrun: print "would add notice.txt for wiki", wikiname, "date", date return elif self.verbose: print "creating notice file for wiki", wikiname, "date", date wiki.set_date(date) NoticeFile(wiki, self.message, True) def do_mark(self, wikiname): ''' mark the specified job with the specified status. ''' wiki = Wiki(self.wikiconfs[wikiname], wikiname) date = wiki.latest_dump() if date is None: print "dump never run, not marking job for wiki", wikiname return wiki.set_date(date) runner = Runner(wiki, prefetch=True, spawn=True, job=None, skip_jobs=[], restart=False, notice="", dryrun=False, enabled=None, partnum_todo=False, checkpoint_file=None, page_id_range=None, skipdone=[], cleanup=False, verbose=self.verbose) known_jobs = [item.name() for item in runner.dump_item_list.dump_items] + ['tables'] if ':' in self.job_status: job, status = self.job_status.split(":", 1) if status not in ["done", "failed"]: status = None if job not in known_jobs: job = None if job is None or status is None: print "bad or no job/status specified", self.job_status if self.verbose: print "known jobs", known_jobs return runner.dumpjobdata.do_before_dump() for item in runner.dump_item_list.dump_items: if item.name() == job: item.set_status(status, True) if item.status() == "done": runner.dumpjobdata.do_after_job(item) elif item.status() not in ["done", "waiting", "skipped"]: runner.failurehandler.failure_count += 1 if self.verbose: print "updating status files for wiki", wiki.db_name if runner.dump_item_list.all_possible_jobs_done(): # All jobs are either in status "done", "waiting", "failed", "skipped" runner.indexhtml.update_index_html("done") runner.statushtml.update_status_file("done") else: runner.indexhtml.update_index_html("partialdone") runner.statushtml.update_status_file("partialdone") runner.dumpjobdata.do_after_dump(runner.dump_item_list.dump_items) return def undo_maintenance(self): ''' remove any maintenance.txt file that may exist, resumes normal operations ''' if self.dryrun: print "would remove maintenance file" return elif self.verbose: print "removing maintenance file" remove_file("maintenance.txt") def undo_exit(self): ''' remove any exit.txt file that may exist, resumes normal operations ''' if self.dryrun: print "would remove exit file" return elif self.verbose: print "removing exit file" remove_file("exit.txt") def undo_notice(self, wikiname): ''' remove any notice.txt file that may exist for the most current run for the given wiki ''' wiki = Wiki(self.wikiconfs[wikiname], wikiname) date = wiki.latest_dump() if date is None: print "dump never run, no notice file to remove for wiki", wikiname return if self.dryrun: print "would remove notice.txt for wiki", wikiname, "date", date return elif self.verbose: print "removing notice file for wiki", wikiname, "date", date wiki.set_date(date) NoticeFile(wiki, False, True) def find_dump_lockinfo(self): ''' get and return host, pid, lockfile name for the wiki specified at instantiation or for all wikis, for lockfiles created on current host ''' my_hostname = socket.getfqdn() lockfiles = [] results = {} if self.wikiname is not None: lockfiles = glob.glob(os.path.join(self.wikiconfs[self.wikiname].private_dir, self.wikiname, "lock_*")) else: lockfiles = glob.glob(os.path.join(self.conf.private_dir, "*", "lock_*")) for filename in lockfiles: host, pid = get_lockfile_content(filename) wiki = self.get_wiki_from_lockfilename(filename) if host == my_hostname: if wiki not in results: results[wiki] = [] results[wiki].append({'pid': pid, 'host': host, 'filename': filename}) return results def get_wiki_from_lockfilename(self, filename): ''' given the full lockfile name, grab the wiki name out of it and return it ''' if "lock" in filename and filename.startswith(self.conf.private_dir): filename = filename[len(self.conf.private_dir):] filename = filename.lstrip(os.path.sep) # wikiname/lock_... wikiname = filename.split(os.path.sep)[0] return wikiname else: return None
def main(): os.environ['DUMPS'] = str(os.getpid()) try: date = None config_file = False force_lock = False prefetch = True prefetchdate = None spawn = True restart = False jobs_requested = None skip_jobs = None enable_logging = False html_notice = "" dryrun = False partnum_todo = None after_checkpoint = False checkpoint_file = None page_id_range = None cutoff = None exitcode = 1 skipdone = False do_locking = False verbose = False cleanup_files = False do_prereqs = False try: (options, remainder) = getopt.gnu_getopt( sys.argv[1:], "", ['date=', 'job=', 'skipjobs=', 'configfile=', 'addnotice=', 'delnotice', 'force', 'dryrun', 'noprefetch', 'prefetchdate=', 'nospawn', 'restartfrom', 'aftercheckpoint=', 'log', 'partnum=', 'checkpoint=', 'pageidrange=', 'cutoff=', "skipdone", "exclusive", "prereqs", "cleanup", 'verbose']) except Exception as ex: usage("Unknown option specified") for (opt, val) in options: if opt == "--date": date = val elif opt == "--configfile": config_file = val elif opt == '--checkpoint': checkpoint_file = val elif opt == '--partnum': partnum_todo = int(val) elif opt == "--force": force_lock = True elif opt == '--aftercheckpoint': after_checkpoint = True checkpoint_file = val elif opt == "--noprefetch": prefetch = False elif opt == "--prefetchdate": prefetchdate = val elif opt == "--nospawn": spawn = False elif opt == "--dryrun": dryrun = True elif opt == "--job": jobs_requested = val elif opt == "--skipjobs": skip_jobs = val elif opt == "--restartfrom": restart = True elif opt == "--log": enable_logging = True elif opt == "--addnotice": html_notice = val elif opt == "--delnotice": html_notice = False elif opt == "--pageidrange": page_id_range = val elif opt == "--cutoff": cutoff = val if not cutoff.isdigit() or not len(cutoff) == 8: usage("--cutoff value must be in yyyymmdd format") elif opt == "--skipdone": skipdone = True elif opt == "--cleanup": cleanup_files = True elif opt == "--exclusive": do_locking = True elif opt == "--verbose": verbose = True elif opt == "--prereqs": do_prereqs = True if jobs_requested is not None: if ',' in jobs_requested: jobs_todo = jobs_requested.split(',') else: jobs_todo = [jobs_requested] else: jobs_todo = [] if dryrun and (len(remainder) == 0): usage("--dryrun requires the name of a wikidb to be specified") if restart and not jobs_requested: usage("--restartfrom requires --job and the job from which to restart") if restart and len(jobs_todo) > 1: usage("--restartfrom requires --job and exactly one job from which to restart") if partnum_todo is not None and not jobs_requested: usage("--partnum option requires specific job(s) for which to rerun that part") if partnum_todo is not None and restart: usage("--partnum option can be specified only for a specific list of jobs") if checkpoint_file is not None and (len(remainder) == 0): usage("--checkpoint option requires the name of a wikidb to be specified") if checkpoint_file is not None and not jobs_requested: usage("--checkpoint option requires --job") if page_id_range and not jobs_requested: usage("--pageidrange option requires --job") if page_id_range and checkpoint_file is not None: usage("--pageidrange option cannot be used with --checkpoint option") if prefetchdate is not None and not prefetch: usage("prefetchdate and noprefetch options may not be specified together") if prefetchdate is not None and (not prefetchdate.isdigit() or len(prefetchdate) != 8): usage("prefetchdate must be of the form YYYYMMDD") if skip_jobs is None: skip_jobs = [] else: skip_jobs = skip_jobs.split(",") # allow alternate config file if config_file: config = Config(config_file) else: config = Config() externals = ['php', 'mysql', 'mysqldump', 'head', 'tail', 'checkforbz2footer', 'grep', 'gzip', 'bzip2', 'writeuptopageid', 'recompressxml', 'sevenzip', 'cat'] failed = False unknowns = [] notfound = [] for external in externals: try: ext = getattr(config, external) except AttributeError as ex: unknowns.append(external) failed = True else: if not exists(ext): notfound.append(ext) failed = True if failed: if unknowns: sys.stderr.write("Unknown config param(s): %s\n" % ", ".join(unknowns)) if notfound: sys.stderr.write("Command(s) not found: %s\n" % ", ".join(notfound)) sys.stderr.write("Exiting.\n") sys.exit(1) if (dryrun or partnum_todo is not None or (jobs_requested is not None and not restart and not do_locking and not force_lock)): locks_enabled = False else: locks_enabled = True if dryrun: print "***" print "Dry run only, no files will be updated." print "***" if len(remainder) > 0: wiki = Wiki(config, remainder[0]) if cutoff: # fixme if we asked for a specific job then check that job only # not the dir last_ran = wiki.latest_dump() if last_ran >= cutoff: wiki = None if wiki is not None and locks_enabled: locker = Locker(wiki, date) if force_lock and locks_enabled: lockfiles = locker.is_locked() locker.unlock(lockfiles, owner=False) if locks_enabled: locker.lock() else: # if the run is across all wikis and we are just doing one job, # we want the age of the wikis by the latest status update # and not the date the run started if jobs_requested is not None and jobs_requested[0] == 'createdirs': check_status_time = False # there won't actually be a status for this job but we want # to ensure that the directory and the status file are present # and intact check_job_status = True check_prereq_status = False else: check_status_time = bool(jobs_requested is not None) check_job_status = bool(skipdone) check_prereq_status = bool(jobs_requested is not None and skipdone) wiki = find_lock_next_wiki(config, locks_enabled, cutoff, prefetch, prefetchdate, spawn, dryrun, html_notice, check_status_time, check_job_status, check_prereq_status, date, jobs_todo[0] if len(jobs_todo) else None, skip_jobs, page_id_range, partnum_todo, checkpoint_file, skipdone, restart, verbose) if wiki is not None and wiki: # process any per-project configuration options config.parse_conffile_per_project(wiki.db_name) if date == 'last': dumps = sorted(wiki.dump_dirs()) if dumps: date = dumps[-1] else: date = None if date is None or not date: date = TimeUtils.today() wiki.set_date(date) if after_checkpoint: fname = DumpFilename(wiki) fname.new_from_filename(checkpoint_file) if not fname.is_checkpoint_file: usage("--aftercheckpoint option requires the " "name of a checkpoint file, bad filename provided") page_id_range = str(int(fname.last_page_id) + 1) partnum_todo = fname.partnum_int # now we don't need this. checkpoint_file = None after_checkpoint_jobs = ['articlesdump', 'metacurrentdump', 'metahistorybz2dump'] if (jobs_requested is None or not set(jobs_requested).issubset(set(after_checkpoint_jobs))): usage("--aftercheckpoint option requires --job option with one or more of %s" % ", ".join(after_checkpoint_jobs)) enabled = {} if enable_logging: enabled = {"logging": True} if restart: sys.stderr.write("Running %s, restarting from job %s...\n" % (wiki.db_name, jobs_todo[0])) elif jobs_requested: sys.stderr.write("Running %s, jobs %s...\n" % (wiki.db_name, jobs_requested)) else: sys.stderr.write("Running %s...\n" % wiki.db_name) # no specific jobs requested, runner will do them all if not len(jobs_todo): runner = Runner(wiki, prefetch, prefetchdate, spawn, None, skip_jobs, restart, html_notice, dryrun, enabled, partnum_todo, checkpoint_file, page_id_range, skipdone, cleanup_files, do_prereqs, verbose) result = runner.run() if result is not None and result: exitcode = 0 else: # do each job requested one at a time for job in jobs_todo: runner = Runner(wiki, prefetch, prefetchdate, spawn, job, skip_jobs, restart, html_notice, dryrun, enabled, partnum_todo, checkpoint_file, page_id_range, skipdone, cleanup_files, do_prereqs, verbose) result = runner.run() if result is not None and result: exitcode = 0 # if we are doing one piece only of the dump, we don't unlock either if locks_enabled: locker = Locker(wiki, date) lockfiles = locker.is_locked() locker.unlock(lockfiles, owner=True) elif wiki is not None: sys.stderr.write("Wikis available to run but prereqs not complete.\n") exitcode = 0 else: sys.stderr.write("No wikis available to run.\n") exitcode = 255 finally: cleanup() sys.exit(exitcode)