def report_statusline(wiki, status, error=False): if error: # No state information, hide the timestamp stamp = "<span style=\"visible: none\">" + TimeUtils.pretty_time() + "</span>" else: stamp = TimeUtils.pretty_time() if wiki.is_private(): link = "%s (private data)" % wiki.db_name else: if wiki.date: link = "<a href=\"%s/%s\">%s</a>" % (wiki.db_name, wiki.date, wiki.db_name) else: link = "%s (new)" % wiki.db_name if wiki.is_closed(): link = link + " (closed)" return "<li>%s %s: %s</li>\n" % (stamp, link, status)
def report_previous_dump_link(self, done): """Produce a link to the previous dump, if any""" # get the list of dumps for this wiki in order, find me in the list, # find the one prev to me. # why? we might be rerunning a job from an older dumps. we might have two # runs going at once (think en pedia, one finishing up the history, another # starting at the beginning to get the new abstracts and stubs). try: dumps_in_order = self.wiki.latest_dump(return_all=True) me_index = dumps_in_order.index(self.wiki.date) # don't wrap around to the newest dump in the list! if me_index > 0: raw_date = dumps_in_order[me_index - 1] elif me_index == 0: # We are the first item in the list. This is not an error, but there is no # previous dump return "No prior dumps of this database stored." else: raise ValueError except Exception as ex: if self.verbose: exc_type, exc_value, exc_traceback = sys.exc_info() sys.stderr.write(repr( traceback.format_exception(exc_type, exc_value, exc_traceback))) return "No prior dumps of this database stored." pretty_date = TimeUtils.pretty_date(raw_date) if done: prefix = "" message = "Last dumped on" else: prefix = "This dump is in progress; see also the " message = "previous dump from" return "%s<a href=\"../%s/\">%s %s</a>" % (prefix, raw_date, message, pretty_date)
def get_date(self, date): if date == 'last': dumps = sorted(self.wiki.dump_dirs()) if dumps: date = dumps[-1] else: date = None if date is None: date = TimeUtils.today() return date
def report_failure(self): if self.email: if self.wiki.config.admin_mail and self.wiki.config.admin_mail.lower() != 'nomail': subject = "Dump failure for " + self.wiki.db_name message = self.wiki.config.read_template("errormail.txt") % { "db": self.wiki.db_name, "date": self.wiki.date, "time": TimeUtils.pretty_time(), "url": "/".join((self.wiki.config.web_root, self.wiki.db_name, self.wiki.date, ''))} self.mail(subject, message)
def db_info_by_age(self, use_status_time=False): """ Sort wikis in reverse order of last successful dump and return tuples of information for each wiki: * whether the dump failed, * the date of the run as found in dump dir string OR as determined by time of status file, if use_status_time is True, * age of status file if any, * wiki name Order is (DumpFailed, Age), and False < True: First, wikis whose latest dump was successful, most recent dump first Then, wikis whose latest dump failed, most recent dump first. Finally, wikis which have never been dumped. According to that sort, the last item of this list is, when applicable, the oldest failed dump attempt. If some error occurs checking a dump status, that dump is put last in the list (sort value is (True, maxsize) ) Note that we now sort this list by the date of the dump directory, not the last date that a dump file in that directory may have been touched. This allows us to rerun jobs to completion from older runs, for example an en pedia history urn that failed in the middle, without borking the index page links. """ available = [] today = int(TimeUtils.today()) for dbname in self.db_list: wiki = Wiki(self, dbname) age = sys.maxsize date = sys.maxsize last = wiki.latest_dump() status = '' if last: dump_status = StatusHtml.get_statusfile_path(wiki, last) try: if use_status_time: # only use the status file time, not the dir date date = today else: date = today - int(last) # tack on the file mtime so that if we have multiple wikis # dumped on the same day, they get ordered properly age = FileUtils.file_age(dump_status) status = FileUtils.read_file(dump_status) except Exception as ex: print("dump dir missing status file %s?" % dump_status) dump_failed = (status == '') or ('dump aborted' in status) available.append((dump_failed, date, age, dbname)) available = sorted(available) return available
def report_statusline(wiki, status, error=False): """ given a wiki name, the status (done, in progress, etc), produce and return a line of html describing the status of the wiki, with a link to the wiki dump directory for the dump run date if appropriate """ if error: # No state information, hide the timestamp stamp = "<span style=\"visible: none\">" + TimeUtils.pretty_time() + "</span>" else: stamp = TimeUtils.pretty_time() if wiki.is_private(): link = "%s (private data)" % wiki.db_name else: if wiki.date: link = "<a href=\"%s/%s\">%s</a>" % (wiki.db_name, wiki.date, wiki.db_name) else: link = "%s (new)" % wiki.db_name if wiki.is_closed(): link = link + " (closed)" return "<li>%s %s: %s</li>\n" % (stamp, link, status)
def do_all_wikis_til_done(self, num_fails, overwrite, date): """Run through all wikis, retrying up to numFails times in case of error""" if not date: date = TimeUtils.today() fails = 0 while 1: self.do_all_wikis(overwrite, date) if not len(self.wikis_todo): break fails = fails + 1 if fails > num_fails: raise BackupError("Too many failures, giving up") # wait 5 minutes and try another loop time.sleep(300)
def do_main(): ''' main entry point, do all the work ''' (configfile, date, dryrun, filenameformat, output_dir, overwrite, wikiname, script, basename, query, retries, verbose, remainder) = get_args() validate_args(date, output_dir, retries, script, query) if retries is None: retries = "3" retries = int(retries) if configfile: config = Config(configfile) else: config = Config() if date is None: date = TimeUtils.today() if script is not None: runner = ScriptRunner(script, remainder, dryrun, verbose) else: if query is None: query = FileUtils.read_file(config.queryfile) runner = QueryRunner(query, dryrun, verbose) if basename is not None: base = Wiki(config, basename) base.set_date(date) if base is not None: base.config.parse_conffile_per_project(base.db_name) else: base = None if wikiname is not None: wiki = Wiki(config, wikiname) wiki.set_date(date) wikirunner = WikiRunner(runner, wiki, filenameformat, output_dir, base) wikirunner.do_one_wiki(overwrite) else: wikirunner = WikiRunnerLoop(config, runner, filenameformat, output_dir, base) wikirunner.do_all_wikis_til_done(retries, overwrite, date)
def set_status(self, status, set_updated=True): self.runinfo["status"] = status if set_updated: self.runinfo["updated"] = TimeUtils.pretty_time()
def debug(self, stuff): self.log_and_print("%s: %s" % (TimeUtils.pretty_time(), stuff))
def main(): os.environ['DUMPS'] = str(os.getpid()) try: date = None config_file = False force_lock = False prefetch = True prefetchdate = None spawn = True restart = False jobs_requested = None skip_jobs = None enable_logging = False html_notice = "" dryrun = False partnum_todo = None after_checkpoint = False checkpoint_file = None page_id_range = None cutoff = None exitcode = 1 skipdone = False do_locking = False verbose = False cleanup_files = False do_prereqs = False try: (options, remainder) = getopt.gnu_getopt( sys.argv[1:], "", ['date=', 'job=', 'skipjobs=', 'configfile=', 'addnotice=', 'delnotice', 'force', 'dryrun', 'noprefetch', 'prefetchdate=', 'nospawn', 'restartfrom', 'aftercheckpoint=', 'log', 'partnum=', 'checkpoint=', 'pageidrange=', 'cutoff=', "skipdone", "exclusive", "prereqs", "cleanup", 'verbose']) except Exception as ex: usage("Unknown option specified") for (opt, val) in options: if opt == "--date": date = val elif opt == "--configfile": config_file = val elif opt == '--checkpoint': checkpoint_file = val elif opt == '--partnum': partnum_todo = int(val) elif opt == "--force": force_lock = True elif opt == '--aftercheckpoint': after_checkpoint = True checkpoint_file = val elif opt == "--noprefetch": prefetch = False elif opt == "--prefetchdate": prefetchdate = val elif opt == "--nospawn": spawn = False elif opt == "--dryrun": dryrun = True elif opt == "--job": jobs_requested = val elif opt == "--skipjobs": skip_jobs = val elif opt == "--restartfrom": restart = True elif opt == "--log": enable_logging = True elif opt == "--addnotice": html_notice = val elif opt == "--delnotice": html_notice = False elif opt == "--pageidrange": page_id_range = val elif opt == "--cutoff": cutoff = val if not cutoff.isdigit() or not len(cutoff) == 8: usage("--cutoff value must be in yyyymmdd format") elif opt == "--skipdone": skipdone = True elif opt == "--cleanup": cleanup_files = True elif opt == "--exclusive": do_locking = True elif opt == "--verbose": verbose = True elif opt == "--prereqs": do_prereqs = True if jobs_requested is not None: if ',' in jobs_requested: jobs_todo = jobs_requested.split(',') else: jobs_todo = [jobs_requested] else: jobs_todo = [] if dryrun and (len(remainder) == 0): usage("--dryrun requires the name of a wikidb to be specified") if restart and not jobs_requested: usage("--restartfrom requires --job and the job from which to restart") if restart and len(jobs_todo) > 1: usage("--restartfrom requires --job and exactly one job from which to restart") if partnum_todo is not None and not jobs_requested: usage("--partnum option requires specific job(s) for which to rerun that part") if partnum_todo is not None and restart: usage("--partnum option can be specified only for a specific list of jobs") if checkpoint_file is not None and (len(remainder) == 0): usage("--checkpoint option requires the name of a wikidb to be specified") if checkpoint_file is not None and not jobs_requested: usage("--checkpoint option requires --job") if page_id_range and not jobs_requested: usage("--pageidrange option requires --job") if page_id_range and checkpoint_file is not None: usage("--pageidrange option cannot be used with --checkpoint option") if prefetchdate is not None and not prefetch: usage("prefetchdate and noprefetch options may not be specified together") if prefetchdate is not None and (not prefetchdate.isdigit() or len(prefetchdate) != 8): usage("prefetchdate must be of the form YYYYMMDD") if skip_jobs is None: skip_jobs = [] else: skip_jobs = skip_jobs.split(",") # allow alternate config file if config_file: config = Config(config_file) else: config = Config() externals = ['php', 'mysql', 'mysqldump', 'head', 'tail', 'checkforbz2footer', 'grep', 'gzip', 'bzip2', 'writeuptopageid', 'recompressxml', 'sevenzip', 'cat'] failed = False unknowns = [] notfound = [] for external in externals: try: ext = getattr(config, external) except AttributeError as ex: unknowns.append(external) failed = True else: if not exists(ext): notfound.append(ext) failed = True if failed: if unknowns: sys.stderr.write("Unknown config param(s): %s\n" % ", ".join(unknowns)) if notfound: sys.stderr.write("Command(s) not found: %s\n" % ", ".join(notfound)) sys.stderr.write("Exiting.\n") sys.exit(1) if (dryrun or partnum_todo is not None or (jobs_requested is not None and not restart and not do_locking and not force_lock)): locks_enabled = False else: locks_enabled = True if dryrun: print "***" print "Dry run only, no files will be updated." print "***" if len(remainder) > 0: wiki = Wiki(config, remainder[0]) if cutoff: # fixme if we asked for a specific job then check that job only # not the dir last_ran = wiki.latest_dump() if last_ran >= cutoff: wiki = None if wiki is not None and locks_enabled: locker = Locker(wiki, date) if force_lock and locks_enabled: lockfiles = locker.is_locked() locker.unlock(lockfiles, owner=False) if locks_enabled: locker.lock() else: # if the run is across all wikis and we are just doing one job, # we want the age of the wikis by the latest status update # and not the date the run started if jobs_requested is not None and jobs_requested[0] == 'createdirs': check_status_time = False # there won't actually be a status for this job but we want # to ensure that the directory and the status file are present # and intact check_job_status = True check_prereq_status = False else: check_status_time = bool(jobs_requested is not None) check_job_status = bool(skipdone) check_prereq_status = bool(jobs_requested is not None and skipdone) wiki = find_lock_next_wiki(config, locks_enabled, cutoff, prefetch, prefetchdate, spawn, dryrun, html_notice, check_status_time, check_job_status, check_prereq_status, date, jobs_todo[0] if len(jobs_todo) else None, skip_jobs, page_id_range, partnum_todo, checkpoint_file, skipdone, restart, verbose) if wiki is not None and wiki: # process any per-project configuration options config.parse_conffile_per_project(wiki.db_name) if date == 'last': dumps = sorted(wiki.dump_dirs()) if dumps: date = dumps[-1] else: date = None if date is None or not date: date = TimeUtils.today() wiki.set_date(date) if after_checkpoint: fname = DumpFilename(wiki) fname.new_from_filename(checkpoint_file) if not fname.is_checkpoint_file: usage("--aftercheckpoint option requires the " "name of a checkpoint file, bad filename provided") page_id_range = str(int(fname.last_page_id) + 1) partnum_todo = fname.partnum_int # now we don't need this. checkpoint_file = None after_checkpoint_jobs = ['articlesdump', 'metacurrentdump', 'metahistorybz2dump'] if (jobs_requested is None or not set(jobs_requested).issubset(set(after_checkpoint_jobs))): usage("--aftercheckpoint option requires --job option with one or more of %s" % ", ".join(after_checkpoint_jobs)) enabled = {} if enable_logging: enabled = {"logging": True} if restart: sys.stderr.write("Running %s, restarting from job %s...\n" % (wiki.db_name, jobs_todo[0])) elif jobs_requested: sys.stderr.write("Running %s, jobs %s...\n" % (wiki.db_name, jobs_requested)) else: sys.stderr.write("Running %s...\n" % wiki.db_name) # no specific jobs requested, runner will do them all if not len(jobs_todo): runner = Runner(wiki, prefetch, prefetchdate, spawn, None, skip_jobs, restart, html_notice, dryrun, enabled, partnum_todo, checkpoint_file, page_id_range, skipdone, cleanup_files, do_prereqs, verbose) result = runner.run() if result is not None and result: exitcode = 0 else: # do each job requested one at a time for job in jobs_todo: runner = Runner(wiki, prefetch, prefetchdate, spawn, job, skip_jobs, restart, html_notice, dryrun, enabled, partnum_todo, checkpoint_file, page_id_range, skipdone, cleanup_files, do_prereqs, verbose) result = runner.run() if result is not None and result: exitcode = 0 # if we are doing one piece only of the dump, we don't unlock either if locks_enabled: locker = Locker(wiki, date) lockfiles = locker.is_locked() locker.unlock(lockfiles, owner=True) elif wiki is not None: sys.stderr.write("Wikis available to run but prereqs not complete.\n") exitcode = 0 else: sys.stderr.write("No wikis available to run.\n") exitcode = 255 finally: cleanup() sys.exit(exitcode)
def debug(self, stuff): """ display a debugging message with wiki name and time, log it also if logging is enabled """ self.log_and_print("%s: %s %s" % (TimeUtils.pretty_time(), self.db_name, stuff))