Esempio n. 1
0
    def remove_symlinks_from_old_runs(self, date_string, dump_name=None, partnum=None,
                                      checkpoint=None, onlyparts=False):
        # fixme
        # this needs to do more work if there are file parts or checkpoint files linked in here from
        # earlier dates. checkpoint ranges change, and configuration of parallel jobs for file parts
        # changes too, so maybe old files still exist and the links need to be removed because we
        # have newer files for the same phase of the dump.

        if SymLinks.NAME in self._enabled:
            latest_dir = self.dump_dir.latest_dir()
            files = os.listdir(latest_dir)
            for filename in files:
                link = os.path.join(latest_dir, filename)
                if os.path.islink(link):
                    realfile = os.readlink(link)
                    file_obj = DumpFilename(self.dump_dir._wiki)
                    file_obj.new_from_filename(os.path.basename(realfile))
                    if file_obj.date < date_string:
                        # fixme check that these are ok if the value is None
                        if dump_name and (file_obj.dumpname != dump_name):
                            continue
                        if (partnum or onlyparts) and (file_obj.partnum != partnum):
                            continue
                        if checkpoint and (file_obj.checkpoint != checkpoint):
                            continue
                        self.debugfn("Removing old symlink %s -> %s" % (link, realfile))
                        os.remove(link)
Esempio n. 2
0
 def get_per_file_path(self, htype, filename):
     '''
     return the full path to the file containing the checksum of
     the specified type for the given filename. this is only in txt format
     '''
     dfname = DumpFilename(self.wiki, None)
     # fixme check to see if this is right or what
     dfname.new_from_filename(Checksummer.get_checksum_basename_perfile(htype, filename))
     return self.dump_dir.filename_public_path(dfname)
Esempio n. 3
0
    def remove_symlinks_from_old_runs(self, date_string, dump_name=None, partnum=None,
                                      checkpoint=None, onlyparts=False):
        """
        Remove symlinks from the 'latest' directory for (some) links that point to
        files from other runs than the current one (of 'date_string').
        If dump_name, part_num, checkpoint are False or None, we remove all the old symlinks
        for all values of the arg in the filename.
        example: if partnum is False or None then we remove all old values for all file parts

        This needs to do more work if there are file parts or checkpoint files linked in here from
        earlier dates. checkpoint ranges change, and configuration of parallel jobs for file parts
        changes too, so maybe old files still exist and the links need to be removed because we
        have newer files for the same phase of the dump. So we keep symlinks to files from
        one older run only, and clean up the rest. We do this because here at WMF we do partial
        and full runs alternating, and we like to keep the links to files from the full runs around
        until a new full run is in place. Really the number of keeps should be configurable
        (FIXME later I guess).
        """
        if SymLinks.NAME in self._enabled:
            latest_dir = self.dump_dir.latest_dir()
            files = os.listdir(latest_dir)
            dates = []

            files_for_cleanup = []
            for filename in files:
                link = os.path.join(latest_dir, filename)
                if os.path.islink(link):
                    realfilepath = os.readlink(link)
                    dfname = DumpFilename(self.dump_dir._wiki)
                    dfname.new_from_filename(os.path.basename(realfilepath))
                    files_for_cleanup.append({'link': link, 'dfname': dfname, 'path': realfilepath})
                    dates.append(dfname.date)
            try:
                index = dates.index(date_string)
                prev_run_date = dates[index - 1]
            except Exception:
                if len(dates) >= 2:
                    prev_run_date = dates[-2]
                else:
                    prev_run_date = None
            for item in files_for_cleanup:
                if item['dfname'].date < date_string:
                    if dump_name and (item['dfname'].dumpname != dump_name):
                        continue
                    if prev_run_date is None or item['dfname'].date == prev_run_date:
                        # for the previous run, or the only existing run, if different
                        # from the current one, we are very careful. For all older runs
                        # we pretty much want to toss everything

                        # fixme check that these are ok if the value is None
                        if (partnum or onlyparts) and (item['dfname'].partnum != partnum):
                            continue
                        if checkpoint and (item['dfname'].checkpoint != checkpoint):
                            continue
                    self.debugfn("Removing old symlink %s -> %s" % (item['link'], item['path']))
                    os.remove(item['link'])
    def write_specialfilesinfo_file(self):
        """
        get info about all files for the most current dump of a given
        wiki, possibly in progress, that don't contain dump job
        output; write this info to an output file
        """
        if SpecialFileInfo.NAME not in self._enabled:
            return

        dump_dir = DumpDir(self.wiki, self.wiki.db_name)
        files = self.get_special_filenames()
        fileinfo = {}
        for filename in files:
            fileinfo[filename] = {}
            path = os.path.join(self.wiki.public_dir(), self.wiki.date, filename)
            fileinfo[filename]['status'] = 'present'
            try:
                size = os.path.getsize(path)
                fileinfo[filename]['size'] = size
            except Exception:
                fileinfo[filename]['status'] = 'missing'
                continue

            dfname = DumpFilename(self.wiki)
            dfname.new_from_filename(os.path.basename(path))
            fileinfo[filename]['url'] = dump_dir.web_path_relative(dfname)

        contents = {}
        contents['files'] = fileinfo
        contents['version'] = SpecialFileInfo.VERSION

        try:
            self.write_contents(contents)
        except Exception:
            if self.verbose:
                exc_type, exc_value, exc_traceback = sys.exc_info()
                sys.stderr.write(repr(traceback.format_exception(exc_type, exc_value,
                                                                 exc_traceback)))
            message = "Couldn't write special files info. Continuing anyways"
            if self.error_callback:
                self.error_callback(message)
            else:
                sys.stderr.write("%s\n" % message)
Esempio n. 5
0
    def command_completion_callback(self, series):
        """
        if the series of commands ran successfully to completion,
        mv produced output files from temporary to permanent
        names

        we write the data into temporary locations initially so that
        as each command series completes, the output files can
        be made available as done immediately, rather than waiting
        for all the parallel processes of a dump step to complete
        first.

        args: CommandSeries for which all commands have
              completed
        """
        if not series.exited_successfully():
            return

        for commands in self.commands_submitted:
            if commands['series'] == series._command_series:
                if not commands['output_files']:
                    return
                for inprogress_filename in commands['output_files']:
                    if not inprogress_filename.endswith(DumpFilename.INPROG):
                        continue
                    final_dfname = DumpFilename(commands['runner'].wiki)
                    final_dfname.new_from_filename(
                        inprogress_filename[:-1 * len(DumpFilename.INPROG)])

                    in_progress_path = os.path.join(commands['output_dir'], inprogress_filename)
                    final_path = os.path.join(commands['output_dir'], final_dfname.filename)
                    try:
                        os.rename(in_progress_path, final_path)
                    except Exception:
                        if self.verbose:
                            exc_type, exc_value, exc_traceback = sys.exc_info()
                            sys.stderr.write(repr(
                                traceback.format_exception(exc_type, exc_value, exc_traceback)))
                        continue
                    # sanity check of file contents, move if bad
                    self.move_if_truncated(commands['runner'], final_dfname)
Esempio n. 6
0
    def __init__(self, wiki, prefetch=True, prefetchdate=None, spawn=True,
                 job=None, skip_jobs=None,
                 restart=False, notice="", dryrun=False, enabled=None,
                 partnum_todo=None, checkpoint_file=None, page_id_range=None,
                 skipdone=False, cleanup=False, do_prereqs=False, verbose=False):
        self.wiki = wiki
        self.db_name = wiki.db_name
        self.prefetch = prefetch
        self.prefetchdate = prefetchdate
        self.spawn = spawn
        self.filepart_info = FilePartInfo(wiki, self.db_name, self.log_and_print)
        self.restart = restart
        self.html_notice_file = None
        self.log = None
        self.dryrun = dryrun
        self._partnum_todo = partnum_todo
        self.checkpoint_file = checkpoint_file
        self.page_id_range = page_id_range
        self.skipdone = skipdone
        self.verbose = verbose
        self.enabled = enabled
        self.cleanup_old_files = cleanup
        self.do_prereqs = do_prereqs

        if self.checkpoint_file is not None:
            fname = DumpFilename(self.wiki)
            fname.new_from_filename(checkpoint_file)
            # we should get file partnum if any
            if self._partnum_todo is None and fname.partnum_int:
                self._partnum_todo = fname.partnum_int
            elif (self._partnum_todo is not None and fname.partnum_int and
                  self._partnum_todo != fname.partnum_int):
                raise BackupError("specifed partnum to do does not match part number "
                                  "of checkpoint file %s to redo", self.checkpoint_file)
            self.checkpoint_file = fname

        if self.enabled is None:
            self.enabled = {}
        for setting in [StatusHtml.NAME, IndexHtml.NAME, Checksummer.NAME,
                        RunInfoFile.NAME, SymLinks.NAME, RunSettings.NAME,
                        Feeds.NAME, NoticeFile.NAME, "makedir", "clean_old_dumps",
                        "cleanup_old_files", "check_trunc_files"]:
            self.enabled[setting] = True

        if not self.cleanup_old_files:
            if "cleanup_old_files" in self.enabled:
                del self.enabled["cleanup_old_files"]

        if self.dryrun or self._partnum_todo is not None or self.checkpoint_file is not None:
            for setting in [StatusHtml.NAME, IndexHtml.NAME, Checksummer.NAME,
                            RunInfoFile.NAME, SymLinks.NAME, RunSettings.NAME,
                            Feeds.NAME, NoticeFile.NAME, "makedir", "clean_old_dumps"]:
                if setting in self.enabled:
                    del self.enabled[setting]

        if self.dryrun:
            for setting in ["check_trunc_files"]:
                if setting in self.enabled:
                    del self.enabled[setting]
            if "logging" in self.enabled:
                del self.enabled["logging"]

        self.job_requested = job

        if self.job_requested == "latestlinks":
            for setting in [StatusHtml.NAME, IndexHtml.NAME, RunInfoFile.NAME]:
                if setting in self.enabled:
                    del self.enabled[setting]

        if self.job_requested == "createdirs":
            for setting in [SymLinks.NAME, Feeds.NAME, RunSettings.NAME]:
                if setting in self.enabled:
                    del self.enabled[setting]

        if self.job_requested == "latestlinks" or self.job_requested == "createdirs":
            for setting in [Checksummer.NAME, NoticeFile.NAME, "makedir",
                            "clean_old_dumps", "check_trunc_files"]:
                if setting in self.enabled:
                    del self.enabled[setting]

        if self.job_requested == "noop":
            for setting in ["clean_old_dumps", "check_trunc_files"]:
                if setting in self.enabled:
                    del self.enabled[setting]

        self.skip_jobs = skip_jobs
        if skip_jobs is None:
            self.skip_jobs = []

        self.db_server_info = DbServerInfo(self.wiki, self.db_name, self.log_and_print)
        self.dump_dir = DumpDir(self.wiki, self.db_name)

        # these must come after the dumpdir setup so we know which directory we are in
        if "logging" in self.enabled and "makedir" in self.enabled:
            file_obj = DumpFilename(self.wiki)
            file_obj.new_from_filename(self.wiki.config.log_file)
            self.log_filename = self.dump_dir.filename_private_path(file_obj)
            self.make_dir(os.path.join(self.wiki.private_dir(), self.wiki.date))
            self.log = Logger(self.log_filename)
            # thread should die horribly when main script dies. no exceptions.
            self.log.daemon = True
            self.log.start()

        self.dumpjobdata = DumpRunJobData(self.wiki, self.dump_dir, notice,
                                          self.log_and_print, self.debug, self.enabled,
                                          self.verbose)

        # some or all of these dump_items will be marked to run
        self.dump_item_list = DumpItemList(self.wiki, self.prefetch, self.prefetchdate,
                                           self.spawn,
                                           self._partnum_todo, self.checkpoint_file,
                                           self.job_requested, self.skip_jobs,
                                           self.filepart_info, self.page_id_range,
                                           self.dumpjobdata, self.dump_dir, self.verbose)
        # only send email failure notices for full runs
        if self.job_requested:
            email = False
        else:
            email = True
        self.failurehandler = FailureHandler(self.wiki, email)
        self.statushtml = StatusHtml(self.wiki, self.dump_dir,
                                     self.dump_item_list.dump_items,
                                     self.dumpjobdata, self.enabled,
                                     self.failurehandler,
                                     self.log_and_print, self.verbose)
        self.indexhtml = IndexHtml(self.wiki, self.dump_dir,
                                   self.dump_item_list.dump_items,
                                   self.dumpjobdata, self.enabled,
                                   self.failurehandler,
                                   self.log_and_print, self.verbose)