Python DumpFile.find_first_page_id_in_fileの例

プログラミング言語: Python

名前空間/パッケージ名: dumps.fileutils

クラス/型: DumpFile

メソッド/関数: find_first_page_id_in_file

hotexamples.comのコード掲載数: 3

Python DumpFile.find_first_page_id_in_file - 3件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのdumps.fileutils.DumpFile.find_first_page_id_in_fileの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

find_first_page_id_in_file(3)

setup_uncompression_command(2)

check_if_empty(1)

check_if_truncated(1)

checksum(1)

rename(1)

コード例 #1

ファイルを表示

 def has_no_entries(self, xmlfile, runner):
     '''
     see if it has a page id in it or not. no? then return True
     '''
     if xmlfile.is_temp_file:
         path = os.path.join(self.wiki.config.temp_dir, xmlfile.filename)
     else:
         path = runner.dump_dir.filename_public_path(xmlfile, self.wiki.date)
     fname = DumpFile(self.wiki, path, xmlfile, self.verbose)
     return bool(fname.find_first_page_id_in_file() is None)

コード例 #2

ファイルを表示

    def get_relevant_prefetch_files(self, file_list, start_page_id, end_page_id, date, runner):
        possibles = []
        if len(file_list):
            # (a) nasty hack, see below (b)
            maxparts = 0
            for file_obj in file_list:
                if file_obj.is_file_part and file_obj.partnum_int > maxparts:
                    maxparts = file_obj.partnum_int
                if not file_obj.first_page_id:
                    fname = DumpFile(
                        self.wiki, runner.dump_dir.filename_public_path(file_obj, date),
                        file_obj, self.verbose)
                    file_obj.first_page_id = fname.find_first_page_id_in_file()

            # get the files that cover our range
            for file_obj in file_list:
                # If some of the file_objs in file_list could not be properly be parsed, some of
                # the (int) conversions below will fail. However, it is of little use to us,
                # which conversion failed. /If any/ conversion fails, it means, that that we do
                # not understand how to make sense of the current file_obj. Hence we cannot use
                # it as prefetch object and we have to drop it, to avoid passing a useless file
                # to the text pass. (This could days as of a comment below, but by not passing
                # a likely useless file, we have to fetch more texts from the database)
                #
                # Therefore try...except-ing the whole block is sufficient: If whatever error
                # occurs, we do not abort, but skip the file for prefetch.
                try:
                    # If we could properly parse
                    first_page_id_in_file = int(file_obj.first_page_id)

                    # fixme what do we do here? this could be very expensive. is that worth it??
                    if not file_obj.last_page_id:
                        # (b) nasty hack, see (a)
                        # it's not a checkpoint fle or we'd have the pageid in the filename
                        # so... temporary hack which will give expensive results
                        # if file part, and it's the last one, put none
                        # if it's not the last part, get the first pageid in the next
                        #  part and subtract 1
                        # if not file part, put none.
                        if file_obj.is_file_part and file_obj.partnum_int < maxparts:
                            for fname in file_list:
                                if fname.partnum_int == file_obj.partnum_int + 1:
                                    # not true!  this could be a few past where it really is
                                    # (because of deleted pages that aren't included at all)
                                    file_obj.last_page_id = str(int(fname.first_page_id) - 1)
                    if file_obj.last_page_id:
                        last_page_id_in_file = int(file_obj.last_page_id)
                    else:
                        last_page_id_in_file = None

                    # FIXME there is no point in including files that have just a
                    # few rev ids in them that we need, and having to read through
                    # the whole file... could take hours or days (later it won't matter,
                    # right? but until a rewrite, this is important)
                    # also be sure that if a critical page is deleted by the time we
                    # try to figure out ranges, that we don't get hosed
                    if ((first_page_id_in_file <= int(start_page_id) and
                         (last_page_id_in_file is None or
                          last_page_id_in_file >= int(start_page_id))) or
                            (first_page_id_in_file >= int(start_page_id) and
                             (end_page_id is None or
                              first_page_id_in_file <= int(end_page_id)))):
                        possibles.append(file_obj)
                except Exception as ex:
                    runner.debug(
                        "Couldn't process %s for prefetch. Format update? Corrupt file?"
                        % file_obj.filename)
        return possibles

コード例 #3

ファイルを表示

    def run(self, runner):
        # here we will either clean up or not depending on how we were called FIXME
        self.cleanup_old_files(runner.dump_dir, runner)
        commands = []

        todo = []

        if self.page_id_range is not None:
            # convert to checkpoint filename, handle the same way
            self.checkpoint_file = self.get_chkptfile_from_pageids()

        if self.checkpoint_file:
            todo = [self.checkpoint_file]
        else:
            # list all the output files that would be produced w/o
            # checkpoint files on
            outfiles = self.get_reg_files_for_filepart_possible(
                runner.dump_dir, self.get_fileparts_list(), self.list_dumpnames())
            if self._checkpoints_enabled:

                # get the stub list that would be used for the current run
                stubs = self.get_stub_files(runner)
                stubs = sorted(stubs, key=lambda thing: thing.filename)

                # get the page ranges covered by stubs
                stub_ranges = []
                for stub in stubs:
                    fname = DumpFile(self.wiki,
                                     runner.dump_dir.filename_public_path(stub, stub.date),
                                     stub, self.verbose)
                    stub_ranges.append((fname.find_first_page_id_in_file(),
                                        self.find_last_page_id(stub, runner), stub.partnum))

                # get list of existing checkpoint files
                chkpt_files = self.list_checkpt_files(
                    runner.dump_dir, [self.dumpname], runner.wiki.date, parts=None)
                chkpt_files = sorted(chkpt_files, key=lambda thing: thing.filename)
                # get the page ranges covered by existing checkpoint files
                checkpoint_ranges = [(chkptfile.first_page_id, chkptfile.last_page_id,
                                      chkptfile.partnum)
                                     for chkptfile in chkpt_files]
                if self.verbose:
                    print "checkpoint_ranges is", checkpoint_ranges
                    print "stub_ranges is", stub_ranges

                if not checkpoint_ranges:
                    # no page ranges covered by checkpoints. do all output files
                    # the usual way
                    todo = outfiles
                else:
                    todo = []
                    parts = self.get_fileparts_list()
                    for partnum in parts:
                        if not [int(chkpt_range[2]) for chkpt_range in checkpoint_ranges
                                if int(chkpt_range[2]) == int(partnum)]:
                            # no page ranges covered by checkpoints for a particular
                            # file part (subjob) so do that output file the
                            # regular way
                            todo.extend([outfile for outfile in outfiles
                                         if int(outfile.partnum) == int(partnum)])

                    missing = self.find_missing_ranges(stub_ranges, checkpoint_ranges)
                    todo.extend([self.chkpt_file_from_page_range((first, last), partnum)
                                 for (first, last, partnum) in missing])

            else:
                # do the missing files only
                # FIXME public or private depending on the wiki!
                todo = [outfile for outfile in outfiles
                        if not os.path.exists(runner.dump_dir.filename_public_path(outfile))]

        partial_stubs = []
        if self.verbose:
            print "todo is", [to.filename for to in todo]
        for fileobj in todo:

            stub_for_file = self.get_stub_files(runner, fileobj.partnum_int)[0]

            if fileobj.first_page_id is None:
                partial_stubs.append(stub_for_file)
            else:
                stub_output_file = DumpFilename(
                    self.wiki, fileobj.date, fileobj.dumpname,
                    self.item_for_stubs.get_filetype(),
                    self.item_for_stubs.get_file_ext(),
                    fileobj.partnum,
                    DumpFilename.make_checkpoint_string(
                        fileobj.first_page_id, fileobj.last_page_id), temp=True)

                self.write_partial_stub(stub_for_file, stub_output_file, runner)
                if not self.has_no_entries(stub_output_file, runner):
                    partial_stubs.append(stub_output_file)

        if self.verbose:
            print "partial_stubs is", [ps.filename for ps in partial_stubs]
        if partial_stubs:
            stub_files = partial_stubs
        else:
            return

        for stub_file in stub_files:
            series = self.build_command(runner, stub_file)
            commands.append(series)

        error = runner.run_command(commands, callback_stderr=self.progress_callback,
                                   callback_stderr_arg=runner)
        if error:
            raise BackupError("error producing xml file(s) %s" % self.dumpname)