def copy_parsed(self, basefile, instance):
        # If the distilled and parsed links are recent, assume that
        # all external resources are OK as well
        if (util.outfile_is_newer([instance.store.distilled_path(basefile)],
                                  self.store.distilled_path(basefile)) and
            util.outfile_is_newer([instance.store.parsed_path(basefile)],
                                  self.store.parsed_path(basefile))):
            self.log.debug("%s: Attachments are (likely) up-to-date" % basefile)
            return

        util.link_or_copy(instance.store.distilled_path(basefile),
                          self.store.distilled_path(basefile))

        util.link_or_copy(instance.store.parsed_path(basefile),
                          self.store.parsed_path(basefile))

        cnt = 0
        for attachment in instance.store.list_attachments(basefile, "parsed"):
            cnt += 1
            src = instance.store.parsed_path(basefile, attachment=attachment)
            target = self.store.parsed_path(basefile, attachment=attachment)
            util.link_or_copy(src, target)
        if cnt:
            self.log.debug("%s: Linked %s attachments from %s to %s" %
                           (basefile,
                            cnt,
                            os.path.dirname(instance.store.parsed_path(basefile)),
                            os.path.dirname(self.store.parsed_path(basefile))))
Beispiel #2
0
    def needed(self, basefile, action):
        # if this function is even called, it means that force is not
        # true (or ferenda-build.py has not been called with a single
        # basefile, which is an implied force)
        if action == "parse":
            infile = self.downloaded_path(basefile)
            outfile = self.parsed_path(basefile)
            return not util.outfile_is_newer([infile], outfile)
        elif action == "relate":
            entry = DocumentEntry(self.documententry_path(basefile))

            def newer(filename, dt):
                if not os.path.exists(filename):
                    return False
                elif not dt:  # has never been indexed
                    return True
                else:
                    return datetime.fromtimestamp(
                        os.stat(filename).st_mtime) > dt

            return Relate(fulltext=newer(self.parsed_path(basefile),
                                         entry.indexed_ft),
                          triples=newer(self.distilled_path(basefile),
                                        entry.indexed_ts),
                          dependencies=newer(self.distilled_path(basefile),
                                             entry.indexed_dep))
        elif action == "generate":
            infile = self.parsed_path(basefile)
            annotations = self.annotation_path(basefile)
            if os.path.exists(self.dependencies_path(basefile)):
                deptxt = util.readfile(self.dependencies_path(basefile))
                dependencies = deptxt.strip().split("\n")
            else:
                dependencies = []
            dependencies.extend((infile, annotations))
            outfile = self.generated_path(basefile)
            return util.outfile_is_newer(dependencies, outfile)
        else:
            # custom actions will need to override needed and provide logic there
            return True
Beispiel #3
0
    def copy_parsed(self, basefile, instance):
        # If the distilled and parsed links are recent, assume that
        # all external resources are OK as well

        if (not self.config.force and util.outfile_is_newer(
            [instance.store.distilled_path(basefile)],
                self.store.distilled_path(basefile)) and util.outfile_is_newer(
                    [instance.store.parsed_path(basefile)],
                    self.store.parsed_path(basefile))):
            self.log.debug("%s: Attachments are (likely) up-to-date" %
                           basefile)
            return

        util.link_or_copy(instance.store.documententry_path(basefile),
                          self.store.documententry_path(basefile))

        util.link_or_copy(instance.store.distilled_path(basefile),
                          self.store.distilled_path(basefile))

        util.link_or_copy(instance.store.parsed_path(basefile),
                          self.store.parsed_path(basefile))

        cnt = 0
        if instance.store.storage_policy == "dir":
            for attachment in instance.store.list_attachments(
                    basefile, "parsed"):
                cnt += 1
                src = instance.store.parsed_path(basefile,
                                                 attachment=attachment)
                target = self.store.parsed_path(basefile,
                                                attachment=attachment)
                util.link_or_copy(src, target)
            if cnt:
                self.log.debug(
                    "%s: Linked %s attachments from %s to %s" %
                    (basefile, cnt,
                     os.path.dirname(instance.store.parsed_path(basefile)),
                     os.path.dirname(self.store.parsed_path(basefile))))
Beispiel #4
0
 def wrapper(self, doc):
     # note: We hardcode the use of .downloaded_path, .parsed_path
     # and the 'parseforce' config option, which means that this
     # decorator can only be used sensibly with the .parse()
     # function.
     infile = self.store.downloaded_path(doc.basefile)
     outfile = self.store.parsed_path(doc.basefile)
     force = (self.config.force is True or
              self.config.parseforce is True)
     if not force and util.outfile_is_newer([infile], outfile):
         self.log.debug("%s: Skipped", doc.basefile)
         return True  # Signals that everything is OK
     else:
         self.log.debug("%s: Starting", doc.basefile)
         return f(self, doc)
Beispiel #5
0
 def getconfig(self, configfile, depth):
     filename = configfile
     if depth != 0:
         (base, ext) = os.path.splitext(configfile)
         filename = "%(base)s-depth-%(depth)d%(ext)s" % locals()
         if not util.outfile_is_newer([configfile],  filename):
             tree = etree.parse(configfile)
             # adjust the relevant link attribute for some nodes
             for xpath, attrib in (("stylesheets/link", "href"),
                                   ("javascripts/script", "src"),
                                   (".//img", "src")):
                 for node in tree.findall(xpath):
                     # don't adjust absolute links
                     if not (re.match("(https?://|/)", node.get(attrib))):
                         node.set(attrib, "../" * depth + node.get(attrib))
             tree.write(filename)
     return filename
Beispiel #6
0
 def getconfig(self, configfile, depth):
     filename = configfile
     if depth != 0:
         (base, ext) = os.path.splitext(configfile)
         filename = "%(base)s-depth-%(depth)d%(ext)s" % locals()
         if not util.outfile_is_newer([configfile], filename):
             tree = etree.parse(configfile)
             # adjust the relevant link attribute for some nodes
             for xpath, attrib in (("stylesheets/link", "href"),
                                   ("javascripts/script", "src"), (".//img",
                                                                   "src")):
                 for node in tree.findall(xpath):
                     # don't adjust absolute links
                     if not (re.match("(https?://|/)", node.get(attrib))):
                         node.set(attrib, "../" * depth + node.get(attrib))
             tree.write(filename)
     return filename
Beispiel #7
0
    def read(self, pdffile, workdir):
        """Initializes a PDFReader object from an existing PDF file. After
        initialization, the PDFReader contains a list of
        :py:class:`~ferenda.pdfreader.Page` objects.

        :param pdffile: The full path to the PDF file
        :param workdir: A directory where intermediate files (particularly
                        background PNG files) are stored

        """

        self.filename = pdffile
        assert os.path.exists(pdffile), "PDF %s not found" % pdffile
        basename = os.path.basename(pdffile)
        xmlfile = os.sep.join(
            (workdir, os.path.splitext(basename)[0] + ".xml"))

        if not util.outfile_is_newer([pdffile], xmlfile):
            tmppdffile = os.sep.join([workdir, basename])
            util.copy_if_different(pdffile, tmppdffile)
            # two pass coding: First use -c (complex) to extract
            # background pictures, then use -xml to get easy-to-parse
            # text with bounding boxes.
            cmd = "pdftohtml -nodrm -c %s" % tmppdffile
            self.log.debug("Converting: %s" % cmd)
            (returncode, stdout, stderr) = util.runcmd(cmd,
                                                       require_success=True)
            # we won't need the html files
            for f in os.listdir(workdir):
                if f.endswith(".html"):
                    os.unlink(workdir + os.sep + f)

            cmd = "pdftohtml -nodrm -xml %s" % tmppdffile
            self.log.debug("Converting: %s" % cmd)
            (returncode, stdout, stderr) = util.runcmd(cmd,
                                                       require_success=True)
        return self._parse_xml(xmlfile)
Beispiel #8
0
    def metrics(self, metricspath=None, plotpath=None,
                startpage=0, pagecount=None, force=False):
        """Calculate and return the metrics for this analyzer.

        metrics is a set of named properties in the form of a
        dict. The keys of the dict can represent margins or other
        measurements of the document (left/right margins,
        header/footer etc) or font styles used in the document (eg.
        default, title, h1 -- h3). Style values are in turn dicts
        themselves, with the keys 'family' and 'size'.

        :param metricspath: The path of a JSON file used as cache for the
                             calculated metrics
        :type  metricspath: str
        :param plotpath: The path to write a PNG file with histograms for
                         different values (for debugging).
        :type plotpath: str
        :param startpage: starting page for the analysis
        :type startpage: int
        :param startpage: number of pages to analyze (default: all available)
        :type startpage: int
        :param force: Perform analysis even if cached JSON metrics exists.
        :type force: bool
        :returns: calculated metrics
        :rtype: dict

        The default implementation will try to find out values for the
        following metrics:

        ================== ===================================================
        key                description
        ================== ===================================================
        leftmargin         position of left margin (for odd pages if
                           twopage = True)
        rightmargin        position of right margin (for odd pages if
                           twopage = True)
        leftmargin_even    position of left margin for even pages

        rightmargin_even   position of right margin for right pages

        topmargin          position of header zone

        bottommargin       position of footer zone

        default            style used for default text

        title              style used for main document title (on front page)

        h1                 style used for level 1 headings

        h2                 style used for level 2 headings

        h3                 style used for level 3 headings
        ================== ===================================================

        Subclasses might add (or remove) from the above.

        """
        if (not force and
                metricspath and
                util.outfile_is_newer([self.pdf.filename], metricspath)):
            with open(metricspath) as fp:
                return json.load(fp)

        if pagecount is None:
            pagecount = len(self.pdf) - startpage

        hcounters = self.count_horizontal_margins(startpage, pagecount)
        vcounters = self.count_vertical_margins(startpage, pagecount)
        stylecounters = self.count_styles(startpage, pagecount)

        hmetrics = self.analyze_horizontal_margins(hcounters)
        vmetrics = self.analyze_vertical_margins(vcounters)
        stylemetrics = self.analyze_styles(stylecounters)

        margincounters = dict(chain(hcounters.items(), vcounters.items()))
        allmetrics = dict(chain(hmetrics.items(), vmetrics.items(), stylemetrics.items()))
        allmetrics['scanned_source'] = self.scanned_source

        if plotpath:
            self.plot(plotpath, margincounters, stylecounters, allmetrics)
        if metricspath:
            util.ensure_dir(metricspath)
            with open(metricspath, "w") as fp:
                s = json.dumps(allmetrics, indent=4, separators=(', ', ': '), sort_keys=True)
                fp.write(s)
        return allmetrics
Beispiel #9
0
    def paginate(self, paginatepath=None, force=False):
        """Attempt to identify the real page number from pagination numbers on the page"""

        if (not force and
                paginatepath and
                util.outfile_is_newer([self.pdf.filename], paginatepath)):
            with open(paginatepath) as fp:
                return json.load(fp)

        guesses = []
        mapping = OrderedDict()
        currentpage = 0
        misguess = 0
        lastpagenumber = 0
        for idx, page in enumerate(self.pdf):
            physical = "%s#page=%s" % (page.src.split(os.sep)[-1], page.number)
            pageskip = page.number - lastpagenumber
            lastpagenumber = page.number
            currentpage = util.increment(currentpage, pageskip)
            pageguess = self.guess_pagenumber(page, currentpage)
            if pageguess is None:
                if len(page) > 0:
                    self.log.debug("physical page %s (%s): Can't guess pagenumber" % (idx,physical))
                else:  # it's ok for completely blank pages not to have pagenumbers
                    pass
                guesses.append((physical, currentpage))
                # page.number = None
            else:
                if pageguess != currentpage:
                    if isinstance(currentpage, str) or isinstance(pageguess, str):
                        # don't try to handle the case where the
                        # expected pagenumber uses roman numerals and
                        # the guessed pagenumbers uses arabic numerals
                        # (ie int)
                        self.log.warning("physical page %s (%s): Assumed page number %s, guess_pagenumber returned %s" % (idx, physical, currentpage, pageguess))
                    elif (currentpage - pageguess) != misguess:
                        # a not-to-uncommon error is that a page might
                        # lack pagination, but at the same time contain a
                        # numbered heading. This will cause a double
                        # mis-guess when the next page resumes
                        # pagination. Try to adapt to this.  FIXME: this
                        # logic is too complicated with state variables
                        # and all.
                        self.log.warning("physical page %s (%s): Expected page number %s, guess_pagenumber returned %s" % (idx, physical, currentpage, pageguess))
                        misguess = pageguess - currentpage
                        guesses.append((physical, pageguess))
                    else:
                        self.log.warning("Never mind, physical page %s (%s): guess_pagenumber now returns %s so all is as it should" % (idx, physical, pageguess))
                        prevphysical = guesses.pop()[0]
                        mapping[prevphysical] = pageguess-1
                        mapping[physical] = pageguess
                        misguess = 0
                else:
                    misguess = 0

                mapping[physical] = pageguess
                currentpage = pageguess  # FIXME: if reasonable. Also: handle roman numerals
        for idx, pageguess in guesses:
            mapping[idx] = pageguess

        if paginatepath:
            util.ensure_dir(paginatepath)
            with open(paginatepath, "w") as fp:
                s = json.dumps(mapping, indent=4, separators=(', ', ': '))
                fp.write(s)
        return mapping
Beispiel #10
0
    def needed(self, basefile, action):
        """Determine if we really need to perform *action* for the given
*basefile*, or if the result of the action (in the form of the file
that the action creates, or similar) is newer than all of the actions
dependencies (in the form of source files for the action).

        """
        
        def newer(filename, dt, field):
            if not os.path.exists(filename):
                return False
            elif not dt:  # has never been indexed
                return Needed(reason="%s has not been processed according to %s in documententry %s" % (filename, field, entry._path))
            else:
                if datetime.fromtimestamp(os.stat(filename).st_mtime) > dt:
                    return Needed(reason="%s is newer than %s in documententry %s" % (filename, field, entry._path))
                else:
                    return False

        # if this function is even called, it means that force is not
        # true (or ferenda-build.py has not been called with a single
        # basefile, which is an implied force)
        if action == "parse":
            infile = self.downloaded_path(basefile)
            outfile = self.parsed_path(basefile)
            newer = util.outfile_is_newer([infile], outfile)
            if not newer:
                return Needed(reason=getattr(newer, 'reason', None))
            else:
                return False
        elif action == "relate":
            entry = DocumentEntry(self.documententry_path(basefile))
                                    
            return RelateNeeded(
                fulltext=newer(self.parsed_path(basefile), entry.indexed_ft, 'indexed_ft'),
                triples=newer(self.distilled_path(basefile), entry.indexed_ts, 'indexed_ts'),
                dependencies=newer(self.dependencies_path(basefile), entry.indexed_dep,
                                   'indexed_dep'))
        elif action == "generate":
            infile = self.parsed_path(basefile)
            annotations = self.annotation_path(basefile)
            if os.path.exists(self.dependencies_path(basefile)):
                deptxt = util.readfile(self.dependencies_path(basefile))
                dependencies = deptxt.strip().split("\n")
            else:
                dependencies = []
            dependencies.extend((infile, annotations))
            outfile = self.generated_path(basefile)
            # support generated 404 files (when served through HTTP,
            # served with HTTP status 404, but otherwise works just as
            # regular generated files)
            if not os.path.exists(outfile) and os.path.exists(outfile + ".404"):
                outfile += ".404"
            newer = util.outfile_is_newer(dependencies, outfile)
            if not newer:
                return Needed(reason=getattr(newer, 'reason', None))
            else:
                return False
        elif action == "transformlinks":
            entry = DocumentEntry(self.documententry_path(basefile))
            infile = self.generated_path(basefile)
            # if entry.status['generate']['date'] is older than the
            # file modification date, something has modified the file
            # after generate -- most likely a call to transformlinks()
            if not newer(infile, entry.updated, 'updated'):
                return Needed(reason="%s has not been modified after generate at %s" % (infile, entry.status['generate']['date']))
            else:
                return False
        else:
            # custom actions will need to override needed and provide logic there
            return True  
Beispiel #11
0
    def needed(self, basefile, action, version=None):
        """Determine if we really need to perform *action* for the given
*basefile*, or if the result of the action (in the form of the file
that the action creates, or similar) is newer than all of the actions
dependencies (in the form of source files for the action).

        """
        def newer(filename, dt, field):
            if not os.path.exists(filename):
                return False
            elif not dt:  # has never been indexed
                return Needed(
                    reason=
                    "%s has not been processed according to %s in documententry %s"
                    % (filename, field, entry._path))
            else:
                if datetime.fromtimestamp(os.stat(filename).st_mtime) > dt:
                    return Needed(
                        reason="%s is newer than %s in documententry %s" %
                        (filename, field, entry._path))
                else:
                    return False

        # if this function is even called, it means that force is not
        # true (or ferenda-build.py has not been called with a single
        # basefile, which is an implied force)
        if action == "parse":
            infile = self.downloaded_path(basefile, version)
            outfile = self.parsed_path(basefile, version)
            newer = util.outfile_is_newer([infile], outfile)
            if not newer:
                return Needed(reason=getattr(newer, 'reason', None))
            else:
                return False
        elif action == "relate":
            entry = DocumentEntry(self.documententry_path(basefile))

            return RelateNeeded(fulltext=newer(self.parsed_path(basefile),
                                               entry.indexed_ft, 'indexed_ft'),
                                triples=newer(self.distilled_path(basefile),
                                              entry.indexed_ts, 'indexed_ts'),
                                dependencies=newer(
                                    self.dependencies_path(basefile),
                                    entry.indexed_dep, 'indexed_dep'))
        elif action == "generate":
            infile = self.parsed_path(basefile, version)
            annotations = self.annotation_path(basefile, version)
            if version is None and os.path.exists(
                    self.dependencies_path(basefile)):
                deptxt = util.readfile(self.dependencies_path(basefile))
                dependencies = deptxt.strip().split("\n")
            else:
                dependencies = []
            dependencies.extend((infile, annotations))
            outfile = self.generated_path(basefile, version)
            # support generated 404 files (when served through HTTP,
            # served with HTTP status 404, but otherwise works just as
            # regular generated files)
            if not os.path.exists(outfile) and os.path.exists(outfile +
                                                              ".404"):
                outfile += ".404"
            newer = util.outfile_is_newer(dependencies, outfile)
            if not newer:
                return Needed(reason=getattr(newer, 'reason', None))
            else:
                return False
        elif action == "transformlinks":
            entry = DocumentEntry(self.documententry_path(basefile))
            infile = self.generated_path(basefile)
            # if entry.status['generate']['date'] is older than the
            # file modification date, something has modified the file
            # after generate -- most likely a call to transformlinks()
            if not newer(infile, entry.updated, 'updated'):
                return Needed(
                    reason="%s has not been modified after generate at %s" %
                    (infile, entry.status['generate']['date']))
            else:
                return False
        else:
            # custom actions will need to override needed and provide logic there
            return True
Beispiel #12
0
    def metrics(self,
                metricspath=None,
                plotpath=None,
                startpage=0,
                pagecount=None,
                force=False):
        """Calculate and return the metrics for this analyzer.

        metrics is a set of named properties in the form of a
        dict. The keys of the dict can represent margins or other
        measurements of the document (left/right margins,
        header/footer etc) or font styles used in the document (eg.
        default, title, h1 -- h3). Style values are in turn dicts
        themselves, with the keys 'family' and 'size'.

        :param metricspath: The path of a JSON file used as cache for the
                             calculated metrics
        :type  metricspath: str
        :param plotpath: The path to write a PNG file with histograms for
                         different values (for debugging).
        :type plotpath: str
        :param startpage: starting page for the analysis
        :type startpage: int
        :param startpage: number of pages to analyze (default: all available)
        :type startpage: int
        :param force: Perform analysis even if cached JSON metrics exists.
        :type force: bool
        :returns: calculated metrics
        :rtype: dict

        The default implementation will try to find out values for the
        following metrics:

        ================== ===================================================
        key                description
        ================== ===================================================
        leftmargin         position of left margin (for odd pages if
                           twopage = True)
        rightmargin        position of right margin (for odd pages if
                           twopage = True)
        leftmargin_even    position of left margin for even pages

        rightmargin_even   position of right margin for right pages

        topmargin          position of header zone

        bottommargin       position of footer zone

        default            style used for default text

        title              style used for main document title (on front page)

        h1                 style used for level 1 headings

        h2                 style used for level 2 headings

        h3                 style used for level 3 headings
        ================== ===================================================

        Subclasses might add (or remove) from the above.

        """
        if (not force and metricspath
                and util.outfile_is_newer([self.pdf.filename], metricspath)):
            with open(metricspath) as fp:
                return json.load(fp)

        if pagecount is None:
            pagecount = len(self.pdf) - startpage

        hcounters = self.count_horizontal_margins(startpage, pagecount)
        vcounters = self.count_vertical_margins(startpage, pagecount)
        stylecounters = self.count_styles(startpage, pagecount)

        hmetrics = self.analyze_horizontal_margins(hcounters)
        vmetrics = self.analyze_vertical_margins(vcounters)
        stylemetrics = self.analyze_styles(stylecounters)

        margincounters = dict(chain(hcounters.items(), vcounters.items()))
        allmetrics = dict(
            chain(hmetrics.items(), vmetrics.items(), stylemetrics.items()))
        allmetrics['scanned_source'] = self.scanned_source

        if plotpath:
            self.plot(plotpath, margincounters, stylecounters, allmetrics)
        if metricspath:
            util.ensure_dir(metricspath)
            with open(metricspath, "w") as fp:
                s = json.dumps(allmetrics,
                               indent=4,
                               separators=(', ', ': '),
                               sort_keys=True)
                fp.write(s)
        return allmetrics
Beispiel #13
0
    def paginate(self, paginatepath=None, force=False):
        """Attempt to identify the real page number from pagination numbers on the page"""

        if (not force and paginatepath
                and util.outfile_is_newer([self.pdf.filename], paginatepath)):
            with open(paginatepath) as fp:
                return json.load(fp)

        guesses = []
        mapping = OrderedDict()
        currentpage = 0
        misguess = 0
        lastpagenumber = 0
        for idx, page in enumerate(self.pdf):
            physical = "%s#page=%s" % (page.src.split(os.sep)[-1], page.number)
            pageskip = page.number - lastpagenumber
            lastpagenumber = page.number
            currentpage = util.increment(currentpage, pageskip)
            pageguess = self.guess_pagenumber(page, currentpage)
            if pageguess is None:
                if len(page) > 0:
                    self.log.debug(
                        "physical page %s (%s): Can't guess pagenumber" %
                        (idx, physical))
                else:  # it's ok for completely blank pages not to have pagenumbers
                    pass
                guesses.append((physical, currentpage))
                # page.number = None
            else:
                if pageguess != currentpage:
                    if isinstance(currentpage, str) or isinstance(
                            pageguess, str):
                        # don't try to handle the case where the
                        # expected pagenumber uses roman numerals and
                        # the guessed pagenumbers uses arabic numerals
                        # (ie int)
                        self.log.warning(
                            "physical page %s (%s): Assumed page number %s, guess_pagenumber returned %s"
                            % (idx, physical, currentpage, pageguess))
                    elif (currentpage - pageguess) != misguess:
                        # a not-to-uncommon error is that a page might
                        # lack pagination, but at the same time contain a
                        # numbered heading. This will cause a double
                        # mis-guess when the next page resumes
                        # pagination. Try to adapt to this.  FIXME: this
                        # logic is too complicated with state variables
                        # and all.
                        self.log.warning(
                            "physical page %s (%s): Expected page number %s, guess_pagenumber returned %s"
                            % (idx, physical, currentpage, pageguess))
                        misguess = pageguess - currentpage
                        guesses.append((physical, pageguess))
                    else:
                        self.log.warning(
                            "Never mind, physical page %s (%s): guess_pagenumber now returns %s so all is as it should"
                            % (idx, physical, pageguess))
                        prevphysical = guesses.pop()[0]
                        mapping[prevphysical] = pageguess - 1
                        mapping[physical] = pageguess
                        misguess = 0
                else:
                    misguess = 0

                mapping[physical] = pageguess
                currentpage = pageguess  # FIXME: if reasonable. Also: handle roman numerals
        for idx, pageguess in guesses:
            mapping[idx] = pageguess

        if paginatepath:
            util.ensure_dir(paginatepath)
            with open(paginatepath, "w") as fp:
                s = json.dumps(mapping, indent=4, separators=(', ', ': '))
                fp.write(s)
        return mapping