def copy_parsed(self, basefile, instance): # If the distilled and parsed links are recent, assume that # all external resources are OK as well if (util.outfile_is_newer([instance.store.distilled_path(basefile)], self.store.distilled_path(basefile)) and util.outfile_is_newer([instance.store.parsed_path(basefile)], self.store.parsed_path(basefile))): self.log.debug("%s: Attachments are (likely) up-to-date" % basefile) return util.link_or_copy(instance.store.distilled_path(basefile), self.store.distilled_path(basefile)) util.link_or_copy(instance.store.parsed_path(basefile), self.store.parsed_path(basefile)) cnt = 0 for attachment in instance.store.list_attachments(basefile, "parsed"): cnt += 1 src = instance.store.parsed_path(basefile, attachment=attachment) target = self.store.parsed_path(basefile, attachment=attachment) util.link_or_copy(src, target) if cnt: self.log.debug("%s: Linked %s attachments from %s to %s" % (basefile, cnt, os.path.dirname(instance.store.parsed_path(basefile)), os.path.dirname(self.store.parsed_path(basefile))))
def needed(self, basefile, action): # if this function is even called, it means that force is not # true (or ferenda-build.py has not been called with a single # basefile, which is an implied force) if action == "parse": infile = self.downloaded_path(basefile) outfile = self.parsed_path(basefile) return not util.outfile_is_newer([infile], outfile) elif action == "relate": entry = DocumentEntry(self.documententry_path(basefile)) def newer(filename, dt): if not os.path.exists(filename): return False elif not dt: # has never been indexed return True else: return datetime.fromtimestamp( os.stat(filename).st_mtime) > dt return Relate(fulltext=newer(self.parsed_path(basefile), entry.indexed_ft), triples=newer(self.distilled_path(basefile), entry.indexed_ts), dependencies=newer(self.distilled_path(basefile), entry.indexed_dep)) elif action == "generate": infile = self.parsed_path(basefile) annotations = self.annotation_path(basefile) if os.path.exists(self.dependencies_path(basefile)): deptxt = util.readfile(self.dependencies_path(basefile)) dependencies = deptxt.strip().split("\n") else: dependencies = [] dependencies.extend((infile, annotations)) outfile = self.generated_path(basefile) return util.outfile_is_newer(dependencies, outfile) else: # custom actions will need to override needed and provide logic there return True
def copy_parsed(self, basefile, instance): # If the distilled and parsed links are recent, assume that # all external resources are OK as well if (not self.config.force and util.outfile_is_newer( [instance.store.distilled_path(basefile)], self.store.distilled_path(basefile)) and util.outfile_is_newer( [instance.store.parsed_path(basefile)], self.store.parsed_path(basefile))): self.log.debug("%s: Attachments are (likely) up-to-date" % basefile) return util.link_or_copy(instance.store.documententry_path(basefile), self.store.documententry_path(basefile)) util.link_or_copy(instance.store.distilled_path(basefile), self.store.distilled_path(basefile)) util.link_or_copy(instance.store.parsed_path(basefile), self.store.parsed_path(basefile)) cnt = 0 if instance.store.storage_policy == "dir": for attachment in instance.store.list_attachments( basefile, "parsed"): cnt += 1 src = instance.store.parsed_path(basefile, attachment=attachment) target = self.store.parsed_path(basefile, attachment=attachment) util.link_or_copy(src, target) if cnt: self.log.debug( "%s: Linked %s attachments from %s to %s" % (basefile, cnt, os.path.dirname(instance.store.parsed_path(basefile)), os.path.dirname(self.store.parsed_path(basefile))))
def wrapper(self, doc): # note: We hardcode the use of .downloaded_path, .parsed_path # and the 'parseforce' config option, which means that this # decorator can only be used sensibly with the .parse() # function. infile = self.store.downloaded_path(doc.basefile) outfile = self.store.parsed_path(doc.basefile) force = (self.config.force is True or self.config.parseforce is True) if not force and util.outfile_is_newer([infile], outfile): self.log.debug("%s: Skipped", doc.basefile) return True # Signals that everything is OK else: self.log.debug("%s: Starting", doc.basefile) return f(self, doc)
def getconfig(self, configfile, depth): filename = configfile if depth != 0: (base, ext) = os.path.splitext(configfile) filename = "%(base)s-depth-%(depth)d%(ext)s" % locals() if not util.outfile_is_newer([configfile], filename): tree = etree.parse(configfile) # adjust the relevant link attribute for some nodes for xpath, attrib in (("stylesheets/link", "href"), ("javascripts/script", "src"), (".//img", "src")): for node in tree.findall(xpath): # don't adjust absolute links if not (re.match("(https?://|/)", node.get(attrib))): node.set(attrib, "../" * depth + node.get(attrib)) tree.write(filename) return filename
def read(self, pdffile, workdir): """Initializes a PDFReader object from an existing PDF file. After initialization, the PDFReader contains a list of :py:class:`~ferenda.pdfreader.Page` objects. :param pdffile: The full path to the PDF file :param workdir: A directory where intermediate files (particularly background PNG files) are stored """ self.filename = pdffile assert os.path.exists(pdffile), "PDF %s not found" % pdffile basename = os.path.basename(pdffile) xmlfile = os.sep.join( (workdir, os.path.splitext(basename)[0] + ".xml")) if not util.outfile_is_newer([pdffile], xmlfile): tmppdffile = os.sep.join([workdir, basename]) util.copy_if_different(pdffile, tmppdffile) # two pass coding: First use -c (complex) to extract # background pictures, then use -xml to get easy-to-parse # text with bounding boxes. cmd = "pdftohtml -nodrm -c %s" % tmppdffile self.log.debug("Converting: %s" % cmd) (returncode, stdout, stderr) = util.runcmd(cmd, require_success=True) # we won't need the html files for f in os.listdir(workdir): if f.endswith(".html"): os.unlink(workdir + os.sep + f) cmd = "pdftohtml -nodrm -xml %s" % tmppdffile self.log.debug("Converting: %s" % cmd) (returncode, stdout, stderr) = util.runcmd(cmd, require_success=True) return self._parse_xml(xmlfile)
def metrics(self, metricspath=None, plotpath=None, startpage=0, pagecount=None, force=False): """Calculate and return the metrics for this analyzer. metrics is a set of named properties in the form of a dict. The keys of the dict can represent margins or other measurements of the document (left/right margins, header/footer etc) or font styles used in the document (eg. default, title, h1 -- h3). Style values are in turn dicts themselves, with the keys 'family' and 'size'. :param metricspath: The path of a JSON file used as cache for the calculated metrics :type metricspath: str :param plotpath: The path to write a PNG file with histograms for different values (for debugging). :type plotpath: str :param startpage: starting page for the analysis :type startpage: int :param startpage: number of pages to analyze (default: all available) :type startpage: int :param force: Perform analysis even if cached JSON metrics exists. :type force: bool :returns: calculated metrics :rtype: dict The default implementation will try to find out values for the following metrics: ================== =================================================== key description ================== =================================================== leftmargin position of left margin (for odd pages if twopage = True) rightmargin position of right margin (for odd pages if twopage = True) leftmargin_even position of left margin for even pages rightmargin_even position of right margin for right pages topmargin position of header zone bottommargin position of footer zone default style used for default text title style used for main document title (on front page) h1 style used for level 1 headings h2 style used for level 2 headings h3 style used for level 3 headings ================== =================================================== Subclasses might add (or remove) from the above. """ if (not force and metricspath and util.outfile_is_newer([self.pdf.filename], metricspath)): with open(metricspath) as fp: return json.load(fp) if pagecount is None: pagecount = len(self.pdf) - startpage hcounters = self.count_horizontal_margins(startpage, pagecount) vcounters = self.count_vertical_margins(startpage, pagecount) stylecounters = self.count_styles(startpage, pagecount) hmetrics = self.analyze_horizontal_margins(hcounters) vmetrics = self.analyze_vertical_margins(vcounters) stylemetrics = self.analyze_styles(stylecounters) margincounters = dict(chain(hcounters.items(), vcounters.items())) allmetrics = dict(chain(hmetrics.items(), vmetrics.items(), stylemetrics.items())) allmetrics['scanned_source'] = self.scanned_source if plotpath: self.plot(plotpath, margincounters, stylecounters, allmetrics) if metricspath: util.ensure_dir(metricspath) with open(metricspath, "w") as fp: s = json.dumps(allmetrics, indent=4, separators=(', ', ': '), sort_keys=True) fp.write(s) return allmetrics
def paginate(self, paginatepath=None, force=False): """Attempt to identify the real page number from pagination numbers on the page""" if (not force and paginatepath and util.outfile_is_newer([self.pdf.filename], paginatepath)): with open(paginatepath) as fp: return json.load(fp) guesses = [] mapping = OrderedDict() currentpage = 0 misguess = 0 lastpagenumber = 0 for idx, page in enumerate(self.pdf): physical = "%s#page=%s" % (page.src.split(os.sep)[-1], page.number) pageskip = page.number - lastpagenumber lastpagenumber = page.number currentpage = util.increment(currentpage, pageskip) pageguess = self.guess_pagenumber(page, currentpage) if pageguess is None: if len(page) > 0: self.log.debug("physical page %s (%s): Can't guess pagenumber" % (idx,physical)) else: # it's ok for completely blank pages not to have pagenumbers pass guesses.append((physical, currentpage)) # page.number = None else: if pageguess != currentpage: if isinstance(currentpage, str) or isinstance(pageguess, str): # don't try to handle the case where the # expected pagenumber uses roman numerals and # the guessed pagenumbers uses arabic numerals # (ie int) self.log.warning("physical page %s (%s): Assumed page number %s, guess_pagenumber returned %s" % (idx, physical, currentpage, pageguess)) elif (currentpage - pageguess) != misguess: # a not-to-uncommon error is that a page might # lack pagination, but at the same time contain a # numbered heading. This will cause a double # mis-guess when the next page resumes # pagination. Try to adapt to this. FIXME: this # logic is too complicated with state variables # and all. self.log.warning("physical page %s (%s): Expected page number %s, guess_pagenumber returned %s" % (idx, physical, currentpage, pageguess)) misguess = pageguess - currentpage guesses.append((physical, pageguess)) else: self.log.warning("Never mind, physical page %s (%s): guess_pagenumber now returns %s so all is as it should" % (idx, physical, pageguess)) prevphysical = guesses.pop()[0] mapping[prevphysical] = pageguess-1 mapping[physical] = pageguess misguess = 0 else: misguess = 0 mapping[physical] = pageguess currentpage = pageguess # FIXME: if reasonable. Also: handle roman numerals for idx, pageguess in guesses: mapping[idx] = pageguess if paginatepath: util.ensure_dir(paginatepath) with open(paginatepath, "w") as fp: s = json.dumps(mapping, indent=4, separators=(', ', ': ')) fp.write(s) return mapping
def needed(self, basefile, action): """Determine if we really need to perform *action* for the given *basefile*, or if the result of the action (in the form of the file that the action creates, or similar) is newer than all of the actions dependencies (in the form of source files for the action). """ def newer(filename, dt, field): if not os.path.exists(filename): return False elif not dt: # has never been indexed return Needed(reason="%s has not been processed according to %s in documententry %s" % (filename, field, entry._path)) else: if datetime.fromtimestamp(os.stat(filename).st_mtime) > dt: return Needed(reason="%s is newer than %s in documententry %s" % (filename, field, entry._path)) else: return False # if this function is even called, it means that force is not # true (or ferenda-build.py has not been called with a single # basefile, which is an implied force) if action == "parse": infile = self.downloaded_path(basefile) outfile = self.parsed_path(basefile) newer = util.outfile_is_newer([infile], outfile) if not newer: return Needed(reason=getattr(newer, 'reason', None)) else: return False elif action == "relate": entry = DocumentEntry(self.documententry_path(basefile)) return RelateNeeded( fulltext=newer(self.parsed_path(basefile), entry.indexed_ft, 'indexed_ft'), triples=newer(self.distilled_path(basefile), entry.indexed_ts, 'indexed_ts'), dependencies=newer(self.dependencies_path(basefile), entry.indexed_dep, 'indexed_dep')) elif action == "generate": infile = self.parsed_path(basefile) annotations = self.annotation_path(basefile) if os.path.exists(self.dependencies_path(basefile)): deptxt = util.readfile(self.dependencies_path(basefile)) dependencies = deptxt.strip().split("\n") else: dependencies = [] dependencies.extend((infile, annotations)) outfile = self.generated_path(basefile) # support generated 404 files (when served through HTTP, # served with HTTP status 404, but otherwise works just as # regular generated files) if not os.path.exists(outfile) and os.path.exists(outfile + ".404"): outfile += ".404" newer = util.outfile_is_newer(dependencies, outfile) if not newer: return Needed(reason=getattr(newer, 'reason', None)) else: return False elif action == "transformlinks": entry = DocumentEntry(self.documententry_path(basefile)) infile = self.generated_path(basefile) # if entry.status['generate']['date'] is older than the # file modification date, something has modified the file # after generate -- most likely a call to transformlinks() if not newer(infile, entry.updated, 'updated'): return Needed(reason="%s has not been modified after generate at %s" % (infile, entry.status['generate']['date'])) else: return False else: # custom actions will need to override needed and provide logic there return True
def needed(self, basefile, action, version=None): """Determine if we really need to perform *action* for the given *basefile*, or if the result of the action (in the form of the file that the action creates, or similar) is newer than all of the actions dependencies (in the form of source files for the action). """ def newer(filename, dt, field): if not os.path.exists(filename): return False elif not dt: # has never been indexed return Needed( reason= "%s has not been processed according to %s in documententry %s" % (filename, field, entry._path)) else: if datetime.fromtimestamp(os.stat(filename).st_mtime) > dt: return Needed( reason="%s is newer than %s in documententry %s" % (filename, field, entry._path)) else: return False # if this function is even called, it means that force is not # true (or ferenda-build.py has not been called with a single # basefile, which is an implied force) if action == "parse": infile = self.downloaded_path(basefile, version) outfile = self.parsed_path(basefile, version) newer = util.outfile_is_newer([infile], outfile) if not newer: return Needed(reason=getattr(newer, 'reason', None)) else: return False elif action == "relate": entry = DocumentEntry(self.documententry_path(basefile)) return RelateNeeded(fulltext=newer(self.parsed_path(basefile), entry.indexed_ft, 'indexed_ft'), triples=newer(self.distilled_path(basefile), entry.indexed_ts, 'indexed_ts'), dependencies=newer( self.dependencies_path(basefile), entry.indexed_dep, 'indexed_dep')) elif action == "generate": infile = self.parsed_path(basefile, version) annotations = self.annotation_path(basefile, version) if version is None and os.path.exists( self.dependencies_path(basefile)): deptxt = util.readfile(self.dependencies_path(basefile)) dependencies = deptxt.strip().split("\n") else: dependencies = [] dependencies.extend((infile, annotations)) outfile = self.generated_path(basefile, version) # support generated 404 files (when served through HTTP, # served with HTTP status 404, but otherwise works just as # regular generated files) if not os.path.exists(outfile) and os.path.exists(outfile + ".404"): outfile += ".404" newer = util.outfile_is_newer(dependencies, outfile) if not newer: return Needed(reason=getattr(newer, 'reason', None)) else: return False elif action == "transformlinks": entry = DocumentEntry(self.documententry_path(basefile)) infile = self.generated_path(basefile) # if entry.status['generate']['date'] is older than the # file modification date, something has modified the file # after generate -- most likely a call to transformlinks() if not newer(infile, entry.updated, 'updated'): return Needed( reason="%s has not been modified after generate at %s" % (infile, entry.status['generate']['date'])) else: return False else: # custom actions will need to override needed and provide logic there return True
def metrics(self, metricspath=None, plotpath=None, startpage=0, pagecount=None, force=False): """Calculate and return the metrics for this analyzer. metrics is a set of named properties in the form of a dict. The keys of the dict can represent margins or other measurements of the document (left/right margins, header/footer etc) or font styles used in the document (eg. default, title, h1 -- h3). Style values are in turn dicts themselves, with the keys 'family' and 'size'. :param metricspath: The path of a JSON file used as cache for the calculated metrics :type metricspath: str :param plotpath: The path to write a PNG file with histograms for different values (for debugging). :type plotpath: str :param startpage: starting page for the analysis :type startpage: int :param startpage: number of pages to analyze (default: all available) :type startpage: int :param force: Perform analysis even if cached JSON metrics exists. :type force: bool :returns: calculated metrics :rtype: dict The default implementation will try to find out values for the following metrics: ================== =================================================== key description ================== =================================================== leftmargin position of left margin (for odd pages if twopage = True) rightmargin position of right margin (for odd pages if twopage = True) leftmargin_even position of left margin for even pages rightmargin_even position of right margin for right pages topmargin position of header zone bottommargin position of footer zone default style used for default text title style used for main document title (on front page) h1 style used for level 1 headings h2 style used for level 2 headings h3 style used for level 3 headings ================== =================================================== Subclasses might add (or remove) from the above. """ if (not force and metricspath and util.outfile_is_newer([self.pdf.filename], metricspath)): with open(metricspath) as fp: return json.load(fp) if pagecount is None: pagecount = len(self.pdf) - startpage hcounters = self.count_horizontal_margins(startpage, pagecount) vcounters = self.count_vertical_margins(startpage, pagecount) stylecounters = self.count_styles(startpage, pagecount) hmetrics = self.analyze_horizontal_margins(hcounters) vmetrics = self.analyze_vertical_margins(vcounters) stylemetrics = self.analyze_styles(stylecounters) margincounters = dict(chain(hcounters.items(), vcounters.items())) allmetrics = dict( chain(hmetrics.items(), vmetrics.items(), stylemetrics.items())) allmetrics['scanned_source'] = self.scanned_source if plotpath: self.plot(plotpath, margincounters, stylecounters, allmetrics) if metricspath: util.ensure_dir(metricspath) with open(metricspath, "w") as fp: s = json.dumps(allmetrics, indent=4, separators=(', ', ': '), sort_keys=True) fp.write(s) return allmetrics
def paginate(self, paginatepath=None, force=False): """Attempt to identify the real page number from pagination numbers on the page""" if (not force and paginatepath and util.outfile_is_newer([self.pdf.filename], paginatepath)): with open(paginatepath) as fp: return json.load(fp) guesses = [] mapping = OrderedDict() currentpage = 0 misguess = 0 lastpagenumber = 0 for idx, page in enumerate(self.pdf): physical = "%s#page=%s" % (page.src.split(os.sep)[-1], page.number) pageskip = page.number - lastpagenumber lastpagenumber = page.number currentpage = util.increment(currentpage, pageskip) pageguess = self.guess_pagenumber(page, currentpage) if pageguess is None: if len(page) > 0: self.log.debug( "physical page %s (%s): Can't guess pagenumber" % (idx, physical)) else: # it's ok for completely blank pages not to have pagenumbers pass guesses.append((physical, currentpage)) # page.number = None else: if pageguess != currentpage: if isinstance(currentpage, str) or isinstance( pageguess, str): # don't try to handle the case where the # expected pagenumber uses roman numerals and # the guessed pagenumbers uses arabic numerals # (ie int) self.log.warning( "physical page %s (%s): Assumed page number %s, guess_pagenumber returned %s" % (idx, physical, currentpage, pageguess)) elif (currentpage - pageguess) != misguess: # a not-to-uncommon error is that a page might # lack pagination, but at the same time contain a # numbered heading. This will cause a double # mis-guess when the next page resumes # pagination. Try to adapt to this. FIXME: this # logic is too complicated with state variables # and all. self.log.warning( "physical page %s (%s): Expected page number %s, guess_pagenumber returned %s" % (idx, physical, currentpage, pageguess)) misguess = pageguess - currentpage guesses.append((physical, pageguess)) else: self.log.warning( "Never mind, physical page %s (%s): guess_pagenumber now returns %s so all is as it should" % (idx, physical, pageguess)) prevphysical = guesses.pop()[0] mapping[prevphysical] = pageguess - 1 mapping[physical] = pageguess misguess = 0 else: misguess = 0 mapping[physical] = pageguess currentpage = pageguess # FIXME: if reasonable. Also: handle roman numerals for idx, pageguess in guesses: mapping[idx] = pageguess if paginatepath: util.ensure_dir(paginatepath) with open(paginatepath, "w") as fp: s = json.dumps(mapping, indent=4, separators=(', ', ': ')) fp.write(s) return mapping