def parse(self, doc): doc.uri = self.canonical_uri(doc.basefile) d = Describer(doc.meta, doc.uri) d.rdftype(self.rdf_type) d.value(self.ns['prov'].wasGeneratedBy, self.qualified_class_name()) self.infer_triples(d, doc.basefile) # prefer PDF or Word files over the plaintext-containing HTML files # FIXME: PDF or Word files are now stored as attachments pdffile = self.generic_path(doc.basefile, 'downloaded', '.pdf') wordfiles = (self.generic_path(doc.basefile, 'downloaded', '.doc'), self.generic_path(doc.basefile, 'downloaded', '.docx'), self.generic_path(doc.basefile, 'downloaded', '.wpd'), self.generic_path(doc.basefile, 'downloaded', '.rtf')) wordfile = None for f in wordfiles: if os.path.exists(f): wordfile = f # if we lack a .pdf file, use Open/LibreOffice to convert any # .wpd or .doc file to .pdf first if (wordfile and not os.path.exists(pdffile)): intermediate_pdf = self.generic_path( doc.basefile, "intermediate", ".pdf") if not os.path.exists(intermediate_pdf): cmdline = "%s --headless -convert-to pdf -outdir '%s' %s" % (self.config.get('soffice', 'soffice'), os.path.dirname( intermediate_pdf), wordfile) self.log.debug( "%s: Converting to PDF: %s" % (doc.basefile, cmdline)) (ret, stdout, stderr) = util.runcmd( cmdline, require_success=True) pdffile = intermediate_pdf if os.path.exists(pdffile): self.log.debug("%s: Using %s" % (doc.basefile, pdffile)) intermediate_dir = os.path.dirname( self.generic_path(doc.basefile, 'intermediate', '.foo')) self.setup_logger('pdfreader', self.config.get('log', 'INFO')) pdfreader = PDFReader() pdfreader.read(pdffile, intermediate_dir) self.parse_from_pdfreader(pdfreader, doc) else: downloaded_path = self.downloaded_path(doc.basefile) intermediate_path = self.generic_path( doc.basefile, 'intermediate', '.txt') self.log.debug("%s: Using %s (%s)" % (doc.basefile, downloaded_path, intermediate_path)) if not os.path.exists(intermediate_path): html = codecs.open( downloaded_path, encoding="iso-8859-1").read() util.writefile(intermediate_path, util.extract_text( html, '<pre>', '</pre>'), encoding="utf-8") textreader = TextReader(intermediate_path, encoding="utf-8") self.parse_from_textreader(textreader, doc)
def parse_basefile(self, basefile): # create an Document instance with an initialized doc.meta RDFLib graph doc = self.make_document() intermediate_path = self.generic_path(basefile, 'intermediate', '.txt') downloaded_path = self.downloaded_path(basefile) doc.uri = self.canonical_uri(basefile) doc.lang = "sv" html = codecs.open(downloaded_path, encoding="iso-8859-1").read() header_chunk = util.extract_text( html, '<pre>\n <pre>', '<hr>', strip_tags=False) self.make_meta(header_chunk, doc.meta, doc.uri, basefile) util.writefile(intermediate_path, util.extract_text( html, '<pre>', '</pre>'), encoding="utf-8") reader = TextReader(intermediate_path, encoding="utf-8") reader.readparagraph() self.make_body(reader, doc.body) # Iterate through body tree and find things to link to (See # EurlexTreaties.process_body for inspiration) self.process_body(doc.body, '', doc.uri) return doc