コード例 #1
0
ファイル: propositioner.py プロジェクト: h4ck3rm1k3/ferenda
    def parse(self, doc):
        doc.uri = self.canonical_uri(doc.basefile)
        d = Describer(doc.meta, doc.uri)
        d.rdftype(self.rdf_type)
        d.value(self.ns['prov'].wasGeneratedBy, self.qualified_class_name())
        self.infer_triples(d, doc.basefile)

        # prefer PDF or Word files over the plaintext-containing HTML files
        # FIXME: PDF or Word files are now stored as attachments

        pdffile = self.generic_path(doc.basefile, 'downloaded', '.pdf')

        wordfiles = (self.generic_path(doc.basefile, 'downloaded', '.doc'),
                     self.generic_path(doc.basefile, 'downloaded', '.docx'),
                     self.generic_path(doc.basefile, 'downloaded', '.wpd'),
                     self.generic_path(doc.basefile, 'downloaded', '.rtf'))
        wordfile = None
        for f in wordfiles:
            if os.path.exists(f):
                wordfile = f

        # if we lack a .pdf file, use Open/LibreOffice to convert any
        # .wpd or .doc file to .pdf first
        if (wordfile
                and not os.path.exists(pdffile)):
            intermediate_pdf = self.generic_path(
                doc.basefile, "intermediate", ".pdf")
            if not os.path.exists(intermediate_pdf):
                cmdline = "%s --headless -convert-to pdf -outdir '%s' %s" % (self.config.get('soffice', 'soffice'),
                                                                             os.path.dirname(
                                                                                 intermediate_pdf),
                                                                             wordfile)
                self.log.debug(
                    "%s: Converting to PDF: %s" % (doc.basefile, cmdline))
                (ret, stdout, stderr) = util.runcmd(
                    cmdline, require_success=True)
            pdffile = intermediate_pdf

        if os.path.exists(pdffile):
            self.log.debug("%s: Using %s" % (doc.basefile, pdffile))
            intermediate_dir = os.path.dirname(
                self.generic_path(doc.basefile, 'intermediate', '.foo'))
            self.setup_logger('pdfreader', self.config.get('log', 'INFO'))
            pdfreader = PDFReader()
            pdfreader.read(pdffile, intermediate_dir)
            self.parse_from_pdfreader(pdfreader, doc)
        else:
            downloaded_path = self.downloaded_path(doc.basefile)
            intermediate_path = self.generic_path(
                doc.basefile, 'intermediate', '.txt')
            self.log.debug("%s: Using %s (%s)" % (doc.basefile,
                           downloaded_path, intermediate_path))
            if not os.path.exists(intermediate_path):
                html = codecs.open(
                    downloaded_path, encoding="iso-8859-1").read()
                util.writefile(intermediate_path, util.extract_text(
                    html, '<pre>', '</pre>'), encoding="utf-8")
            textreader = TextReader(intermediate_path, encoding="utf-8")
            self.parse_from_textreader(textreader, doc)
コード例 #2
0
ファイル: direktiv.py プロジェクト: h4ck3rm1k3/ferenda
    def parse_basefile(self, basefile):
        # create an Document instance with an initialized doc.meta RDFLib graph
        doc = self.make_document()
        intermediate_path = self.generic_path(basefile, 'intermediate', '.txt')
        downloaded_path = self.downloaded_path(basefile)
        doc.uri = self.canonical_uri(basefile)
        doc.lang = "sv"
        html = codecs.open(downloaded_path, encoding="iso-8859-1").read()
        header_chunk = util.extract_text(
            html, '<pre>\n   <pre>', '<hr>', strip_tags=False)
        self.make_meta(header_chunk, doc.meta, doc.uri, basefile)
        util.writefile(intermediate_path, util.extract_text(
            html, '<pre>', '</pre>'), encoding="utf-8")
        reader = TextReader(intermediate_path, encoding="utf-8")
        reader.readparagraph()
        self.make_body(reader, doc.body)

        # Iterate through body tree and find things to link to (See
        # EurlexTreaties.process_body for inspiration)
        self.process_body(doc.body, '', doc.uri)
        return doc