コード例 #1
0
ファイル: legalref.py プロジェクト: h4ck3rm1k3/ferenda
    def tag(text, tagtable, sliceleft, sliceright):
        c = hashlib.md5()
        c.update(text)
        text_checksum = c.hexdigest()
        pickled_tagger = tagtable  # remember, not a real tagtable struct
        pickled_tagged = "%s-%s.pickle" % (pickled_tagger, text_checksum)

        if not os.path.exists(pickled_tagged):
            # 2. Dump text as string
            full_text_path = "%s/%s.txt" % (os.path.dirname(pickled_tagger),
                                            text_checksum)
            with open(full_text_path, "wb") as fp:
                fp.write(text)
                # 3. call script (that loads the pickled tagtable + string
                # file, saves tagged text as pickle)
            util.runcmd("%s %s %s %s %s" %
                        (python_exe,
                         tagstring_script,
                         pickled_tagger,
                         full_text_path,
                         text_checksum),
                        require_success=True)
        # 4. load tagged text pickle
        with open(pickled_tagged, "rb") as fp:
            res = pickle.load(fp)
        return res
コード例 #2
0
ファイル: sfs.py プロジェクト: staffanm/ferenda
 def makeimage(basename, label):
     filename = "res/img/sfs/%s.png" % basename
     if not os.path.exists(filename):
         util.ensure_dir(filename)
         self.log.info("Creating img %s with label %s" %
                       (filename, label))
         cmd = 'convert -background transparent -fill Grey -font %s -pointsize 10 -size 44x14 -gravity East label:"%s " %s' % (font, label, filename)
         util.runcmd(cmd)
     return filename
コード例 #3
0
ファイル: myndfskr.py プロジェクト: h4ck3rm1k3/ferenda
    def textreader_from_basefile(self, basefile, encoding):
        infile = self.store.downloaded_path(basefile)
        tmpfile = self.store.path(basefile, "intermediate", ".pdf")
        outfile = self.store.path(basefile, "intermediate", ".txt")
        util.copy_if_different(infile, tmpfile)
        util.runcmd("pdftotext %s" % tmpfile, require_success=True)
        util.robust_remove(tmpfile)

        return TextReader(outfile, encoding=encoding, linesep=TextReader.UNIX)
コード例 #4
0
 def makeimage(basename, label):
     filename = "res/img/sfs/%s.png" % basename
     if not os.path.exists(filename):
         util.ensure_dir(filename)
         self.log.info("Creating img %s with label %s" %
                       (filename, label))
         cmd = 'convert -background transparent -fill gray50 -font %s -pointsize 10 -size 44x14 -gravity East label:"%s " %s' % (
             font, label, filename)
         util.runcmd(cmd)
     return filename
コード例 #5
0
ファイル: legalref.py プロジェクト: h4ck3rm1k3/ferenda
        def buildTagger(self, production=None, processor=None):
            pickled_tagger = "%s/%s-%s.pickle" % (external_simpleparse_state,
                                                  self.declaration_md5,
                                                  production)
            if not os.path.exists(pickled_tagger):

                #    3. call the script with python 27 and production
                cmdline = "%s %s %s/%s %s" % (python_exe,
                                              buildtagger_script,
                                              external_simpleparse_state,
                                              self.declaration_md5,
                                              production)
                util.runcmd(cmdline, require_success=True)
                #    4. the script builds tagtable and dumps it to a pickle file
                assert os.path.exists(pickled_tagger)
            return pickled_tagger  # filename instead of tagtable struct
コード例 #6
0
ファイル: wordreader.py プロジェクト: h4ck3rm1k3/ferenda
    def word_to_docbook(self, indoc, outdoc):
        """Convert a old Word document (.doc) to a pseudo-docbook file through antiword."""
        tmpfile = mktemp()
        indoc = os.path.normpath(indoc)
        wrapper = textwrap.TextWrapper(break_long_words=False,
                                       width=72)

        util.ensure_dir(outdoc)
        if " " in indoc:
            indoc = '"%s"' % indoc
        cmd = "antiword -x db %s > %s" % (indoc, tmpfile)
        self.log.debug("Executing %s" % cmd)
        (ret, stdout, stderr) = util.runcmd(cmd)

        if ret != 0:
            self.log.error("Docbook conversion failed: %s" % stderr)
            raise errors.ExternalCommandError(
                "Docbook conversion failed: %s" % stderr.strip())

        tree = ET.parse(tmpfile)
        for element in tree.getiterator():
            if element.text and element.text.strip() != "":
                replacement = ""
                for p in element.text.split("\n"):
                    if p:
                        replacement += wrapper.fill(p) + "\n\n"

                element.text = replacement.strip()

        tree.write(outdoc, encoding="utf-8")
        os.unlink(tmpfile)
コード例 #7
0
ファイル: pdfreader.py プロジェクト: h4ck3rm1k3/ferenda
 def crop(self, top=0, left=0, bottom=None, right=None):
     """Removes any :py:class:`ferenda.pdfreader.Textbox` objects that does not fit within the bounding box specified by the parameters."""
     # Crop any text box that sticks out
     # Actually if top and left != 0, we need to adjust them
     newboxes = []
     for box in self.boundingbox(top, left, bottom, right):
         box.top = box.top - top
         box.left = box.left - left
         box.right = box.right - right
         box.bottom = box.bottom - bottom
         newboxes.append(box)
     self[:] = []
     self.extend(newboxes)
     self.width = right - left
     self.height = bottom - top
     # Then crop the background images... somehow
     if os.path.exists(self.background):
         cmdline = "convert %s -crop %dx%d+%d+%d +repage %s" % (self.background,
                                                                self.width, self.height, left, top,
                                                                self.background + ".new")
         # print "Running %s" % cmdline
         (returncode, stdout, stderr) = util.runcmd(cmdline,
                                                    require_success=True)
         util.replace_if_different(
             "%s.new" % self.background, self.background)
コード例 #8
0
ファイル: propositioner.py プロジェクト: h4ck3rm1k3/ferenda
    def parse(self, doc):
        doc.uri = self.canonical_uri(doc.basefile)
        d = Describer(doc.meta, doc.uri)
        d.rdftype(self.rdf_type)
        d.value(self.ns['prov'].wasGeneratedBy, self.qualified_class_name())
        self.infer_triples(d, doc.basefile)

        # prefer PDF or Word files over the plaintext-containing HTML files
        # FIXME: PDF or Word files are now stored as attachments

        pdffile = self.generic_path(doc.basefile, 'downloaded', '.pdf')

        wordfiles = (self.generic_path(doc.basefile, 'downloaded', '.doc'),
                     self.generic_path(doc.basefile, 'downloaded', '.docx'),
                     self.generic_path(doc.basefile, 'downloaded', '.wpd'),
                     self.generic_path(doc.basefile, 'downloaded', '.rtf'))
        wordfile = None
        for f in wordfiles:
            if os.path.exists(f):
                wordfile = f

        # if we lack a .pdf file, use Open/LibreOffice to convert any
        # .wpd or .doc file to .pdf first
        if (wordfile
                and not os.path.exists(pdffile)):
            intermediate_pdf = self.generic_path(
                doc.basefile, "intermediate", ".pdf")
            if not os.path.exists(intermediate_pdf):
                cmdline = "%s --headless -convert-to pdf -outdir '%s' %s" % (self.config.get('soffice', 'soffice'),
                                                                             os.path.dirname(
                                                                                 intermediate_pdf),
                                                                             wordfile)
                self.log.debug(
                    "%s: Converting to PDF: %s" % (doc.basefile, cmdline))
                (ret, stdout, stderr) = util.runcmd(
                    cmdline, require_success=True)
            pdffile = intermediate_pdf

        if os.path.exists(pdffile):
            self.log.debug("%s: Using %s" % (doc.basefile, pdffile))
            intermediate_dir = os.path.dirname(
                self.generic_path(doc.basefile, 'intermediate', '.foo'))
            self.setup_logger('pdfreader', self.config.get('log', 'INFO'))
            pdfreader = PDFReader()
            pdfreader.read(pdffile, intermediate_dir)
            self.parse_from_pdfreader(pdfreader, doc)
        else:
            downloaded_path = self.downloaded_path(doc.basefile)
            intermediate_path = self.generic_path(
                doc.basefile, 'intermediate', '.txt')
            self.log.debug("%s: Using %s (%s)" % (doc.basefile,
                           downloaded_path, intermediate_path))
            if not os.path.exists(intermediate_path):
                html = codecs.open(
                    downloaded_path, encoding="iso-8859-1").read()
                util.writefile(intermediate_path, util.extract_text(
                    html, '<pre>', '</pre>'), encoding="utf-8")
            textreader = TextReader(intermediate_path, encoding="utf-8")
            self.parse_from_textreader(textreader, doc)
コード例 #9
0
ファイル: triplestore.py プロジェクト: h4ck3rm1k3/ferenda
 def _run_curl(self, options):
     if options['method'] == 'GET':
         cmd = 'curl -o "%(filename)s" --header "Accept:%(accept)s" "%(url)s"' % options
     elif options['method'] == 'POST':
         cmd = 'curl -X POST --data-binary "@%(filename)s" --header "Content-Type:%(contenttype)s" "%(url)s"' % options
     (ret, stdout, stderr) = util.runcmd(cmd)
     if ret != 0:
         raise errors.TriplestoreError(stderr)
     return stdout
コード例 #10
0
ファイル: triplestore.py プロジェクト: mavteam/ferenda
 def _run_curl(self, options):
     if "<" in options["url"]:
         options["url"] = options["url"].replace("<", "%3C").replace(">", "%3E")
     if options['method'] == 'GET':
         cmd = 'curl -o "%(filename)s" --header "Accept:%(accept)s" "%(url)s"' % options
     elif options['method'] == 'POST':
         cmd = 'curl -X POST --data-binary "@%(filename)s" --header "Content-Type:%(contenttype)s" "%(url)s"' % options
     (ret, stdout, stderr) = util.runcmd(cmd)
     if ret != 0:
         raise errors.TriplestoreError(stderr)
     return stdout
コード例 #11
0
ファイル: testDocStore.py プロジェクト: staffanm/ferenda
 def test_open_intermediate_path(self):
     self.store.intermediate_suffixes = [".html", ".xhtml"]
     with self.store.open_intermediate("123/a", mode="w", suffix=".xhtml") as fp:
         fp.write(self.dummytext)
     filename = self.p("intermediate/123/a.xhtml" + self.expected_suffix)
     self.assertTrue(os.path.exists(filename))
     mimetype = util.runcmd("file -b --mime-type %s" % filename)[1]
     self.assertIn(mimetype.strip(), self.expected_mimetype)
     with self.store.open_intermediate("123/a") as fp:
         # note, open_intermediate should open the file with the
         # the .xhtml suffix automatically
         self.assertEqual(self.dummytext, fp.read())
コード例 #12
0
ファイル: testUtil.py プロジェクト: zigit/ferenda
    def test_runcmd(self):
        filename = self.dname+os.sep+"räksmörgås.txt"
        util.writefile(filename, "räksmörgås")
        if sys.platform == "win32":
            cmd = "type"
        else:
            cmd = "cat"
        cmdline = "%s %s" % (cmd, filename)
        (retcode, stdout, stderr) = util.runcmd(cmdline)
        self.assertEqual(0, retcode)
        self.assertEqual("räksmörgås", stdout)
        self.assertEqual("", stderr)
        
        cmdline = "non-existing-binary foo"
        (retcode, stdout, stderr) = util.runcmd(cmdline)
        self.assertNotEqual(0, retcode)
        self.assertNotEqual("", stderr)

        with self.assertRaises(errors.ExternalCommandError):
            (retcode, stdout, stderr) = util.runcmd(cmdline,
                                                    require_success=True)
コード例 #13
0
 def test_open_intermediate_path(self):
     self.store.intermediate_suffixes = [".html", ".xhtml"]
     with self.store.open_intermediate("123/a", mode="w", suffix=".xhtml") as fp:
         fp.write(self.dummytext)
     filename = self.p("intermediate/123/a.xhtml" + self.expected_suffix)
     self.assertTrue(os.path.exists(filename))
     mimetype = util.runcmd("file -b --mime-type %s" % filename)[1]
     self.assertIn(mimetype.strip(), self.expected_mimetype)
     with self.store.open_intermediate("123/a") as fp:
         # note, open_intermediate should open the file with the
         # the .xhtml suffix automatically
         self.assertEqual(self.dummytext, fp.read())
コード例 #14
0
ファイル: testDocStore.py プロジェクト: staffanm/ferenda
    def test_open_binary(self):
        wanted_filename = self.store.path("basefile", "maindir", ".suffix")
        # the smallest possible PNG image
        bindata = b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x06\x00\x00\x00\x1f\x15\xc4\x89\x00\x00\x00\nIDATx\x9cc\x00\x01\x00\x00\x05\x00\x01\r\n-\xb4\x00\x00\x00\x00IEND\xaeB`\x82'
        with self.store.open("basefile", "maindir", ".suffix", "wb") as fp:
            fp.write(bindata)

        mimetype = util.runcmd("file -b --mime-type %s" % wanted_filename)[1]
        self.assertEqual("image/png", mimetype.strip())

        # make sure that the open method also can be used
        with self.store.open("basefile", "maindir", ".suffix", "rb") as fp:
            self.assertEqual(bindata, fp.read())
コード例 #15
0
    def test_open_binary(self):
        wanted_filename = self.store.path("basefile", "maindir", ".suffix")
        # the smallest possible PNG image
        bindata = b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x06\x00\x00\x00\x1f\x15\xc4\x89\x00\x00\x00\nIDATx\x9cc\x00\x01\x00\x00\x05\x00\x01\r\n-\xb4\x00\x00\x00\x00IEND\xaeB`\x82'
        with self.store.open("basefile", "maindir", ".suffix", "wb") as fp:
            fp.write(bindata)

        mimetype = util.runcmd("file -b --mime-type %s" % wanted_filename)[1]
        self.assertEqual("image/png", mimetype.strip())

        # make sure that the open method also can be used
        with self.store.open("basefile", "maindir", ".suffix", "rb") as fp:
            self.assertEqual(bindata, fp.read())
コード例 #16
0
ファイル: pdfreader.py プロジェクト: h4ck3rm1k3/ferenda
    def read(self, pdffile, workdir):
        """Initializes a PDFReader object from an existing PDF file. After
        initialization, the PDFReader contains a list of
        :py:class:`~ferenda.pdfreader.Page` objects.

        :param pdffile: The full path to the PDF file
        :param workdir: A directory where intermediate files (particularly
                        background PNG files) are stored

        """

        self.filename = pdffile
        assert os.path.exists(pdffile), "PDF %s not found" % pdffile
        basename = os.path.basename(pdffile)
        xmlfile = os.sep.join(
            (workdir, os.path.splitext(basename)[0] + ".xml"))

        if not util.outfile_is_newer([pdffile], xmlfile):
            tmppdffile = os.sep.join([workdir, basename])
            util.copy_if_different(pdffile, tmppdffile)
            # two pass coding: First use -c (complex) to extract
            # background pictures, then use -xml to get easy-to-parse
            # text with bounding boxes.
            cmd = "pdftohtml -nodrm -c %s" % tmppdffile
            self.log.debug("Converting: %s" % cmd)
            (returncode, stdout, stderr) = util.runcmd(cmd,
                                                       require_success=True)
            # we won't need the html files
            for f in os.listdir(workdir):
                if f.endswith(".html"):
                    os.unlink(workdir + os.sep + f)

            cmd = "pdftohtml -nodrm -xml %s" % tmppdffile
            self.log.debug("Converting: %s" % cmd)
            (returncode, stdout, stderr) = util.runcmd(cmd,
                                                       require_success=True)
        return self._parse_xml(xmlfile)
コード例 #17
0
 def download(self):
     hg_clone_path = os.sep.join(self.config.datadir, self.alias, 'clone')
     if os.path.exists(hg_clone_path):
         self.log.debug("Pulling latest changes")
         util.runcmd("hg pull", cwd=hg_clone_path)
         self.log.debug("Updating local clone")
         util.runcmd("hg update", cwd=hg_clone_path)
     else:
         hg_clone_parent = os.sep.join(self.config.datadir, self.alias)
         util.runcmd("hg clone %s clone" % self.start_url,
                     cwd=hg_clone_parent)
     new_last_rev = None
     cmd = "LANGUAGE=C hg log -v"
     util.runcmd(cmd)
     for rev in "LANGUAGE=C hg log -v":
         if not new_last_rev:
             new_last_rev = rev.id
         if rev > self.config.last_rev:
             for f in rev.files:  # rev.files only contain proper pep files
                 "hg cat -r %s > downloaded/%s-r%s.txt" % (
                     f, self.store.downloaded_path(f), rev.id)
         else:
             self.config.last_rev = new_last_rev
             break
コード例 #18
0
ファイル: pep.py プロジェクト: staffanm/ferenda
 def download(self):
     hg_clone_path = os.sep.join(self.config.datadir, self.alias, 'clone')
     if os.path.exists(hg_clone_path):
         self.log.debug("Pulling latest changes")
         util.runcmd("hg pull", cwd=hg_clone_path)
         self.log.debug("Updating local clone")
         util.runcmd("hg update", cwd=hg_clone_path)
     else:
         hg_clone_parent = os.sep.join(self.config.datadir, self.alias)
         util.runcmd("hg clone %s clone" % self.start_url,
                     cwd=hg_clone_parent)
     new_last_rev = None
     cmd = "LANGUAGE=C hg log -v"
     util.runcmd(cmd)
     for rev in "LANGUAGE=C hg log -v":
         if not new_last_rev:
             new_last_rev = rev.id
         if rev > self.config.last_rev:
             for f in rev.files:  # rev.files only contain proper pep files
                 "hg cat -r %s > downloaded/%s-r%s.txt" % (
                     f, self.store.downloaded_path(f), rev.id)
         else:
             self.config.last_rev = new_last_rev
             break
コード例 #19
0
ファイル: wordreader.py プロジェクト: zigit/ferenda
    def word_to_docbook(self, indoc, outfp):
        """Convert a old Word document (.doc) to a pseudo-docbook file through antiword."""
        tmpfile = mktemp()
        indoc = os.path.normpath(indoc)
        wrapper = textwrap.TextWrapper(break_long_words=False, width=72)
        if " " in indoc:
            indoc = '"%s"' % indoc
        cmd = "antiword -x db %s > %s" % (indoc, tmpfile)
        # make sure HOME is set even on win32 -- antiword seems to require it?
        if 'HOME' not in os.environ and 'USERPROFILE' in os.environ:
            os.environ['HOME'] = os.environ['USERPROFILE']

        self.log.debug("Executing %s" % cmd)
        (ret, stdout, stderr) = util.runcmd(cmd)

        if ret != 0:
            self.log.error("Docbook conversion failed: %s" % stderr)
            raise errors.ExternalCommandError("Docbook conversion failed: %s" %
                                              stderr.strip())

        # wrap long lines in the docbook output. Maybe should be configurable?
        tree = ET.parse(tmpfile)
        if hasattr(tree, 'iter'):
            iterator = tree.iter()
        else:
            # Python 2.6 way -- results in a PendingDeprecationWarning
            # on newer pythons.
            iterator = tree.getiterator()
        for element in iterator:
            if element.text and element.text.strip() != "":
                replacement = ""
                for p in element.text.split("\n"):
                    if p:
                        replacement += wrapper.fill(p) + "\n\n"

                element.text = replacement.strip()
        tree.write(outfp, encoding="utf-8")
        os.unlink(tmpfile)
コード例 #20
0
    def get_pathfunc(self, environ, basefile, params, contenttype, suffix):
        """Given the parameters, return a function that will, given a
        basefile, produce the proper path to that basefile. If the
        parameters indicate a version of the resource that does not
        exist as a static file on disk (like ".../basefile/data.rdf"),
        returns None

        """
        if "extended" in params:
            # by definition, this means that we don't have a static file on disk
            return None
        # try to lookup pathfunc from contenttype (or possibly suffix, or maybe params)
        if "repo" in params:
            # this must be a CompositeRepository that has the get_instance method
            for cls in self.repo.subrepos:
                if cls.alias == params['repo']:
                    repo = self.repo.get_instance(cls)
                    break
            else:
                raise ValueError("No '%s' repo is a subrepo of %s" %
                                 (params['repo'], self.repo.alias))
        else:
            repo = self.repo

        if "dir" in params:
            method = {
                'downloaded': repo.store.downloaded_path,
                'intermediate': repo.store.intermediate_path,
                'parsed': repo.store.parsed_path
            }[params["dir"]]
            if "page" in params and "format" in params:
                # check if this is a robot we need to ban (we try to
                # ban them through robots.txt but not all are well
                # behaved)
                if getattr(self.repo.config, 'imagerobots', None):
                    if re.search(self.repo.config.imagerobots,
                                 environ.get("User-Agent")):
                        raise Forbidden()
                baseparam = "-size 400x300 -pointsize 12 -gravity center"
                baseattach = None
                try:
                    if "attachment" in params:
                        sourcefile = method(basefile,
                                            attachment=params["attachment"])
                    else:
                        sourcefile = method(basefile)

                    # we might run this on a host to where we haven't
                    # transferred the downloaded files -- try to
                    # re-aquire them now that someone wants to watch
                    # them.
                    if not os.path.exists(sourcefile):
                        repo.download(basefile)

                    assert params["page"].isdigit(
                    ), "%s is not a digit" % params["page"]
                    assert params["format"] in ("png", "jpg"), (
                        "%s is not a valid image format" % params["format"])
                    baseattach = "page_%s.%s" % (params["page"],
                                                 params["format"])
                    if "attachment" in params:
                        baseattach = "%s_%s" % (params["attachment"],
                                                baseattach)
                    outfile = repo.store.intermediate_path(
                        basefile, attachment=baseattach)
                    if not os.path.exists(outfile):
                        # params['page'] is 0-based, pdftoppm is 1-based
                        cmdline = "pdftoppm -f %s -singlefile -png %s %s" % (
                            int(params["page"]) + 1, sourcefile,
                            outfile.replace(".png", ".tmp"))
                        util.runcmd(cmdline, require_success=True)
                        cmdline = "convert %s -trim %s" % (outfile.replace(
                            ".png", ".tmp.png"), outfile)
                        util.runcmd(cmdline, require_success=True)
                        os.unlink(outfile.replace(".png", ".tmp.png"))
                        logfile = self.repo.config._parent.datadir + os.sep + "ua.log"
                        with open(logfile, "a") as fp:
                            fp.write("%s\t%s\t%s\n" %
                                     (outfile, environ.get("User-Agent"),
                                      environ.get("Referer")))
                except Exception as e:
                    if not baseattach:
                        baseattach = "page_error.png"
                    outfile = repo.store.intermediate_path(
                        basefile, attachment=baseattach)
                    errormsg = "%s\n%s: %s" % ("".join(
                        traceback.format_tb(
                            sys.exc_info()[2])), e.__class__.__name__, str(e))
                    errormsg = errormsg.replace("\n",
                                                "\\n").replace("'", "\\'")
                    cmdline = 'convert  label:"%s" %s' % (errormsg, outfile)
                    util.runcmd(cmdline, require_success=True)
                method = partial(repo.store.intermediate_path,
                                 attachment=baseattach)
                return method  # we really don't want to partial()
                # this method again below
        elif "version" in params:
            method = partial(repo.store.generated_path,
                             version=params["version"])
        elif "diff" in params and params.get("from") != "None":
            return None
        elif contenttype in self._mimemap:
            method = getattr(repo.store, self._mimemap[contenttype])
        elif suffix in self._suffixmap:
            method = getattr(repo.store, self._suffixmap[suffix])
        elif "attachment" in params and mimetypes.guess_extension(contenttype):
            method = repo.store.generated_path
        else:
            # method = repo.store.generated_path
            return None

        if "attachment" in params:
            method = partial(method, attachment=params["attachment"])

        return method
コード例 #21
0
ファイル: eurlex.py プロジェクト: mavteam/ferenda
    def query_webservice(self, query, page):
        # this is the only soap template we'll need, so we include it
        # verbatim to avoid having a dependency on a soap module like
        # zeep.
        endpoint = 'https://eur-lex.europa.eu/EURLexWebService'
        envelope = """<soap-env:Envelope xmlns:soap-env="http://www.w3.org/2003/05/soap-envelope">
  <soap-env:Header>
    <wsse:Security xmlns:wsse="http://docs.oasis-open.org/wss/2004/01/oasis-200401-wss-wssecurity-secext-1.0.xsd">
      <wsse:UsernameToken>
        <wsse:Username>%s</wsse:Username>
        <wsse:Password Type="http://docs.oasis-open.org/wss/2004/01/oasis-200401-wss-username-token-profile-1.0#PasswordText">%s</wsse:Password>
      </wsse:UsernameToken>
    </wsse:Security>
  </soap-env:Header>
  <soap-env:Body>
    <sear:searchRequest xmlns:sear="http://eur-lex.europa.eu/search">
      <sear:expertQuery>%s</sear:expertQuery>
      <sear:page>%s</sear:page>
      <sear:pageSize>%s</sear:pageSize>
      <sear:searchLanguage>%s</sear:searchLanguage>
    </sear:searchRequest>
  </soap-env:Body>
</soap-env:Envelope>
""" % (self.config.username, self.config.password, escape(query, quote=False), page, self.pagesize, self.lang)
        headers = {'Content-Type': 'application/soap+xml; charset=utf-8; action="https://eur-lex.europa.eu/EURLexWebService/doQuery"',
                   'SOAPAction': 'https://eur-lex.europa.eu/EURLexWebService/doQuery'}
        if self.config.curl:
            # dump the envelope to a tempfile
            headerstr = ""
            for k, v in headers.items():
                assert "'" not in v  # if it is, we need to work on escaping it
                headerstr += " --header '%s: %s'" % (k, v)
            with tempfile.NamedTemporaryFile() as fp:
                fp.write(envelope.encode("utf-8"))
                fp.flush()
                envelopename = fp.name
                headerfiledesc, headerfilename = tempfile.mkstemp()
                cmd = 'curl -L -X POST -D %(headerfilename)s --data-binary "@%(envelopename)s" %(headerstr)s %(endpoint)s' % locals()
                (ret, stdout, stderr) = util.runcmd(cmd)
            headerfp = os.fdopen(headerfiledesc)
            header = headerfp.read()
            headerfp.close()
            util.robust_remove(headerfilename)
            status, headers = header.split('\n', 1)
            prot, code, msg = status.split(" ", 2)
            headers = dict(email.message_from_string(headers).items())
            res = FakeResponse(int(code), stdout, headers)
        else:
            res = util.robust_fetch(self.session.post, endpoint, self.log,
                                    raise_for_status=False,
                                    data=envelope, headers=headers,
                                    timeout=10)
            
        if res.status_code == 500:
            tree = etree.parse(BytesIO(res.content))
            statuscode = tree.find(".//{http://www.w3.org/2003/05/soap-envelope}Subcode")[0].text
            statusmsg = tree.find(".//{http://www.w3.org/2003/05/soap-envelope}Text").text
            raise errors.DownloadError("%s: %s" % (statuscode, statusmsg))
        elif res.status_code == 301:
            # the call to robust_fetch or curl should have followed
            # the redirect, but at this point we'll just have to
            # report the error
            raise errors.DownloadError("%s: was redirected to %s" % (endpoint, res.headers['Location']))
        return res
コード例 #22
0
    def get_pathfunc(self, environ, basefile, params, contenttype, suffix):
        """Given the parameters, return a function that will, given a
        basefile, produce the proper path to that basefile. If the
        parameters indicate a version of the resource that does not
        exist as a static file on disk (like ".../basefile/data.rdf"),
        returns None

        """
        # try to lookup pathfunc from contenttype (or possibly suffix, or maybe params)
        if "repo" in params:
            # this must be a CompositeRepository that has the get_instance method
            for cls in self.repo.subrepos:
                if cls.alias == params['repo']:
                    repo = self.repo.get_instance(cls)
                    break
            else:
                raise ValueError("No '%s' repo is a subrepo of %s" %
                                 (param['repo'], self.repo.alias))
        else:
            repo = self.repo

        if "dir" in params:
            method = {'downloaded': repo.store.downloaded_path,
                      'parsed': repo.store.parsed_path}[params["dir"]]
            if "page" in params and "format" in params:
                baseparam = "-size 400x300 -pointsize 12 -gravity center"
                baseattach = None
                try:
                    if "attachment" in params:
                        sourcefile = method(basefile, attachment=params["attachment"])
                    else:
                        sourcefile = method(basefile)

                    # we might run this on a host to where we haven't
                    # transferred the downloaded files -- try to
                    # re-aquire them now that someone wants to watch
                    # them.
                    if not os.path.exists(sourcefile):
                        repo.download(basefile)

                    assert params["page"].isdigit(), "%s is not a digit" % params["page"]
                    assert params["format"] in ("png", "jpg"), ("%s is not a valid image format" %
                                                                params["format"])
                    baseattach = "page_%s.%s" % (params["page"], params["format"])
                    if "attachment" in params:
                        baseattach = "%s_%s" % (params["attachment"], baseattach)
                    outfile = repo.store.intermediate_path(basefile, attachment=baseattach)
                    if not os.path.exists(outfile):
                        # params['page'] is 0-based, pdftoppm is 1-based
                        cmdline = "pdftoppm -f %s -singlefile -png %s %s" % (int(params["page"])+1, sourcefile, outfile.replace(".png",".tmp"))
                        util.runcmd(cmdline, require_success=True)
                        cmdline = "convert %s -trim %s" % (outfile.replace(".png", ".tmp.png"), outfile)
                        util.runcmd(cmdline, require_success=True)
                        os.unlink(outfile.replace(".png", ".tmp.png"))
                except Exception as e:
                    if not baseattach:
                        baseattach = "page_error.png"
                    outfile = repo.store.intermediate_path(basefile, attachment=baseattach)
                    errormsg = str(e).replace("\n", "\\n").replace("'", "\\'")
                    cmdline = 'convert  label:"%s" %s' % (errormsg, outfile)
                    util.runcmd(cmdline, require_success=True)
                method = partial(repo.store.intermediate_path, attachment=baseattach)
                return method  # we really don't want to partial()
                               # this method again below
        elif contenttype in self._mimemap and not basefile.endswith("/data"):
            method = getattr(repo.store, self._mimemap[contenttype])
        elif suffix in self._suffixmap and not basefile.endswith("/data"):
            method = getattr(repo.store, self._suffixmap[suffix])
        elif "attachment" in params and mimetypes.guess_extension(contenttype):
            method = repo.store.generated_path
        else:
            # method = repo.store.generated_path
            return None

        if "attachment" in params:
            method = partial(method, attachment=params["attachment"])

        return method
コード例 #23
0
ファイル: requesthandler.py プロジェクト: staffanm/ferenda
    def get_pathfunc(self, environ, basefile, params, contenttype, suffix):
        """Given the parameters, return a function that will, given a
        basefile, produce the proper path to that basefile. If the
        parameters indicate a version of the resource that does not
        exist as a static file on disk (like ".../basefile/data.rdf"),
        returns None

        """
        # try to lookup pathfunc from contenttype (or possibly suffix, or maybe params)
        if "repo" in params:
            # this must be a CompositeRepository that has the get_instance method
            for cls in self.repo.subrepos:
                if cls.alias == params['repo']:
                    repo = self.repo.get_instance(cls)
                    break
            else:
                raise ValueError("No '%s' repo is a subrepo of %s" %
                                 (params['repo'], self.repo.alias))
        else:
            repo = self.repo

        if "dir" in params:
            method = {'downloaded': repo.store.downloaded_path,
                      'intermediate': repo.store.intermediate_path,
                      'parsed': repo.store.parsed_path}[params["dir"]]
            if "page" in params and "format" in params:
                baseparam = "-size 400x300 -pointsize 12 -gravity center"
                baseattach = None
                try:
                    if "attachment" in params:
                        sourcefile = method(basefile, attachment=params["attachment"])
                    else:
                        sourcefile = method(basefile)

                    # we might run this on a host to where we haven't
                    # transferred the downloaded files -- try to
                    # re-aquire them now that someone wants to watch
                    # them.
                    if not os.path.exists(sourcefile):
                        repo.download(basefile)

                    assert params["page"].isdigit(), "%s is not a digit" % params["page"]
                    assert params["format"] in ("png", "jpg"), ("%s is not a valid image format" %
                                                                params["format"])
                    baseattach = "page_%s.%s" % (params["page"], params["format"])
                    if "attachment" in params:
                        baseattach = "%s_%s" % (params["attachment"], baseattach)
                    outfile = repo.store.intermediate_path(basefile, attachment=baseattach)
                    if not os.path.exists(outfile):
                        # params['page'] is 0-based, pdftoppm is 1-based
                        cmdline = "pdftoppm -f %s -singlefile -png %s %s" % (int(params["page"])+1, sourcefile, outfile.replace(".png",".tmp"))
                        util.runcmd(cmdline, require_success=True)
                        cmdline = "convert %s -trim %s" % (outfile.replace(".png", ".tmp.png"), outfile)
                        util.runcmd(cmdline, require_success=True)
                        os.unlink(outfile.replace(".png", ".tmp.png"))
                except Exception as e:
                    if not baseattach:
                        baseattach = "page_error.png"
                    outfile = repo.store.intermediate_path(basefile, attachment=baseattach)
                    errormsg = "%s\n%s: %s" % ("".join(traceback.format_tb(sys.exc_info()[2])), e.__class__.__name__, str(e))
                    errormsg = errormsg.replace("\n", "\\n").replace("'", "\\'")
                    cmdline = 'convert  label:"%s" %s' % (errormsg, outfile)
                    util.runcmd(cmdline, require_success=True)
                method = partial(repo.store.intermediate_path, attachment=baseattach)
                return method  # we really don't want to partial()
                               # this method again below
        elif contenttype in self._mimemap and not basefile.endswith("/data"):
            method = getattr(repo.store, self._mimemap[contenttype])
        elif suffix in self._suffixmap and not basefile.endswith("/data"):
            method = getattr(repo.store, self._suffixmap[suffix])
        elif "attachment" in params and mimetypes.guess_extension(contenttype):
            method = repo.store.generated_path
        else:
            # method = repo.store.generated_path
            return None

        if "attachment" in params:
            method = partial(method, attachment=params["attachment"])

        return method