Beispiel #1
0
 def download(self):
     # do something with static/sitenews.txt --> split into
     # <datadir>/sitenews/<timestamp>.txt
     ofp = temppath = path = basefile = None
     with codecs.open(self.resourceloader.filename(self.config.newsfile),
                      encoding="utf-8") as fp:
         for line in fp:
             m = self.re_news_subjectline(line)
             if m:
                 if ofp:
                     ofp.close()
                     if util.replace_if_different(temppath, path):
                         self.log.info("%s: creating news item" % basefile)
                 d = datetime.strptime(m.group(1), "%Y-%m-%d %H:%M:%S")
                 basefile = str(int(d.timestamp()))
                 path = self.store.downloaded_path(basefile)
                 fileno, temppath = tempfile.mkstemp(text=True)
                 util.ensure_dir(path)
                 # ofp = codecs.open(path, "w", encoding="utf-8")
                 ofp = os.fdopen(fileno, "w")
             ofp.write(line)
         ofp.close()
         if util.replace_if_different(temppath, path):
             self.log.info("%s: download OK (creating news item)" %
                           basefile)
Beispiel #2
0
    def test_doctype(self):
        base = self.datadir+os.sep
        util.ensure_dir(base+"teststyle-doctype.xslt")
        with open(base+"teststyle-doctype.xslt","w") as fp:
            fp.write("""<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
    <xsl:output method="html"
	        doctype-system="about:legacy-compat"
	        omit-xml-declaration="yes"
	        encoding='utf-8'
	        indent="yes"/>
    <xsl:template match="/">
      <html>
        <head>
          <title><xsl:value-of select="/doc/title"/></title>
        </head>
        <body>
          <h1>hello world</h1>
        </body>
      </html>
    </xsl:template>
</xsl:stylesheet>
""")
        with open(base+"infile.xml","w") as fp:
            fp.write("""<doc><title>Document title</title></doc>""")
        t = Transformer("XSLT", base+"teststyle-doctype.xslt", "xsl", None, "")
        t.transform_file(base+"infile.xml", base+"outfile.xml")
        self.assertTrue(util.readfile(base+"outfile.xml").startswith('<!DOCTYPE html SYSTEM "about:legacy-compat">'))
Beispiel #3
0
    def put_files_in_place(self):
        self.repo = None
        self.repos = [DocumentRepository(datadir=self.datadir,
                                         storetype = self.storetype,
                                         storelocation = self.storelocation,
                                         storerepository = self.storerepository,
                                         indextype = self.indextype,
                                         indexlocation = self.indexlocation)]
        # create three basic documents (at parsed and distilled)
        #
        # each document should have a dcterms:title, a dcterms:issued and a
        # dcterms:publisher, which has a URI
        #
        # basefile  dcterms:title  dcterms:issued  dcterms:publisher
        # 123/a     "Example"      2014-01-04      <http://example.org/publisher/A>
        # 123/b     "Example 2"    2013-09-23      <http://example.org/publisher/B>
        # 123/c     "Of needles"   2014-05-06      <http://example.org/publisher/B>
        for i in ('a','b','c'):
            self.ttl_to_rdf_xml("test/files/base/distilled/123/%s.ttl" % i,
                                self.repos[0].store.distilled_path("123/%s" % i),
                                self.repos[0].store)
            util.ensure_dir(self.repos[0].store.parsed_path("123/%s" % i))
            shutil.copy2("test/files/base/parsed/123/%s.xhtml" % i,
                                self.repos[0].store.parsed_path("123/%s" % i))
            self.repos[0].relate("123/%s" % i)
            # prepare a base.ttl (or var-common.js) that maps
            # <http://example.org/publisher/B> to "Publishing house B"

        self.repos[0].rdf_type = self.repos[0].ns['bibo'].Standard
    def create_external_resources(self, doc):
        resources = []
        cssfile = self.store.parsed_path(doc.basefile, attachment="index.css")
        resources.append(cssfile)
        util.ensure_dir(cssfile)
        with open(cssfile, "w") as fp:
            # Create CSS header with fontspecs
            for pdf in doc.body:
                assert isinstance(pdf, PDFReader), "doc.body is %s, not PDFReader -- still need to access fontspecs etc" % type(pdf)
                for spec in list(pdf.fontspec.values()):
                    fp.write(".fontspec%s {font: %spx %s; color: %s;}\n" %
                             (spec['id'], spec['size'], spec['family'], spec['color']))

            # 2 Copy all created png files to their correct locations
            totcnt = 0
            pdfbase = os.path.splitext(os.path.basename(pdf.filename))[0]
            for pdf in doc.body:
                cnt = 0
                for page in pdf:
                    totcnt += 1
                    cnt += 1
                    if page.background:
                        src = self.store.intermediate_path(
                            doc.basefile, attachment=os.path.basename(page.background))
                        dest = self.store.parsed_path(
                            doc.basefile, attachment=os.path.basename(page.background))
                        if util.copy_if_different(src, dest):
                            self.log.debug("Copied %s to %s" % (src, dest))
                        resources.append(dest)
                        fp.write("#page%03d { background: url('%s');}\n" %
                                 (cnt, os.path.basename(dest)))
        return resources
Beispiel #5
0
 def native_to_file(self, nativedata, outfile):
     res = etree.tostring(nativedata,
                          pretty_print=self.format,
                          encoding="utf-8")
     util.ensure_dir(outfile)
     with open(outfile, "wb") as fp:
         fp.write(res)
Beispiel #6
0
 def test_parse(self):
     util.ensure_dir(self.repo.store.downloaded_path("sample"))
     shutil.copy2("test/files/pdfreader/sample.pdf",
                  self.repo.store.downloaded_path("sample"))
     try:
         self.repo.required_predicates = []
         self.repo.parse("sample")
     except errors.ExternalCommandError:
         # print("pdftohtml error: retrying")
         # for systems that don't have pdftohtml, we copy the expected
         # intermediate files, so that we can test the rest of the logic
         targetdir = os.path.dirname(self.repo.store.intermediate_path("sample"))
         # print("working around by copying test/files/pdfreader/intermediate tree to %s" % targetdir)
         if os.path.exists(targetdir):
             shutil.rmtree(targetdir)
         shutil.copytree("test/files/pdfreader/intermediate",
                         targetdir)
         # make really sure the xml file has a newer timestamp than the PDF
         from time import sleep
         sleep(0.01)
         os.utime(targetdir+"/index.xml", None)
         try:
             self.repo.parse("sample")
         except errors.ExternalCommandError as e:
             print("ExternalCommandError on rerun.\n    targetdir: %s\n    %s exists: %s\n    message: %s" %
                   (targetdir, targetdir+"/index.xml", os.path.exists(targetdir+"/index.xml"), e))
         # print("Workaround succeeded: %s" % os.path.exists(targetdir+"/index.xml"))
     
     p = self.repo.store.datadir
     self.assertTrue(os.path.exists(p+'/intermediate/sample/index001.png'))
     self.assertFalse(os.path.exists(p+'/intermediate/sample/index.pdf'))
     self.assertTrue(os.path.exists(p+'/intermediate/sample/index.xml'))
     self.assertTrue(os.path.exists(p+'/parsed/sample/index001.png'))
     self.assertTrue(os.path.exists(p+'/parsed/sample/index.css'))
     self.assertTrue(os.path.exists(p+'/parsed/sample/index.xhtml'))
Beispiel #7
0
    def graph_to_image(self, graph, imageformat, filename):
        import pydot
        import rdflib
        dot = pydot.Dot()
        # dot.progs = {"dot": "c:/Program Files/Graphviz2.26.3/bin/dot.exe"}

        # code from rdflib.util.graph_to_dot, but adjusted to handle unicode
        nodes = {}
        for s, o in graph.subject_objects():
            for i in s, o:
                if i not in list(nodes.keys()):
                    if isinstance(i, rdflib.BNode):
                        nodes[i] = repr(i)[7:]
                    elif isinstance(i, rdflib.Literal):
                        nodes[i] = repr(i)[16:-1]
                    elif isinstance(i, rdflib.URIRef):
                        nodes[i] = repr(i)[22:-2]

        for s, p, o in graph.triples((None, None, None)):
            dot.add_edge(pydot.Edge(nodes[s], nodes[o], label=repr(p)[22:-2]))

        self.log.debug("Writing %s format to %s" % (imageformat, filename))
        util.ensure_dir(filename)
        dot.write(path=filename, prog="dot", format=imageformat)
        self.log.debug("Wrote %s" % filename)
Beispiel #8
0
    def word_to_docbook(self, indoc, outdoc):
        """Convert a old Word document (.doc) to a pseudo-docbook file through antiword."""
        tmpfile = mktemp()
        indoc = os.path.normpath(indoc)
        wrapper = textwrap.TextWrapper(break_long_words=False,
                                       width=72)

        util.ensure_dir(outdoc)
        if " " in indoc:
            indoc = '"%s"' % indoc
        cmd = "antiword -x db %s > %s" % (indoc, tmpfile)
        self.log.debug("Executing %s" % cmd)
        (ret, stdout, stderr) = util.runcmd(cmd)

        if ret != 0:
            self.log.error("Docbook conversion failed: %s" % stderr)
            raise errors.ExternalCommandError(
                "Docbook conversion failed: %s" % stderr.strip())

        tree = ET.parse(tmpfile)
        for element in tree.getiterator():
            if element.text and element.text.strip() != "":
                replacement = ""
                for p in element.text.split("\n"):
                    if p:
                        replacement += wrapper.fill(p) + "\n\n"

                element.text = replacement.strip()

        tree.write(outdoc, encoding="utf-8")
        os.unlink(tmpfile)
Beispiel #9
0
    def archive(self, basefile, version):
        """Moves the current version of a document to an archive. All
        files related to the document are moved (downloaded, parsed,
        generated files and any existing attachment files).

        :param basefile: The basefile of the document to archive
        :type basefile: str
        :param version: The version id to archive under
        :type version: str
        """

        for meth in (self.downloaded_path, self.documententry_path,
                     self.parsed_path, self.serialized_path,
                     self.distilled_path, self.annotation_path,
                     self.generated_path):
            # FIXME: what about intermediate? Ignore them as they
            # should be able to be regenerated at any time?
            src = meth(basefile)
            dest = meth(basefile, version)
            if self.storage_policy == "dir" and meth in (self.downloaded_path,
                                                         self.parsed_path,
                                                         self.generated_path):
                src = os.path.dirname(src)
                dest = os.path.dirname(dest)
            if not os.path.exists(src):
                continue
            if os.path.exists(dest):
                raise errors.ArchivingError(
                    "Archive destination %s for basefile %s version %s already exists!"
                    % (dest, basefile, version))
            # self.log.debug("Archiving %s to %s" % (src,dest))
            # print("Archiving %s to %s" % (src,dest))
            util.ensure_dir(dest)
            shutil.move(src, dest)
Beispiel #10
0
    def make_api_files(self):
        # this should create the following files under resourcedir
        # api/context.json (aliased to /json-ld/context.json if legacyapi)
        # api/terms.json (aliased to /var/terms.json if legacyapi)
        # api/common.json (aliased to /var/common.json if legacyapi)
        # MAYBE api/ui/  - copied from ferenda/res/ui
        files = []
        context = os.sep.join([self.resourcedir, "api", "context.json"])
        if self.config.legacyapi:
            self.log.info("Creating API files for legacyapi")
            contextpath = "/json-ld/context.json"
            termspath = "/var/terms"
            commonpath = "/var/common"
        else:
            # FIXME: create correct URL path
            contextpath = "/rsrc/api/context.json"
            termspath = "/rsrc/api/terms.json"
            commonpath = "/rsrc/api/common.json"
        util.ensure_dir(context)
        with open(context, "w") as fp:
            contextdict = self._get_json_context()
            s = json.dumps({"@context": contextdict},
                           separators=(', ', ': '),
                           indent=4,
                           sort_keys=True)
            fp.write(s)
        files.append(self._filepath_to_urlpath(context, 2))

        common = os.sep.join([self.resourcedir, "api", "common.json"])
        terms = os.sep.join([self.resourcedir, "api", "terms.json"])

        for (filename, func,
             urlpath) in ((common, self._get_common_graph, commonpath),
                          (terms, self._get_term_graph, termspath)):
            g = func(self.config.url + urlpath[1:])
            d = json.loads(
                g.serialize(format="json-ld", context=contextdict,
                            indent=4).decode("utf-8"))
            # d might not contain a @context (if contextdict == {}, ie
            # no repos are given)
            if '@context' in d:
                d['@context'] = contextpath
            if self.config.legacyapi:
                d = self._convert_legacy_jsonld(d,
                                                self.config.url + urlpath[1:])
            with open(filename, "w") as fp:
                s = json.dumps(d,
                               indent=4,
                               separators=(', ', ': '),
                               sort_keys=True)
                fp.write(s)

            files.append(self._filepath_to_urlpath(filename, 2))

        if self.config.legacyapi:
            # copy ui explorer app to <url>/rsrc/ui/ -- this does not get
            # included in files
            targetdir = os.sep.join([self.resourcedir, "ui"])
            self.resourceloader.extractdir("ui", targetdir)
        return files
Beispiel #11
0
    def save(self, path=None):
        """Saves the state of the documententry to a JSON file at *path*. If
        *path* is not provided, uses the path that the object was initialized
        with.

        """
        if not path:
            path = self._path  # better be there
            
        # The below concise way of creating a dict will yield a
        # future.types.newdict.newdict, whose .keys() method yields a
        # dictionary-keyiterator object, not a standard sortable
        # list. This fails with json.dump(sort_keys=True).
        #
        #  d = dict((k, v) for (k, v) in self.__dict__.items() if k[0] != "_")
        #
        # So we create a standard py2 dict by using literals:
        d = {}
        for (k, v) in self.__dict__.items():
            if k[0] != "_":
                d[k] = v
        if isinstance(self.summary, Literal) and self.summary.datatype == RDF.XMLLiteral:
            d["summary_type"] = "html"

        util.ensure_dir(path)
        with open(path, "w") as fp:
            s = json.dumps(d, default=util.json_default_date, indent=2,
                           separators=(', ', ': '), sort_keys=True)
            fp.write(s)
Beispiel #12
0
    def save(self, path=None):
        """Saves the state of the documententry to a JSON file at *path*. If
        *path* is not provided, uses the path that the object was initialized
        with.

        """
        if not path:
            path = self._path  # better be there

        # The below concise way of creating a dict will yield a
        # future.types.newdict.newdict, whose .keys() method yields a
        # dictionary-keyiterator object, not a standard sortable
        # list. This fails with json.dump(sort_keys=True).
        #
        #  d = dict((k, v) for (k, v) in self.__dict__.items() if k[0] != "_")
        #
        # So we create a standard py2 dict by using literals:
        d = {}
        for (k, v) in self.__dict__.items():
            if k[0] != "_":
                d[k] = v
        if isinstance(self.summary,
                      Literal) and self.summary.datatype == RDF.XMLLiteral:
            d["summary_type"] = "html"

        util.ensure_dir(path)
        with open(path, "w") as fp:
            s = json.dumps(d,
                           default=util.json_default_date,
                           indent=2,
                           separators=(', ', ': '),
                           sort_keys=True)
            fp.write(s)
Beispiel #13
0
 def test_parse(self):
     
     util.ensure_dir(self.repo.store.downloaded_path("sample"))
     shutil.copy2("test/files/pdfreader/sample.pdf",
                  self.repo.store.downloaded_path("sample"))
     try:
         self.repo.parse("sample")
     except errors.ExternalCommandError:
         # print("pdftohtml error: retrying")
         # for systems that don't have pdftohtml, we copy the expected
         # intermediate files, so that we can test the rest of the logic
         targetdir = os.path.dirname(self.repo.store.intermediate_path("sample"))
         # print("working around by copying to %s" % targetdir)
         if os.path.exists(targetdir):
             shutil.rmtree(targetdir)
         shutil.copytree("test/files/pdfreader/intermediate",
                         targetdir)
         self.repo.parse("sample")
         # print("Workaround succeeded")
     p = self.repo.store.datadir
     self.assertTrue(os.path.exists(p+'/intermediate/sample/index001.png'))
     self.assertTrue(os.path.exists(p+'/intermediate/sample/index.pdf'))
     self.assertTrue(os.path.exists(p+'/intermediate/sample/index.xml'))
     self.assertTrue(os.path.exists(p+'/parsed/sample/index001.png'))
     self.assertTrue(os.path.exists(p+'/parsed/sample/index.css'))
     self.assertTrue(os.path.exists(p+'/parsed/sample/index.xhtml'))
Beispiel #14
0
 def setUp(self):
     super(TOC, self).setUp()
     resources = self.datadir + os.sep + "rsrc" + os.sep + "resources.xml"
     util.ensure_dir(resources)
     shutil.copy2(
         "%s/files/base/rsrc/resources.xml" % os.path.dirname(__file__),
         resources)
Beispiel #15
0
    def download_ftp(self, dirname, recurse, user, password, connection=None):
        self.log.debug('Listing contents of %s' % dirname)
        lines = []
        if not connection:
            connection = FTP('ftp.dom.se')
            connection.login(user, password)

        connection.cwd(dirname)
        connection.retrlines('LIST', lines.append)

        for line in lines:
            parts = line.split()
            filename = parts[-1].strip()
            if line.startswith('d') and recurse:
                self.download(filename, recurse)
            elif line.startswith('-'):
                basefile = os.path.splitext(filename)[0]
                if dirname:
                    basefile = dirname + "/" + basefile
                localpath = self.store.downloaded_path(basefile)
                if os.path.exists(localpath) and not self.config.force:
                    pass  # we already got this
                else:
                    util.ensure_dir(localpath)
                    self.log.debug('Fetching %s to %s' % (filename,
                                                          localpath))
                    connection.retrbinary('RETR %s' % filename,
                                          # FIXME: retrbinary calls .close()?
                                          open(localpath, 'wb').write)
                    self.process_zipfile(localpath)
        connection.cwd('/')
Beispiel #16
0
    def graph_to_image(self, graph, imageformat, filename):
        import pydot
        import rdflib
        dot = pydot.Dot()
        # dot.progs = {"dot": "c:/Program Files/Graphviz2.26.3/bin/dot.exe"}

        # code from rdflib.util.graph_to_dot, but adjusted to handle unicode
        nodes = {}
        for s, o in graph.subject_objects():
            for i in s, o:
                if i not in list(nodes.keys()):
                    if isinstance(i, rdflib.BNode):
                        nodes[i] = repr(i)[7:]
                    elif isinstance(i, rdflib.Literal):
                        nodes[i] = repr(i)[16:-1]
                    elif isinstance(i, rdflib.URIRef):
                        nodes[i] = repr(i)[22:-2]

        for s, p, o in graph.triples((None, None, None)):
            dot.add_edge(pydot.Edge(nodes[s], nodes[o], label=repr(p)[22:-2]))

        self.log.debug("Writing %s format to %s" % (imageformat, filename))
        util.ensure_dir(filename)
        dot.write(path=filename, prog="dot", format=imageformat)
        self.log.debug("Wrote %s" % filename)
Beispiel #17
0
 def metrics(self, metricspath=None, plotpath=None, startpage=0,
             pagecount=None, force=False):
     docsegments = self.documents
     if len(docsegments) == 1:
         return super(PropAnalyzer, self).metrics(metricspath,
                                                  plotpath,
                                                  startpage,
                                                  pagecount, force)
     else:
         r = []
         exclude = []
         mainidx = None
         for idx, (startpage, pagecount, tag) in enumerate(docsegments):
             r.append(super(PropAnalyzer,
                              self).metrics(startpage=startpage,
                                            pagecount=pagecount))
             if tag != 'main':
                 exclude.extend(list(range(startpage, startpage+pagecount)))
             elif mainidx is None:
                 mainidx = idx
     r[mainidx]['excludedpages'] = exclude
     # since we don't pass metricspath to super().metrics, that
     # func does not create a metrics.json cache file. So we
     # generate that now (using the same data as we return)
     util.ensure_dir(metricspath)
     with open(metricspath, "w") as fp:
         s = json.dumps(r[mainidx], indent=4, separators=(', ', ': '), sort_keys=True)
         fp.write(s)
     return r[mainidx]
Beispiel #18
0
    def archive(self, basefile, version):
        """Moves the current version of a document to an archive. All
        files related to the document are moved (downloaded, parsed,
        generated files and any existing attachment files).

        :param basefile: The basefile of the document to archive
        :type basefile: str
        :param version: The version id to archive under
        :type version: str
        """

        for meth in (self.downloaded_path, self.documententry_path,
                     self.parsed_path, self.serialized_path,
                     self.distilled_path,
                     self.annotation_path, self.generated_path):
            # FIXME: what about intermediate? Ignore them as they
            # should be able to be regenerated at any time?
            src = meth(basefile)
            dest = meth(basefile, version)
            if self.storage_policy == "dir" and meth in (self.downloaded_path,
                                                         self.parsed_path,
                                                         self.generated_path):
                src = os.path.dirname(src)
                dest = os.path.dirname(dest)
            if not os.path.exists(src):
                continue
            if os.path.exists(dest):
                raise errors.ArchivingError(
                    "Archive destination %s for basefile %s version %s already exists!" % (dest, basefile, version))
            # self.log.debug("Archiving %s to %s" % (src,dest))
            # print("Archiving %s to %s" % (src,dest))
            util.ensure_dir(dest)
            shutil.move(src, dest)
Beispiel #19
0
    def open(self, basefile, maindir, suffix, mode="r", version=None, attachment=None):
        """
        Context manager that opens files for reading or
        writing. The parameters are the same as for :meth:`~ferenda.DocumentStore.path`, and the
        note is applicable here as well -- use
        :meth:`~ferenda.DocumentStore.open_downloaded`, :meth:`~ferenda.DocumentStore.open_parsed` et al if
        possible.

        Example:
        
        >>> store = DocumentStore(datadir="/tmp/base")
        >>> with store.open('123/a', 'parsed', '.xhtml', mode="w") as fp:
        ...     res = fp.write("hello world")
        >>> os.path.exists("/tmp/base/parsed/123/a.xhtml")
        True

        """
        filename = self.path(basefile, maindir, suffix, version, attachment)
        fp = NamedTemporaryFile(mode, delete=False)
        fp.realname = filename
        try:
            yield fp
        finally:
            tempname = fp.name
            fp.close()
            if not os.path.exists(filename) or not filecmp.cmp(tempname, filename):
                util.ensure_dir(filename)
                shutil.move(tempname, filename)
            else:
                os.unlink(tempname)
Beispiel #20
0
 def close(self, *args, **kwargs):
     if "w" in self.mode:
         tempname = util.name_from_fp(self.fp)
         ret = self.fp.close()
         if not os.path.exists(self.filename) or not filecmp.cmp(
                 tempname, self.filename):
             util.ensure_dir(self.filename)
             shutil.move(tempname, self.filename)
             # since _open uses NamedTemporaryFile, which creates
             # files only readable by the creating user, we need to
             # set more liberal permissions. FIXME: This should
             # respect os.umask()
             os.chmod(
                 self.filename, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP
                 | stat.S_IWGRP | stat.S_IROTH)
         else:
             os.unlink(tempname)
         return ret
     else:
         # This is needed sometimes since
         # Bzip2File/LZMAFile/GzipFile doesn't close the open file
         # objects that they wrap
         if hasattr(self.fp,
                    '_fp'):  # for Bzip2File/LZMAFile with IOBufferedReader
             self.fp._fp.close()
         if hasattr(self.fp,
                    'fileobj'):  # for GzipFile in the same situation
             self.fp.fileobj.close()
         return self.fp.close()
Beispiel #21
0
 def native_to_file(self, nativedata, outfile, doctype=None):
     extra = {}
     if doctype:
         extra['doctype'] = doctype
     res = etree.tostring(nativedata, pretty_print=self.format, encoding="utf-8", **extra)
     util.ensure_dir(outfile)
     with open(outfile, "wb") as fp:
         fp.write(res)
Beispiel #22
0
 def makeimage(basename, label):
     filename = "res/img/sfs/%s.png" % basename
     if not os.path.exists(filename):
         util.ensure_dir(filename)
         self.log.info("Creating img %s with label %s" %
                       (filename, label))
         cmd = 'convert -background transparent -fill Grey -font %s -pointsize 10 -size 44x14 -gravity East label:"%s " %s' % (font, label, filename)
         util.runcmd(cmd)
     return filename
Beispiel #23
0
    def _process_file(self, filename, buf, destdir, origin=""):
        """
        Helper function to concatenate or copy CSS/JS (optionally
        processing them with e.g. Scss) or other files to correct place
        under the web root directory.

        :param filename: The name (relative to the ferenda package) of the file
        :param buf: A buffer into which the contents of the file is written
                    (if combineresources == True)
        :param destdir: The directory into which the file will be copied
                        (unless combineresources == True)
        :param origin: The source of the configuration that specifies this file
        :returns: The URL path of the resulting file, relative to the web root
                  (or None if combineresources == True)
        :rtype: str
        """
        if filename.startswith("http://") or filename.startswith("https://"):
            if self.config.combineresources:
                raise errors.ConfigurationError(
                    "makeresources: Can't use combineresources=True in combination with external js/css URLs (%s)"
                    % filename)
            self.log.debug("Using external url %s" % filename)
            return filename
        try:
            fp = self.resourceloader.openfp(filename, binary=True)
        except errors.ResourceNotFound:
            self.log.warning("file %(filename)s (specified in %(origin)s)"
                             " doesn't exist" % locals())
            return None

        (base, ext) = os.path.splitext(filename)

        if self.config.combineresources:
            self.log.debug("combining %s into buffer" % filename)
            d = fp.read()
            buf.write(d)
            fp.close()
            return None
        else:
            # FIXME: don't copy (at least not log) if the outfile
            # already exists.
            # self.log.debug("writing %s out to %s" % (filename, destdir))
            outfile = destdir + os.sep + os.path.basename(filename)
            if (os.path.islink(outfile) and os.path.relpath(
                    os.path.join(os.path.dirname(outfile),
                                 os.readlink(outfile)))
                    == util.name_from_fp(fp)):
                self.log.warning(
                    "%s is a symlink to source file %s, won't overwrite" %
                    (outfile, util.name_from_fp(fp)))
            else:
                util.ensure_dir(outfile)
                with open(outfile, "wb") as fp2:
                    fp2.write(fp.read())
                fp.close()
            return self._filepath_to_urlpath(outfile, 2)
Beispiel #24
0
 def makeimage(basename, label):
     filename = "res/img/sfs/%s.png" % basename
     if not os.path.exists(filename):
         util.ensure_dir(filename)
         self.log.info("Creating img %s with label %s" %
                       (filename, label))
         cmd = 'convert -background transparent -fill gray50 -font %s -pointsize 10 -size 44x14 -gravity East label:"%s " %s' % (
             font, label, filename)
         util.runcmd(cmd)
     return filename
Beispiel #25
0
    def download(self, basefile=None):
        # Get all "term sets" (used dcterms:subject Objects, wiki pages
        # describing legal concepts, swedish wikipedia pages...)
        terms = defaultdict(dict)

        # 1) Query the triplestore for all dcterms:subject triples (is this
        # semantically sensible for a "download" action -- the content
        # isn't really external?) -- term set "subjects" (these come
        # from both court cases and legal definitions in law text)
        sq = """
        PREFIX dcterms:<http://purl.org/dc/terms/>
        PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#>

        SELECT ?uri ?subject ?label
        WHERE { {?uri dcterms:subject ?subject . }
                OPTIONAL {?subject rdfs:label ?label . } }
        """
        store = TripleStore.connect(self.config.storetype,
                                    self.config.storelocation,
                                    self.config.storerepository)
        results = store.select(sq, "python")
        for row in results:
            if 'label' in row:
                label = row['label']
            else:
                label = self.basefile_from_uri(row['subject'])
                if label is None:
                    self.log.warning("could not determine keyword from %s" % row['subject'])
                    continue
            
            sanitized = self.sanitize_term(label)
            if sanitized:
                if sanitized not in terms:
                    terms[sanitized]['subjects'] = []
                terms[sanitized]['subjects'].append(row['uri'])

        self.log.debug("Retrieved %s subject terms from triplestore" % len(terms))

        for termset_func in self.termset_funcs:
            termset_func(terms)

        for term in terms:
            term = self.sanitize_term(term)
            if not term:
                continue
            oldterms = ""
            termpath = self.store.downloaded_path(term)
            if os.path.exists(termpath):
                oldterms = yaml.load(util.readfile(termpath))
            if terms[term] != oldterms:
                util.ensure_dir(termpath)
                util.writefile(termpath, yaml.dump(terms[term], default_flow_style=False))
                self.log.info("%s: in %s termsets" % (term, len(terms[term])))
            else:
                self.log.debug("%s: skipped" % term)
Beispiel #26
0
 def test_feed_param(self):
     tocdata = b"<!-- specific feed goes here -->"
     tocpath = self.repo.store.resourcepath("feed/a.atom")
     util.ensure_dir(tocpath)
     with open(tocpath, "wb") as fp:
         fp.write(tocdata)
     self.env["PATH_INFO"] = "/dataset/base/feed.atom?title=a"
     status, headers, content = self.call_wsgi(self.env)
     want = ["200 OK", {'Content-Type': 'application/atom+xml'}, tocdata]
     self.assertResponse(want[0], want[1], want[2], status, headers,
                         content)
Beispiel #27
0
 def test_load(self):
     path = self.repo.store.documententry_path("123/a")
     util.ensure_dir(path)
     with open(path, "w") as fp:
         fp.write(self.basic_json)
     d = DocumentEntry(path=path)
     self.assertEqual(d.orig_checked, datetime(2013,3,27,20,46,37))
     self.assertIsNone(d.orig_updated)
     self.assertEqual(d.orig_url,'http://source.example.org/doc/123/a')
     self.assertEqual(d.id,'http://example.org/123/a')
     self.assertEqual('<DocumentEntry id=http://example.org/123/a>', repr(d))
Beispiel #28
0
 def native_to_file(self, nativedata, outfile, doctype=None):
     extra = {}
     if doctype:
         extra['doctype'] = doctype
     res = etree.tostring(nativedata,
                          pretty_print=self.format,
                          encoding="utf-8",
                          **extra)
     util.ensure_dir(outfile)
     with open(outfile, "wb") as fp:
         fp.write(res)
Beispiel #29
0
 def test_load_status(self):
     path = self.repo.store.documententry_path("123/a")
     util.ensure_dir(path)
     with open(path, "w") as fp:
         fp.write(self.status_json)
     d = DocumentEntry(path=path)
     self.assertEqual(datetime(2018,8,14,18,15,00), d.status['download']['date'])
     self.assertEqual(datetime(2018,8,14,18,16,00), d.status['parse']['date'])
     self.assertEqual(datetime(2018,8,14,18,17,00), d.status['relate']['date'])
     self.assertEqual(datetime(2018,8,14,18,18,00), d.status['generate']['date'])
     self.assertEqual("2018-08-14T18:18:00", d.status['generate']['not_a_date'])
Beispiel #30
0
    def extractdir(self, resourcedir, target, suffixes=None):
        """Extract all file resources contained in the specified
        resource directory to the target directory.
        
        Searches all loadpaths and optionally the Resources API for
        any file contained within. This means the target dir may end
        up with eg. one file from a high-priority path and other files
        from the system dirs/resources. This in turns makes it easy to
        just override a single file in a larger set of resource files.

        Even if the resourcedir might contain resources in
        subdirectories (eg "source/sub/dir/resource.xml"), the
        extraction will be to the top-level target directory (eg
        "target/resource.xml").

        """
        if not suffixes:
            suffixes = []
        extracted = set()
        for path in self.loadpath:
            if resourcedir and resourcedir != ".":
                path = path+os.sep+resourcedir
            if not os.path.exists(path):
                continue
            # for f in os.listdir(path):
            for f in util.list_dirs(path, suffixes):
                f = f[len(path)+1:]
                basef = os.path.basename(f)
                src = os.sep.join([path, f])
                dest = os.sep.join([target, basef])
                if dest not in extracted and os.path.isfile(src):
                    util.ensure_dir(dest)
                    shutil.copy2(src, dest)
                    extracted.add(dest)

        if self.use_pkg_resources:
            self._check_module_path()
            path = self.resourceprefix
            if resourcedir:
                path = path + os.sep + resourcedir
            for f in pkg_resources.resource_listdir(self.modulename, path):
                src = path + os.sep + f
                dest = target
                dest += os.sep + f
                if (dest not in extracted and not
                    pkg_resources.resource_isdir(self.modulename,
                                                 self.resourceprefix + os.sep + f)):
                    util.ensure_dir(dest)
                    with open(dest, "wb") as fp:
                        readfp = pkg_resources.resource_stream(self.modulename,
                                                               src)
                        fp.write(readfp.read())
                        readfp.close()
                    extracted.add(dest)
Beispiel #31
0
 def test_load(self):
     path = self.repo.store.documententry_path("123/a")
     util.ensure_dir(path)
     with open(path, "w") as fp:
         fp.write(self.basic_json)
     d = DocumentEntry(path=path)
     self.assertEqual(d.orig_checked, datetime(2013,3,27,20,46,37))
     self.assertIsNone(d.orig_updated)
     self.assertEqual(d.orig_url,'http://source.example.org/doc/123/a')
     self.assertEqual(d.id,'http://example.org/123/a')
     self.assertEqual('<DocumentEntry id=http://example.org/123/a>', repr(d))
Beispiel #32
0
    def extractdir(self, resourcedir, target, suffixes=None):
        """Extract all file resources contained in the specified
        resource directory to the target directory.
        
        Searches all loadpaths and optionally the Resources API for
        any file contained within. This means the target dir may end
        up with eg. one file from a high-priority path and other files
        from the system dirs/resources. This in turns makes it easy to
        just override a single file in a larger set of resource files.

        Even if the resourcedir might contain resources in
        subdirectories (eg "source/sub/dir/resource.xml"), the
        extraction will be to the top-level target directory (eg
        "target/resource.xml").

        """
        if not suffixes:
            suffixes = []
        extracted = set()
        for path in self.loadpath:
            if resourcedir and resourcedir != ".":
                path = path + os.sep + resourcedir
            if not os.path.exists(path):
                continue
            # for f in os.listdir(path):
            for f in util.list_dirs(path, suffixes):
                f = f[len(path) + 1:]
                basef = os.path.basename(f)
                src = os.sep.join([path, f])
                dest = os.sep.join([target, basef])
                if dest not in extracted and os.path.isfile(src):
                    util.ensure_dir(dest)
                    shutil.copy2(src, dest)
                    extracted.add(dest)

        if self.use_pkg_resources:
            self._check_module_path()
            path = self.resourceprefix
            if resourcedir:
                path = path + os.sep + resourcedir
            for f in pkg_resources.resource_listdir(self.modulename, path):
                src = path + os.sep + f
                dest = target
                dest += os.sep + f
                if (dest not in extracted and not pkg_resources.resource_isdir(
                        self.modulename, self.resourceprefix + os.sep + f)):
                    util.ensure_dir(dest)
                    with open(dest, "wb") as fp:
                        readfp = pkg_resources.resource_stream(
                            self.modulename, src)
                        fp.write(readfp.read())
                        readfp.close()
                    extracted.add(dest)
Beispiel #33
0
    def wrapper(self, doc):
        ret = f(self, doc)
        updated = self.render_xhtml(doc, self.store.parsed_path(doc.basefile))
        if updated:
            self.log.debug("%s: Created %s" % (doc.basefile, self.store.parsed_path(doc.basefile)))

        # css file + background images + png renderings of text
        self.create_external_resources(doc)

        # Validate that all triples specified in doc.meta and any
        # .meta property on any body object is present in the
        # XHTML+RDFa file.
        distilled_graph = Graph()

        with codecs.open(self.store.parsed_path(doc.basefile),
                         encoding="utf-8") as fp:  # unicode
            distilled_graph.parse(data=fp.read(), format="rdfa",
                                  publicID=doc.uri)
        # The act of parsing from RDFa binds a lot of namespaces
        # in the graph in an unneccesary manner. Particularly it
        # binds both 'dc' and 'dcterms' to
        # 'http://purl.org/dc/terms/', which makes serialization
        # less than predictable. Blow these prefixes away.
        distilled_graph.bind("dc", URIRef("http://purl.org/dc/elements/1.1/"))
        distilled_graph.bind(
            "dcterms",
            URIRef("http://example.org/this-prefix-should-not-be-used"))

        util.ensure_dir(self.store.distilled_path(doc.basefile))
        with open(self.store.distilled_path(doc.basefile),
                  "wb") as distilled_file:
            # print("============distilled===============")
            # print(distilled_graph.serialize(format="turtle").decode('utf-8'))
            distilled_graph.serialize(distilled_file, format="pretty-xml")
        self.log.debug(
            '%s: %s triples extracted to %s', doc.basefile,
            len(distilled_graph), self.store.distilled_path(doc.basefile))

        for g in iterate_graphs(doc.body):
            doc.meta += g

        for triple in distilled_graph:
            # len_before = len(doc.meta)
            doc.meta.remove(triple)
            # len_after = len(doc.meta)

        if doc.meta:
            self.log.warning("%s: %d triple(s) from the original metadata was "
                             "not found in the serialized XHTML file:\n%s",
                             doc.basefile, len(doc.meta),
                             doc.meta.serialize(format="nt").decode('utf-8').strip())
        return ret
Beispiel #34
0
    def _process_file(self, filename, buf, destdir, origin=""):
        """
        Helper function to concatenate or copy CSS/JS (optionally
        processing them with e.g. Scss) or other files to correct place
        under the web root directory.

        :param filename: The name (relative to the ferenda package) of the file
        :param buf: A buffer into which the contents of the file is written
                    (if combineresources == True)
        :param destdir: The directory into which the file will be copied
                        (unless combineresources == True)
        :param origin: The source of the configuration that specifies this file
        :returns: The URL path of the resulting file, relative to the web root
                  (or None if combineresources == True)
        :rtype: str
        """
        if filename.startswith("http://") or filename.startswith("https://"):
            if self.config.combineresources:
                raise errors.ConfigurationError(
                    "makeresources: Can't use combineresources=True in combination with external js/css URLs (%s)" % filename)
            self.log.debug("Using external url %s" % filename)
            return filename
        try: 
            fp = self.resourceloader.openfp(filename, binary=True)
        except errors.ResourceNotFound:
            self.log.warning("file %(filename)s (specified in %(origin)s)"
                             " doesn't exist" % locals())
            return None

        (base, ext) = os.path.splitext(filename)

        if self.config.combineresources:
            self.log.debug("combining %s into buffer" % filename)
            d = fp.read()
            buf.write(d)
            fp.close()
            return None
        else:
            # FIXME: don't copy (at least not log) if the outfile
            # already exists.
            # self.log.debug("writing %s out to %s" % (filename, destdir))
            outfile = destdir + os.sep + os.path.basename(filename)
            if (os.path.islink(outfile) and
                os.path.relpath(os.path.join(os.path.dirname(outfile),
                                             os.readlink(outfile))) == util.name_from_fp(fp)):
                self.log.warning("%s is a symlink to source file %s, won't overwrite" % (outfile, util.name_from_fp(fp)))
            else:
                util.ensure_dir(outfile)
                with open(outfile, "wb") as fp2:
                    fp2.write(fp.read())
                fp.close()
            return self._filepath_to_urlpath(outfile, 2)
Beispiel #35
0
    def plot(self, filename, margincounters, stylecounters, metrics):
        try:
            import matplotlib
            matplotlib.use('Agg')
            import matplotlib.pyplot as plt
        except ImportError:
            raise ImportError("You need matplotlib installed")
        # plt.style.use('ggplot')  # looks good but makes histograms unreadable
        matplotlib.rcParams.update({'font.size': 8})
        # width, height in inches
        plt.figure(figsize=((len(margincounters)) * 2, 7))

        # if 6 counters:
        # +0,0--+ +0,1--+ +0,2--+ +0,3--+
        # | LM  | | LEM | | RM  | | REM |
        # +-----+ +-----+ +-----+ +-----+
        # +1,0--+ +1,1--+ +1,2 colspan=2+
        # | TM  | | BM  | |    Styles   |
        # +-----+ +-----+ +-------------+
        #
        # if 4 counters:
        # +0,0--+ +0,1--+ +0,2--+
        # | LM  | | RM  | | TM  |
        # +-----+ +-----+ +-----+
        # +1,0--+ +1,1 colspan=2+
        # | BM  | |    Styles   |
        # +-----+ +-------------+

        # disregard the pageheight/pagewidth counters
        pagewidth = max(margincounters['pagewidth'])
        del margincounters['pagewidth']
        pageheight = max(margincounters['pageheight'])
        del margincounters['pageheight']
        if len(margincounters) == 4:
            coords = ((0, 0), (0, 1), (0, 2), (1, 0), (1, 1))
            grid = (2, 3)
        elif len(margincounters) == 6:
            coords = ((0, 0), (0, 1), (0, 2), (0, 3), (1, 0), (1, 1), (1, 2))
            grid = (2, 4)
        else:
            # FIXME: make this dynamic
            raise ValueError("Can't layout other # of counters than 4 or 6")
        marginplots = [plt.subplot2grid(grid, pos) for pos in coords[:-1]]
        self.plot_margins(marginplots, margincounters, metrics, pagewidth,
                          pageheight)

        styleplot = plt.subplot2grid(grid, coords[-1], colspan=2)
        self.plot_styles(styleplot, stylecounters, metrics, plt)

        util.ensure_dir(filename)
        plt.savefig(filename, dpi=150)
        self.log.debug("wrote %s" % filename)
Beispiel #36
0
    def make_api_files(self):
        # this should create the following files under resourcedir
        # api/context.json (aliased to /json-ld/context.json if legacyapi)
        # api/terms.json (aliased to /var/terms.json if legacyapi)
        # api/common.json (aliased to /var/common.json if legacyapi)
        # MAYBE api/ui/  - copied from ferenda/res/ui
        files = []
        context = os.sep.join([self.resourcedir, "api", "context.json"])
        if self.config.legacyapi:
            self.log.info("Creating API files for legacyapi")
            contextpath = "/json-ld/context.json"
            termspath = "/var/terms"
            commonpath = "/var/common"
        else:
            # FIXME: create correct URL path
            contextpath = "/rsrc/api/context.json"
            termspath = "/rsrc/api/terms.json"
            commonpath = "/rsrc/api/common.json"
        util.ensure_dir(context)
        with open(context, "w") as fp:
            contextdict = self._get_json_context()
            s = json.dumps({"@context": contextdict}, separators=(', ', ': '),
                           indent=4, sort_keys=True)
            fp.write(s)
        files.append(self._filepath_to_urlpath(context, 2))

        common = os.sep.join([self.resourcedir, "api", "common.json"])
        terms = os.sep.join([self.resourcedir, "api", "terms.json"])

        for (filename, func, urlpath) in ((common, self._get_common_graph, commonpath),
                                          (terms,  self._get_term_graph,   termspath)):
            g = func(self.config.url + urlpath[1:])
            d = json.loads(g.serialize(format="json-ld", context=contextdict,
                                       indent=4).decode("utf-8"))
            # d might not contain a @context (if contextdict == {}, ie
            # no repos are given)
            if '@context' in d:
                d['@context'] = contextpath
            if self.config.legacyapi:
                d = self._convert_legacy_jsonld(d, self.config.url + urlpath[1:])
            with open(filename, "w") as fp:
                s = json.dumps(d, indent=4, separators=(', ', ': '), sort_keys=True)
                fp.write(s)
                
            files.append(self._filepath_to_urlpath(filename, 2))

        if self.config.legacyapi:
            # copy ui explorer app to <url>/rsrc/ui/ -- this does not get
            # included in files
            targetdir = os.sep.join([self.resourcedir, "ui"])
            self.resourceloader.extractdir("ui", targetdir)
        return files
Beispiel #37
0
    def plot(self, filename, margincounters, stylecounters, metrics):
        try:
            import matplotlib
            matplotlib.use('Agg')
            import matplotlib.pyplot as plt
        except ImportError:
            raise ImportError("You need matplotlib installed")
        # plt.style.use('ggplot')  # looks good but makes histograms unreadable
        matplotlib.rcParams.update({'font.size': 8})
        # width, height in inches
        plt.figure(figsize=((len(margincounters)) * 2, 7)) 

        # if 6 counters:
        # +0,0--+ +0,1--+ +0,2--+ +0,3--+
        # | LM  | | LEM | | RM  | | REM |
        # +-----+ +-----+ +-----+ +-----+
        # +1,0--+ +1,1--+ +1,2 colspan=2+
        # | TM  | | BM  | |    Styles   |
        # +-----+ +-----+ +-------------+
        #
        # if 4 counters:
        # +0,0--+ +0,1--+ +0,2--+
        # | LM  | | RM  | | TM  |
        # +-----+ +-----+ +-----+
        # +1,0--+ +1,1 colspan=2+
        # | BM  | |    Styles   |
        # +-----+ +-------------+

        # disregard the pageheight/pagewidth counters
        pagewidth = max(margincounters['pagewidth'])
        del margincounters['pagewidth']
        pageheight = max(margincounters['pageheight'])
        del margincounters['pageheight']
        if len(margincounters) == 4:
            coords = ((0, 0), (0, 1), (0, 2), (1, 0), (1, 1))
            grid = (2, 3)
        elif len(margincounters) == 6:
            coords = ((0, 0), (0, 1), (0, 2), (0, 3), (1, 0), (1, 1), (1, 2))
            grid = (2, 4)
        else:
            # FIXME: make this dynamic
            raise ValueError("Can't layout other # of counters than 4 or 6")
        marginplots = [plt.subplot2grid(grid, pos) for pos in coords[:-1]]
        self.plot_margins(marginplots, margincounters, metrics,
                          pagewidth, pageheight)

        styleplot = plt.subplot2grid(grid, coords[-1], colspan=2)
        self.plot_styles(styleplot, stylecounters, metrics, plt)

        util.ensure_dir(filename)
        plt.savefig(filename, dpi=150)
        self.log.debug("wrote %s" % filename)
Beispiel #38
0
 def test_dataset_param(self):
     util.ensure_dir(self.repo.store.generated_path("123/a"))
     tocdata = b"<!-- specific toc page goes here -->"
     tocpath = self.repo.store.resourcepath("toc/title/a.html")
     with open(tocpath, "wb") as fp:
         fp.write(tocdata)
     self.env["PATH_INFO"] = "/dataset/base?title=a"
     status, headers, content = self.call_wsgi(self.env)
     want = ["200 OK",
             {'Content-Type': 'text/html; charset=utf-8'},
             tocdata]
     self.assertResponse(want[0], want[1], want[2],
                         status, headers, content)
Beispiel #39
0
 def test_attachment_param(self):
     self.repo.store.storage_policy = "dir"
     util.ensure_dir(self.repo.store.generated_path("123/a"))
     cssdata = b"/* css data goes here */"
     csspath = self.repo.store.generated_path("123/a",
                                              attachment="index.css")
     with open(csspath, "wb") as fp:
         fp.write(cssdata)
     self.env["PATH_INFO"] = "/res/base/123/a?attachment=index.css"
     status, headers, content = self.call_wsgi(self.env)
     want = ["200 OK", {'Content-Type': 'text/css'}, cssdata]
     self.assertResponse(want[0], want[1], want[2], status, headers,
                         content)
Beispiel #40
0
 def close(self, *args, **kwargs):
     if "w" in self.mode:
         tempname = util.name_from_fp(self.fp)
         ret = self.fp.close()
         if not os.path.exists(self.filename) or not filecmp.cmp(
                 tempname, self.filename):
             util.ensure_dir(self.filename)
             shutil.move(tempname, self.filename)
         else:
             os.unlink(tempname)
         return ret
     else:
         return self.fp.close()
Beispiel #41
0
 def test_feed_param(self):
     tocdata = b"<!-- specific feed goes here -->"
     tocpath = self.repo.store.resourcepath("feed/a.atom")
     util.ensure_dir(tocpath)
     with open(tocpath, "wb") as fp:
         fp.write(tocdata)
     self.env["PATH_INFO"] = "/dataset/base/feed.atom?title=a"
     status, headers, content = self.call_wsgi(self.env)
     want = ["200 OK",
             {'Content-Type': 'application/atom+xml'},
             tocdata]
     self.assertResponse(want[0], want[1], want[2],
                         status, headers, content)
Beispiel #42
0
 def test_dataset_param(self):
     util.ensure_dir(self.repo.store.generated_path("123/a"))
     tocdata = b"<!-- specific toc page goes here -->"
     tocpath = self.repo.store.resourcepath("toc/title/a.html")
     with open(tocpath, "wb") as fp:
         fp.write(tocdata)
     self.builder.path = "/dataset/base"
     self.builder.query_string = "title=a"
     status, headers, content = self.call_wsgi()
     want = ["200 OK",
             {'Content-Type': 'text/html; charset=utf-8'},
             tocdata]
     self.assertResponse(want[0], want[1], want[2],
                         status, headers, content)
Beispiel #43
0
 def test_attachment_param(self):
     self.repo.store.storage_policy = "dir"
     util.ensure_dir(self.repo.store.generated_path("123/a"))
     cssdata = b"/* css data goes here */"
     csspath = self.repo.store.generated_path("123/a", attachment="index.css")
     with open(csspath, "wb") as fp:
         fp.write(cssdata)
     self.env["PATH_INFO"] = "/res/base/123/a?attachment=index.css"
     status, headers, content = self.call_wsgi(self.env)
     want = ["200 OK",
             {'Content-Type': 'text/css'},
             cssdata]
     self.assertResponse(want[0], want[1], want[2],
                         status, headers, content)
Beispiel #44
0
 def GenerateMap(self, basefile):
     start = time()
     infile = os.path.relpath(self._xmlFileName(basefile))
     head = codecs.open(infile, encoding='utf-8').read(1024)
     m = self.re_xmlbase(head)
     if m:
         uri = "http://rinfo.lagrummet.se/publ/rattsfall/%s" % m.group(1)
         mapfile = self.store.path('generated', 'uri.map', '.new')
         util.ensure_dir(mapfile)
         f = codecs.open(mapfile, 'a', encoding='iso-8859-1')
         f.write("%s\t%s\n" % (m.group(1), basefile))
         f.close()
         self.log.info("%s ok" % basefile)
         return
     else:
         self.log.warning("could not find xml:base in %s" % infile)
Beispiel #45
0
 def test_load_status(self):
     path = self.repo.store.documententry_path("123/a")
     util.ensure_dir(path)
     with open(path, "w") as fp:
         fp.write(self.status_json)
     d = DocumentEntry(path=path)
     self.assertEqual(datetime(2018, 8, 14, 18, 15, 00),
                      d.status['download']['date'])
     self.assertEqual(datetime(2018, 8, 14, 18, 16, 00),
                      d.status['parse']['date'])
     self.assertEqual(datetime(2018, 8, 14, 18, 17, 00),
                      d.status['relate']['date'])
     self.assertEqual(datetime(2018, 8, 14, 18, 18, 00),
                      d.status['generate']['date'])
     self.assertEqual("2018-08-14T18:18:00",
                      d.status['generate']['not_a_date'])
Beispiel #46
0
 def write_doc(basefile, page_el):
     writefile = False
     p = self.store.downloaded_path(basefile)
     newcontent = etree.tostring(page_el, encoding="utf-8")
     if not os.path.exists(p):
         writefile = True
     else:
         oldcontent = util.readfile(p, "rb")
         if newcontent != oldcontent:
             writefile = True
     if writefile:
         util.ensure_dir(p)
         with open(p, "wb") as fp:
             fp.write(newcontent)
             self.log.info("%s: extracting from XML dump" % basefile)
     if basefile in basefiles:
         del basefiles[basefiles.index(basefile)]
Beispiel #47
0
    def test_modify(self):
        path = self.repo.store.documententry_path("123/a")
        util.ensure_dir(path)
        with open(path, "w") as fp:
            fp.write(self.basic_json)

        d = DocumentEntry(path=path)
        d.orig_updated = datetime(2013, 3, 27, 20, 59, 42, 325067)
        d.id = "http://example.org/123/a"
        # do this in setUp?
        with open(self.datadir+"/xhtml","w") as f:
            f.write("<div>xhtml fragment</div>")

        d.set_content(self.datadir+"/xhtml", "http://example.org/test",
                      mimetype="xhtml", inline=True)
        d.save()
        self.assertEqual(self.d2u(util.readfile(path)), self.modified_json)
Beispiel #48
0
def writegraph(graph, dest, operation="transformed"):
    util.ensure_dir(dest)
    if os.path.exists(dest):
        olddata = util.readfile(dest).split("\n\n", 1)[1]
    else:
        olddata = ""

    newdata = graph.serialize(format="turtle").decode("utf-8")
    if newdata != olddata:
        with open(dest, "w") as fp:
            header = "# Automatically %s from sources at %s\n\n" % (
                operation, datetime.now().isoformat())
            fp.write(header)
            fp.write(newdata)
            print("Wrote %s triples to %s" % (len(graph), dest))
    else:
        print("%s is unchanged" % dest)
Beispiel #49
0
    def download(self, basefile=None, url=None):
        if basefile:
            if not url:
                entry = DocumentEntry(self.store.documententry_path(basefile))
                url = entry.orig_url
            if url:
                return self.download_single(basefile, url)
            else:
                raise DownloadError(
                    "%s doesn't support downloading single basefiles w/o page URL"
                    % self.__class__.__name__)
        params = {
            'filterType': 'Taxonomy',
            'filterByType': 'FilterablePageBase',
            'preFilteredCategories': '1324',
            'rootPageReference': '0',
            'filteredContentCategories': self.document_type
        }
        if 'lastdownload' in self.config and not self.config.refresh:
            params['fromDate'] = self.config.lastdownload.strftime("%Y-%m-%d")
        # temporary test -- useful when troubleshooting behaviour related to malformed entries in the search result list
        # params['fromDate'] = "2009-05-13"
        # params['toDate']   = "2009-05-20"

        self.log.debug("Loading documents starting from %s" %
                       params.get('fromDate', "the beginning"))
        try:
            for basefile, url in self.download_get_basefiles(params):
                try:
                    # sleep(0.5)  # regeringen.se has a tendency to throw 400 errors, maybe because we're too quick?
                    self.download_single(basefile, url)
                except requests.exceptions.HTTPError as e:
                    if self.download_accept_404 and e.response.status_code == 404:
                        self.log.error("%s: %s %s" % (basefile, url, e))
                        ret = False
                    else:
                        raise e
        finally:
            urlmap_path = self.store.path("urls",
                                          "downloaded",
                                          ".map",
                                          storage_policy="file")
            util.ensure_dir(urlmap_path)
            with codecs.open(urlmap_path, "w", encoding="utf-8") as fp:
                for url, identifier in self.urlmap.items():
                    fp.write("%s\t%s\n" % (url, identifier))
Beispiel #50
0
    def test_modify(self):
        path = self.repo.store.documententry_path("123/a")
        util.ensure_dir(path)
        with open(path, "w") as fp:
            fp.write(self.basic_json)

        d = DocumentEntry(path=path)
        d.orig_updated = datetime(2013, 3, 27, 20, 59, 42, 325067)
        d.id = "http://example.org/123/a"
        # do this in setUp?
        with open(self.datadir+"/xhtml","w") as f:
            f.write("<div>xhtml fragment</div>")

        d.set_content(self.datadir+"/xhtml", "http://example.org/test",
                      mimetype="xhtml", inline=True)
        d.save()
        self.assertEqual(self.d2u(util.readfile(path)), self.modified_json)
Beispiel #51
0
    def word_to_ooxml(self, indoc, outdoc):
        """Extracts the raw OOXML file from a modern Word document (.docx)."""
        name = "word/document.xml"
        zipf = zipfile.ZipFile(indoc, "r")
        assert name in zipf.namelist(), "No %s in zipfile %s" % (name, indoc)
        data = zipf.read(name)
        util.ensure_dir(outdoc)
        with open(outdoc, "wb") as fp:
            fp.write(data)

        # FIXME: We need to reimplement this old function (which ran
        # tidy on the outfile) with an internal lxml based thingy
        # util.indent_xml_file(outdoc)
        zi = zipf.getinfo(name)
        dt = datetime(*zi.date_time)
        ts = mktime(dt.timetuple())
        os.utime(outdoc, (ts, ts))
Beispiel #52
0
 def add_downloaded_files(filelist, spec, url):
     downloaddir = os.sep.join(
         [self.datadir, self.repoclass.alias, "downloaded"])
     for f in list(util.list_dirs(downloaddir)):
         if f.endswith(".etag"):
             continue  # FIXME: this is ugly
         if f not in filelist:
             # print("Fetching %s resulted in downloaded file %s" % (url, f))
             filelist.append(f)
             expect = "downloaded" + f.replace(downloaddir, "")
             if os.sep != "/":
                 expect = expect.replace(os.sep, "/")
             spec[url]['expect'] = expect
             reldest = os.path.relpath(".." + os.sep + "downloaded",
                                       os.path.dirname(f))
             dest = os.path.normpath(
                 os.path.join(os.path.dirname(specfile), reldest))
             util.ensure_dir(dest)
             shutil.copy2(f, dest)
Beispiel #53
0
 def download(self):
     # do something with static/sitenews.txt --> split into
     # <datadir>/sitenews/<timestamp>.txt
     ofp = None
     with codecs.open(self.resourceloader.filename(self.config.newsfile),
                      encoding="utf-8") as fp:
         for line in fp:
             m = self.re_news_subjectline(line)
             if m:
                 if ofp:
                     ofp.close()
                 d = datetime.strptime(m.group(1), "%Y-%m-%d %H:%M:%S")
                 basefile = str(int(d.timestamp()))
                 path = self.store.downloaded_path(basefile)
                 self.log.info("%s: creating news item" % basefile)
                 util.ensure_dir(path)
                 ofp = codecs.open(path, "w", encoding="utf-8")
             ofp.write(line)
         ofp.close()
Beispiel #54
0
    def put_files_in_place(self):
        self.repos = []
        for repoclass in DocRepo1, DocRepo2, DocRepo3:
            repo = repoclass(datadir=self.datadir,
                             storetype=self.storetype,
                             storelocation=self.storelocation,
                             storerepository=self.storerepository,
                             indextype=self.indextype,
                             indexlocation=self.indexlocation)
            self.repos.append(repo)

        # NOTE: calling repo.relate(basefile, self.repos) will reorder
        # self.repos in MRU order. This is for efficency, but might
        # cause a change in the list we iterate over. So by wrapping
        # in list(), we create a temporary list that won't be
        # reordered.
        for repo in list(self.repos):
            for basefile in "a", "b", "c", "d":
                util.ensure_dir(repo.store.parsed_path(basefile))
                # Put files in place: parsed
                parsed_path = "test/files/testrepos/%s/parsed/%s.xhtml" % (repo.alias, basefile)
                shutil.copy2(parsed_path, repo.store.parsed_path(basefile))

                # FIXME: This distilling code is copied from
                # decorators.render -- should perhaps move to a
                # DocumentRepository method like render_xhtml
                distilled_graph = Graph()
                with codecs.open(repo.store.parsed_path(basefile),
                                 encoding="utf-8") as fp:  # unicode
                    distilled_graph.parse(data=fp.read(), format="rdfa",
                                          publicID=repo.canonical_uri(basefile))
                distilled_graph.bind("dc", URIRef("http://purl.org/dc/elements/1.1/"))
                distilled_graph.bind("dcterms", URIRef("http://example.org/this-prefix-should-not-be-used"))
                util.ensure_dir(repo.store.distilled_path(basefile))
                with open(repo.store.distilled_path(basefile),
                          "wb") as distilled_file:
                    distilled_graph.serialize(distilled_file, format="pretty-xml")
                    # print("#======= %s/%s ========" % (repo.alias, basefile))
                    # print(distilled_graph.serialize(format="turtle").decode())
                # finally index all the data into the triplestore/fulltextindex
                repo.relate(basefile, self.repos)