Ejemplo n.º 1
0
 def ttl_to_rdf_xml(self, inpath, outpath, store=None):
     if not store:
         store = self.repo.store
     g = Graph()
     g.parse(data=util.readfile(inpath, encoding="utf-8"), format="turtle")
     with _open(outpath, "wb") as fp:
         fp.write(g.serialize(format="pretty-xml"))
     return g
Ejemplo n.º 2
0
 def ttl_to_rdf_xml(self, inpath, outpath, store=None):
     if not store:
         store = self.repo.store
     g = Graph()
     g.parse(data=util.readfile(inpath, encoding="utf-8"), format="turtle")
     with _open(outpath, "wb") as fp:
         fp.write(g.serialize(format="pretty-xml"))
     return g
Ejemplo n.º 3
0
    def download(self, basefile=None):

        def write_doc(basefile, page_el):
            writefile = False
            p = self.store.downloaded_path(basefile)
            newcontent = etree.tostring(page_el, encoding="utf-8")
            if not os.path.exists(p):
                writefile = True
            else:
                oldcontent = util.readfile(p, "rb")
                if newcontent != oldcontent:
                    writefile = True
            if writefile:
                util.ensure_dir(p)
                with open(p, "wb") as fp:
                    fp.write(newcontent)
                    self.log.info("%s: extracting from XML dump" % basefile)
            if basefile in basefiles:
                del basefiles[basefiles.index(basefile)]

        if basefile:
            return self.download_single(basefile)
        if self.config.mediawikidump:
            xmldumppath = self.store.path('dump', 'downloaded', '.xml')
            resp = requests.get(self.config.mediawikidump)
            self.log.info("Loaded XML dump from %s" % self.config.mediawikidump)
            from ferenda.documentstore import _open
            with _open(xmldumppath, mode="wb") as fp:
                fp.write(resp.content)
            xml = etree.parse(xmldumppath)
        else:
            raise ConfigurationError("config.mediawikidump not set")

        MW_NS = "{%s}" % xml.getroot().nsmap[None]
        wikinamespaces = []
        for ns_el in xml.findall("//" + MW_NS + "namespace"):
            wikinamespaces.append(ns_el.text)

        # Get list of existing basefiles - if any of those
        # does not appear in the XML dump, remove them afterwards
        basefiles = list(self.store.list_basefiles_for("parse"))
        total = written = 0
        deferred = {}
        for page_el in xml.findall(MW_NS + "page"):
            basefile = page_el.find(MW_NS + "title").text
            if basefile == "Huvudsida":  # FIXME: generalize/make configurable
                continue
            if ":" in basefile and basefile.split(":")[0] in wikinamespaces:
                (namespace, localtitle) = basefile.split(":", 1)
                if namespace not in self.config.mediawikinamespaces:
                    continue
                # defer writing of this one, so that it overwrites any
                # similarly named pages from teh main namespace. This
                # is so that Category pages about $TOPIC take
                # precedence over ordinary pages about $TOPIC
                deferred[localtitle] = page_el
            else:
                write_doc(basefile, page_el)
        for basefile, page_el in deferred.items():
            write_doc(basefile, page_el)

        if 'dump' in basefiles:  # never remove
            del basefiles[basefiles.index('dump')]
        for b in basefiles:
            self.log.info("%s: removing stale document" % b)
            util.robust_remove(self.store.downloaded_path(b))