Ejemplo n.º 1
0
    def parse(self, doc):
        head, body = util.readfile(self.store.downloaded_path(
            doc.basefile)).split("\n\n", 1)
        datestr, timestr, title = head.split(" ", 2)
        published = datetime.strptime("%s %s" % (datestr, timestr),
                                      "%Y-%m-%d %H:%M:%S")

        doc.meta.add((URIRef(doc.uri), RDF.type, self.rdf_type))
        doc.meta.add((URIRef(doc.uri), DCTERMS.issued, Literal(published)))
        doc.meta.add(
            (URIRef(doc.uri), DCTERMS.title, Literal(title, lang=doc.lang)))
        soup = bs4.BeautifulSoup(
            "<div class='sitenews-item'>" + body + "</div>", "lxml")
        doc.body = elements_from_soup(soup.body)
        # move timestamp into dcterms:issued, title into dcterms:title
        # parse body with elements_from_soup
        # set first real para as dcterms:abstract (XMLLiteral)
        doc.body[0][0] = Div([doc.body[0][0]],
                             datatype="rdf:XMLLiteral",
                             property="dcterms:abstract")

        # but we need to add it to doc.meta RIGHT AWAY because of reasons...
        doc.meta.add((URIRef(doc.uri), DCTERMS.abstract,
                      Literal(body.split("\n\n")[0], datatype=RDF.XMLLiteral)))
        self.parse_entry_update(
            doc)  # need to set published and possibly updated
        entry = DocumentEntry(self.store.documententry_path(doc.basefile))
        entry.published = published
        entry.save()
        return True
Ejemplo n.º 2
0
    def test_incomplete_entries(self):
        self.repo.faceted_data = Mock(return_value=self.faceted_data)

        # make our entries incomplete in various ways
        entry = DocumentEntry(self.repo.store.documententry_path("1"))
        entry.published = None
        entry.save()

        # try very hard to remove title from everywhere
        entry = DocumentEntry(self.repo.store.documententry_path("2"))
        del entry.title
        entry.save()
        g = rdflib.Graph().parse(self.repo.store.distilled_path("2"))
        g.remove((rdflib.URIRef("http://localhost:8000/res/base/2"),
                  self.repo.ns['dcterms'].title, rdflib.Literal("Doc #2")))
        with open(self.repo.store.distilled_path("2"), "wb") as fp:
            g.serialize(fp, format="pretty-xml")

        os.unlink(self.repo.store.distilled_path("3"))

        # entries w/o published date and w/o distilled file should not
        # be published, but w/o title is OK
        with silence():  # avoid warnings about stale entry files
            # since the downloaded and intermediate file
            # is missing, which would exist in a real
            # scenario
            self.assertEqual(len(list(self.repo.news_entries())), 23)

            # also make sure that corresponding faceted_entries do not
            # show these non-published entries
            self.assertEqual(len(self.repo.news_facet_entries()), 23)
Ejemplo n.º 3
0
    def parse(self, doc):
        head, body = util.readfile(self.store.downloaded_path(doc.basefile)).split("\n\n", 1)
        datestr, timestr, title = head.split(" ", 2)
        published = datetime.strptime("%s %s" % (datestr, timestr), "%Y-%m-%d %H:%M:%S")

        doc.meta.add((URIRef(doc.uri), RDF.type, self.rdf_type))
        doc.meta.add((URIRef(doc.uri), DCTERMS.issued, Literal(published)))
        doc.meta.add((URIRef(doc.uri), DCTERMS.title, Literal(title, lang=doc.lang)))
        soup = bs4.BeautifulSoup("<div class='sitenews-item'>"+body+"</div>", "lxml")
        doc.body = elements_from_soup(soup.body)
        # move timestamp into dcterms:issued, title into dcterms:title
        # parse body with elements_from_soup
        # set first real para as dcterms:abstract (XMLLiteral)
        doc.body[0][0] = Div([doc.body[0][0]],
                          datatype="rdf:XMLLiteral",
                          property="dcterms:abstract")

        # but we need to add it to doc.meta RIGHT AWAY because of reasons...
        doc.meta.add((URIRef(doc.uri), DCTERMS.abstract,
                      Literal(body.split("\n\n")[0], datatype=RDF.XMLLiteral)))
        self.parse_entry_update(doc) # need to set published and possibly updated
        entry = DocumentEntry(self.store.documententry_path(doc.basefile))
        entry.published = published
        entry.save()
        return True
Ejemplo n.º 4
0
    def test_incomplete_entries(self):
        self.repo.faceted_data = Mock(return_value=self.faceted_data)

        # make our entries incomplete in various ways
        entry = DocumentEntry(self.repo.store.documententry_path("1"))
        entry.published = None
        entry.save()

        # try very hard to remove title from everywhere
        entry = DocumentEntry(self.repo.store.documententry_path("2"))
        del entry.title
        entry.save()
        g = rdflib.Graph().parse(self.repo.store.distilled_path("2"))
        g.remove((rdflib.URIRef("http://localhost:8000/res/base/2"),
                  self.repo.ns['dcterms'].title,
                  rdflib.Literal("Doc #2")))
        with open(self.repo.store.distilled_path("2"), "wb") as fp:
            g.serialize(fp, format="pretty-xml")

        os.unlink(self.repo.store.distilled_path("3"))

        # entries w/o published date and w/o distilled file should not
        # be published, but w/o title is OK
        with silence():  # avoid warnings about stale entry files
                         # since the downloaded and intermediate file
                         # is missing, which would exist in a real
                         # scenario
            self.assertEqual(len(list(self.repo.news_entries())),
                             23)

            # also make sure that corresponding faceted_entries do not
            # show these non-published entries
            self.assertEqual(len(self.repo.news_facet_entries()), 23)
Ejemplo n.º 5
0
    def setUp(self):
        super(News, self).setUp()
        self.faceted_data = []
        # create a bunch of DocumentEntry objects and save them
        basetime = datetime(2013, 1, 1, 12, 0)
        for basefile in range(25):
            v = {'id':self.repo.canonical_uri(basefile),
                 'title':"Doc #%s" % basefile}
            self.faceted_data.append({'uri': v['id'],
                                      'dcterms_title': v['title'],
                                      'rdf_type': 'http://xmlns.com/foaf/0.1/Document'})
            de = DocumentEntry()
            de.orig_created = basetime + timedelta(hours=basefile)
            de.orig_updated = basetime + timedelta(hours=basefile, minutes=10)
            de.orig_checked = basetime + timedelta(hours=basefile, minutes=20)
            de.published    = basetime + timedelta(hours=basefile, minutes=30)
            de.updated      = basetime + timedelta(hours=basefile, minutes=40)
            de.orig_url     = "http://source.example.org/doc/%s" % basefile
            de.title        = v['title']
            de.save(self.repo.store.documententry_path(str(basefile)))

            g = rdflib.Graph()
            desc = Describer(g, self.repo.canonical_uri(basefile))
            dcterms = self.repo.ns['dcterms']
            desc.rdftype(self.repo.ns['foaf'].Document)
            desc.value(dcterms.title, "Invalid title")
            util.ensure_dir(self.repo.store.distilled_path(str(basefile)))
            with open(self.repo.store.distilled_path(str(basefile)), "wb") as fp:
                g.serialize(fp, format="pretty-xml")

            util.ensure_dir(self.repo.store.parsed_path(str(basefile)))
            with open(self.repo.store.parsed_path(str(basefile)), "w") as fp:
                fp.write("""<?xml version='1.0' encoding='utf-8'?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML+RDFa 1.0//EN" "http://www.w3.org/MarkUp/DTD/xhtml-rdfa-1.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:dcterms="http://purl.org/dc/terms/" xml:lang="en">
  <head about="%(id)s">
    <title>%(title)s</title>
  </head>
  <body about="%(id)s">
    <h1>%(title)s</h1>
  </body>
</html>""" % v)

            util.ensure_dir(self.repo.store.generated_path(str(basefile)))
            with open(self.repo.store.generated_path(str(basefile)), "w") as fp:
                fp.write("""<!DOCTYPE html>
<html>
  <head>
    <title>%(title)s</title>
  </head>
  <body>
    <h1>%(title)s</h1>
  </body>
</html>""" % v)
Ejemplo n.º 6
0
    def setUp(self):
        super(News, self).setUp()
        self.faceted_data = []
        # create a bunch of DocumentEntry objects and save them
        basetime = datetime(2013, 1, 1, 12, 0)
        for basefile in range(25):
            v = {'id':self.repo.canonical_uri(basefile),
                 'title':"Doc #%s" % basefile}
            self.faceted_data.append({'uri': v['id'],
                                      'dcterms_title': v['title'],
                                      'rdf_type': 'http://xmlns.com/foaf/0.1/Document'})
            de = DocumentEntry()
            de.orig_created = basetime + timedelta(hours=basefile)
            de.orig_updated = basetime + timedelta(hours=basefile, minutes=10)
            de.orig_checked = basetime + timedelta(hours=basefile, minutes=20)
            de.published    = basetime + timedelta(hours=basefile, minutes=30)
            de.updated      = basetime + timedelta(hours=basefile, minutes=40)
            de.orig_url     = "http://source.example.org/doc/%s" % basefile
            de.title        = v['title']
            de.save(self.repo.store.documententry_path(str(basefile)))

            g = rdflib.Graph()
            desc = Describer(g, self.repo.canonical_uri(basefile))
            dcterms = self.repo.ns['dcterms']
            desc.rdftype(self.repo.ns['foaf'].Document)
            desc.value(dcterms.title, "Invalid title")
            util.ensure_dir(self.repo.store.distilled_path(str(basefile)))
            with open(self.repo.store.distilled_path(str(basefile)), "wb") as fp:
                g.serialize(fp, format="pretty-xml")

            util.ensure_dir(self.repo.store.parsed_path(str(basefile)))
            with open(self.repo.store.parsed_path(str(basefile)), "w") as fp:
                fp.write("""<?xml version='1.0' encoding='utf-8'?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML+RDFa 1.0//EN" "http://www.w3.org/MarkUp/DTD/xhtml-rdfa-1.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:dcterms="http://purl.org/dc/terms/" xml:lang="en">
  <head about="%(id)s">
    <title>%(title)s</title>
  </head>
  <body about="%(id)s">
    <h1>%(title)s</h1>
  </body>
</html>""" % v)

            util.ensure_dir(self.repo.store.generated_path(str(basefile)))
            with open(self.repo.store.generated_path(str(basefile)), "w") as fp:
                fp.write("""<!DOCTYPE html>
<html>
  <head>
    <title>%(title)s</title>
  </head>
  <body>
    <h1>%(title)s</h1>
  </body>
</html>""" % v)
Ejemplo n.º 7
0
    def importarchive(self, archivedir):
        """Imports downloaded data from an archive from legacy lagen.nu data.

        In particular, creates proper archive storage for older
        versions of each text.

        """
        current = archived = 0
        for f in util.list_dirs(archivedir, ".html"):
            if not f.startswith("downloaded/sfs"):  # sfst or sfsr
                continue
            for regex in self.templ:
                m = re.match(regex, f)
                if not m:
                    continue
                if "vcheck" in m.groupdict():  # silently ignore
                    break
                basefile = "%s:%s" % (m.group("byear"), m.group("bnum"))

                # need to look at the file to find out its version
                # text = t.extractfile(f).read(4000).decode("latin-1")
                text = open(f).read(4000).decode("latin-1")
                reader = TextReader(string=text)
                updated_to = self._find_uppdaterad_tom(basefile, reader=reader)

                if "vyear" in m.groupdict():  # this file is marked as
                    # an archival version
                    archived += 1
                    version = updated_to

                    if m.group("vyear") == "first":
                        pass
                    else:
                        exp = "%s:%s" % (m.group("vyear"), m.group("vnum"))
                        if version != exp:
                            self.log.warning("%s: Expected %s, found %s" %
                                             (f, exp, version))
                else:
                    version = None
                    current += 1
                    de = DocumentEntry()
                    de.basefile = basefile
                    de.id = self.canonical_uri(basefile, updated_to)
                    # fudge timestamps best as we can
                    de.orig_created = datetime.fromtimestamp(
                        os.path.getctime(f))
                    de.orig_updated = datetime.fromtimestamp(
                        os.path.getmtime(f))
                    de.orig_updated = datetime.now()
                    de.orig_url = self.document_url_template % locals()
                    de.published = datetime.now()
                    de.url = self.generated_url(basefile)
                    de.title = "SFS %s" % basefile
                    # de.set_content()
                    # de.set_link()
                    de.save(self.store.documententry_path(basefile))
                # this yields more reasonable basefiles, but they are not
                # backwards compatible -- skip them for now
                # basefile = basefile.replace("_", "").replace(".", "")
                if "type" in m.groupdict() and m.group("type") == "sfsr":
                    dest = self.store.register_path(basefile)
                    current -= 1  # to offset the previous increment
                else:
                    dest = self.store.downloaded_path(basefile, version)
                self.log.debug("%s: extracting %s to %s" % (basefile, f, dest))
                util.ensure_dir(dest)
                shutil.copy2(f, dest)
                break
            else:
                self.log.warning("Couldn't process %s" % f)
        self.log.info(
            "Extracted %s current versions and %s archived versions" %
            (current, archived))