def parse(self, doc): head, body = util.readfile(self.store.downloaded_path( doc.basefile)).split("\n\n", 1) datestr, timestr, title = head.split(" ", 2) published = datetime.strptime("%s %s" % (datestr, timestr), "%Y-%m-%d %H:%M:%S") doc.meta.add((URIRef(doc.uri), RDF.type, self.rdf_type)) doc.meta.add((URIRef(doc.uri), DCTERMS.issued, Literal(published))) doc.meta.add( (URIRef(doc.uri), DCTERMS.title, Literal(title, lang=doc.lang))) soup = bs4.BeautifulSoup( "<div class='sitenews-item'>" + body + "</div>", "lxml") doc.body = elements_from_soup(soup.body) # move timestamp into dcterms:issued, title into dcterms:title # parse body with elements_from_soup # set first real para as dcterms:abstract (XMLLiteral) doc.body[0][0] = Div([doc.body[0][0]], datatype="rdf:XMLLiteral", property="dcterms:abstract") # but we need to add it to doc.meta RIGHT AWAY because of reasons... doc.meta.add((URIRef(doc.uri), DCTERMS.abstract, Literal(body.split("\n\n")[0], datatype=RDF.XMLLiteral))) self.parse_entry_update( doc) # need to set published and possibly updated entry = DocumentEntry(self.store.documententry_path(doc.basefile)) entry.published = published entry.save() return True
def test_incomplete_entries(self): self.repo.faceted_data = Mock(return_value=self.faceted_data) # make our entries incomplete in various ways entry = DocumentEntry(self.repo.store.documententry_path("1")) entry.published = None entry.save() # try very hard to remove title from everywhere entry = DocumentEntry(self.repo.store.documententry_path("2")) del entry.title entry.save() g = rdflib.Graph().parse(self.repo.store.distilled_path("2")) g.remove((rdflib.URIRef("http://localhost:8000/res/base/2"), self.repo.ns['dcterms'].title, rdflib.Literal("Doc #2"))) with open(self.repo.store.distilled_path("2"), "wb") as fp: g.serialize(fp, format="pretty-xml") os.unlink(self.repo.store.distilled_path("3")) # entries w/o published date and w/o distilled file should not # be published, but w/o title is OK with silence(): # avoid warnings about stale entry files # since the downloaded and intermediate file # is missing, which would exist in a real # scenario self.assertEqual(len(list(self.repo.news_entries())), 23) # also make sure that corresponding faceted_entries do not # show these non-published entries self.assertEqual(len(self.repo.news_facet_entries()), 23)
def parse(self, doc): head, body = util.readfile(self.store.downloaded_path(doc.basefile)).split("\n\n", 1) datestr, timestr, title = head.split(" ", 2) published = datetime.strptime("%s %s" % (datestr, timestr), "%Y-%m-%d %H:%M:%S") doc.meta.add((URIRef(doc.uri), RDF.type, self.rdf_type)) doc.meta.add((URIRef(doc.uri), DCTERMS.issued, Literal(published))) doc.meta.add((URIRef(doc.uri), DCTERMS.title, Literal(title, lang=doc.lang))) soup = bs4.BeautifulSoup("<div class='sitenews-item'>"+body+"</div>", "lxml") doc.body = elements_from_soup(soup.body) # move timestamp into dcterms:issued, title into dcterms:title # parse body with elements_from_soup # set first real para as dcterms:abstract (XMLLiteral) doc.body[0][0] = Div([doc.body[0][0]], datatype="rdf:XMLLiteral", property="dcterms:abstract") # but we need to add it to doc.meta RIGHT AWAY because of reasons... doc.meta.add((URIRef(doc.uri), DCTERMS.abstract, Literal(body.split("\n\n")[0], datatype=RDF.XMLLiteral))) self.parse_entry_update(doc) # need to set published and possibly updated entry = DocumentEntry(self.store.documententry_path(doc.basefile)) entry.published = published entry.save() return True
def test_incomplete_entries(self): self.repo.faceted_data = Mock(return_value=self.faceted_data) # make our entries incomplete in various ways entry = DocumentEntry(self.repo.store.documententry_path("1")) entry.published = None entry.save() # try very hard to remove title from everywhere entry = DocumentEntry(self.repo.store.documententry_path("2")) del entry.title entry.save() g = rdflib.Graph().parse(self.repo.store.distilled_path("2")) g.remove((rdflib.URIRef("http://localhost:8000/res/base/2"), self.repo.ns['dcterms'].title, rdflib.Literal("Doc #2"))) with open(self.repo.store.distilled_path("2"), "wb") as fp: g.serialize(fp, format="pretty-xml") os.unlink(self.repo.store.distilled_path("3")) # entries w/o published date and w/o distilled file should not # be published, but w/o title is OK with silence(): # avoid warnings about stale entry files # since the downloaded and intermediate file # is missing, which would exist in a real # scenario self.assertEqual(len(list(self.repo.news_entries())), 23) # also make sure that corresponding faceted_entries do not # show these non-published entries self.assertEqual(len(self.repo.news_facet_entries()), 23)
def setUp(self): super(News, self).setUp() self.faceted_data = [] # create a bunch of DocumentEntry objects and save them basetime = datetime(2013, 1, 1, 12, 0) for basefile in range(25): v = {'id':self.repo.canonical_uri(basefile), 'title':"Doc #%s" % basefile} self.faceted_data.append({'uri': v['id'], 'dcterms_title': v['title'], 'rdf_type': 'http://xmlns.com/foaf/0.1/Document'}) de = DocumentEntry() de.orig_created = basetime + timedelta(hours=basefile) de.orig_updated = basetime + timedelta(hours=basefile, minutes=10) de.orig_checked = basetime + timedelta(hours=basefile, minutes=20) de.published = basetime + timedelta(hours=basefile, minutes=30) de.updated = basetime + timedelta(hours=basefile, minutes=40) de.orig_url = "http://source.example.org/doc/%s" % basefile de.title = v['title'] de.save(self.repo.store.documententry_path(str(basefile))) g = rdflib.Graph() desc = Describer(g, self.repo.canonical_uri(basefile)) dcterms = self.repo.ns['dcterms'] desc.rdftype(self.repo.ns['foaf'].Document) desc.value(dcterms.title, "Invalid title") util.ensure_dir(self.repo.store.distilled_path(str(basefile))) with open(self.repo.store.distilled_path(str(basefile)), "wb") as fp: g.serialize(fp, format="pretty-xml") util.ensure_dir(self.repo.store.parsed_path(str(basefile))) with open(self.repo.store.parsed_path(str(basefile)), "w") as fp: fp.write("""<?xml version='1.0' encoding='utf-8'?> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML+RDFa 1.0//EN" "http://www.w3.org/MarkUp/DTD/xhtml-rdfa-1.dtd"> <html xmlns="http://www.w3.org/1999/xhtml" xmlns:dcterms="http://purl.org/dc/terms/" xml:lang="en"> <head about="%(id)s"> <title>%(title)s</title> </head> <body about="%(id)s"> <h1>%(title)s</h1> </body> </html>""" % v) util.ensure_dir(self.repo.store.generated_path(str(basefile))) with open(self.repo.store.generated_path(str(basefile)), "w") as fp: fp.write("""<!DOCTYPE html> <html> <head> <title>%(title)s</title> </head> <body> <h1>%(title)s</h1> </body> </html>""" % v)
def setUp(self): super(News, self).setUp() self.faceted_data = [] # create a bunch of DocumentEntry objects and save them basetime = datetime(2013, 1, 1, 12, 0) for basefile in range(25): v = {'id':self.repo.canonical_uri(basefile), 'title':"Doc #%s" % basefile} self.faceted_data.append({'uri': v['id'], 'dcterms_title': v['title'], 'rdf_type': 'http://xmlns.com/foaf/0.1/Document'}) de = DocumentEntry() de.orig_created = basetime + timedelta(hours=basefile) de.orig_updated = basetime + timedelta(hours=basefile, minutes=10) de.orig_checked = basetime + timedelta(hours=basefile, minutes=20) de.published = basetime + timedelta(hours=basefile, minutes=30) de.updated = basetime + timedelta(hours=basefile, minutes=40) de.orig_url = "http://source.example.org/doc/%s" % basefile de.title = v['title'] de.save(self.repo.store.documententry_path(str(basefile))) g = rdflib.Graph() desc = Describer(g, self.repo.canonical_uri(basefile)) dcterms = self.repo.ns['dcterms'] desc.rdftype(self.repo.ns['foaf'].Document) desc.value(dcterms.title, "Invalid title") util.ensure_dir(self.repo.store.distilled_path(str(basefile))) with open(self.repo.store.distilled_path(str(basefile)), "wb") as fp: g.serialize(fp, format="pretty-xml") util.ensure_dir(self.repo.store.parsed_path(str(basefile))) with open(self.repo.store.parsed_path(str(basefile)), "w") as fp: fp.write("""<?xml version='1.0' encoding='utf-8'?> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML+RDFa 1.0//EN" "http://www.w3.org/MarkUp/DTD/xhtml-rdfa-1.dtd"> <html xmlns="http://www.w3.org/1999/xhtml" xmlns:dcterms="http://purl.org/dc/terms/" xml:lang="en"> <head about="%(id)s"> <title>%(title)s</title> </head> <body about="%(id)s"> <h1>%(title)s</h1> </body> </html>""" % v) util.ensure_dir(self.repo.store.generated_path(str(basefile))) with open(self.repo.store.generated_path(str(basefile)), "w") as fp: fp.write("""<!DOCTYPE html> <html> <head> <title>%(title)s</title> </head> <body> <h1>%(title)s</h1> </body> </html>""" % v)
def importarchive(self, archivedir): """Imports downloaded data from an archive from legacy lagen.nu data. In particular, creates proper archive storage for older versions of each text. """ current = archived = 0 for f in util.list_dirs(archivedir, ".html"): if not f.startswith("downloaded/sfs"): # sfst or sfsr continue for regex in self.templ: m = re.match(regex, f) if not m: continue if "vcheck" in m.groupdict(): # silently ignore break basefile = "%s:%s" % (m.group("byear"), m.group("bnum")) # need to look at the file to find out its version # text = t.extractfile(f).read(4000).decode("latin-1") text = open(f).read(4000).decode("latin-1") reader = TextReader(string=text) updated_to = self._find_uppdaterad_tom(basefile, reader=reader) if "vyear" in m.groupdict(): # this file is marked as # an archival version archived += 1 version = updated_to if m.group("vyear") == "first": pass else: exp = "%s:%s" % (m.group("vyear"), m.group("vnum")) if version != exp: self.log.warning("%s: Expected %s, found %s" % (f, exp, version)) else: version = None current += 1 de = DocumentEntry() de.basefile = basefile de.id = self.canonical_uri(basefile, updated_to) # fudge timestamps best as we can de.orig_created = datetime.fromtimestamp( os.path.getctime(f)) de.orig_updated = datetime.fromtimestamp( os.path.getmtime(f)) de.orig_updated = datetime.now() de.orig_url = self.document_url_template % locals() de.published = datetime.now() de.url = self.generated_url(basefile) de.title = "SFS %s" % basefile # de.set_content() # de.set_link() de.save(self.store.documententry_path(basefile)) # this yields more reasonable basefiles, but they are not # backwards compatible -- skip them for now # basefile = basefile.replace("_", "").replace(".", "") if "type" in m.groupdict() and m.group("type") == "sfsr": dest = self.store.register_path(basefile) current -= 1 # to offset the previous increment else: dest = self.store.downloaded_path(basefile, version) self.log.debug("%s: extracting %s to %s" % (basefile, f, dest)) util.ensure_dir(dest) shutil.copy2(f, dest) break else: self.log.warning("Couldn't process %s" % f) self.log.info( "Extracted %s current versions and %s archived versions" % (current, archived))