def setUp(self): super(News, self).setUp() self.faceted_data = [] # create a bunch of DocumentEntry objects and save them basetime = datetime(2013, 1, 1, 12, 0) for basefile in range(25): v = {'id':self.repo.canonical_uri(basefile), 'title':"Doc #%s" % basefile} self.faceted_data.append({'uri': v['id'], 'dcterms_title': v['title'], 'rdf_type': 'http://xmlns.com/foaf/0.1/Document'}) de = DocumentEntry() de.orig_created = basetime + timedelta(hours=basefile) de.orig_updated = basetime + timedelta(hours=basefile, minutes=10) de.orig_checked = basetime + timedelta(hours=basefile, minutes=20) de.published = basetime + timedelta(hours=basefile, minutes=30) de.updated = basetime + timedelta(hours=basefile, minutes=40) de.orig_url = "http://source.example.org/doc/%s" % basefile de.title = v['title'] de.save(self.repo.store.documententry_path(str(basefile))) g = rdflib.Graph() desc = Describer(g, self.repo.canonical_uri(basefile)) dcterms = self.repo.ns['dcterms'] desc.rdftype(self.repo.ns['foaf'].Document) desc.value(dcterms.title, "Invalid title") util.ensure_dir(self.repo.store.distilled_path(str(basefile))) with open(self.repo.store.distilled_path(str(basefile)), "wb") as fp: g.serialize(fp, format="pretty-xml") util.ensure_dir(self.repo.store.parsed_path(str(basefile))) with open(self.repo.store.parsed_path(str(basefile)), "w") as fp: fp.write("""<?xml version='1.0' encoding='utf-8'?> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML+RDFa 1.0//EN" "http://www.w3.org/MarkUp/DTD/xhtml-rdfa-1.dtd"> <html xmlns="http://www.w3.org/1999/xhtml" xmlns:dcterms="http://purl.org/dc/terms/" xml:lang="en"> <head about="%(id)s"> <title>%(title)s</title> </head> <body about="%(id)s"> <h1>%(title)s</h1> </body> </html>""" % v) util.ensure_dir(self.repo.store.generated_path(str(basefile))) with open(self.repo.store.generated_path(str(basefile)), "w") as fp: fp.write("""<!DOCTYPE html> <html> <head> <title>%(title)s</title> </head> <body> <h1>%(title)s</h1> </body> </html>""" % v)
def importarchive(self, archivedir): """Imports downloaded data from an archive from legacy lagen.nu data. In particular, creates proper archive storage for older versions of each text. """ current = archived = 0 for f in util.list_dirs(archivedir, ".html"): if not f.startswith("downloaded/sfs"): # sfst or sfsr continue for regex in self.templ: m = re.match(regex, f) if not m: continue if "vcheck" in m.groupdict(): # silently ignore break basefile = "%s:%s" % (m.group("byear"), m.group("bnum")) # need to look at the file to find out its version # text = t.extractfile(f).read(4000).decode("latin-1") text = open(f).read(4000).decode("latin-1") reader = TextReader(string=text) updated_to = self._find_uppdaterad_tom(basefile, reader=reader) if "vyear" in m.groupdict(): # this file is marked as # an archival version archived += 1 version = updated_to if m.group("vyear") == "first": pass else: exp = "%s:%s" % (m.group("vyear"), m.group("vnum")) if version != exp: self.log.warning("%s: Expected %s, found %s" % (f, exp, version)) else: version = None current += 1 de = DocumentEntry() de.basefile = basefile de.id = self.canonical_uri(basefile, updated_to) # fudge timestamps best as we can de.orig_created = datetime.fromtimestamp( os.path.getctime(f)) de.orig_updated = datetime.fromtimestamp( os.path.getmtime(f)) de.orig_updated = datetime.now() de.orig_url = self.document_url_template % locals() de.published = datetime.now() de.url = self.generated_url(basefile) de.title = "SFS %s" % basefile # de.set_content() # de.set_link() de.save(self.store.documententry_path(basefile)) # this yields more reasonable basefiles, but they are not # backwards compatible -- skip them for now # basefile = basefile.replace("_", "").replace(".", "") if "type" in m.groupdict() and m.group("type") == "sfsr": dest = self.store.register_path(basefile) current -= 1 # to offset the previous increment else: dest = self.store.downloaded_path(basefile, version) self.log.debug("%s: extracting %s to %s" % (basefile, f, dest)) util.ensure_dir(dest) shutil.copy2(f, dest) break else: self.log.warning("Couldn't process %s" % f) self.log.info( "Extracted %s current versions and %s archived versions" % (current, archived))
def test_save(self): path = self.repo.store.documententry_path("123/x") d = DocumentEntry() d.title = StringIO("A file-like object, not a string") with self.assertRaises(TypeError): d.save(path=path)