def render_template(self, jinja_template, page_title, **context): from ferenda import DocumentRepository repo = DocumentRepository(config=self.repo.config) jinja_template = """ <html xmlns="http://www.w3.org/1999/xhtml"> <head><title>%(page_title)s</title></head> <body> <div> %(jinja_template)s </div> </body> </html> """ % (locals()) t = Template(jinja_template, autoescape=True) text = t.render(context).encode("utf-8") try: xhtml = etree.parse(BytesIO(text)) except XMLSyntaxError as e: raise ValueError("invalid xhtml from template: %s\n%s" % (e, text.decode("utf-8"))) conffile = os.sep.join([repo.config.datadir, 'rsrc', 'resources.xml']) transformer = Transformer('XSLT', "xsl/generic.xsl", "xsl", resourceloader=repo.resourceloader, config=conffile) urltransform = None if 'develurl' in repo.config and repo.config.develurl: urltransform = repo.get_url_transform_func( develurl=repo.config.develurl) depth = 2 # len(doc.uri.split("/")) - 3 tree = transformer.transform(xhtml, depth, uritransform=urltransform) data = etree.tostring(tree, encoding="utf-8") return Response(data, mimetype="text/html")
def test_more(self): from ferenda import DocumentRepository d = DocumentRepository() rows = [{ 'uri': 'http://ex.org/1', 'dcterms_title': 'Abc', 'dcterms_issued': '2009-04-02' }, { 'uri': 'http://ex.org/2', 'dcterms_title': 'Abcd', 'dcterms_issued': '2010-06-30' }, { 'uri': 'http://ex.org/3', 'dcterms_title': 'Dfg', 'dcterms_issued': '2010-08-01' }] from rdflib.namespace import DCTERMS facets = [Facet(DCTERMS.title), Facet(DCTERMS.issued)] pagesets = d.toc_pagesets(rows, facets) expected = { ('dcterms_title', 'a'): [[Link('Abc', uri='http://ex.org/1')], [Link('Abcd', uri='http://ex.org/2')]], ('dcterms_title', 'd'): [[Link('Dfg', uri='http://ex.org/3')]], ('dcterms_issued', '2009'): [[Link('Abc', uri='http://ex.org/1')]], ('dcterms_issued', '2010'): [[Link('Abcd', uri='http://ex.org/2')], [Link('Dfg', uri='http://ex.org/3')]] } got = d.toc_select_for_pages(rows, pagesets, facets) self.assertEqual(expected, got)
def test_facet_query(self): results1 = json.load(open("test/files/datasets/results1.json")) results2 = json.load(open("test/files/datasets/results2.json")) self.loader.add_serialized( util.readfile("test/files/datasets/books.ttl"), format="turtle", context="http://example.org/ctx/base") self.loader.add_serialized( util.readfile("test/files/datasets/articles.ttl"), format="turtle", context="http://example.org/ctx/other") # Since the query is partially constructed by DocumentRepository, we # need to run that code. import rdflib from ferenda import DocumentRepository repo = DocumentRepository() repo.config.storetype = self.storetype repo.rdf_type = rdflib.URIRef("http://purl.org/ontology/bibo/Book") # test 1 sq = repo.facet_query("http://example.org/ctx/base") got = self.store.select(sq, format="python") self.assertEqual(len(got), len(results1)) for row in results1: self.assertIn(row, got) # test 2 sq = repo.facet_query("http://example.org/ctx/other") got = self.store.select(sq, format="python") self.assertEqual(len(got), len(results2)) for row in results2: self.assertIn(row, got) if self.storetype == "SLEEPYCAT": self.store.graph.close()
def test_create(self): # First do the basic tests super(WhooshBasicIndex, self).test_create() # then do more low-level tests # 1 assert that some files have been created at the specified location self.assertNotEqual(os.listdir(self.location), []) # 2 assert that it's really a whoosh index self.assertTrue(whoosh.index.exists_in(self.location)) # 3. assert that the actual schema with whoosh types is, in # fact, correct got = self.index.index.schema want = whoosh.fields.Schema( basefile=whoosh.fields.ID(stored=True), dcterms_identifier=whoosh.fields.ID(field_boost=16, stored=True), dcterms_issued=whoosh.fields.DATETIME(stored=True), dcterms_publisher=whoosh.fields.IDLIST(stored=True), dcterms_title=whoosh.fields.TEXT(field_boost=4, stored=True), rdf_type=whoosh.fields.ID( stored=True, field_boost=1.1), # corresponds to URI not Label repo=whoosh.fields.ID(stored=True), text=whoosh.fields.TEXT(stored=True), uri=whoosh.fields.ID(unique=True, stored=True)) self.assertEqual(sorted(want.names()), sorted(got.names())) for fld in got.names(): self.assertEqual((fld, want[fld]), (fld, got[fld])) # finally, try to create again (opening an existing index # instead of creating) # need mock docrepo self.index = FulltextIndex.connect("WHOOSH", self.location, [DocumentRepository()])
def __init__(self, repos, inifile=None, **kwargs): self.repos = repos self.log = logging.getLogger("wsgi") # FIXME: Cut-n-paste of the method in Resources.__init__ loadpaths = [ResourceLoader.make_loadpath(repo) for repo in repos] loadpath = ["."] # cwd always has priority -- makes sense? for subpath in loadpaths: for p in subpath: if p not in loadpath: loadpath.append(p) self.resourceloader = ResourceLoader(*loadpath) # FIXME: need to specify documentroot? defaults = DocumentRepository.get_default_options() if inifile: assert os.path.exists( inifile), "INI file %s doesn't exist (relative to %s)" % ( inifile, os.getcwd()) # NB: If both inifile and kwargs are specified, the latter # will take precedence. I think this is the expected # behaviour. self.config = LayeredConfig(Defaults(defaults), INIFile(inifile), Defaults(kwargs), cascade=True)
def __init__(self, repos, resourcedir, **kwargs): # FIXME: document what kwargs could be (particularly 'combineresources') self.repos = repos self.resourcedir = resourcedir from ferenda.manager import DEFAULT_CONFIG defaults = dict(DEFAULT_CONFIG) defaults.update(DocumentRepository.get_default_options()) defaults.update(kwargs) self.config = LayeredConfig(Defaults(defaults)) # the below call to setup_logger alters the logging level of # the root logger, which can't be good practice. Also, we # should probably not log to the root logger, but rather to # ferenda.resources. # # from ferenda.manager import setup_logger # self.log = setup_logger() self.log = logging.getLogger("ferenda.resources") # FIXME: How should we set up a global loadpath from the # individual repos? loadpaths = [ResourceLoader.make_loadpath(repo) for repo in repos] loadpath = ["."] # cwd always has priority -- makes sense? for subpath in loadpaths: for p in subpath: if p not in loadpath: loadpath.append(p) self.resourceloader = ResourceLoader(*loadpath)
def test_staticsite(self): # test4: Make sure staticsite works (ie no search form in resources.xml): repo = DocumentRepository() got = Resources([repo],self.tempdir+os.sep+'rsrc', staticsite = True).make() tree = ET.parse(self.tempdir+os.sep+got['xml'][0]) search=tree.find("search") self.assertFalse(search)
def __init__(self, repos, inifile=None, **kwargs): self.repos = repos self.log = logging.getLogger("wsgi") # FIXME: Cut-n-paste of the method in Resources.__init__ loadpaths = [ResourceLoader.make_loadpath(repo) for repo in repos] loadpath = ["."] # cwd always has priority -- makes sense? for subpath in loadpaths: for p in subpath: if p not in loadpath: loadpath.append(p) self.resourceloader = ResourceLoader(*loadpath) # FIXME: need to specify documentroot? defaults = DocumentRepository.get_default_options() if inifile: assert os.path.exists( inifile), "INI file %s doesn't exist (relative to %s)" % (inifile, os.getcwd()) # NB: If both inifile and kwargs are specified, the latter # will take precedence. I think this is the expected # behaviour. self.config = LayeredConfig(Defaults(defaults), INIFile(inifile), Defaults(kwargs), cascade=True)
def put_files_in_place(self): self.repo = None self.repos = [DocumentRepository(datadir=self.datadir, storetype = self.storetype, storelocation = self.storelocation, storerepository = self.storerepository, indextype = self.indextype, indexlocation = self.indexlocation)] # create three basic documents (at parsed and distilled) # # each document should have a dcterms:title, a dcterms:issued and a # dcterms:publisher, which has a URI # # basefile dcterms:title dcterms:issued dcterms:publisher # 123/a "Example" 2014-01-04 <http://example.org/publisher/A> # 123/b "Example 2" 2013-09-23 <http://example.org/publisher/B> # 123/c "Of needles" 2014-05-06 <http://example.org/publisher/B> for i in ('a','b','c'): self.ttl_to_rdf_xml("test/files/base/distilled/123/%s.ttl" % i, self.repos[0].store.distilled_path("123/%s" % i), self.repos[0].store) util.ensure_dir(self.repos[0].store.parsed_path("123/%s" % i)) shutil.copy2("test/files/base/parsed/123/%s.xhtml" % i, self.repos[0].store.parsed_path("123/%s" % i)) self.repos[0].relate("123/%s" % i) # prepare a base.ttl (or var-common.js) that maps # <http://example.org/publisher/B> to "Publishing house B" self.repos[0].rdf_type = self.repos[0].ns['bibo'].Standard
def tearDown(self): self.setupclass = False # make sure super.tearDown deletes all files super(AdvancedAPI, self).tearDown() FulltextIndex.connect(self.indextype, self.indexlocation, [DocumentRepository()]).destroy() TripleStore.connect(self.storetype, self.storelocation, self.storerepository).clear()
class BasicIndex(object): repos = [DocumentRepository()] def test_create(self): # setUp calls FulltextIndex.connect, creating the index self.assertTrue(self.index.exists()) # assert that the schema, using our types, looks OK want = { 'basefile': Label(), 'dcterms_identifier': Label(boost=16), 'dcterms_issued': Datetime(), 'dcterms_publisher': Resource(), 'dcterms_title': Text(boost=4), 'rdf_type': URI(), 'repo': Label(), 'text': Text(), 'uri': Identifier() } got = self.index.schema() self.assertEqual(want, got) def test_insert(self): self.index.update(**basic_dataset[0]) self.index.update(**basic_dataset[1]) self.index.commit() self.assertEqual(self.index.doccount(), 2) self.index.update(**basic_dataset[2]) self.index.update( **basic_dataset[3]) # updated version of basic_dataset[1] self.index.commit() self.assertEqual(self.index.doccount(), 3)
def test_status(self): want = """ Status for document repository 'base' (ferenda.documentrepository.DocumentRepository) download: None. parse: None. generated: None. """.strip() repo = DocumentRepository(datadir=self.tempdir) with patch("builtins.print") as printmock: manager.status(repo) got = "\n".join([x[1][0] for x in printmock.mock_calls]) self.assertEqual(want, got) # test both status and get_status in one swoop. for basefile in range(1, 13): util.writefile(repo.store.downloaded_path(str(basefile)), "downloaded %s" % basefile) for basefile in range(1, 9): util.writefile(repo.store.parsed_path(str(basefile)), "parsed %s" % basefile) for basefile in range(1, 5): util.writefile(repo.store.generated_path(str(basefile)), "generated %s" % basefile) want = """ Status for document repository 'base' (ferenda.documentrepository.DocumentRepository) download: 12, 11, 10... (9 more) parse: 8, 7, 6... (5 more) Todo: 12, 11, 10... (1 more) generated: 4, 3, 2... (1 more) Todo: 8, 7, 6... (1 more) """.strip() with patch("builtins.print") as printmock: manager.status(repo) got = "\n".join([x[1][0] for x in printmock.mock_calls]) self.assertEqual(want, got)
def test_makedocument(self): @makedocument def testfunc(repo, doc): return doc doc = testfunc(DocumentRepository(), "base/file") self.assertIsInstance(doc, Document) self.assertEqual(doc.basefile, "base/file")
def test_more(self): from ferenda import DocumentRepository d = DocumentRepository() rows = [{'uri':'http://ex.org/1','dcterms_title':'Abc','dcterms_issued':'2009-04-02'}, {'uri':'http://ex.org/2','dcterms_title':'Abcd','dcterms_issued':'2010-06-30'}, {'uri':'http://ex.org/3','dcterms_title':'Dfg','dcterms_issued':'2010-08-01'}] from rdflib.namespace import DCTERMS facets = [Facet(DCTERMS.title), Facet(DCTERMS.issued)] pagesets=d.toc_pagesets(rows,facets) expected={('dcterms_title','a'):[[Link('Abc',uri='http://ex.org/1')], [Link('Abcd',uri='http://ex.org/2')]], ('dcterms_title','d'):[[Link('Dfg',uri='http://ex.org/3')]], ('dcterms_issued','2009'):[[Link('Abc',uri='http://ex.org/1')]], ('dcterms_issued','2010'):[[Link('Abcd',uri='http://ex.org/2')], [Link('Dfg',uri='http://ex.org/3')]]} got = d.toc_select_for_pages(rows, pagesets, facets) self.assertEqual(expected, got)
class BasicQuery(object): repos = [DocumentRepository()] def load(self, data): # print("loading...") for doc in data: self.index.update(**doc) self.index.commit() def test_basic(self): # an un-initialized fulltext index may throw an exception instead of reporting 0 documents # self.assertEqual(self.index.doccount(),0) self.load(basic_dataset) self.assertEqual(self.index.doccount(), 4) res, pager = self.index.query("main") self.assertEqual(len(res), 1) self.assertEqual(res[0]['dcterms_identifier'], 'Doc #1') self.assertEqual(res[0]['uri'], 'http://example.org/doc/1') res, pager = self.index.query("document") self.assertEqual(len(res), 2) # Doc #2 contains the term 'document' in title (which is a # boosted field), not just in text. self.assertEqual(res[0]['dcterms_identifier'], 'Doc #2') res, pager = self.index.query("section") # can't get these results when using MockESBasicQuery with # CREATE_CANNED=True for some reason... if type(self) == ESBasicQuery: self.assertEqual(len(res), 1) self.assertEqual(len(res[0]['innerhits']), 2) self.assertEqual(res[0]['innerhits'][0]['dcterms_identifier'], 'Doc #1 (section 1)') def test_fragmented(self): self.load([{ 'uri': 'http://example.org/doc/3', 'repo': 'base', 'basefile': '3', 'dcterms_title': 'Other example', 'dcterms_identifier': 'Doc #3', 'text': """Haystack needle haystack haystack haystack haystack haystack haystack haystack haystack haystack haystack haystack haystack needle haystack haystack.""" }]) self.index.fragment_size = 60 # the default is 150, which doesn't trigger this res, pager = self.index.query("needle") # this should return 1 hit (only 1 document) self.assertEqual(1, len(res)) # that has a fragment connector (' ... ') in the middle self.assertIn(' ... ', "".join(str(x) for x in res[0]['text']))
def test_select_toc(self): results1 = json.load(open("test/files/datasets/results1.json")) results2 = json.load(open("test/files/datasets/results2.json")) self.loader.add_serialized( util.readfile("test/files/datasets/books.ttl"), format="turtle", context="http://example.org/ctx/base") self.loader.add_serialized( util.readfile("test/files/datasets/articles.ttl"), format="turtle", context="http://example.org/ctx/other") # Since the query is partially constructed by DocumentRepository, we # need to run that code. import rdflib from ferenda import DocumentRepository repo = DocumentRepository() repo.config.storetype = self.storetype repo.rdf_type = rdflib.URIRef("http://purl.org/ontology/bibo/Book") # test 1 sq = repo.toc_query("http://example.org/ctx/base") got = self.store.select(sq, format="python") self.assertEqual(len(got), len(results1)) for row in results1: self.assertIn(row, got) # test 2 sq = repo.toc_query("http://example.org/ctx/other") got = self.store.select(sq, format="python") self.assertEqual(len(got), len(results2)) for row in results2: self.assertIn(row, got) # test 3 sq = repo.toc_query() got = self.store.select(sq, format="python") want = results1 + results2 self.assertEqual(len(got), len(want)) for row in want: self.assertIn(row, got) if self.storetype == "SLEEPYCAT": self.store.graph.close()
def setUp(self, mock_requests): can = canned((404, "exists-not.json"), create=CREATE_CANNED, method="get") mock_requests.get.side_effect = can can = canned((200, "create.json"), create=CREATE_CANNED, method="put") mock_requests.put.side_effect = can self.location = "http://localhost:9200/ferenda/" self.index = FulltextIndex.connect("ELASTICSEARCH", self.location, [DocumentRepository()])
def test_default_docrepo(self): # Test3: No combining, make sure that a non-customized # DocumentRepository works s = os.sep repo = DocumentRepository() # but remove any external urls -- that's tested separately in Test5 repo.config.cssfiles = [x for x in repo.config.cssfiles if not x.startswith("http://")] got = Resources([repo],self.tempdir+os.sep+'rsrc', cssfiles=[], jsfiles=[], imgfiles=[]).make(api=False) s = os.sep want = {'css':[s.join(['rsrc', 'css','ferenda.css'])], 'img':[s.join(['rsrc', 'img', 'atom.png'])], 'js':[s.join(['rsrc', 'js','ferenda.js'])], 'xml':[s.join(['rsrc', 'resources.xml'])] } self.assertEqual(want,got)
def test_default_docrepo(self): # Test3: No combining, make sure that a non-customized # DocumentRepository works. It should not specify any # resources (global resources are now specified in # ferenda.manager.DEFAULT_CONFIG and not in the base docrepo # class) except for the resulting xml file s = os.sep repo = DocumentRepository() got = Resources([repo], self.tempdir + os.sep + 'rsrc', cssfiles=[], jsfiles=[], imgfiles=[]).make(api=False) s = os.sep want = { 'css': [], 'img': [], 'js': [], 'xml': [s.join(['rsrc', 'resources.xml'])] } self.assertEqual(want, got)
def __init__(self, repos, resourcedir, **kwargs): # FIXME: document what kwargs could be (particularly 'combineresources') self.repos = repos self.resourcedir = resourcedir defaults = DocumentRepository.get_default_options() defaults.update(kwargs) self.config = LayeredConfig(Defaults(defaults)) # the below call to setup_logger alters the logging level of # the root logger, which can't be good practice. Also, we # should probably not log to the root logger, but rather to # ferenda.resources. # # from ferenda.manager import setup_logger # self.log = setup_logger() self.log = logging.getLogger("ferenda.resources") # FIXME: How should we set up a global loadpath from the # individual repos? loadpaths = [ResourceLoader.make_loadpath(repo) for repo in repos] loadpath = ["."] # cwd always has priority -- makes sense? for subpath in loadpaths: for p in subpath: if p not in loadpath: loadpath.append(p) self.resourceloader = ResourceLoader(*loadpath)
def tearDown(self): self.setupclass = False # make sure super.tearDown deletes all files super(BasicAPI, self).tearDown() FulltextIndex.connect(self.indextype, self.indexlocation, [DocumentRepository()]).destroy()
def setUp(self): self.maxDiff = None self.datadir = tempfile.mkdtemp() self.repo = DocumentRepository(datadir=self.datadir)
def setUp(self): self.tempdir = tempfile.mkdtemp() self.repo = DocumentRepository()