def check_abnormal_remove_data_from_exist_and_cache(self, op): db = ExistDB() c = Chunk(data="<div/>", is_normal=False) c.clean() cache.delete(c.c_hash) method = getattr(c, op) db.removeCollection(self.chunk_collection_path, True) db.removeCollection(self.display_collection_path, True) c.save() keys = [c.display_key(kind) for kind in self.prepare_kinds] for kind in self.prepare_kinds: self.assertIsNone(cache.get(c.display_key(kind))) self.assertEqual(len(list_collection(db, self.chunk_collection_path)), 0) self.assertEqual(len(list_collection(db, self.display_collection_path)), 0) with mock.patch('lexicography.models.ExistDB.removeDocument') as \ remove_mock: method() self.assertEqual(remove_mock.call_count, 0) self.assertEqual(len(list_collection(db, self.chunk_collection_path)), 0) self.assertEqual(len(list_collection(db, self.display_collection_path)), 0) for key in keys: self.assertIsNone(cache.get(key))
def fetch_xml(pk): """ This function will check in the cache first and if the xml is not present there it will load it from eXist and put it back in the cache. This is not actually a task but it is so tightly related to the ``process_xml`` task that it is included among the other tasks. """ key = make_display_key("xml", pk) xml = cache.get(key) if xml: return xml # We make this atomic and use select_for_update so that anything # else that might want to mess with our chunks is blocked from # doing so until we are done. with transaction.atomic(): try: meta = ChunkMetadata.objects \ .select_for_update().get(chunk_id=pk) except ChunkMetadata.DoesNotExist: meta = None xml = None if meta: path = get_path_for_chunk_hash("display", pk) db = ExistDB() xml = db.getDocument(path).decode("utf-8") if xml: cache.set(key, xml, timeout=settings.LEXICOGRAPHY_XML_TIMEOUT) return xml
def _delete_cached_data(self): if self.is_normal: db = ExistDB() db.removeDocument(self.exist_path("chunks"), True) db.removeDocument(self.exist_path("display"), True) cache.delete_many(self.display_key(kind) for kind in self.key_kinds)
def setUp(self): self.foo = foo = user_model.objects.create( username="******", password="******") scribe = Group.objects.get(name='scribe') foo.groups.add(scribe) cache.clear() db = ExistDB() db.removeCollection(self.chunk_collection_path, True) db.removeCollection(self.display_collection_path, True) return super(ChunkTransactionTestCase, self).setUp()
def check_deletes_documents(self, op, collection, *args): c = Chunk(data="<div/>", is_normal=True) c.save() entry = self.make_reachable(c) # If it does not have metadata yet, that's fine. try: c.chunkmetadata.delete() except ChunkMetadata.DoesNotExist: pass # We have to delete the collection because merely saving the # chunk causes it to be synced, but this is not what we are # testing here. We want to make sure that calling # op will perform the sync. db = ExistDB() db.removeCollection(collection, True) self.assertEqual(len(list_collection(db, collection)), 0) op = getattr(self.manager, op) op(*args) self.assertEqual(len(list_collection(db, collection)), 1) # Make sure our chunk was not collected. self.assertEqual(self.manager.count(), 1) # Now we delete the chunk in SQL because we do not want the # ``delete`` method to be called, as it would take care of # removing the document itself. (And yes, we do interpolate # the table name. This is safe as ``Entry._meta.db_table`` is # a value under our control.) with connection.cursor() as cursor: cr = entry.latest cursor.execute( "DELETE FROM {} WHERE id = %s".format(entry._meta.db_table), [entry.pk]) # We have to do this ourselves because Django's cascading # delete is implemented at the ORM level, not the database # level. cursor.execute( "DELETE FROM {} WHERE id = %s".format(cr._meta.db_table), [cr.pk]) # Check that no collection or syncing has occurred. self.assertEqual(self.manager.count(), 1) self.assertEqual(len(list_collection(db, collection)), 1) op(*args) # Make sure our chunk was collected. self.assertEqual(self.manager.count(), 0) self.assertEqual(len(list_collection(db, collection)), 0)
def prepare_xml(pk): """ This function prepares a chunk for display and caches the result of the prepared XML. :param pk: The primary key of the chunk to prepare. :type pk: :class:`int` """ # By using atomicity and using select_for_update we are # effectively preventing other prepare_xml tasks from working on # the same chunk at the same time. with transaction.atomic(): chunk = Chunk.objects.get(pk=pk) key = chunk.display_key("xml") logger.debug("%s processing...", key) meta, _ = ChunkMetadata.objects \ .select_for_update() \ .get_or_create(chunk=chunk) data = chunk.data xml, sf_records = prepare_article_data(data) cache.set(key, xml, timeout=settings.LEXICOGRAPHY_XML_TIMEOUT) logger.debug("%s is set", key) sha1 = hashlib.sha1() sha1.update(xml.encode('utf-8')) xml_hash = sha1.hexdigest() db = ExistDB() path = get_path_for_chunk_hash("display", pk) absent = not db.hasDocument(path) if meta.xml_hash != xml_hash or absent: # This is something that should not happen ever. It has # happened once in development but it is unclear what could # have been the cause. if meta.xml_hash == xml_hash and absent: logger.error( "%s was missing from eXist but had a value " "already set and equal to the new hash; this " "should not happen!", path) meta.semantic_fields.set(sf_records) # Technically, if it was created then xml_hash is already # set, but putting this in an conditional block does not # provide for better performance. meta.xml_hash = xml_hash meta.save() if not db.load(xml.encode("utf-8"), path): raise Exception("could not sync with eXist database")
def prepare_xml(pk): """ This function prepares a chunk for display and caches the result of the prepared XML. :param pk: The primary key of the chunk to prepare. :type pk: :class:`int` """ # By using atomicity and using select_for_update we are # effectively preventing other prepare_xml tasks from working on # the same chunk at the same time. with transaction.atomic(): chunk = Chunk.objects.get(pk=pk) key = chunk.display_key("xml") logger.debug("%s processing...", key) meta, _ = ChunkMetadata.objects \ .select_for_update() \ .get_or_create(chunk=chunk) data = chunk.data xml, sf_records = prepare_article_data(data) cache.set(key, xml, timeout=settings.LEXICOGRAPHY_XML_TIMEOUT) logger.debug("%s is set", key) sha1 = hashlib.sha1() sha1.update(xml.encode('utf-8')) xml_hash = sha1.hexdigest() db = ExistDB() path = get_path_for_chunk_hash("display", pk) absent = not db.hasDocument(path) if meta.xml_hash != xml_hash or absent: # This is something that should not happen ever. It has # happened once in development but it is unclear what could # have been the cause. if meta.xml_hash == xml_hash and absent: logger.error("%s was missing from eXist but had a value " "already set and equal to the new hash; this " "should not happen!", path) meta.semantic_fields.set(sf_records) # Technically, if it was created then xml_hash is already # set, but putting this in an conditional block does not # provide for better performance. meta.xml_hash = xml_hash meta.save() if not db.load(xml.encode("utf-8"), path): raise Exception("could not sync with eXist database")
def check_skip_abnormal_chunks(self, op, collection, *args): c = Chunk(data="", is_normal=False) c.save() # We have to delete the collection because merely saving the # chunk causes it to be synced, but this is not what we are # testing here. We want to make sure that calling # sync_with_exist will perform the sync. db = ExistDB() db.removeCollection(collection, True) self.assertEqual(len(list_collection(db, collection)), 0) getattr(c, op)(*args) self.assertEqual(len(list_collection(db, collection)), 0)
def test_when_chunk_becomes_hidden_cached_data_is_cleared(self): """ When a ``Chunk`` becomes hidden, then its cached data is deleted. """ c = Chunk(data="<div/>", is_normal=True) c.save() keys = [c.display_key(kind) for kind in self.prepare_kinds] e = Entry() e.update(self.foo, "q", c, "foo", ChangeRecord.CREATE, ChangeRecord.MANUAL) db = ExistDB() self.assertIsNotNone(cache.get(c.display_key("xml"))) self.assertEqual(len(list_collection(db, self.chunk_collection_path)), 1) self.assertEqual( len(list_collection(db, self.display_collection_path)), 1) e.latest.hidden = True e.latest.save() for key in keys: self.assertIsNone(cache.get(key)) self.assertEqual(len(list_collection(db, self.chunk_collection_path)), 0) self.assertEqual( len(list_collection(db, self.display_collection_path)), 0)
def test_no_exist_document(self): """ When the exist document is missing, raise an error. We want an error because it indicates something really broken about our internal state. We should never have metadata without a corresponding XML file. """ cr = ChangeRecord.objects.get(pk=1) chunk = cr.c_hash self.assertIsNotNone(cache.get(chunk.display_key("xml"))) cache.clear() db = ExistDB() db.removeCollection(get_collection_path("display"), True) with self.assertRaises(ExistDBException): tasks.fetch_xml(chunk.c_hash)
def sync_with_exist(self): self.collect() db = ExistDB() present = set() for chunk in self.all_syncable_chunks(): chunk.sync_with_exist(db) present.add(chunk.c_hash) self._remove_absent(db, present, get_collection_path("chunks"))
def test_sync_handles_overwrites(self): """ ``sync_with_exist`` will not overwrite documents already in eXist. """ db = ExistDB() db.removeCollection(self.chunk_collection_path, True) c = Chunk(data="<div/>", is_normal=True) c.save() c.sync_with_exist() self.assertEqual(len(list_collection(db, self.chunk_collection_path)), 1) with mock.patch('lexicography.models.ExistDB.load') as load_mock: c.sync_with_exist() self.assertEqual(load_mock.call_count, 0, "load should not have been called!") self.assertEqual(len(list_collection(db, self.chunk_collection_path)), 1)
def check_remove_data_from_exist_and_cache(self, op): """ Check that invoking ``op`` will remove the data from the eXist database and the cache. """ db = ExistDB() c = Chunk(data="<div/>", is_normal=True) c.clean() method = op if callable(op) else getattr(c, op) cache.delete(c.c_hash) db.removeCollection(self.chunk_collection_path, True) db.removeCollection(self.display_collection_path, True) self.assertEqual(len(list_collection(db, self.chunk_collection_path)), 0) self.assertEqual(len(list_collection(db, self.display_collection_path)), 0) for kind in self.prepare_kinds: self.assertIsNone(cache.get(c.display_key(kind))) c.save() c._create_cached_data() keys = [c.display_key(kind) for kind in self.prepare_kinds] # Only the "xml" data is created on save. self.assertIsNotNone(cache.get(c.display_key("xml"))) self.assertEqual(len(list_collection(db, self.chunk_collection_path)), 1) self.assertEqual(len(list_collection(db, self.display_collection_path)), 1) method() self.assertEqual(len(list_collection(db, self.chunk_collection_path)), 0) self.assertEqual(len(list_collection(db, self.display_collection_path)), 0) for key in keys: self.assertIsNone(cache.get(key))
def check_abnormal_remove_data_from_exist_and_cache(self, op): db = ExistDB() c = Chunk(data="<div/>", is_normal=False) c.clean() cache.delete(c.c_hash) method = getattr(c, op) db.removeCollection(self.chunk_collection_path, True) db.removeCollection(self.display_collection_path, True) c.save() keys = [c.display_key(kind) for kind in self.prepare_kinds] for kind in self.prepare_kinds: self.assertIsNone(cache.get(c.display_key(kind))) self.assertEqual(len(list_collection(db, self.chunk_collection_path)), 0) self.assertEqual( len(list_collection(db, self.display_collection_path)), 0) with mock.patch('lexicography.models.ExistDB.removeDocument') as \ remove_mock: method() self.assertEqual(remove_mock.call_count, 0) self.assertEqual(len(list_collection(db, self.chunk_collection_path)), 0) self.assertEqual( len(list_collection(db, self.display_collection_path)), 0) for key in keys: self.assertIsNone(cache.get(key))
def check_skip_abnormal_chunks(self, op, collection, *args): c = Chunk(data="", is_normal=False) c.save() self.make_reachable(c) db = ExistDB() self.assertEqual(len(list_collection(db, collection)), 0) getattr(self.manager, op)(*args) self.assertEqual(len(list_collection(db, collection)), 0) # Make sure our chunk was not collected. self.assertEqual(self.manager.count(), 1)
def _delete_cached_data(self): if self.is_normal: db = ExistDB() db.removeDocument(self.exist_path("chunks"), True) db.removeDocument(self.exist_path("display"), True) cache.delete_many( self.display_key(kind) for kind in self.key_kinds)
def check_syncs_normal_chunks(self, op, collection, *args): c = Chunk(data="<div/>", is_normal=True) c.save() self.make_reachable(c) # If it does not have metadata yet, that's fine. try: c.chunkmetadata.delete() except ChunkMetadata.DoesNotExist: pass # We have to delete the collection because merely saving the # chunk causes it to be synced, but this is not what we are # testing here. We want to make sure that calling # sync_with_exist will perform the sync. db = ExistDB() db.removeCollection(collection, True) self.assertEqual(len(list_collection(db, collection)), 0) getattr(self.manager, op)(*args) self.assertEqual(len(list_collection(db, collection)), 1) # Make sure our chunk was not collected. self.assertEqual(self.manager.count(), 1)
def sync_with_exist(self, db=None): # We do not put "abnormal" chunks in exist. if not self.is_normal: return db = db or ExistDB() # Reminder: chunks are immutable. So if a chunk has been put # in eXist already, then we do not want to reput that data. If # we were to overwrite the data with the same value, it is not # clear at all whether eXist would stupidly reindex the new # data. We proactively avoid the situation. path = self.exist_path("chunks") if not db.hasDocument(path) and \ not db.load(self.data.encode("utf-8"), path): raise Exception("could not sync with eXist database")
def setUp(self): self.foo = foo = user_model.objects.create(username="******", password="******") scribe = Group.objects.get(name='scribe') foo.groups.add(scribe) cache.clear() db = ExistDB() db.removeCollection(self.chunk_collection_path, True) db.removeCollection(self.display_collection_path, True) return super(ChunkTransactionTestCase, self).setUp()
def prepare(self, kind, include_unpublished): if kind != "xml": raise ValueError("the manager only supports preparing XML data; " "future versions may support other kinds") self.collect() db = ExistDB() present = set() chunks = self.all_syncable_chunks() if not include_unpublished: chunks = chunks.filter(changerecord__published=True) for chunk in chunks: chunk.prepare("xml", True) present.add(chunk.c_hash) self._remove_absent(db, present, get_collection_path("display"))
def test_not_cached(self): """ When the data is not cached, get it from eXist. """ cr = ChangeRecord.objects.get(pk=1) chunk = cr.c_hash key = chunk.display_key("xml") xml_doc = cache.get(key) _, xml_doc = xml.strip_xml_decl(xml_doc) self.assertIsNotNone(xml) db = ExistDB() cache.delete(key) with mock.patch('lexicography.models.ExistDB.getDocument', wraps=db.getDocument) as get_mock: self.assertEqual(tasks.fetch_xml(chunk.c_hash), xml_doc) self.assertEqual(cache.get(key), xml_doc) self.assertEqual(get_mock.call_count, 1)
def __call__(self, command, _options): """ Load initial data into a new database. This is necessary for BTW to run. """ assert_running() from django.utils import translation translation.activate('en-us') db = ExistDB() chunk_collection_path = get_collection_path("chunks") if db.hasCollection(chunk_collection_path): db.removeCollection(chunk_collection_path) Chunk.objects.sync_with_exist() display_path = get_collection_path("display") if db.hasCollection(display_path): db.removeCollection(display_path) Chunk.objects.prepare("xml", include_unpublished=False)
def hashes_with_semantic_field(self, sf): """ Returns a set of chunk *hashes* that contain the semantic field requested. """ db = ExistDB() chunks = set() for query_chunk in query_iterator( db, xquery.format("""\ for $m in collection({db})//btw:sf[@ref = {path}] return util:document-name($m)""", db=get_collection_path("display"), path=sf)): for result in query_chunk.values: chunks.add(result) return chunks
def __call__(self, command, _options): assert_running() db = get_admin_db() for (group, desc) in command.new_user_groups.items(): db.server.addGroup( group, {'http://exist-db.org/security/description': desc}) db.server.addAccount( command.server_user, settings.EXISTDB_SERVER_PASSWORD, "", list(command.new_user_groups.keys()), True, 0o022, { 'http://exist-db.org/security/description': 'BTW user' }) db.server.setUserPrimaryGroup(command.server_user, command.btw_group) db = ExistDB()
def __call__(self, command, _options): """ Load initial data into a new database. This is necessary for BTW to run. """ assert_running() from django.utils import translation translation.activate('en-us') db = ExistDB() chunk_collection_path = get_collection_path("chunks") if db.hasCollection(chunk_collection_path): db.removeCollection(chunk_collection_path) Chunk.objects.sync_with_exist() display_path = get_collection_path("display") if db.hasCollection(display_path): db.removeCollection(display_path) Chunk.objects.prepare("xml", True)
def check_remove_data_from_exist_and_cache(self, op): """ Check that invoking ``op`` will remove the data from the eXist database and the cache. """ db = ExistDB() c = Chunk(data="<div/>", is_normal=True) c.clean() method = op if isinstance(op, Callable) else getattr(c, op) cache.delete(c.c_hash) db.removeCollection(self.chunk_collection_path, True) db.removeCollection(self.display_collection_path, True) self.assertEqual(len(list_collection(db, self.chunk_collection_path)), 0) self.assertEqual( len(list_collection(db, self.display_collection_path)), 0) for kind in self.prepare_kinds: self.assertIsNone(cache.get(c.display_key(kind))) c.save() c._create_cached_data() keys = [c.display_key(kind) for kind in self.prepare_kinds] # Only the "xml" data is created on save. self.assertIsNotNone(cache.get(c.display_key("xml"))) self.assertEqual(len(list_collection(db, self.chunk_collection_path)), 1) self.assertEqual( len(list_collection(db, self.display_collection_path)), 1) method() self.assertEqual(len(list_collection(db, self.chunk_collection_path)), 0) self.assertEqual( len(list_collection(db, self.display_collection_path)), 0) for key in keys: self.assertIsNone(cache.get(key))
def list_display_collection(self): db = ExistDB() return list_collection(db, self.chunk_collection_path)
def test_complex_document(self): # Yeah, we launch it here. The other tests don't need this # data so... launch_fetch_task() entry = create_valid_article() cr = entry.latest chunk = cr.c_hash tasks.prepare_xml.delay(chunk.c_hash).get() # Check that the correct results are in the cache. result = cache.get(chunk.display_key("xml")) db = ExistDB() self.assertTrue(db.hasDocument(chunk.exist_path("display"))) tree = lxml.etree.fromstring(result) senses = tree.xpath( "/btw:entry/btw:sense-discrimination/btw:sense", namespaces=xml.default_namespace_mapping) self.assertEqual(len(senses), 4) expected_values = [ [ "01.02.11n", "Person (01.04.04n)", "01.04.08n", "01.05.05.09.01n", "01.06.07.03n", "Beautification (02.02.18n)", "Lack of beauty (02.02.19n)", "Written laws (03.05.01n)", ], [ "Belief (02.01.13n)", "Belief, trust, confidence (02.01.13.02n)", "Act of convincing, conviction (02.01.13.02.02n)", "Absence of doubt, confidence (02.01.13.08.11n)", "Making certain, assurance (02.01.13.08.11.01.01n)", "Expectation (02.01.14n)", "02.01.17n", "Good taste (02.02.12n)", "Bad taste (02.02.13n)", "Fashionableness (02.02.14n)", "02.02.22n", "Education (03.07n)", ], [ "01.05.05.12.01n" ], [ "02.01.17n", "Good taste (02.02.12n)", "Bad taste (02.02.13n)", "03.07.00.23n", "Learning (03.07.03n)" ], ] for ix, (sense, expected) in enumerate(zip(senses, expected_values)): sense_label = "sense " + str(ix + 1) sfss = sense.xpath("./btw:semantic-fields", namespaces=xml.default_namespace_mapping) self.assertEqual(len(sfss), 1, "there should be only one btw:semantic-fields " "in " + sense_label) sfs = [sf.text for sf in sfss[0]] self.assertEqual(sfs, expected, "the list of semantic fields should be correct " "in " + sense_label) sfss = tree.xpath("/btw:entry/btw:overview/btw:semantic-fields", namespaces=xml.default_namespace_mapping) self.assertEqual(len(sfss), 1, "there should be only one btw:semantic-fields " "element") sfs = [sf.text for sf in sfss[0]] self.assertEqual(sfs, [ "01.02.11n", "Person (01.04.04n)", "01.04.08n", "By eating habits (01.05.05n)", "01.06.07n", # By family relationships , "Belief (02.01.13n)", "Expectation (02.01.14n)", "02.01.17n", "Good taste (02.02.12n)", "Bad taste (02.02.13n)", "Fashionableness (02.02.14n)", "Beautification (02.02.18n)", "Lack of beauty (02.02.19n)", "02.02.22n", "Written laws (03.05.01n)", "Education (03.07n)", "03.07.00n", "Learning (03.07.03n)" ], "the list of semantic fields should be correct") self.assertIsNone(sfss[0].getnext())