def full_dedup(limit=1000): from designsafe.apps.data.models.elasticsearch import IndexedFile from elasticsearch import Elasticsearch from elasticsearch.helpers import bulk files_alias = settings.ES_INDICES['files']['alias'] HOSTS = settings.ES_CONNECTIONS[settings.DESIGNSAFE_ENVIRONMENT]['hosts'] es_client = Elasticsearch(hosts=HOSTS) file_search = IndexedFile.search().sort('_id').extra(size=limit) res = file_search.execute() while res.hits: for hit in res.hits: if hit.name is None or hit.path is None: continue print((hit.meta.id)) try: IndexedFile.from_path(hit.system, hit.path) except Exception as e: print(e) search_after = res.hits.hits[-1]['sort'] logger.debug(search_after) file_search = IndexedFile.search().sort('_id').extra( size=limit, search_after=search_after) res = file_search.execute()
def test_from_path_multiple_hits(self, mock_refresh, mock_get, mock_search, mock_delete): """ When there are multiple files sharing a system and path, ensure we delete all but one and return the remaining document. """ search_res = IndexedFile(**{ 'name': 'res1', 'system': 'test.system', 'path': '/path/to/res1' }) sys_filter = Q('term', **{'system._exact': 'test.system'}) path_filter = Q('term', **{'path._exact': '/path/to/res1'}) # Need to mock either slicing the result or retrieving a single element. mock_res = MagicMock() mock_res.hits.total.value = 3 mock_search().filter().execute.return_value = mock_res mock_get.return_value = search_res doc_from_path = IndexedFile.from_path('test.system', '/path/to/res1') self.assertEqual(mock_search().filter().delete.call_count, 1) self.assertEqual(doc_from_path, search_res)
def test_from_path_multiple_hits(self, mock_search, mock_delete): """ When there are multiple files sharing a system and path, ensure we delete all but one and return the remaining document. """ search_res = IndexedFile(**{ 'name': 'res1', 'system': 'test.system', 'path': '/path/to/res1' }) # Need to mock either slicing the result or retrieving a single element. def mock_getitem(i): if type(i) is slice: return [search_res, search_res] else: return search_res # mock a search result with 3 hits and the ability to get/slice. mock_res = MagicMock() mock_res.hits.total = 3 mock_res.__getitem__.side_effect = mock_getitem mock_search().filter().filter().execute.return_value = mock_res doc_from_path = IndexedFile.from_path('test.system', '/path/to/res1') mock_search().filter.assert_called_with( 'term', **{'system._exact': 'test.system'}) mock_search().filter().filter.assert_called_with( 'term', **{'path._exact': '/path/to/res1'}) self.assertEqual(mock_delete.call_count, 2) self.assertEqual(doc_from_path, search_res)
def repair_paths(limit=1000): from designsafe.apps.data.models.elasticsearch import IndexedFile file_search = IndexedFile.search().sort('_uid').extra(size=limit) res = file_search.execute() while res.hits: for hit in res.hits: print hit.name, hit.path new_path = repair_path(hit.name, hit.path) hit.update(**{'path': new_path}) hit.update(**{'basePath': os.path.dirname(new_path)}) # use from_path to remove any duplicates. IndexedFile.from_path(hit.system, hit.path) search_after = res.hits.hits[-1]['sort'] logger.debug(search_after) file_search = IndexedFile.search().sort('_uid').extra( size=limit, search_after=search_after) res = file_search.execute()
def test_from_path_1_hit(self, mock_search): search_res = IndexedFile(**{ 'name': 'res1', 'system': 'test.system', 'path': '/path/to/res1' }) mock_res = MagicMock() mock_res.hits.total = 1 mock_res.__getitem__.return_value = search_res mock_search().filter().filter().execute.return_value = mock_res doc_from_path = IndexedFile.from_path('test.system', '/path/to/res1') mock_search().filter.assert_called_with( 'term', **{'system._exact': 'test.system'}) mock_search().filter().filter.assert_called_with( 'term', **{'path._exact': '/path/to/res1'}) self.assertEqual(doc_from_path, search_res)
def test_from_path_raises_when_no_hits(self, mock_refresh, mock_search): mock_search().filter().execute.return_value.hits.total.value = 0 with self.assertRaises(DocumentNotFound): IndexedFile.from_path('test.system', '/')
def test_from_path_with_404(self, mock_refresh, mock_search): mock_search().filter().execute.side_effect = TransportError(404) with self.assertRaises(TransportError): IndexedFile.from_path('test.system', '/')