Beispiel #1
0
def full_dedup(limit=1000):
    from designsafe.apps.data.models.elasticsearch import IndexedFile
    from elasticsearch import Elasticsearch
    from elasticsearch.helpers import bulk

    files_alias = settings.ES_INDICES['files']['alias']
    HOSTS = settings.ES_CONNECTIONS[settings.DESIGNSAFE_ENVIRONMENT]['hosts']
    es_client = Elasticsearch(hosts=HOSTS)
    file_search = IndexedFile.search().sort('_id').extra(size=limit)
    res = file_search.execute()

    while res.hits:
        for hit in res.hits:

            if hit.name is None or hit.path is None:
                continue

            print((hit.meta.id))
            try:
                IndexedFile.from_path(hit.system, hit.path)
            except Exception as e:
                print(e)

        search_after = res.hits.hits[-1]['sort']
        logger.debug(search_after)
        file_search = IndexedFile.search().sort('_id').extra(
            size=limit, search_after=search_after)
        res = file_search.execute()
Beispiel #2
0
    def test_delete_recursive(self, mock_children, mock_delete):
        wrapped_doc = IndexedFile(
            **{
                'name': 'folder1',
                'system': 'test.system',
                'path': '/path/to/folder',
                'format': 'folder'
            })
        base = BaseESFile('test_user',
                          system='test.system',
                          wrapped_doc=wrapped_doc)
        object.__setattr__(base, '_wrapped', wrapped_doc)
        object.__setattr__(base, 'format', 'folder')
        object.__setattr__(base, 'path', '/path/to/folder')

        child_doc = IndexedFile(
            **{
                'name': 'child1',
                'system': 'test.system',
                'path': '/path/to/child1',
                'format': 'file'
            })
        base_child = BaseESFile('test_user',
                                system='test.system',
                                wrapped_doc=child_doc)
        object.__setattr__(base_child, '_wrapped', child_doc)
        object.__setattr__(base_child, 'format', 'file')
        object.__setattr__(base_child, 'path', '/path/to/child1')

        mock_children.return_value = iter([base_child])

        base.delete()
        # Assert 2 delete calls: 1 for parent, 1 for child
        self.assertEqual(mock_delete.call_count, 2)
Beispiel #3
0
    def test_children_returns_when_hits(self, mock_search, mock_get):

        search_res = IndexedFile(**{
            'name': 'res1',
            'system': 'test.system',
            'path': '/path/to/res1'
        })

        search_res.meta.id = 'MOCK ID'

        mock_search().filter().filter().sort().extra(
        ).execute.return_value.hits.__len__.return_value = 1
        mock_search().filter().filter().sort().extra().execute(
        ).__iter__.return_value = [search_res]
        mock_search().filter().filter().sort().extra(
        ).execute.return_value.hits.hits = [{
            'sort': 'MOCK SORTKEY'
        }]

        mock_get.return_value = search_res

        children = IndexedFile.children('test_user',
                                        system='test.system',
                                        path='/')
        mock_get.assert_called_with('MOCK ID')
        self.assertEqual(children, ([search_res], 'MOCK SORTKEY'))
Beispiel #4
0
    def test_from_path_multiple_hits(self, mock_refresh, mock_get, mock_search,
                                     mock_delete):
        """
        When there are multiple files sharing a system and path, ensure we delete
        all but one and return the remaining document.
        """
        search_res = IndexedFile(**{
            'name': 'res1',
            'system': 'test.system',
            'path': '/path/to/res1'
        })

        sys_filter = Q('term', **{'system._exact': 'test.system'})
        path_filter = Q('term', **{'path._exact': '/path/to/res1'})

        # Need to mock either slicing the result or retrieving a single element.

        mock_res = MagicMock()
        mock_res.hits.total.value = 3
        mock_search().filter().execute.return_value = mock_res
        mock_get.return_value = search_res

        doc_from_path = IndexedFile.from_path('test.system', '/path/to/res1')

        self.assertEqual(mock_search().filter().delete.call_count, 1)

        self.assertEqual(doc_from_path, search_res)
Beispiel #5
0
    def test_from_path_multiple_hits(self, mock_search, mock_delete):
        """
        When there are multiple files sharing a system and path, ensure we delete
        all but one and return the remaining document.
        """
        search_res = IndexedFile(**{
            'name': 'res1',
            'system': 'test.system',
            'path': '/path/to/res1'
        })

        # Need to mock either slicing the result or retrieving a single element.
        def mock_getitem(i):
            if type(i) is slice:
                return [search_res, search_res]
            else:
                return search_res

        # mock a search result with 3 hits and the ability to get/slice.
        mock_res = MagicMock()
        mock_res.hits.total = 3
        mock_res.__getitem__.side_effect = mock_getitem
        mock_search().filter().filter().execute.return_value = mock_res

        doc_from_path = IndexedFile.from_path('test.system', '/path/to/res1')

        mock_search().filter.assert_called_with(
            'term', **{'system._exact': 'test.system'})
        mock_search().filter().filter.assert_called_with(
            'term', **{'path._exact': '/path/to/res1'})

        self.assertEqual(mock_delete.call_count, 2)
        self.assertEqual(doc_from_path, search_res)
Beispiel #6
0
    def test_children_function(self, mock_index):
        child_doc1 = IndexedFile(**{
            'name': 'child1',
            'system': 'test.system',
            'path': '/path/to/child1'
        })
        child_doc2 = IndexedFile(**{
            'name': 'child2',
            'system': 'test.system',
            'path': '/path/to/child2'
        })

        mock_index.return_value.children.side_effect = [([child_doc1], 'KEY1'),
                                                        ([child_doc2], 'KEY2'),
                                                        ([], None)]

        wrapped_doc = IndexedFile(**{
            'name': 'file1',
            'system': 'test.system',
            'path': '/path/to/file'
        })
        base = BaseESFile('test_user',
                          system='test.system',
                          wrapped_doc=wrapped_doc)

        # Need to set attrs manually because the custom setter/getter in BaseESResource are mocked

        object.__setattr__(base, 'username', 'test_user')
        object.__setattr__(base, '_reindex', False)
        object.__setattr__(base, 'system', 'test.system')
        object.__setattr__(base, 'path', '/path/to/file')

        child_generator = base.children(limit=1)
        for child in child_generator:
            continue

        mock_index().children.assert_has_calls([
            call('test_user', 'test.system', '/path/to/file', limit=1),
            call('test_user',
                 'test.system',
                 '/path/to/file',
                 limit=1,
                 search_after='KEY1'),
            call('test_user',
                 'test.system',
                 '/path/to/file',
                 limit=1,
                 search_after='KEY2'),
        ])

        # Check that iteration ends after all children have been listed.
        self.assertRaises(StopIteration, child_generator.__next__)
Beispiel #7
0
    def listing(self,
                system,
                file_path,
                user_context=None,
                offset=None,
                limit=None):
        """Perform the search and output in a serializable format."""

        ngram_query = Q("query_string",
                        query=self.query_string,
                        fields=["name"],
                        minimum_should_match='80%',
                        default_operator='or')

        match_query = Q("query_string",
                        query=self.query_string,
                        fields=["name._exact", "name._pattern"],
                        default_operator='and')

        search = IndexedFile.search()
        search = search.filter("nested",
                               path="permissions",
                               query=Q("term",
                                       permissions__username=user_context))
        search = search.query(ngram_query | match_query)

        search = search.query(
            Q('bool',
              must_not=[Q({'prefix': {
                  'path._exact': '/' + user_context
              }})]))
        search = search.filter("term", system=system)
        search = search.query(
            Q('bool',
              must_not=[
                  Q({
                      'prefix': {
                          'path._exact': '{}/.Trash'.format(user_context)
                      }
                  })
              ]))
        res = search.execute()

        children = []
        if res.hits.total.value:
            children = [o.to_dict() for o in search[offset:limit]]

        result = {
            'trail': [{
                'name': '$SEARCHSHARED',
                'path': '/$SEARCH'
            }],
            'name': '$SEARCHSHARED',
            'path': '/$SEARCHSHARED',
            'system': system,
            'type': 'dir',
            'children': children,
            'permissions': 'READ'
        }
        return result
Beispiel #8
0
    def test_init(self, mock_base):
        request = MagicMock()
        request.query_string = 'test_query'
        request.username = '******'

        sm = CommunityDataSearchManager(request)
        mock_base.assert_called_with(IndexedFile, IndexedFile.search())
Beispiel #9
0
    def test_search(self, mock_search, mock_base):
        request = MagicMock()
        request.query_string = 'test_query'
        request.username = '******'

        mock_res = MagicMock()
        mock_res.hits.total.value = 1
        mock_res.__iter__.return_value = [IndexedFile(name='file01')]

        mock_search().query().extra().execute.return_value = mock_res

        sm = PrivateDataSearchManager(request)
        expected_result = {
            'trail': [{
                'name': '$SEARCH',
                'path': '/$SEARCH'
            }],
            'name': '$SEARCH',
            'path': '/',
            'system': 'test.system',
            'type': 'dir',
            'children': [{
                'name': 'file01'
            }],
            'permissions': 'READ'
        }
        listing = sm.listing('test.system', '/')
        self.assertEqual(listing, expected_result)
Beispiel #10
0
 def test_children_returns_when_no_hits(self, mock_search):
     mock_search().filter().filter().sort().extra(
     ).execute.return_value.hits.__len__.return_value = 0
     children = IndexedFile.children('test_user',
                                     system='test.system',
                                     path='/')
     self.assertEqual(children, ([], None))
Beispiel #11
0
    def __init__(self, request=None, **kwargs):
        if request:
            self.query_string = request.GET.get('query_string').replace(
                "/", "\\/")
        else:
            self.query_string = kwargs.get('query_string').replace("/", "\\/")

        super(PublishedDataSearchManager,
              self).__init__(IndexedFile, IndexedFile.search())
def listing(client,
            system,
            path,
            username,
            offset=0,
            limit=100,
            *args,
            **kwargs):
    """
    Perform a Tapis file listing

    Params
    ------
    client: agavepy.agave.Agave
        Tapis client to use for the listing.
    system: str
        Tapis system ID.
    path: str
        Path in which to peform the listing.
    offset: int
        Offset for pagination.
    limit: int
        Number of results to return.

    Returns
    -------
    list
        List of dicts containing file metadata
    """

    if path:
        return agave_listing(client, system, path, offset, limit)

    username_q = Q('term', **{'permissions.username': username})
    world_q = Q('term', **{'permissions.username': '******'})
    pems_filter = Q('bool', should=[username_q, world_q])

    nested_filter = Q('nested')
    nested_filter.path = 'permissions'
    nested_filter.query = pems_filter

    file_path = '/'
    home_filter = Q('prefix', **{'path._exact': '/' + username})
    system_filter = Q('term',
                      **{'system._exact': 'designsafe.storage.default'})
    query = Q('bool',
              must_not=home_filter,
              filter=[nested_filter, system_filter])

    search = IndexedFile.search().filter(query).sort('name._exact').extra(
        from_=int(offset), size=int(limit))
    res = search.execute()

    hits = [hit.to_dict() for hit in res]

    return {'listing': hits, 'reachedEnd': len(hits) < int(limit)}
Beispiel #13
0
def repair_paths(limit=1000):
    from designsafe.apps.data.models.elasticsearch import IndexedFile
    file_search = IndexedFile.search().sort('_uid').extra(size=limit)
    res = file_search.execute()

    while res.hits:
        for hit in res.hits:
            print hit.name, hit.path
            new_path = repair_path(hit.name, hit.path)
            hit.update(**{'path': new_path})
            hit.update(**{'basePath': os.path.dirname(new_path)})

            # use from_path to remove any duplicates.
            # IndexedFile.from_path(hit.system, hit.path)

        search_after = res.hits.hits[-1]['sort']
        logger.debug(search_after)
        file_search = IndexedFile.search().sort('_uid').extra(size=limit, search_after=search_after)
        res = file_search.execute()
Beispiel #14
0
 def get(self, request):
     current_user = request.user
     q = IndexedFile.search()\
             .query('bool', must=[Q("prefix", **{"path._exact": '/' + current_user.username})])\
             .extra(size=0)
     q.aggs.metric('total_storage_bytes', 'sum', field="length")
     result = q.execute()
     agg = result.to_dict()["aggregations"]
     out = {"total_storage_bytes": agg["total_storage_bytes"]["value"]}
     return JsonResponse(out)
Beispiel #15
0
 def test_attrs(self):
     f = IndexedFile()
     self.assertTrue(hasattr(f, 'name'))
     self.assertTrue(hasattr(f, 'path'))
     self.assertTrue(hasattr(f, 'lastModified'))
     self.assertTrue(hasattr(f, 'length'))
     self.assertTrue(hasattr(f, 'format'))
     self.assertTrue(hasattr(f, 'mimeType'))
     self.assertTrue(hasattr(f, 'type'))
     self.assertTrue(hasattr(f, 'system'))
Beispiel #16
0
 def test_update(self, mock_update):
     wrapped_doc = IndexedFile(
         **{
             'name': 'folder1',
             'system': 'test.system',
             'path': '/path/to/folder',
             'format': 'folder'
         })
     base = BaseESResource(wrapped_doc=wrapped_doc)
     base.update(**{'name': 'folder2'})
     mock_update.assert_called_with(**{'name': 'folder2'})
Beispiel #17
0
    def test_init(self, mock_wrap):
        wrapped_doc = IndexedFile(
            **{
                'name': 'folder1',
                'system': 'test.system',
                'path': '/path/to/folder',
                'format': 'folder'
            })

        BaseESResource(wrapped_doc=wrapped_doc)
        mock_wrap.assert_called_with(wrapped_doc)
Beispiel #18
0
    def test_from_path_1_hit(self, mock_search):
        search_res = IndexedFile(**{
            'name': 'res1',
            'system': 'test.system',
            'path': '/path/to/res1'
        })

        mock_res = MagicMock()
        mock_res.hits.total = 1
        mock_res.__getitem__.return_value = search_res
        mock_search().filter().filter().execute.return_value = mock_res

        doc_from_path = IndexedFile.from_path('test.system', '/path/to/res1')

        mock_search().filter.assert_called_with(
            'term', **{'system._exact': 'test.system'})
        mock_search().filter().filter.assert_called_with(
            'term', **{'path._exact': '/path/to/res1'})

        self.assertEqual(doc_from_path, search_res)
Beispiel #19
0
    def __init__(self, request=None, **kwargs):
        if request:
            self.query_string = request.GET.get('query_string').replace(
                "/", "\\/")
            self.username = request.user.username
        else:
            self.query_string = kwargs.get('query_string').replace("/", "\\/")
            self.username = kwargs.get('username')

        super(PrivateDataSearchManager, self).__init__(IndexedFile,
                                                       IndexedFile.search())
Beispiel #20
0
def repair_paths(limit=1000):
    from designsafe.apps.data.models.elasticsearch import IndexedFile
    from elasticsearch import Elasticsearch
    from elasticsearch.helpers import bulk

    files_alias = settings.ES_INDICES['files']['alias']
    HOSTS = settings.ES_CONNECTIONS[settings.DESIGNSAFE_ENVIRONMENT]['hosts']
    es_client = Elasticsearch(hosts=HOSTS)
    file_search = IndexedFile.search().sort('_id').extra(size=limit)
    res = file_search.execute()

    while res.hits:
        update_ops = []
        for hit in res.hits:

            if hit.name is None or hit.path is None:
                continue

            new_path = repair_path(hit.name, hit.path)
            new_basepath = os.path.dirname(new_path)

            update_ops.append({
                '_op_type': 'update',
                '_index': files_alias,
                '_type': 'file',
                '_id': hit.meta.id,
                'doc': {
                    'path': new_path,
                    'basePath': new_basepath
                }
            })

            # use from_path to remove any duplicates.
            # IndexedFile.from_path(hit.system, hit.path)

        bulk(es_client, update_ops)
        search_after = res.hits.hits[-1]['sort']
        logger.debug(search_after)
        file_search = IndexedFile.search().sort('_id').extra(
            size=limit, search_after=search_after)
        res = file_search.execute()
Beispiel #21
0
    def test_to_dict(self, mock_to_dict):
        wrapped_doc = IndexedFile(
            **{
                'name': 'folder1',
                'system': 'test.system',
                'path': '/path/to/folder',
                'format': 'folder'
            })
        base = BaseESResource(wrapped_doc=wrapped_doc)

        base.to_dict()
        mock_to_dict.assert_called_with()
Beispiel #22
0
 def listing_recursive(self, system='designsafe.storage.default', path='/'):
     """Lists every folder's children"""
     search = IndexedFile.search()
     term_system_query = Q('term', **{'system._exact': system})
     term_path_query = Q('term', **{'path._path': path})
     bool_query = Q('bool')
     bool_query.must = [term_system_query, term_path_query]
     bool_query.filter = self._pems_filter()
     search = search.query(bool_query)
     search = search.sort({'name._exact': 'asc'})
     res = search.execute()
     return res, search
Beispiel #23
0
    def test_class_init_with_wrap(self):
        wd = IndexedFile(**{
            'name': 'file1',
            'system': 'test.system',
            'path': '/path/to/file'
        })
        base = BaseESFile('test_user', wrapped_doc=wd)
        self.mock_base_init.assert_called_with(wd)

        self.mock_base_setattr.assert_has_calls(
            [call('username', 'test_user'),
             call('_reindex', False)])
Beispiel #24
0
def search(client,
           system,
           path,
           offset=0,
           limit=100,
           query_string='',
           **kwargs):
    """
    Perform a search for files using a query string.

    Params
    ------
    client: NoneType
    system: str
        Tapis system ID to filter on.
    path: NoneType
    offset: int
        Search offset for pagination.
    limit: int
        Number of search results to return
    query_string: str
        Query string to pass to Elasticsearch

    Returns
    -------
    list
        List of dicts containing file metadata from Elasticsearch

    """
    ngram_query = Q("query_string",
                    query=query_string,
                    fields=["name"],
                    minimum_should_match='80%',
                    default_operator='or')
    match_query = Q("query_string",
                    query=query_string,
                    fields=["name._exact, name._pattern"],
                    default_operator='and')

    if not path.startswith('/'):
        path = '/' + path
    if not path.endswith('/'):
        path = path + '/'
    search = IndexedFile.search()
    search = search.query(ngram_query | match_query)
    search = search.filter('prefix', **{'path._exact': path})
    search = search.filter('term', **{'system._exact': system})
    search = search.extra(from_=int(offset), size=int(limit))
    res = search.execute()
    hits = [hit.to_dict() for hit in res]

    return {'listing': hits, 'reachedEnd': len(hits) < int(limit)}
Beispiel #25
0
 def setUp(self):
     # configure regular user
     user = get_user_model().objects.get(pk=2)
     user.set_password('user/password')
     user.save()
     f1 = IndexedFile(
         length=1,
         path="ds_user/test",
     )
     f1.save(refresh=True)
     f2 = IndexedFile(
         length=1,
         path="ds_user/test",
     )
     f2.save(refresh=True)
Beispiel #26
0
    def __init__(self, request=None, **kwargs):
        if request:
            self.query_string = request.GET.get('query_string')
        else:
            self.query_string = kwargs.get('query_string')

        split_query = self.query_string.split(" ")
        for i, c in enumerate(split_query):
            if c.upper() not in ["AND", "OR", "NOT"]:
                split_query[i] = "*" + c + "*"
        self.query_string = " ".join(split_query)

        super(PublishedDataSearchManager,
              self).__init__(IndexedFile, IndexedFile.search())
Beispiel #27
0
    def test_wrap(self, mock_update):
        wrapped_doc = IndexedFile(
            **{
                'name': 'folder1',
                'system': 'test.system',
                'path': '/path/to/folder',
                'format': 'folder'
            })
        base = BaseESResource(wrapped_doc=wrapped_doc)
        self.assertEqual(base._wrapped, wrapped_doc)

        base_with_kwargs = BaseESResource(wrapped_doc=wrapped_doc,
                                          **{'name': 'folder2'})
        mock_update.assert_called_with(**{'name': 'folder2'})
Beispiel #28
0
 def get(self, system='designsafe.storage.default', path='/', name=''):
     """Gets a file"""
     search = IndexedFile.search()
     term_system_query = Q('term', **{'system._exact': system})
     term_path_query = Q('term', **{'path._exact': path})
     term_username_query = Q('term', **{'name._exact': name})
     bool_query = Q('bool')
     bool_query.must = [
         term_system_query, term_path_query, term_username_query
     ]
     bool_query.filter = self._pems_filter()
     search = search.query(bool_query)
     search = search.sort({'name._exact': 'asc'})
     res = search.execute()
     # logger.debug('search :%s', json.dumps(search.to_dict(), indent=2))
     return res, search
Beispiel #29
0
    def test_delete_no_dir(self, mock_delete):
        wrapped_doc = IndexedFile(
            **{
                'name': 'file1',
                'system': 'test.system',
                'path': '/path/to/file',
                'format': 'file'
            })
        base = BaseESFile('test_user',
                          system='test.system',
                          wrapped_doc=wrapped_doc)

        object.__setattr__(base, '_wrapped', wrapped_doc)

        base.delete()
        mock_delete.assert_called_with()
Beispiel #30
0
    def test_getter_and_setter(self):
        wrapped_doc = IndexedFile(
            **{
                'name': 'folder1',
                'system': 'test.system',
                'path': '/path/to/folder',
                'format': 'folder'
            })
        base = BaseESResource(wrapped_doc=wrapped_doc)

        base.name = 'folder2'
        self.assertEqual(base.name, 'folder2')
        self.assertEqual(base._wrapped.name, 'folder2')

        base.newAttr = 'this attr is not in the wrapped doc'
        self.assertEqual(base.newAttr, 'this attr is not in the wrapped doc')
        self.assertFalse(hasattr(base._wrapped, 'newAttr'))