class DataSetSearch(DataCatalogModel):
    """
    Responsible for searching the ElasticSearch index for data sets
    (the metadata describing them).
    """

    SEARCH_ERROR_MESSAGE = 'Searching in the index failed'
    INVALID_QUERY_ERROR_MESSAGE = SEARCH_ERROR_MESSAGE + ': invalid query.'
    NO_CONNECTION_ERROR_MESSAGE = SEARCH_ERROR_MESSAGE + ': failed to connect to ElasticSearch.'

    def __init__(self):
        super(DataSetSearch, self).__init__()
        self._translator = ElasticSearchQueryTranslator()

    def search(self, query, org_uuid_list, dataset_filtering, is_admin):
        query_string = self._translator.translate(query, org_uuid_list,
                                                  dataset_filtering, is_admin)
        try:
            elastic_search_results = self._elastic_search.search(
                index=self._config.elastic.elastic_index,
                doc_type=self._config.elastic.elastic_metadata_type,
                body=query_string)
            return self._extract_metadata(elastic_search_results)
        except RequestError:
            self._log.exception(self.INVALID_QUERY_ERROR_MESSAGE)
            raise InvalidQueryError(self.INVALID_QUERY_ERROR_MESSAGE)
        except ConnectionError:
            self._log.exception(self.NO_CONNECTION_ERROR_MESSAGE)
            raise IndexConnectionError(self.NO_CONNECTION_ERROR_MESSAGE)

    @staticmethod
    def _extract_metadata(es_query_result):
        hits = es_query_result['hits']
        category_aggregations = es_query_result['aggregations']['categories'][
            'buckets']
        format_aggregations = es_query_result['aggregations']['formats'][
            'buckets']
        entries = []
        for entry in hits['hits']:
            entries.append(entry['_source'])
            entries[-1]['id'] = entry['_id']
        categories = [cat['key'] for cat in category_aggregations]
        formats = [obj['key'] for obj in format_aggregations]
        return {
            'hits': entries,
            'total': hits['total'],
            'categories': categories,
            'formats': formats
        }

    @staticmethod
    def get_params_from_request_args(args):
        dataset_filtering = DataSetFiltering.PRIVATE_AND_PUBLIC
        if args.get('onlyPublic', default="", type=str).lower() == 'true':
            dataset_filtering = DataSetFiltering.ONLY_PUBLIC
        if args.get('onlyPrivate', default="", type=str).lower() == 'true':
            dataset_filtering = DataSetFiltering.ONLY_PRIVATE

        return {'dataset_filtering': dataset_filtering}
Exemple #2
0
class DataSetSearch(DataCatalogModel):

    """
    Responsible for searching the ElasticSearch index for data sets
    (the metadata describing them).
    """

    SEARCH_ERROR_MESSAGE = 'Searching in the index failed'
    INVALID_QUERY_ERROR_MESSAGE = SEARCH_ERROR_MESSAGE + ': invalid query.'
    NO_CONNECTION_ERROR_MESSAGE = SEARCH_ERROR_MESSAGE + ': failed to connect to ElasticSearch.'

    def __init__(self):
        super(DataSetSearch, self).__init__()
        self._translator = ElasticSearchQueryTranslator()

    def search(self, query, org_uuid_list, dataset_filtering, is_admin):
        query_string = self._translator.translate(query, org_uuid_list, dataset_filtering, is_admin)
        try:
            elastic_search_results = self._elastic_search.search(
                index=self._config.elastic.elastic_index,
                doc_type=self._config.elastic.elastic_metadata_type,
                body=query_string
            )
            return self._extract_metadata(elastic_search_results)
        except RequestError:
            self._log.exception(self.INVALID_QUERY_ERROR_MESSAGE)
            raise InvalidQueryError(self.INVALID_QUERY_ERROR_MESSAGE)
        except ConnectionError:
            self._log.exception(self.NO_CONNECTION_ERROR_MESSAGE)
            raise IndexConnectionError(self.NO_CONNECTION_ERROR_MESSAGE)

    @staticmethod
    def _extract_metadata(es_query_result):
        hits = es_query_result['hits']
        category_aggregations = es_query_result['aggregations']['categories']['buckets']
        format_aggregations = es_query_result['aggregations']['formats']['buckets']
        entries = []
        for entry in hits['hits']:
            entries.append(entry['_source'])
            entries[-1]['id'] = entry['_id']
        categories = [cat['key'] for cat in category_aggregations]
        formats = [obj['key'] for obj in format_aggregations]
        return {'hits': entries,
                'total': hits['total'],
                'categories': categories,
                'formats': formats}

    @staticmethod
    def get_params_from_request_args(args):
        dataset_filtering = DataSetFiltering.PRIVATE_AND_PUBLIC
        if args.get('onlyPublic', default="", type=str).lower() == 'true':
            dataset_filtering = DataSetFiltering.ONLY_PUBLIC
        if args.get('onlyPrivate', default="", type=str).lower() == 'true':
            dataset_filtering = DataSetFiltering.ONLY_PRIVATE

        return {'dataset_filtering': dataset_filtering}
Exemple #3
0
 def __init__(self):
     super(DataSetSearch, self).__init__()
     self._translator = ElasticSearchQueryTranslator()
Exemple #4
0
 def setUp(self):
     self.translator = ElasticSearchQueryTranslator()
     self.org_uuid = ['orgid007']
Exemple #5
0
class ElasticSearchQueryTranslationTests(TestCase):
    def setUp(self):
        self.translator = ElasticSearchQueryTranslator()
        self.org_uuid = ['orgid007']

    def test_queryTranslation_sizeInQuery_sizeAddedToOutput(self):
        SIZE = 123
        size_query = json.dumps({'size': SIZE})

        translated_query = self.translator.translate(size_query, self.org_uuid,
                                                     None, False)

        self.assertEqual(SIZE, json.loads(translated_query)['size'])

    def test_queryTranslation_fromInQuery_fromAddedToOutput(self):
        FROM = 345
        from_query = json.dumps({'from': FROM})

        translated_query = self.translator.translate(from_query, self.org_uuid,
                                                     True, False)

        self.assertEqual(FROM, json.loads(translated_query)['from'])

    def test_combiningQueryAndFilter_queryWithFilter_filteredQueryCreated(
            self):
        FAKE_BASE_QUERY = {'yup': 'totally fake'}
        FAKE_FILTER = {'uhuh': 'this filter is also fake'}
        FAKE_POST_FILTER = {'hello': 'fake filter'}
        expected_query = {
            'query': {
                'filtered': {
                    'filter': FAKE_FILTER,
                    'query': FAKE_BASE_QUERY
                }
            },
            'post_filter': FAKE_POST_FILTER,
            'aggregations': {
                'categories': {
                    'terms': {
                        'size': 100,
                        'field': 'category'
                    }
                },
                'formats': {
                    'terms': {
                        'field': 'format'
                    }
                }
            }
        }

        output_query = self.translator._combine_query_and_filters(
            FAKE_BASE_QUERY, FAKE_FILTER, FAKE_POST_FILTER)

        self.assertDictEqual(expected_query, output_query)

    def test_queryTranslation_queryIsNotJson_invalidQueryError(self):
        with self.assertRaises(InvalidQueryError):
            self.translator.translate('{"this is not a proper JSON"}',
                                      self.org_uuid, None, False)

    def test_decodingInputQuery_noneQuery_emptyDictReturned(self):
        self.assertDictEqual({}, self.translator._get_query_dict(None))

    def test_queryTranslation_fullFeaturedQuery_queryTranslated(self):
        input_query = {
            'query': 'blabla',
            'filters': [{
                'format': ['csv']
            }],
            'size': 3,
            'from': 14
        }

        output_query_string = self.translator.translate(
            json.dumps(input_query), self.org_uuid, True, False)
        output_query = json.loads(output_query_string)

        self.assertIn('filtered', output_query['query'])
        self.assertIn('size', output_query)
        self.assertIn('from', output_query)