コード例 #1
0
class IntegrationTestSuperSearchFields(ElasticsearchTestCase):
    """Test SuperSearchFields with an elasticsearch database containing fake
    data. """

    def setUp(self):
        super(IntegrationTestSuperSearchFields, self).setUp()

        # Create the supersearch fields.
        self.index_super_search_fields()

        self.api = SuperSearchFields(config=self.config)

    def tearDown(self):
        # Clear the test indices.
        self.index_client.delete(
            self.config.elasticsearch.elasticsearch_default_index
        )

        super(IntegrationTestSuperSearchFields, self).tearDown()

    def test_get_fields(self):
        results = self.api.get_fields()
        eq_(results, SUPERSEARCH_FIELDS)

    def test_get_mapping(self):
        mapping = self.api.get_mapping()['mappings']
        doctype = self.config.elasticsearch.elasticsearch_doctype

        ok_(doctype in mapping)
        properties = mapping[doctype]['properties']

        ok_('processed_crash' in properties)
        ok_('raw_crash' in properties)

        # Check in_database_name is used.
        ok_('os_name' in properties['processed_crash']['properties'])
        ok_('platform' not in properties['processed_crash']['properties'])

        # Those fields have no `storage_mapping`.
        ok_('fake_field' not in properties['raw_crash']['properties'])

        # Test overwriting a field.
        mapping = self.api.get_mapping(overwrite_mapping={
            'name': 'fake_field',
            'storage_mapping': {
                'type': 'long'
            }
        })['mappings']
        properties = mapping[doctype]['properties']

        ok_('fake_field' in properties['raw_crash']['properties'])
        eq_(
            properties['raw_crash']['properties']['fake_field']['type'],
            'long'
        )
コード例 #2
0
    def __init__(self, *args, **kwargs):
        self.config = kwargs.get('config')
        self.es_context = self.config.elasticsearch.elasticsearch_class(
            self.config.elasticsearch)

        self.all_fields = SuperSearchFields(config=self.config).get_fields()

        # Create a map to associate a field's name in the database to its
        # exposed name (in the results and facets).
        self.database_name_to_field_name_map = dict(
            (x['in_database_name'], x['name'])
            for x in self.all_fields.values())

        kwargs.update(fields=self.all_fields)
        super(SuperSearch, self).__init__(*args, **kwargs)
コード例 #3
0
    def create_index(self, index_name, mappings=None):
        """Create an index that will receive crash reports.

        :arg index_name: the name of the index to create
        :arg mappings: dict of doctype->ES mapping

        :returns: True if the index was created, False if it already
            existed

        """
        if mappings is None:
            mappings = SuperSearchFields(context=self).get_mapping()

        es_settings = self.get_socorro_index_settings(mappings)

        try:
            client = self.indices_client()
            client.create(index=index_name, body=es_settings)
            return True

        except elasticsearch.exceptions.RequestError as e:
            # If this index already exists, swallow the error.
            # NOTE! This is NOT how the error looks like in ES 2.x
            if 'IndexAlreadyExistsException' not in str(e):
                raise
            return False
コード例 #4
0
    def setUp(self):
        super(IntegrationTestSuperSearchFields, self).setUp()

        # Create the supersearch fields.
        self.index_super_search_fields()

        self.api = SuperSearchFields(config=self.config)
コード例 #5
0
    def create_socorro_index(self, es_index, mappings=None):
        """Create an index that will receive crash reports. """
        if mappings is None:
            mappings = SuperSearchFields(config=self.config).get_mapping()

        es_settings = self.get_socorro_index_settings(mappings)
        self.create_index(es_index, es_settings)
コード例 #6
0
    def create_socorro_index(self, es_index, mappings=None):
        """Create an index that will receive crash reports. """
        if mappings is None:
            # Import at runtime to avoid dependency circle.
            from socorro.external.es.super_search_fields import (
                SuperSearchFields)
            mappings = SuperSearchFields(config=self.config).get_mapping()

        es_settings = self.get_socorro_index_settings(mappings)
        self.create_index(es_index, es_settings)
コード例 #7
0
    def create_socorro_index(self, es_index, mappings=None):
        """Create an index that will receive crash reports. """
        if mappings is None:
            mappings = SuperSearchFields(config=self.config).get_mapping()

        es_settings = self.get_socorro_index_settings(mappings)
        if self.config.elasticsearch.dry_run:
            print(json.dumps(es_settings, indent=2))
        else:
            self.create_index(es_index, es_settings)
コード例 #8
0
    def _get_all_fields(self):
        if (hasattr(self, '_all_fields')
                and hasattr(self, '_all_fields_timestamp')):
            # we might have it cached
            age = time.time() - self._all_fields_timestamp
            if age < 60 * 60:
                # fresh enough
                return self._all_fields

        self._all_fields = SuperSearchFields(config=self.config).get()
        self._all_fields_timestamp = time.time()
        return self._all_fields
コード例 #9
0
    def create_socorro_index(self,
                             index_name,
                             mappings=None,
                             log_result=False):
        """Create an index that will receive crash reports.

        Note: This function can get called in two contexts: when the processor
        is saving crash reports and also in the local dev environment scripts.
        The former wants to ignore index-existing errors quietly but the latter
        wants to log the result. Hence the fickle nature of this function.

        """
        if mappings is None:
            mappings = SuperSearchFields(context=self).get_mapping()

        es_settings = self.get_socorro_index_settings(mappings)
        self.create_index(index_name, es_settings, log_result)
コード例 #10
0
ファイル: supersearch.py プロジェクト: abudulemusa/socorro
    def __init__(self, *args, **kwargs):
        self.config = kwargs.get('config')
        self.es_context = self.config.elasticsearch.elasticsearch_class(
            self.config.elasticsearch
        )

        self.all_fields = SuperSearchFields(config=self.config).get_fields()

        # Create a map to associate a field's name in the database to its
        # exposed name (in the results and facets).
        self.database_name_to_field_name_map = dict(
            (x['in_database_name'], x['name'])
            for x in self.all_fields.values()
        )

        kwargs.update(fields=self.all_fields)
        super(SuperSearch, self).__init__(
            *args, **kwargs
        )
コード例 #11
0
 def create_socorro_index(self, es_index):
     """Create an index that will receive crash reports. """
     es_settings = SuperSearchFields(config=self.config).get_mapping()
     self.create_index(es_index, es_settings)
コード例 #12
0
class IntegrationTestSuperSearchFields(ElasticsearchTestCase):
    """Test SuperSearchFields with an elasticsearch database containing fake
    data. """

    def setUp(self):
        super(IntegrationTestSuperSearchFields, self).setUp()

        self.api = SuperSearchFields(config=self.config)

    def test_get_fields(self):
        results = self.api.get_fields()
        eq_(results, SUPERSEARCH_FIELDS)

    def test_create_field(self):
        # Test with all parameters set.
        params = {
            'name': 'plotfarm',
            'data_validation_type': 'str',
            'default_value': None,
            'description': 'a plotfarm like Lunix or Wondiws',
            'form_field_choices': ['lun', 'won', 'cam'],
            'has_full_version': True,
            'in_database_name': 'os_name',
            'is_exposed': True,
            'is_returned': True,
            'is_mandatory': False,
            'query_type': 'str',
            'namespace': 'processed_crash',
            'permissions_needed': ['view_plotfarm'],
            'storage_mapping': {"type": "multi_field"},
        }
        res = self.api.create_field(**params)
        ok_(res)
        field = self.connection.get(
            index=self.config.webapi.elasticsearch_default_index,
            doc_type='supersearch_fields',
            id='plotfarm',
        )
        field = field['_source']
        eq_(sorted(field.keys()), sorted(params.keys()))
        for key in field.keys():
            eq_(field[key], params[key])

        # Test default values.
        res = self.api.create_field(
            name='brand_new_field',
            in_database_name='brand_new_field',
            namespace='processed_crash',
        )
        ok_(res)
        ok_(
            self.connection.get(
                index=self.config.webapi.elasticsearch_default_index,
                doc_type='supersearch_fields',
                id='brand_new_field',
            )
        )

        # Test errors.
        # `name` is missing.
        assert_raises(
            MissingArgumentError,
            self.api.create_field,
            in_database_name='something',
        )

        # `in_database_name` is missing.
        assert_raises(
            MissingArgumentError,
            self.api.create_field,
            name='something',
        )

        # Field already exists.
        assert_raises(
            BadArgumentError,
            self.api.create_field,
            name='product',
            in_database_name='product',
            namespace='processed_crash',
        )

        # Test logging.
        res = self.api.create_field(
            name='what_a_field',
            in_database_name='what_a_field',
            namespace='processed_crash',
            storage_mapping='{"type": "long"}',
        )
        ok_(res)
        self.api.config.logger.info.assert_called_with(
            'elasticsearch mapping changed for field "%s", '
            'added new mapping "%s"',
            'what_a_field',
            {u'type': u'long'},
        )

    def test_update_field(self):
        # Let's create a field first.
        assert self.api.create_field(
            name='super_field',
            in_database_name='super_field',
            namespace='superspace',
            description='inaccurate description',
            permissions_needed=['view_nothing'],
            storage_mapping={'type': 'boolean', 'null_value': False}
        )

        # Now let's update that field a little.
        res = self.api.update_field(
            name='super_field',
            description='very accurate description',
            storage_mapping={'type': 'long', 'analyzer': 'keyword'},
        )
        ok_(res)

        # Test logging.
        self.api.config.logger.info.assert_called_with(
            'elasticsearch mapping changed for field "%s", '
            'was "%s", now "%s"',
            'super_field',
            {'type': 'boolean', 'null_value': False},
            {'type': 'long', 'analyzer': 'keyword'},
        )

        field = self.connection.get(
            index=self.config.elasticsearch.elasticsearch_default_index,
            doc_type='supersearch_fields',
            id='super_field',
        )
        field = field['_source']

        # Verify the changes were taken into account.
        eq_(field['description'], 'very accurate description')
        eq_(field['storage_mapping'], {'type': 'long', 'analyzer': 'keyword'})

        # Verify other values did not change.
        eq_(field['permissions_needed'], ['view_nothing'])
        eq_(field['in_database_name'], 'super_field')
        eq_(field['namespace'], 'superspace')

        # Test errors.
        assert_raises(
            MissingArgumentError,
            self.api.update_field,
        )  # `name` is missing

        assert_raises(
            ResourceNotFound,
            self.api.update_field,
            name='unkownfield',
        )

    def test_delete_field(self):
        self.api.delete_field(name='product')

        ok_(
            self.connection.get(
                index=self.config.elasticsearch.elasticsearch_default_index,
                doc_type='supersearch_fields',
                id='signature',
            )
        )
        assert_raises(
            elasticsearch.exceptions.NotFoundError,
            self.connection.get,
            index=self.config.elasticsearch.elasticsearch_default_index,
            doc_type='supersearch_fields',
            id='product',
        )

    @minimum_es_version('1.0')
    def test_get_missing_fields(self):
        config = self.get_mware_config(
            es_index='socorro_integration_test_%W'
        )

        fake_mappings = [
            {
                'mappings': {
                    config.elasticsearch.elasticsearch_doctype: {
                        'properties': {
                            # Add a bunch of unknown fields.
                            'field_z': {
                                'type': 'string'
                            },
                            'namespace1': {
                                'type': 'object',
                                'properties': {
                                    'field_a': {
                                        'type': 'string'
                                    },
                                    'field_b': {
                                        'type': 'long'
                                    }
                                }
                            },
                            'namespace2': {
                                'type': 'object',
                                'properties': {
                                    'subspace1': {
                                        'type': 'object',
                                        'properties': {
                                            'field_b': {
                                                'type': 'long'
                                            }
                                        }
                                    }
                                }
                            },
                            # Add a few known fields that should not appear.
                            'processed_crash': {
                                'type': 'object',
                                'properties': {
                                    'signature': {
                                        'type': 'string'
                                    },
                                    'product': {
                                        'type': 'string'
                                    },
                                }
                            }
                        }
                    }
                }
            },
            {
                'mappings': {
                    config.elasticsearch.elasticsearch_doctype: {
                        'properties': {
                            'namespace1': {
                                'type': 'object',
                                'properties': {
                                    'subspace1': {
                                        'type': 'object',
                                        'properties': {
                                            'field_d': {
                                                'type': 'long'
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            },
        ]

        now = datetimeutil.utc_now()
        indices = []

        try:
            # Using "2" here means that an index will be missing, hence testing
            # that it swallows the subsequent error.
            for i in range(2):
                date = now - datetime.timedelta(weeks=i)
                index = date.strftime(config.elasticsearch.elasticsearch_index)
                mapping = fake_mappings[i % len(fake_mappings)]

                self.index_creator.create_index(index, mapping)
                indices.append(index)

            api = SuperSearchFields(config=config)
            missing_fields = api.get_missing_fields()
            expected = [
                'field_z',
                'namespace1.field_a',
                'namespace1.field_b',
                'namespace1.subspace1.field_d',
                'namespace2.subspace1.field_b',
            ]

            eq_(missing_fields['hits'], expected)
            eq_(missing_fields['total'], 5)

        finally:
            for index in indices:
                self.index_client.delete(index=index)

    def test_get_mapping(self):
        mapping = self.api.get_mapping()['mappings']
        doctype = self.config.elasticsearch.elasticsearch_doctype

        ok_(doctype in mapping)
        properties = mapping[doctype]['properties']

        ok_('processed_crash' in properties)
        ok_('raw_crash' in properties)

        processed_crash = properties['processed_crash']['properties']

        # Check in_database_name is used.
        ok_('os_name' in processed_crash)
        ok_('platform' not in processed_crash)

        # Those fields have no `storage_mapping`.
        ok_('fake_field' not in properties['raw_crash']['properties'])

        # Those fields have a `storage_mapping`.
        eq_(processed_crash['release_channel'], {'type': 'string'})

        # Test nested objects.
        ok_('json_dump' in processed_crash)
        ok_('properties' in processed_crash['json_dump'])
        ok_('write_combine_size' in processed_crash['json_dump']['properties'])
        eq_(
            processed_crash['json_dump']['properties']['write_combine_size'],
            {'type': 'long'}
        )

        # Test overwriting a field.
        mapping = self.api.get_mapping(overwrite_mapping={
            'name': 'fake_field',
            'storage_mapping': {
                'type': 'long'
            }
        })['mappings']
        properties = mapping[doctype]['properties']

        ok_('fake_field' in properties['raw_crash']['properties'])
        eq_(
            properties['raw_crash']['properties']['fake_field']['type'],
            'long'
        )

    def test_test_mapping(self):
        """Much test. So meta. Wow test_test_. """
        # First test a valid mapping.
        mapping = self.api.get_mapping()
        ok_(self.api.test_mapping(mapping) is None)

        # Insert an invalid storage mapping.
        mapping = self.api.get_mapping({
            'name': 'fake_field',
            'storage_mapping': {
                'type': 'unkwown'
            }
        })
        assert_raises(
            BadArgumentError,
            self.api.test_mapping,
            mapping,
        )

        # Test with a correct mapping but with data that cannot be indexed.
        self.index_crash({
            'date_processed': datetimeutil.utc_now(),
            'product': 'WaterWolf',
        })
        self.refresh_index()
        mapping = self.api.get_mapping({
            'name': 'product',
            'storage_mapping': {
                'type': 'long'
            }
        })
        assert_raises(
            BadArgumentError,
            self.api.test_mapping,
            mapping,
        )
コード例 #13
0
    def setUp(self):
        super(IntegrationTestSuperSearchFields, self).setUp()

        self.api = SuperSearchFields(config=self.config)
        self.api.get_fields = lambda: copy.deepcopy(SUPERSEARCH_FIELDS)
コード例 #14
0
    def test_get_missing_fields(self):
        config = self.get_base_config(es_index='socorro_integration_test_%W')

        fake_mappings = [
            {
                'mappings': {
                    config.elasticsearch.elasticsearch_doctype: {
                        'properties': {
                            # Add a bunch of unknown fields.
                            'field_z': {
                                'type': 'string'
                            },
                            'namespace1': {
                                'type': 'object',
                                'properties': {
                                    'field_a': {
                                        'type': 'string'
                                    },
                                    'field_b': {
                                        'type': 'long'
                                    }
                                }
                            },
                            'namespace2': {
                                'type': 'object',
                                'properties': {
                                    'subspace1': {
                                        'type': 'object',
                                        'properties': {
                                            'field_b': {
                                                'type': 'long'
                                            }
                                        }
                                    }
                                }
                            },
                            # Add a few known fields that should not appear.
                            'processed_crash': {
                                'type': 'object',
                                'properties': {
                                    'signature': {
                                        'type': 'string'
                                    },
                                    'product': {
                                        'type': 'string'
                                    },
                                }
                            }
                        }
                    }
                }
            },
            {
                'mappings': {
                    config.elasticsearch.elasticsearch_doctype: {
                        'properties': {
                            'namespace1': {
                                'type': 'object',
                                'properties': {
                                    'subspace1': {
                                        'type': 'object',
                                        'properties': {
                                            'field_d': {
                                                'type': 'long'
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            },
        ]

        now = datetimeutil.utc_now()
        indices = []

        try:
            # Using "2" here means that an index will be missing, hence testing
            # that it swallows the subsequent error.
            for i in range(2):
                date = now - datetime.timedelta(weeks=i)
                index = date.strftime(config.elasticsearch.elasticsearch_index)
                mapping = fake_mappings[i % len(fake_mappings)]

                self.index_creator.create_index(index, mapping)
                indices.append(index)

            api = SuperSearchFields(config=config)
            missing_fields = api.get_missing_fields()
            expected = [
                'field_z',
                'namespace1.field_a',
                'namespace1.field_b',
                'namespace1.subspace1.field_d',
                'namespace2.subspace1.field_b',
            ]

            assert missing_fields['hits'] == expected
            assert missing_fields['total'] == 5

        finally:
            for index in indices:
                self.index_client.delete(index=index)
コード例 #15
0
class IntegrationTestSuperSearchFields(ElasticsearchTestCase):
    """Test SuperSearchFields with an elasticsearch database containing fake
    data. """

    def setUp(self):
        super(IntegrationTestSuperSearchFields, self).setUp()

        self.api = SuperSearchFields(config=self.config)

    def test_get_fields(self):
        results = self.api.get_fields()
        eq_(results, SUPERSEARCH_FIELDS)

    def test_create_field(self):
        # Test with all parameters set.
        params = {
            "name": "plotfarm",
            "data_validation_type": "str",
            "default_value": None,
            "description": "a plotfarm like Lunix or Wondiws",
            "form_field_choices": ["lun", "won", "cam"],
            "has_full_version": True,
            "in_database_name": "os_name",
            "is_exposed": True,
            "is_returned": True,
            "is_mandatory": False,
            "query_type": "str",
            "namespace": "processed_crash",
            "permissions_needed": ["view_plotfarm"],
            "storage_mapping": {"type": "multi_field"},
        }
        res = self.api.create_field(**params)
        ok_(res)
        field = self.connection.get(
            index=self.config.webapi.elasticsearch_default_index, doc_type="supersearch_fields", id="plotfarm"
        )
        field = field["_source"]
        eq_(sorted(field.keys()), sorted(params.keys()))
        for key in field.keys():
            eq_(field[key], params[key])

        # Test default values.
        res = self.api.create_field(
            name="brand_new_field", in_database_name="brand_new_field", namespace="processed_crash"
        )
        ok_(res)
        ok_(
            self.connection.get(
                index=self.config.webapi.elasticsearch_default_index,
                doc_type="supersearch_fields",
                id="brand_new_field",
            )
        )

        # Test errors.
        # `name` is missing.
        assert_raises(MissingArgumentError, self.api.create_field, in_database_name="something")

        # `in_database_name` is missing.
        assert_raises(MissingArgumentError, self.api.create_field, name="something")

        # Field already exists.
        assert_raises(
            InsertionError,
            self.api.create_field,
            name="product",
            in_database_name="product",
            namespace="processed_crash",
        )

        # Test logging.
        res = self.api.create_field(
            name="what_a_field",
            in_database_name="what_a_field",
            namespace="processed_crash",
            storage_mapping='{"type": "long"}',
        )
        ok_(res)
        self.api.config.logger.info.assert_called_with(
            'elasticsearch mapping changed for field "%s", ' 'added new mapping "%s"',
            "what_a_field",
            {u"type": u"long"},
        )

    def test_update_field(self):
        # Let's create a field first.
        assert self.api.create_field(
            name="super_field",
            in_database_name="super_field",
            namespace="superspace",
            description="inaccurate description",
            permissions_needed=["view_nothing"],
            storage_mapping={"type": "boolean", "null_value": False},
        )

        # Now let's update that field a little.
        res = self.api.update_field(
            name="super_field",
            description="very accurate description",
            storage_mapping={"type": "long", "analyzer": "keyword"},
        )
        ok_(res)

        # Test logging.
        self.api.config.logger.info.assert_called_with(
            'elasticsearch mapping changed for field "%s", ' 'was "%s", now "%s"',
            "super_field",
            {"type": "boolean", "null_value": False},
            {"type": "long", "analyzer": "keyword"},
        )

        field = self.connection.get(
            index=self.config.elasticsearch.elasticsearch_default_index, doc_type="supersearch_fields", id="super_field"
        )
        field = field["_source"]

        # Verify the changes were taken into account.
        eq_(field["description"], "very accurate description")
        eq_(field["storage_mapping"], {"type": "long", "analyzer": "keyword"})

        # Verify other values did not change.
        eq_(field["permissions_needed"], ["view_nothing"])
        eq_(field["in_database_name"], "super_field")
        eq_(field["namespace"], "superspace")

        # Test errors.
        assert_raises(MissingArgumentError, self.api.update_field)  # `name` is missing

        assert_raises(ResourceNotFound, self.api.update_field, name="unkownfield")

    def test_delete_field(self):
        self.api.delete_field(name="product")

        ok_(
            self.connection.get(
                index=self.config.elasticsearch.elasticsearch_default_index,
                doc_type="supersearch_fields",
                id="signature",
            )
        )
        assert_raises(
            elasticsearch.exceptions.NotFoundError,
            self.connection.get,
            index=self.config.elasticsearch.elasticsearch_default_index,
            doc_type="supersearch_fields",
            id="product",
        )

    @minimum_es_version("1.0")
    def test_get_missing_fields(self):
        config = self.get_mware_config(es_index="socorro_integration_test_%W")

        fake_mappings = [
            {
                "mappings": {
                    config.elasticsearch.elasticsearch_doctype: {
                        "properties": {
                            # Add a bunch of unknown fields.
                            "field_z": {"type": "string"},
                            "namespace1": {
                                "type": "object",
                                "properties": {"field_a": {"type": "string"}, "field_b": {"type": "long"}},
                            },
                            "namespace2": {
                                "type": "object",
                                "properties": {
                                    "subspace1": {"type": "object", "properties": {"field_b": {"type": "long"}}}
                                },
                            },
                            # Add a few known fields that should not appear.
                            "processed_crash": {
                                "type": "object",
                                "properties": {"signature": {"type": "string"}, "product": {"type": "string"}},
                            },
                        }
                    }
                }
            },
            {
                "mappings": {
                    config.elasticsearch.elasticsearch_doctype: {
                        "properties": {
                            "namespace1": {
                                "type": "object",
                                "properties": {
                                    "subspace1": {"type": "object", "properties": {"field_d": {"type": "long"}}}
                                },
                            }
                        }
                    }
                }
            },
        ]

        now = datetimeutil.utc_now()
        indices = []

        try:
            # Using "2" here means that an index will be missing, hence testing
            # that it swallows the subsequent error.
            for i in range(2):
                date = now - datetime.timedelta(weeks=i)
                index = date.strftime(config.elasticsearch.elasticsearch_index)
                mapping = fake_mappings[i % len(fake_mappings)]

                self.index_creator.create_index(index, mapping)
                indices.append(index)

            api = SuperSearchFields(config=config)
            missing_fields = api.get_missing_fields()
            expected = [
                "field_z",
                "namespace1.field_a",
                "namespace1.field_b",
                "namespace1.subspace1.field_d",
                "namespace2.subspace1.field_b",
            ]

            eq_(missing_fields["hits"], expected)
            eq_(missing_fields["total"], 5)

        finally:
            for index in indices:
                self.index_client.delete(index=index)

    def test_get_mapping(self):
        mapping = self.api.get_mapping()["mappings"]
        doctype = self.config.elasticsearch.elasticsearch_doctype

        ok_(doctype in mapping)
        properties = mapping[doctype]["properties"]

        ok_("processed_crash" in properties)
        ok_("raw_crash" in properties)

        processed_crash = properties["processed_crash"]["properties"]

        # Check in_database_name is used.
        ok_("os_name" in processed_crash)
        ok_("platform" not in processed_crash)

        # Those fields have no `storage_mapping`.
        ok_("fake_field" not in properties["raw_crash"]["properties"])

        # Those fields have a `storage_mapping`.
        eq_(processed_crash["release_channel"], {"type": "string"})

        # Test nested objects.
        ok_("json_dump" in processed_crash)
        ok_("properties" in processed_crash["json_dump"])
        ok_("write_combine_size" in processed_crash["json_dump"]["properties"])
        eq_(processed_crash["json_dump"]["properties"]["write_combine_size"], {"type": "long"})

        # Test overwriting a field.
        mapping = self.api.get_mapping(overwrite_mapping={"name": "fake_field", "storage_mapping": {"type": "long"}})[
            "mappings"
        ]
        properties = mapping[doctype]["properties"]

        ok_("fake_field" in properties["raw_crash"]["properties"])
        eq_(properties["raw_crash"]["properties"]["fake_field"]["type"], "long")

    def test_test_mapping(self):
        """Much test. So meta. Wow test_test_. """
        # First test a valid mapping.
        mapping = self.api.get_mapping()
        ok_(self.api.test_mapping(mapping) is None)

        # Insert an invalid storage mapping.
        mapping = self.api.get_mapping({"name": "fake_field", "storage_mapping": {"type": "unkwown"}})
        assert_raises(elasticsearch.exceptions.RequestError, self.api.test_mapping, mapping)

        # Test with a correct mapping but with data that cannot be indexed.
        self.index_crash({"date_processed": datetimeutil.utc_now(), "product": "WaterWolf"})
        self.refresh_index()
        mapping = self.api.get_mapping({"name": "product", "storage_mapping": {"type": "long"}})
        # self.api.test_mapping(mapping)
        assert_raises(elasticsearch.exceptions.RequestError, self.api.test_mapping, mapping)
コード例 #16
0
class IntegrationTestSuperSearchFields(ElasticsearchTestCase):
    """Test SuperSearchFields with an elasticsearch database containing fake
    data. """
    def setUp(self):
        super(IntegrationTestSuperSearchFields, self).setUp()

        self.api = SuperSearchFields(config=self.config)
        self.api.get_fields = lambda: copy.deepcopy(SUPERSEARCH_FIELDS)

    def test_get_fields(self):
        results = self.api.get_fields()
        assert results == SUPERSEARCH_FIELDS

    @minimum_es_version('1.0')
    def test_get_missing_fields(self):
        config = self.get_base_config(es_index='socorro_integration_test_%W')

        fake_mappings = [
            {
                'mappings': {
                    config.elasticsearch.elasticsearch_doctype: {
                        'properties': {
                            # Add a bunch of unknown fields.
                            'field_z': {
                                'type': 'string'
                            },
                            'namespace1': {
                                'type': 'object',
                                'properties': {
                                    'field_a': {
                                        'type': 'string'
                                    },
                                    'field_b': {
                                        'type': 'long'
                                    }
                                }
                            },
                            'namespace2': {
                                'type': 'object',
                                'properties': {
                                    'subspace1': {
                                        'type': 'object',
                                        'properties': {
                                            'field_b': {
                                                'type': 'long'
                                            }
                                        }
                                    }
                                }
                            },
                            # Add a few known fields that should not appear.
                            'processed_crash': {
                                'type': 'object',
                                'properties': {
                                    'signature': {
                                        'type': 'string'
                                    },
                                    'product': {
                                        'type': 'string'
                                    },
                                }
                            }
                        }
                    }
                }
            },
            {
                'mappings': {
                    config.elasticsearch.elasticsearch_doctype: {
                        'properties': {
                            'namespace1': {
                                'type': 'object',
                                'properties': {
                                    'subspace1': {
                                        'type': 'object',
                                        'properties': {
                                            'field_d': {
                                                'type': 'long'
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            },
        ]

        now = datetimeutil.utc_now()
        indices = []

        try:
            # Using "2" here means that an index will be missing, hence testing
            # that it swallows the subsequent error.
            for i in range(2):
                date = now - datetime.timedelta(weeks=i)
                index = date.strftime(config.elasticsearch.elasticsearch_index)
                mapping = fake_mappings[i % len(fake_mappings)]

                self.index_creator.create_index(index, mapping)
                indices.append(index)

            api = SuperSearchFields(config=config)
            missing_fields = api.get_missing_fields()
            expected = [
                'field_z',
                'namespace1.field_a',
                'namespace1.field_b',
                'namespace1.subspace1.field_d',
                'namespace2.subspace1.field_b',
            ]

            assert missing_fields['hits'] == expected
            assert missing_fields['total'] == 5

        finally:
            for index in indices:
                self.index_client.delete(index=index)

    def test_get_mapping(self):
        mapping = self.api.get_mapping()
        doctype = self.config.elasticsearch.elasticsearch_doctype

        assert doctype in mapping
        properties = mapping[doctype]['properties']

        assert 'processed_crash' in properties
        assert 'raw_crash' in properties

        processed_crash = properties['processed_crash']['properties']

        # Check in_database_name is used.
        assert 'os_name' in processed_crash
        assert 'platform' not in processed_crash

        # Those fields have no `storage_mapping`.
        assert 'fake_field' not in properties['raw_crash']['properties']

        # Those fields have a `storage_mapping`.
        assert processed_crash['release_channel'] == {'type': 'string'}

        # Test nested objects.
        assert 'json_dump' in processed_crash
        assert 'properties' in processed_crash['json_dump']
        assert 'write_combine_size' in processed_crash['json_dump'][
            'properties']
        assert processed_crash['json_dump']['properties'][
            'write_combine_size'] == {
                'type': 'long'
            }

        # Test overwriting a field.
        mapping = self.api.get_mapping(overwrite_mapping={
            'name': 'fake_field',
            'storage_mapping': {
                'type': 'long'
            }
        })
        properties = mapping[doctype]['properties']

        assert 'fake_field' in properties['raw_crash']['properties']
        assert properties['raw_crash']['properties']['fake_field'][
            'type'] == 'long'

    def test_test_mapping(self):
        """Much test. So meta. Wow test_test_. """
        # First test a valid mapping.
        mapping = self.api.get_mapping()
        assert self.api.test_mapping(mapping) is None

        # Insert an invalid storage mapping.
        mapping = self.api.get_mapping({
            'name': 'fake_field',
            'storage_mapping': {
                'type': 'unkwown'
            }
        })
        with pytest.raises(BadArgumentError):
            self.api.test_mapping(mapping)

        # Test with a correct mapping but with data that cannot be indexed.
        self.index_crash({
            'date_processed': datetimeutil.utc_now(),
            'product': 'WaterWolf',
        })
        self.refresh_index()
        mapping = self.api.get_mapping({
            'name': 'product',
            'storage_mapping': {
                'type': 'long'
            }
        })
        with pytest.raises(BadArgumentError):
            self.api.test_mapping(mapping)
コード例 #17
0
 def delete_field(self, **kwargs):
     return SuperSearchFields(config=self.config).delete_field(**kwargs)
コード例 #18
0
 def get(self, **kwargs):
     kwargs['_fields'] = SuperSearchFields(config=self.config).get_fields()
     return super(SuperSearchWithFields, self).get(**kwargs)
コード例 #19
0
    def setUp(self):
        super(IntegrationTestSuperSearchFields, self).setUp()

        self.api = SuperSearchFields(config=self.config)
        self.api.get_fields = lambda: copy.deepcopy(FIELDS)
コード例 #20
0
 def get_fields(self, **kwargs):
     return SuperSearchFields(config=self.config).get_fields(**kwargs)
コード例 #21
0
    def setUp(self):
        super(IntegrationTestSuperSearchFields, self).setUp()

        self.api = SuperSearchFields(config=self.config)
コード例 #22
0
    def test_index_crash_mapping_keys(self):
        """Test indexing a crash that has keys not in the mapping

        Indexing a crash that has keys that aren't in the mapping for the index
        should cause those keys to be removed from the crash.

        """
        # The test harness creates an index for this week and last week. So let's create
        # one for 4 weeks ago.
        now = utc_now()
        four_weeks_ago = now - timedelta(days=28)

        field = "user_comments"

        # We're going to use a mapping that's what SuperSearchFields gives us, but
        # remove the user_comments field.
        mappings = SuperSearchFields(context=self.es_context).get_mapping()
        doctype = self.es_context.get_doctype()
        del mappings[doctype]["properties"]["processed_crash"]["properties"][
            field]

        # Create the index for 4 weeks ago
        self.es_context.create_index(
            index_name=self.es_context.get_index_for_date(four_weeks_ago),
            mappings=mappings,
        )

        es_storage = ESCrashStorage(config=self.config)

        # Create a crash for this week and save it
        now_uuid = "00000000-0000-0000-0000-000000120408"
        raw_crash = {
            "BuildID": "20200506000000",
        }
        processed_crash = {
            field: "this week",
            "date_processed": date_to_string(now),
            "uuid": now_uuid,
        }

        es_storage.save_processed_crash(
            raw_crash=raw_crash,
            processed_crash=processed_crash,
        )

        # Create a crash for four weeks ago with the bum mapping and save it
        old_uuid = "11111111-1111-1111-1111-111111120408"
        raw_crash = {
            "BuildID": "20200506000000",
        }
        processed_crash = {
            field: "this week",
            "date_processed": date_to_string(now - timedelta(days=28)),
            "uuid": old_uuid,
        }

        es_storage.save_processed_crash(
            raw_crash=raw_crash,
            processed_crash=processed_crash,
        )

        self.es_context.refresh()

        # Retrieve the document from this week and verify it has the user_comments
        # field
        doc = self.conn.get(
            index=self.es_context.get_index_for_date(now),
            id=now_uuid,
        )
        assert field in doc["_source"]["processed_crash"]

        # Retrieve the document from four weeks ago and verify it doesn't have the
        # user_comments field
        doc = self.conn.get(
            index=self.es_context.get_index_for_date(four_weeks_ago),
            id=old_uuid,
        )
        assert field not in doc["_source"]["processed_crash"]
コード例 #23
0
class SuperSearch(SearchBase):
    def __init__(self, *args, **kwargs):
        self.config = kwargs.get('config')
        self.es_context = self.config.elasticsearch.elasticsearch_class(
            self.config.elasticsearch)

        self.all_fields = SuperSearchFields(config=self.config).get_fields()

        # Create a map to associate a field's name in the database to its
        # exposed name (in the results and facets).
        self.database_name_to_field_name_map = dict(
            (x['in_database_name'], x['name'])
            for x in self.all_fields.values())

        kwargs.update(fields=self.all_fields)
        super(SuperSearch, self).__init__(*args, **kwargs)

    def get_connection(self):
        with self.es_context() as conn:
            return conn

    def generate_list_of_indices(self, from_date, to_date, es_index=None):
        """Return the list of indices to query to access all the crash reports
        that were processed between from_date and to_date.

        The naming pattern for indices in elasticsearch is configurable, it is
        possible to have an index per day, per week, per month...

        Parameters:
        * from_date datetime object
        * to_date datetime object
        """
        if es_index is None:
            es_index = self.config.elasticsearch_index

        indices = []
        current_date = from_date
        while current_date <= to_date:
            index = current_date.strftime(es_index)

            # Make sure no index is twice in the list
            # (for weekly or monthly indices for example)
            if index not in indices:
                indices.append(index)
            current_date += datetime.timedelta(days=1)

        return indices

    def get_indices(self, dates):
        """Return the list of indices to use for given dates. """
        start_date = None
        end_date = None
        for date in dates:
            if '>' in date.operator:
                start_date = date.value
            if '<' in date.operator:
                end_date = date.value

        return self.generate_list_of_indices(start_date, end_date)

    def format_field_names(self, hit):
        """Return a hit with each field's database name replaced by its
        exposed name. """
        new_hit = {}
        for field in hit:
            new_field = field

            if '.' in new_field:
                # Remove the prefix ("processed_crash." or "raw_crash.").
                new_field = new_field.split('.')[-1]

            new_field = self.database_name_to_field_name_map.get(
                new_field, new_field)

            new_hit[new_field] = hit[field]

        return new_hit

    def format_fields(self, hit):
        """Return a well formatted document.

        Elasticsearch returns values as lists when using the `fields` option.
        This function removes the list when it contains zero or one element.
        It also calls `format_field_names` to correct all the field names.
        """
        hit = self.format_field_names(hit)

        for field in hit:
            if isinstance(hit[field], (list, tuple)):
                if len(hit[field]) == 0:
                    hit[field] = None
                elif len(hit[field]) == 1:
                    hit[field] = hit[field][0]

        return hit

    def format_aggregations(self, aggregations):
        """Return aggregations in a form that looks like facets.

        We used to expose the Elasticsearch facets directly. This is thus
        needed for backwards compatibility.
        """
        aggs = aggregations.to_dict()
        for agg in aggs:
            for i, row in enumerate(aggs[agg]['buckets']):
                aggs[agg]['buckets'][i] = {
                    'term': row['key'],
                    'count': row['doc_count'],
                }
            aggs[agg] = aggs[agg]['buckets']

        return aggs

    def get(self, **kwargs):
        """Return a list of results and aggregations based on parameters.

        The list of accepted parameters (with types and default values) is in
        the database and can be accessed with the super_search_fields service.
        """
        # Filter parameters and raise potential errors.
        params = self.get_parameters(**kwargs)

        # Find the indices to use to optimize the elasticsearch query.
        indices = self.get_indices(params['date'])

        # Create and configure the search object.
        search = Search(
            using=self.get_connection(),
            index=indices,
            doc_type=self.config.elasticsearch.elasticsearch_doctype,
        )

        # Create filters.
        filters = None

        for field, sub_params in params.items():
            sub_filters = None
            for param in sub_params:

                if param.name.startswith('_'):
                    if param.name == '_results_offset':
                        results_from = param.value[0]
                    elif param.name == '_results_number':
                        results_number = param.value[0]
                    # Don't use meta parameters in the query.
                    continue

                field_data = self.all_fields[param.name]

                name = '%s.%s' % (field_data['namespace'],
                                  field_data['in_database_name'])

                if param.data_type in ('date', 'datetime'):
                    param.value = datetimeutil.date_to_string(param.value)
                elif param.data_type == 'enum':
                    param.value = [x.lower() for x in param.value]
                elif param.data_type == 'str' and not param.operator:
                    param.value = [x.lower() for x in param.value]

                args = {}
                filter_type = 'term'
                filter_value = None
                if not param.operator:
                    # contains one of the terms
                    if len(param.value) == 1:
                        val = param.value[0]
                        if not isinstance(val, basestring) or (isinstance(
                                val, basestring) and ' ' not in val):
                            filter_value = val

                        # If the term contains white spaces, we want to perform
                        # a phrase query. Thus we do nothing here and let this
                        # value be handled later.
                    else:
                        filter_type = 'terms'
                        filter_value = param.value
                elif param.operator == '=':
                    # is exactly
                    if field_data['has_full_version']:
                        name = '%s.full' % name
                    filter_value = param.value
                elif param.operator == '>':
                    # greater than
                    filter_type = 'range'
                    filter_value = {'gt': param.value}
                elif param.operator == '<':
                    # lower than
                    filter_type = 'range'
                    filter_value = {'lt': param.value}
                elif param.operator == '>=':
                    # greater than or equal to
                    filter_type = 'range'
                    filter_value = {'gte': param.value}
                elif param.operator == '<=':
                    # lower than or equal to
                    filter_type = 'range'
                    filter_value = {'lte': param.value}
                elif param.operator == '__null__':
                    # is null
                    filter_type = 'missing'
                    args['field'] = name

                if filter_value is not None:
                    args[name] = filter_value

                if args:
                    if param.operator_not:
                        new_filter = ~F(filter_type, **args)
                    else:
                        new_filter = F(filter_type, **args)

                    if sub_filters is None:
                        sub_filters = new_filter
                    elif param.data_type == 'enum':
                        sub_filters |= new_filter
                    else:
                        sub_filters &= new_filter

                    continue

                # These use a wildcard and thus need to be in a query
                # instead of a filter.
                operator_wildcards = {
                    '~': '*%s*',  # contains
                    '$': '%s*',  # starts with
                    '^': '*%s'  # ends with
                }
                if param.operator in operator_wildcards:
                    if field_data['has_full_version']:
                        name = '%s.full' % name

                    query_type = 'wildcard'
                    args[name] = (operator_wildcards[param.operator] %
                                  param.value)
                elif not param.operator:
                    # This is a phrase that was passed down.
                    query_type = 'simple_query_string'
                    args['query'] = param.value[0]
                    args['fields'] = [name]
                    args['default_operator'] = 'and'

                if args:
                    query = Q(query_type, **args)
                    if param.operator_not:
                        query = ~query
                    search = search.query(query)
                else:
                    # If we reach this point, that means the operator is
                    # not supported, and we should raise an error about that.
                    raise NotImplementedError('Operator %s is not supported' %
                                              param.operator)

            if filters is None:
                filters = sub_filters
            elif sub_filters is not None:
                filters &= sub_filters

        search = search.filter(filters)

        # Pagination.
        results_to = results_from + results_number
        search = search[results_from:results_to]

        # Create facets.
        for param in params['_facets']:
            for value in param.value:
                try:
                    field_ = self.all_fields[value]
                except KeyError:
                    # That is not a known field, we can't facet on it.
                    raise BadArgumentError(
                        value,
                        msg='Unknown field "%s", cannot facet on it' % value)

                field_name = '%s.%s' % (field_['namespace'],
                                        field_['in_database_name'])

                if field_['has_full_version']:
                    # If the param has a full version, that means what matters
                    # is the full string, and not its individual terms.
                    field_name += '.full'

                search.aggs.bucket(value,
                                   'terms',
                                   field=field_name,
                                   size=self.config.facets_max_number)

        # Query and compute results.
        hits = []
        fields = [
            '%s.%s' % (x['namespace'], x['in_database_name'])
            for x in self.all_fields.values() if x['is_returned']
        ]
        search = search.fields(*fields)

        if params['_return_query'][0].value[0]:
            # Return only the JSON query that would be sent to elasticsearch.
            return {
                'query': search.to_dict(),
                'indices': indices,
            }

        # We call elasticsearch with a computed list of indices, based on
        # the date range. However, if that list contains indices that do not
        # exist in elasticsearch, an error will be raised. We thus want to
        # remove all failing indices until we either have a valid list, or
        # an empty list in which case we return no result.
        while True:
            try:
                results = search.execute()
                for hit in results:
                    hits.append(self.format_fields(hit.to_dict()))

                total = search.count()
                aggregations = self.format_aggregations(results.aggregations)
                break  # Yay! Results!
            except NotFoundError, e:
                missing_index = re.findall(BAD_INDEX_REGEX, e.error)[0]
                if missing_index in indices:
                    del indices[indices.index(missing_index)]
                else:
                    # Wait what? An error caused by an index that was not
                    # in the request? That should never happen, but in case
                    # it does, better know it.
                    raise

                if indices:
                    # Update the list of indices and try again.
                    # Note: we need to first empty the list of indices before
                    # updating it, otherwise the removed indices never get
                    # actually removed.
                    search = search.index().index(*indices)
                else:
                    # There is no index left in the list, return an empty
                    # result.
                    hits = []
                    total = 0
                    aggregations = {}
                    break

        return {
            'hits': hits,
            'total': total,
            'facets': aggregations,
        }
コード例 #24
0
    def setUp(self):
        super(IntegrationTestSuperSearchFields, self).setUp()

        self.api = SuperSearchFields(config=self.config)
コード例 #25
0
class IntegrationTestSuperSearchFields(ElasticsearchTestCase):
    """Test SuperSearchFields with an elasticsearch database containing fake
    data. """
    def setUp(self):
        super(IntegrationTestSuperSearchFields, self).setUp()

        self.api = SuperSearchFields(config=self.config)

    def test_get_fields(self):
        results = self.api.get_fields()
        eq_(results, SUPERSEARCH_FIELDS)

    def test_create_field(self):
        # Test with all parameters set.
        params = {
            'name': 'plotfarm',
            'data_validation_type': 'str',
            'default_value': None,
            'description': 'a plotfarm like Lunix or Wondiws',
            'form_field_choices': ['lun', 'won', 'cam'],
            'has_full_version': False,
            'in_database_name': 'os_name',
            'is_exposed': True,
            'is_returned': True,
            'is_mandatory': False,
            'query_type': 'str',
            'namespace': 'processed_crash',
            'permissions_needed': ['view_plotfarm'],
            'storage_mapping': {
                "type": "keyword"
            },
        }
        res = self.api.create_field(**params)
        ok_(res)
        field = self.connection.get(
            index=self.config.elasticsearch.elasticsearch_default_index,
            doc_type='supersearch_fields',
            id='plotfarm',
        )
        field = field['_source']
        eq_(sorted(field.keys()), sorted(params.keys()))
        for key in field.keys():
            eq_(field[key], params[key])

        # Test default values.
        res = self.api.create_field(
            name='brand_new_field',
            in_database_name='brand_new_field',
            namespace='processed_crash',
        )
        ok_(res)
        ok_(
            self.connection.get(
                index=self.config.elasticsearch.elasticsearch_default_index,
                doc_type='supersearch_fields',
                id='brand_new_field',
            ))

        # Test errors.
        # `name` is missing.
        assert_raises(
            MissingArgumentError,
            self.api.create_field,
            in_database_name='something',
        )

        # `in_database_name` is missing.
        assert_raises(
            MissingArgumentError,
            self.api.create_field,
            name='something',
        )

        # Field already exists.
        assert_raises(
            BadArgumentError,
            self.api.create_field,
            name='product',
            in_database_name='product',
            namespace='processed_crash',
        )

        # Test logging.
        res = self.api.create_field(
            name='what_a_field',
            in_database_name='what_a_field',
            namespace='processed_crash',
            storage_mapping='{"type": "long"}',
        )
        ok_(res)
        self.api.config.logger.info.assert_called_with(
            'elasticsearch mapping changed for field "%s", '
            'added new mapping "%s"',
            'what_a_field',
            {u'type': u'long'},
        )

    def test_update_field(self):
        # Let's create a field first.
        assert self.api.create_field(name='super_field',
                                     in_database_name='super_field',
                                     namespace='superspace',
                                     description='inaccurate description',
                                     permissions_needed=['view_nothing'],
                                     storage_mapping={
                                         'type': 'boolean',
                                         'null_value': False
                                     })

        # Now let's update that field a little.
        res = self.api.update_field(
            name='super_field',
            description='very accurate description',
            storage_mapping={'type': 'long'},
        )
        ok_(res)

        # Test logging.
        self.api.config.logger.info.assert_called_with(
            'Elasticsearch mapping changed for field "%s", '
            'was "%s", now "%s"',
            'super_field',
            {
                'type': 'boolean',
                'null_value': False
            },
            {'type': 'long'},
        )

        field = self.connection.get(
            index=self.config.elasticsearch.elasticsearch_default_index,
            doc_type='supersearch_fields',
            id='super_field',
        )
        field = field['_source']

        # Verify the changes were taken into account.
        eq_(field['description'], 'very accurate description')
        eq_(field['storage_mapping'], {'type': 'long'})

        # Verify other values did not change.
        eq_(field['permissions_needed'], ['view_nothing'])
        eq_(field['in_database_name'], 'super_field')
        eq_(field['namespace'], 'superspace')

        # Test errors.
        assert_raises(
            MissingArgumentError,
            self.api.update_field,
        )  # `name` is missing

        assert_raises(
            ResourceNotFound,
            self.api.update_field,
            name='unkownfield',
        )

    def test_delete_field(self):
        self.api.delete_field(name='product')

        ok_(
            self.connection.get(
                index=self.config.elasticsearch.elasticsearch_default_index,
                doc_type='supersearch_fields',
                id='signature',
            ))
        assert_raises(
            elasticsearch.exceptions.NotFoundError,
            self.connection.get,
            index=self.config.elasticsearch.elasticsearch_default_index,
            doc_type='supersearch_fields',
            id='product',
        )

    def test_get_missing_fields(self):
        config = self.get_base_config(es_index='socorro_integration_test_%W')

        fake_mappings = [
            {
                'mappings': {
                    config.elasticsearch.elasticsearch_doctype: {
                        'properties': {
                            # Add a bunch of unknown fields.
                            'field_z': {
                                'type': 'string'
                            },
                            'namespace1': {
                                'type': 'object',
                                'properties': {
                                    'field_a': {
                                        'type': 'string'
                                    },
                                    'field_b': {
                                        'type': 'long'
                                    }
                                }
                            },
                            'namespace2': {
                                'type': 'object',
                                'properties': {
                                    'subspace1': {
                                        'type': 'object',
                                        'properties': {
                                            'field_b': {
                                                'type': 'long'
                                            }
                                        }
                                    }
                                }
                            },
                            # Add a few known fields that should not appear.
                            'processed_crash': {
                                'type': 'object',
                                'properties': {
                                    'signature': {
                                        'type': 'string'
                                    },
                                    'product': {
                                        'type': 'string'
                                    },
                                }
                            }
                        }
                    }
                }
            },
            {
                'mappings': {
                    config.elasticsearch.elasticsearch_doctype: {
                        'properties': {
                            'namespace1': {
                                'type': 'object',
                                'properties': {
                                    'subspace1': {
                                        'type': 'object',
                                        'properties': {
                                            'field_d': {
                                                'type': 'long'
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            },
        ]

        now = datetimeutil.utc_now()
        indices = []

        try:
            # Using "2" here means that an index will be missing, hence testing
            # that it swallows the subsequent error.
            for i in range(2):
                date = now - datetime.timedelta(weeks=i)
                index = date.strftime(config.elasticsearch.elasticsearch_index)
                mapping = fake_mappings[i % len(fake_mappings)]

                self.index_creator.create_index(index, mapping)
                indices.append(index)

            api = SuperSearchFields(config=config)
            missing_fields = api.get_missing_fields()
            expected = [
                'field_z',
                'namespace1.field_a',
                'namespace1.field_b',
                'namespace1.subspace1.field_d',
                'namespace2.subspace1.field_b',
            ]

            eq_(missing_fields['hits'], expected)
            eq_(missing_fields['total'], 5)

        finally:
            for index in indices:
                self.index_client.delete(index=index)

    def test_get_mapping(self):
        mapping = self.api.get_mapping()
        doctype = self.config.elasticsearch.elasticsearch_doctype

        ok_(doctype in mapping)
        properties = mapping[doctype]['properties']

        ok_('processed_crash' in properties)
        ok_('raw_crash' in properties)

        processed_crash = properties['processed_crash']['properties']

        # Check in_database_name is used.
        ok_('os_name' in processed_crash)
        ok_('platform' not in processed_crash)

        # Those fields have no `storage_mapping`.
        ok_('fake_field' not in properties['raw_crash']['properties'])

        # Those fields have a `storage_mapping`.
        eq_(processed_crash['signature'], {
            'type': 'text',
            'fields': {
                'full': {
                    'type': 'keyword',
                }
            }
        })

        # Test nested objects.
        ok_('json_dump' in processed_crash)
        ok_('properties' in processed_crash['json_dump'])
        ok_('write_combine_size' in processed_crash['json_dump']['properties'])
        eq_(processed_crash['json_dump']['properties']['write_combine_size'],
            {'type': 'long'})

        # Test overwriting a field.
        mapping = self.api.get_mapping(overwrite_mapping={
            'name': 'fake_field',
            'storage_mapping': {
                'type': 'long'
            }
        })
        properties = mapping[doctype]['properties']

        ok_('fake_field' in properties['raw_crash']['properties'])
        eq_(properties['raw_crash']['properties']['fake_field']['type'],
            'long')

    def test_test_mapping(self):
        """Much test. So meta. Wow test_test_. """
        # First test a valid mapping.
        mapping = self.api.get_mapping()
        ok_(self.api.test_mapping(mapping) is None)

        # Insert an invalid storage mapping.
        mapping = self.api.get_mapping({
            'name': 'fake_field',
            'storage_mapping': {
                'type': 'unkwown'
            }
        })
        assert_raises(
            BadArgumentError,
            self.api.test_mapping,
            mapping,
        )

        # Test with a correct mapping but with data that cannot be indexed.
        self.index_crash({
            'date_processed': datetimeutil.utc_now(),
            'product': 'WaterWolf',
        })
        self.refresh_index()
        mapping = self.api.get_mapping({
            'name': 'product',
            'storage_mapping': {
                'type': 'long'
            }
        })
        assert_raises(
            BadArgumentError,
            self.api.test_mapping,
            mapping,
        )
コード例 #26
0
ファイル: crashstorage.py プロジェクト: stephendonner/socorro
class TelemetryBotoS3CrashStorage(BotoS3CrashStorage):
    """Sends a subset of the processed crash to an S3 bucket

    The subset of the processed crash is based on the JSON Schema which is
    derived from "socorro/external/es/super_search_fields.py".

    """

    required_config = Namespace()
    required_config.resource_class = change_default(
        BotoCrashStorage,
        'resource_class',
        'socorro.external.boto.connection_context.RegionalS3ConnectionContext'
    )
    required_config.elasticsearch = Namespace()
    required_config.elasticsearch.add_option(
        'elasticsearch_class',
        default='socorro.external.es.connection_context.ConnectionContext',
        from_string_converter=class_converter,
        reference_value_from='resource.elasticsearch',
    )

    def __init__(self, config, *args, **kwargs):
        super(TelemetryBotoS3CrashStorage, self).__init__(
            config, *args, **kwargs
        )
        self._all_fields = SuperSearchFields(config=self.config).get()

    def save_raw_and_processed(
        self,
        raw_crash,
        dumps,
        processed_crash,
        crash_id
    ):
        crash_report = {}

        # TODO Opportunity of optimization;
        # We could inspect CRASH_REPORT_JSON_SCHEMA and get a list
        # of all (recursive) keys that are in there and use that
        # to limit the two following loops to not bother
        # filling up `crash_report` with keys that will never be
        # needed.

        # Rename fields in raw_crash.
        raw_fields_map = dict(
            (x['in_database_name'], x['name'])
            for x in self._all_fields.values()
            if x['namespace'] == 'raw_crash'
        )
        for key, val in raw_crash.items():
            crash_report[raw_fields_map.get(key, key)] = val

        # Rename fields in processed_crash.
        processed_fields_map = dict(
            (x['in_database_name'], x['name'])
            for x in self._all_fields.values()
            if x['namespace'] == 'processed_crash'
        )
        for key, val in processed_crash.items():
            crash_report[processed_fields_map.get(key, key)] = val

        # Validate crash_report.
        crash_report = json_schema_reducer.make_reduced_dict(
            CRASH_REPORT_JSON_SCHEMA, crash_report
        )
        self.save_processed(crash_report)

    @staticmethod
    def _do_save_processed(boto_connection, processed_crash):
        """Overriding this to change "name of thing" to crash_report"""
        crash_id = processed_crash['uuid']
        processed_crash_as_string = boto_connection._convert_mapping_to_string(
            processed_crash
        )
        boto_connection.submit(
            crash_id,
            "crash_report",
            processed_crash_as_string
        )

    @staticmethod
    def _do_get_unredacted_processed(boto_connection, crash_id, json_object_hook):
        """Overriding this to change "name of thing" to crash_report"""
        try:
            processed_crash_as_string = boto_connection.fetch(crash_id, 'crash_report')
            return json.loads(
                processed_crash_as_string,
                object_hook=json_object_hook,
            )
        except boto_connection.ResponseError as x:
            raise CrashIDNotFound(
                '%s not found: %s' % (crash_id, x)
            )
コード例 #27
0
ファイル: crashstorage.py プロジェクト: stephendonner/socorro
 def __init__(self, config, *args, **kwargs):
     super(TelemetryBotoS3CrashStorage, self).__init__(
         config, *args, **kwargs
     )
     self._all_fields = SuperSearchFields(config=self.config).get()
コード例 #28
0
class TelemetryBotoS3CrashStorage(BotoS3CrashStorage):
    """Sends a subset of the processed crash to an S3 bucket

    The subset of the processed crash is based on the JSON Schema which is
    derived from "socorro/external/es/super_search_fields.py".

    """

    required_config = Namespace()
    required_config.resource_class = change_default(
        BotoCrashStorage, 'resource_class',
        'socorro.external.boto.connection_context.RegionalS3ConnectionContext')
    required_config.elasticsearch = Namespace()
    required_config.elasticsearch.add_option(
        'elasticsearch_class',
        default='socorro.external.es.connection_context.ConnectionContext',
        from_string_converter=class_converter,
        reference_value_from='resource.elasticsearch',
    )

    def __init__(self, config, *args, **kwargs):
        super(TelemetryBotoS3CrashStorage,
              self).__init__(config, *args, **kwargs)
        self._all_fields = SuperSearchFields(config=self.config).get()

    def save_raw_and_processed(self, raw_crash, dumps, processed_crash,
                               crash_id):
        crash_report = {}

        # TODO Opportunity of optimization;
        # We could inspect CRASH_REPORT_JSON_SCHEMA and get a list
        # of all (recursive) keys that are in there and use that
        # to limit the two following loops to not bother
        # filling up `crash_report` with keys that will never be
        # needed.

        # Rename fields in raw_crash.
        raw_fields_map = dict((x['in_database_name'], x['name'])
                              for x in self._all_fields.values()
                              if x['namespace'] == 'raw_crash')
        for key, val in raw_crash.items():
            crash_report[raw_fields_map.get(key, key)] = val

        # Rename fields in processed_crash.
        processed_fields_map = dict((x['in_database_name'], x['name'])
                                    for x in self._all_fields.values()
                                    if x['namespace'] == 'processed_crash')
        for key, val in processed_crash.items():
            crash_report[processed_fields_map.get(key, key)] = val

        # Validate crash_report.
        crash_report = json_schema_reducer.make_reduced_dict(
            CRASH_REPORT_JSON_SCHEMA, crash_report)
        self.save_processed(crash_report)

    @staticmethod
    def _do_save_processed(boto_connection, processed_crash):
        """Overriding this to change "name of thing" to crash_report"""
        crash_id = processed_crash['uuid']
        processed_crash_as_string = boto_connection._convert_mapping_to_string(
            processed_crash)
        boto_connection.submit(crash_id, "crash_report",
                               processed_crash_as_string)

    @staticmethod
    def _do_get_unredacted_processed(boto_connection, crash_id,
                                     json_object_hook):
        """Overriding this to change "name of thing" to crash_report"""
        try:
            processed_crash_as_string = boto_connection.fetch(
                crash_id, 'crash_report')
            return json.loads(
                processed_crash_as_string,
                object_hook=json_object_hook,
            )
        except boto_connection.ResponseError as x:
            raise CrashIDNotFound('%s not found: %s' % (crash_id, x))
コード例 #29
0
    def test_get_missing_fields(self):
        config = self.get_mware_config(es_index="socorro_integration_test_%W")

        fake_mappings = [
            {
                "mappings": {
                    config.elasticsearch.elasticsearch_doctype: {
                        "properties": {
                            # Add a bunch of unknown fields.
                            "field_z": {"type": "string"},
                            "namespace1": {
                                "type": "object",
                                "properties": {"field_a": {"type": "string"}, "field_b": {"type": "long"}},
                            },
                            "namespace2": {
                                "type": "object",
                                "properties": {
                                    "subspace1": {"type": "object", "properties": {"field_b": {"type": "long"}}}
                                },
                            },
                            # Add a few known fields that should not appear.
                            "processed_crash": {
                                "type": "object",
                                "properties": {"signature": {"type": "string"}, "product": {"type": "string"}},
                            },
                        }
                    }
                }
            },
            {
                "mappings": {
                    config.elasticsearch.elasticsearch_doctype: {
                        "properties": {
                            "namespace1": {
                                "type": "object",
                                "properties": {
                                    "subspace1": {"type": "object", "properties": {"field_d": {"type": "long"}}}
                                },
                            }
                        }
                    }
                }
            },
        ]

        now = datetimeutil.utc_now()
        indices = []

        try:
            # Using "2" here means that an index will be missing, hence testing
            # that it swallows the subsequent error.
            for i in range(2):
                date = now - datetime.timedelta(weeks=i)
                index = date.strftime(config.elasticsearch.elasticsearch_index)
                mapping = fake_mappings[i % len(fake_mappings)]

                self.index_creator.create_index(index, mapping)
                indices.append(index)

            api = SuperSearchFields(config=config)
            missing_fields = api.get_missing_fields()
            expected = [
                "field_z",
                "namespace1.field_a",
                "namespace1.field_b",
                "namespace1.subspace1.field_d",
                "namespace2.subspace1.field_b",
            ]

            eq_(missing_fields["hits"], expected)
            eq_(missing_fields["total"], 5)

        finally:
            for index in indices:
                self.index_client.delete(index=index)
コード例 #30
0
class IntegrationTestSuperSearchFields(ElasticsearchTestCase):
    """Test SuperSearchFields with an elasticsearch database containing fake
    data. """

    def setUp(self):
        super(IntegrationTestSuperSearchFields, self).setUp()

        self.api = SuperSearchFields(config=self.config)
        self.api.get_fields = lambda: copy.deepcopy(FIELDS)

    def test_get_fields(self):
        results = self.api.get_fields()
        assert results == FIELDS

    def test_get_missing_fields(self):
        config = self.get_base_config(
            es_index='socorro_integration_test_%W'
        )

        fake_mappings = [
            {
                'mappings': {
                    config.elasticsearch.elasticsearch_doctype: {
                        'properties': {
                            # Add a bunch of unknown fields.
                            'field_z': {
                                'type': 'string'
                            },
                            'namespace1': {
                                'type': 'object',
                                'properties': {
                                    'field_a': {
                                        'type': 'string'
                                    },
                                    'field_b': {
                                        'type': 'long'
                                    }
                                }
                            },
                            'namespace2': {
                                'type': 'object',
                                'properties': {
                                    'subspace1': {
                                        'type': 'object',
                                        'properties': {
                                            'field_b': {
                                                'type': 'long'
                                            }
                                        }
                                    }
                                }
                            },
                            # Add a few known fields that should not appear.
                            'processed_crash': {
                                'type': 'object',
                                'properties': {
                                    'signature': {
                                        'type': 'string'
                                    },
                                    'product': {
                                        'type': 'string'
                                    },
                                }
                            }
                        }
                    }
                }
            },
            {
                'mappings': {
                    config.elasticsearch.elasticsearch_doctype: {
                        'properties': {
                            'namespace1': {
                                'type': 'object',
                                'properties': {
                                    'subspace1': {
                                        'type': 'object',
                                        'properties': {
                                            'field_d': {
                                                'type': 'long'
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            },
        ]

        now = datetimeutil.utc_now()
        indices = []

        try:
            # Using "2" here means that an index will be missing, hence testing
            # that it swallows the subsequent error.
            for i in range(2):
                date = now - datetime.timedelta(weeks=i)
                index = date.strftime(config.elasticsearch.elasticsearch_index)
                mapping = fake_mappings[i % len(fake_mappings)]

                self.index_creator.create_index(index, mapping)
                indices.append(index)

            api = SuperSearchFields(config=config)
            missing_fields = api.get_missing_fields()
            expected = [
                'field_z',
                'namespace1.field_a',
                'namespace1.field_b',
                'namespace1.subspace1.field_d',
                'namespace2.subspace1.field_b',
            ]

            assert missing_fields['hits'] == expected
            assert missing_fields['total'] == 5

        finally:
            for index in indices:
                self.index_client.delete(index=index)

    def test_get_mapping(self):
        mapping = self.api.get_mapping()
        doctype = self.config.elasticsearch.elasticsearch_doctype

        assert doctype in mapping
        properties = mapping[doctype]['properties']

        assert 'processed_crash' in properties
        assert 'raw_crash' in properties

        processed_crash = properties['processed_crash']['properties']

        # Check in_database_name is used.
        assert 'os_name' in processed_crash
        assert 'platform' not in processed_crash

        # Those fields have no `storage_mapping`.
        assert 'fake_field' not in properties['raw_crash']['properties']

        # Those fields have a `storage_mapping`.
        assert processed_crash['release_channel'] == {'analyzer': 'keyword', 'type': 'string'}

        # Test nested objects.
        assert 'json_dump' in processed_crash
        assert 'properties' in processed_crash['json_dump']
        assert 'write_combine_size' in processed_crash['json_dump']['properties']
        assert processed_crash['json_dump']['properties']['write_combine_size'] == {'type': 'long'}

        # Test overwriting a field.
        mapping = self.api.get_mapping(overwrite_mapping={
            'name': 'fake_field',
            'namespace': 'raw_crash',
            'in_database_name': 'fake_field',
            'storage_mapping': {
                'type': 'long'
            }
        })
        properties = mapping[doctype]['properties']

        assert 'fake_field' in properties['raw_crash']['properties']
        assert properties['raw_crash']['properties']['fake_field']['type'] == 'long'

    def test_test_mapping(self):
        """Much test. So meta. Wow test_test_. """
        # First test a valid mapping.
        mapping = self.api.get_mapping()
        assert self.api.test_mapping(mapping) is None

        # Insert an invalid storage mapping.
        mapping = self.api.get_mapping({
            'name': 'fake_field',
            'namespace': 'raw_crash',
            'in_database_name': 'fake_field',
            'storage_mapping': {
                'type': 'unkwown'
            }
        })
        with pytest.raises(BadArgumentError):
            self.api.test_mapping(mapping)

        # Test with a correct mapping but with data that cannot be indexed.
        self.index_crash({
            'date_processed': datetimeutil.utc_now(),
            'product': 'WaterWolf',
        })
        self.refresh_index()
        mapping = self.api.get_mapping({
            'name': 'product',
            'storage_mapping': {
                'type': 'long'
            }
        })
        with pytest.raises(BadArgumentError):
            self.api.test_mapping(mapping)
コード例 #31
0
 def __init__(self, config, *args, **kwargs):
     super(TelemetryBotoS3CrashStorage,
           self).__init__(config, *args, **kwargs)
     self._all_fields = SuperSearchFields(config=self.config).get()
コード例 #32
0
    def test_get_missing_fields(self):
        config = self.get_base_config(
            es_index='socorro_integration_test_%W'
        )

        fake_mappings = [
            {
                'mappings': {
                    config.elasticsearch.elasticsearch_doctype: {
                        'properties': {
                            # Add a bunch of unknown fields.
                            'field_z': {
                                'type': 'string'
                            },
                            'namespace1': {
                                'type': 'object',
                                'properties': {
                                    'field_a': {
                                        'type': 'string'
                                    },
                                    'field_b': {
                                        'type': 'long'
                                    }
                                }
                            },
                            'namespace2': {
                                'type': 'object',
                                'properties': {
                                    'subspace1': {
                                        'type': 'object',
                                        'properties': {
                                            'field_b': {
                                                'type': 'long'
                                            }
                                        }
                                    }
                                }
                            },
                            # Add a few known fields that should not appear.
                            'processed_crash': {
                                'type': 'object',
                                'properties': {
                                    'signature': {
                                        'type': 'string'
                                    },
                                    'product': {
                                        'type': 'string'
                                    },
                                }
                            }
                        }
                    }
                }
            },
            {
                'mappings': {
                    config.elasticsearch.elasticsearch_doctype: {
                        'properties': {
                            'namespace1': {
                                'type': 'object',
                                'properties': {
                                    'subspace1': {
                                        'type': 'object',
                                        'properties': {
                                            'field_d': {
                                                'type': 'long'
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            },
        ]

        now = datetimeutil.utc_now()
        indices = []

        try:
            # Using "2" here means that an index will be missing, hence testing
            # that it swallows the subsequent error.
            for i in range(2):
                date = now - datetime.timedelta(weeks=i)
                index = date.strftime(config.elasticsearch.elasticsearch_index)
                mapping = fake_mappings[i % len(fake_mappings)]

                self.index_creator.create_index(index, mapping)
                indices.append(index)

            api = SuperSearchFields(config=config)
            missing_fields = api.get_missing_fields()
            expected = [
                'field_z',
                'namespace1.field_a',
                'namespace1.field_b',
                'namespace1.subspace1.field_d',
                'namespace2.subspace1.field_b',
            ]

            assert missing_fields['hits'] == expected
            assert missing_fields['total'] == 5

        finally:
            for index in indices:
                self.index_client.delete(index=index)
コード例 #33
0
ファイル: supersearch.py プロジェクト: abudulemusa/socorro
class SuperSearch(SearchBase):

    def __init__(self, *args, **kwargs):
        self.config = kwargs.get('config')
        self.es_context = self.config.elasticsearch.elasticsearch_class(
            self.config.elasticsearch
        )

        self.all_fields = SuperSearchFields(config=self.config).get_fields()

        # Create a map to associate a field's name in the database to its
        # exposed name (in the results and facets).
        self.database_name_to_field_name_map = dict(
            (x['in_database_name'], x['name'])
            for x in self.all_fields.values()
        )

        kwargs.update(fields=self.all_fields)
        super(SuperSearch, self).__init__(
            *args, **kwargs
        )

    def get_connection(self):
        with self.es_context() as conn:
            return conn

    def generate_list_of_indices(self, from_date, to_date, es_index=None):
        """Return the list of indices to query to access all the crash reports
        that were processed between from_date and to_date.

        The naming pattern for indices in elasticsearch is configurable, it is
        possible to have an index per day, per week, per month...

        Parameters:
        * from_date datetime object
        * to_date datetime object
        """
        if es_index is None:
            es_index = self.config.elasticsearch_index

        indices = []
        current_date = from_date
        while current_date <= to_date:
            index = current_date.strftime(es_index)

            # Make sure no index is twice in the list
            # (for weekly or monthly indices for example)
            if index not in indices:
                indices.append(index)
            current_date += datetime.timedelta(days=1)

        return indices

    def get_indices(self, dates):
        """Return the list of indices to use for given dates. """
        start_date = None
        end_date = None
        for date in dates:
            if '>' in date.operator:
                start_date = date.value
            if '<' in date.operator:
                end_date = date.value

        return self.generate_list_of_indices(start_date, end_date)

    def format_field_names(self, hit):
        """Return a hit with each field's database name replaced by its
        exposed name. """
        new_hit = {}
        for field in hit:
            new_field = field

            if '.' in new_field:
                # Remove the prefix ("processed_crash." or "raw_crash.").
                new_field = new_field.split('.')[-1]

            new_field = self.database_name_to_field_name_map.get(
                new_field, new_field
            )

            new_hit[new_field] = hit[field]

        return new_hit

    def format_fields(self, hit):
        """Return a well formatted document.

        Elasticsearch returns values as lists when using the `fields` option.
        This function removes the list when it contains zero or one element.
        It also calls `format_field_names` to correct all the field names.
        """
        hit = self.format_field_names(hit)

        for field in hit:
            if isinstance(hit[field], (list, tuple)):
                if len(hit[field]) == 0:
                    hit[field] = None
                elif len(hit[field]) == 1:
                    hit[field] = hit[field][0]

        return hit

    def format_aggregations(self, aggregations):
        """Return aggregations in a form that looks like facets.

        We used to expose the Elasticsearch facets directly. This is thus
        needed for backwards compatibility.
        """
        aggs = aggregations.to_dict()
        for agg in aggs:
            for i, row in enumerate(aggs[agg]['buckets']):
                aggs[agg]['buckets'][i] = {
                    'term': row['key'],
                    'count': row['doc_count'],
                }
            aggs[agg] = aggs[agg]['buckets']

        return aggs

    def get(self, **kwargs):
        """Return a list of results and aggregations based on parameters.

        The list of accepted parameters (with types and default values) is in
        the database and can be accessed with the super_search_fields service.
        """
        # Filter parameters and raise potential errors.
        params = self.get_parameters(**kwargs)

        # Find the indices to use to optimize the elasticsearch query.
        indices = self.get_indices(params['date'])

        # Create and configure the search object.
        search = Search(
            using=self.get_connection(),
            index=indices,
            doc_type=self.config.elasticsearch.elasticsearch_doctype,
        )

        # Create filters.
        filters = None

        for field, sub_params in params.items():
            sub_filters = None
            for param in sub_params:

                if param.name.startswith('_'):
                    if param.name == '_results_offset':
                        results_from = param.value[0]
                    elif param.name == '_results_number':
                        results_number = param.value[0]
                    # Don't use meta parameters in the query.
                    continue

                field_data = self.all_fields[param.name]

                name = '%s.%s' % (
                    field_data['namespace'],
                    field_data['in_database_name']
                )

                if param.data_type in ('date', 'datetime'):
                    param.value = datetimeutil.date_to_string(param.value)
                elif param.data_type == 'enum':
                    param.value = [x.lower() for x in param.value]
                elif param.data_type == 'str' and not param.operator:
                    param.value = [x.lower() for x in param.value]

                args = {}
                filter_type = 'term'
                filter_value = None
                if not param.operator:
                    # contains one of the terms
                    if len(param.value) == 1:
                        val = param.value[0]
                        if not isinstance(val, basestring) or (
                            isinstance(val, basestring) and ' ' not in val
                        ):
                            filter_value = val

                        # If the term contains white spaces, we want to perform
                        # a phrase query. Thus we do nothing here and let this
                        # value be handled later.
                    else:
                        filter_type = 'terms'
                        filter_value = param.value
                elif param.operator == '=':
                    # is exactly
                    if field_data['has_full_version']:
                        name = '%s.full' % name
                    filter_value = param.value
                elif param.operator == '>':
                    # greater than
                    filter_type = 'range'
                    filter_value = {
                        'gt': param.value
                    }
                elif param.operator == '<':
                    # lower than
                    filter_type = 'range'
                    filter_value = {
                        'lt': param.value
                    }
                elif param.operator == '>=':
                    # greater than or equal to
                    filter_type = 'range'
                    filter_value = {
                        'gte': param.value
                    }
                elif param.operator == '<=':
                    # lower than or equal to
                    filter_type = 'range'
                    filter_value = {
                        'lte': param.value
                    }
                elif param.operator == '__null__':
                    # is null
                    filter_type = 'missing'
                    args['field'] = name

                if filter_value is not None:
                    args[name] = filter_value

                if args:
                    if param.operator_not:
                        new_filter = ~F(filter_type, **args)
                    else:
                        new_filter = F(filter_type, **args)

                    if sub_filters is None:
                        sub_filters = new_filter
                    elif param.data_type == 'enum':
                        sub_filters |= new_filter
                    else:
                        sub_filters &= new_filter

                    continue

                # These use a wildcard and thus need to be in a query
                # instead of a filter.
                operator_wildcards = {
                    '~': '*%s*',  # contains
                    '$': '%s*',  # starts with
                    '^': '*%s'  # ends with
                }
                if param.operator in operator_wildcards:
                    if field_data['has_full_version']:
                        name = '%s.full' % name

                    query_type = 'wildcard'
                    args[name] = (
                        operator_wildcards[param.operator] % param.value
                    )
                elif not param.operator:
                    # This is a phrase that was passed down.
                    query_type = 'simple_query_string'
                    args['query'] = param.value[0]
                    args['fields'] = [name]
                    args['default_operator'] = 'and'

                if args:
                    query = Q(query_type, **args)
                    if param.operator_not:
                        query = ~query
                    search = search.query(query)
                else:
                    # If we reach this point, that means the operator is
                    # not supported, and we should raise an error about that.
                    raise NotImplementedError(
                        'Operator %s is not supported' % param.operator
                    )

            if filters is None:
                filters = sub_filters
            elif sub_filters is not None:
                filters &= sub_filters

        search = search.filter(filters)

        # Restricting returned fields.
        fields = []
        for param in params['_columns']:
            for value in param.value:
                if not value:
                    continue

                try:
                    field_ = self.all_fields[value]
                except KeyError:
                    # That is not a known field, we can't restrict on it.
                    raise BadArgumentError(
                        value,
                        msg='Unknown field "%s", cannot return it' % value
                    )

                if not field_['is_returned']:
                    # Returning this field is not allowed.
                    raise BadArgumentError(
                        value,
                        msg='Field "%s" is not allowed to be returned' % value
                    )

                field_name = '%s.%s' % (
                    field_['namespace'],
                    field_['in_database_name']
                )

                fields.append(field_name)

        search = search.fields(fields)

        # Sorting.
        sort_fields = []
        for param in params['_sort']:
            for value in param.value:
                if not value:
                    continue

                # Values starting with a '-' are sorted in descending order.
                # In order to retrieve the database name of the field, we
                # must first remove the '-' part and add it back later.
                # Example: given ['product', '-version'], the results will be
                # sorted by ascending product and descending version.
                desc = False
                if value.startswith('-'):
                    desc = True
                    value = value[1:]

                try:
                    field_ = self.all_fields[value]
                except KeyError:
                    # That is not a known field, we can't sort on it.
                    raise BadArgumentError(
                        value,
                        msg='Unknown field "%s", cannot sort on it' % value
                    )

                field_name = '%s.%s' % (
                    field_['namespace'],
                    field_['in_database_name']
                )

                if desc:
                    # The underlying library understands that '-' means
                    # sorting in descending order.
                    field_name = '-' + field_name

                sort_fields.append(field_name)

        search = search.sort(*sort_fields)

        # Pagination.
        results_to = results_from + results_number
        search = search[results_from:results_to]

        # Create facets.
        for param in params['_facets']:
            for value in param.value:
                try:
                    field_ = self.all_fields[value]
                except KeyError:
                    # That is not a known field, we can't facet on it.
                    raise BadArgumentError(
                        value,
                        msg='Unknown field "%s", cannot facet on it' % value
                    )

                field_name = '%s.%s' % (
                    field_['namespace'],
                    field_['in_database_name']
                )

                if field_['has_full_version']:
                    # If the param has a full version, that means what matters
                    # is the full string, and not its individual terms.
                    field_name += '.full'

                search.aggs.bucket(
                    value,
                    'terms',
                    field=field_name,
                    size=self.config.facets_max_number
                )

        # Query and compute results.
        hits = []

        if params['_return_query'][0].value[0]:
            # Return only the JSON query that would be sent to elasticsearch.
            return {
                'query': search.to_dict(),
                'indices': indices,
            }

        # We call elasticsearch with a computed list of indices, based on
        # the date range. However, if that list contains indices that do not
        # exist in elasticsearch, an error will be raised. We thus want to
        # remove all failing indices until we either have a valid list, or
        # an empty list in which case we return no result.
        while True:
            try:
                results = search.execute()
                for hit in results:
                    hits.append(self.format_fields(hit.to_dict()))

                total = search.count()
                aggregations = self.format_aggregations(results.aggregations)
                break  # Yay! Results!
            except NotFoundError, e:
                missing_index = re.findall(BAD_INDEX_REGEX, e.error)[0]
                if missing_index in indices:
                    del indices[indices.index(missing_index)]
                else:
                    # Wait what? An error caused by an index that was not
                    # in the request? That should never happen, but in case
                    # it does, better know it.
                    raise

                if indices:
                    # Update the list of indices and try again.
                    # Note: we need to first empty the list of indices before
                    # updating it, otherwise the removed indices never get
                    # actually removed.
                    search = search.index().index(*indices)
                else:
                    # There is no index left in the list, return an empty
                    # result.
                    hits = []
                    total = 0
                    aggregations = {}
                    break

        return {
            'hits': hits,
            'total': total,
            'facets': aggregations,
        }
コード例 #34
0
 def update_field(self, **kwargs):
     return SuperSearchFields(config=self.config).update_field(**kwargs)
コード例 #35
0
    def run(self, end_datetime):
        # Truncate to the hour
        end_datetime = end_datetime.replace(minute=0, second=0, microsecond=0)

        # Do a super search and get the signature, buildid, and date processed for
        # every crash in the range
        all_fields = SuperSearchFields(config=self.config).get()
        api = SuperSearch(config=self.config)
        start_datetime = end_datetime - datetime.timedelta(
            minutes=self.config.period)
        self.config.logger.info('Looking at %s to %s', start_datetime,
                                end_datetime)

        params = {
            'date': [
                '>={}'.format(start_datetime.isoformat()),
                '<{}'.format(end_datetime.isoformat()),
            ],
            '_columns': ['signature', 'build_id', 'date'],
            '_facets_size':
            0,
            '_fields':
            all_fields,

            # Set up first page
            '_results_offset':
            0,
            '_results_number':
            MAX_PAGE,
        }

        results = {}
        crashids_count = 0

        while True:
            resp = api.get(**params)
            hits = resp['hits']
            for hit in hits:
                crashids_count += 1

                if not hit['build_id']:
                    # Not all crashes have a build id, so skip the ones that don't.
                    continue

                if hit['signature'] in results:
                    data = results[hit['signature']]
                    data['build_id'] = min(data['build_id'], hit['build_id'])
                    data['date'] = min(data['date'], hit['date'])
                else:
                    data = {
                        'signature': hit['signature'],
                        'build_id': hit['build_id'],
                        'date': hit['date']
                    }
                results[hit['signature']] = data

            # If there are no more crash ids to get, we return
            total = resp['total']
            if not hits or crashids_count >= total:
                break

            # Get the next page, but only as many results as we need
            params['_results_offset'] += MAX_PAGE
            params['_results_number'] = min(
                # MAX_PAGE is the maximum we can request
                MAX_PAGE,

                # The number of results Super Search can return to us that is hasn't returned so far
                total - crashids_count)

        signature_data = results.values()

        # Save signature data to the db
        for item in signature_data:
            if self.config.dry_run:
                self.config.logger.info(
                    'Inserting/updating signature (%s, %s, %s)',
                    item['signature'], item['date'], item['build_id'])
            else:
                self.update_crashstats_signature(
                    signature=item['signature'],
                    report_date=item['date'],
                    report_build=item['build_id'],
                )

        self.config.logger.info('Inserted/updated %d signatures.',
                                len(signature_data))
コード例 #36
0
 def get_missing_fields(self):
     return SuperSearchFields(config=self.config).get_missing_fields()
コード例 #37
0
ファイル: supersearch.py プロジェクト: twobraids/socorro
class SuperSearch(SearchBase):

    def __init__(self, *args, **kwargs):
        self.config = kwargs.get('config')
        self.es_context = self.config.elasticsearch.elasticsearch_class(
            self.config.elasticsearch
        )

        self.all_fields = SuperSearchFields(config=self.config).get_fields()

        # Create a map to associate a field's name in the database to its
        # exposed name (in the results and facets).
        self.database_name_to_field_name_map = dict(
            (x['in_database_name'], x['name'])
            for x in self.all_fields.values()
        )

        kwargs.update(fields=self.all_fields)
        super(SuperSearch, self).__init__(
            *args, **kwargs
        )

    def get_connection(self):
        with self.es_context() as conn:
            return conn

    def get_list_of_indices(self, from_date, to_date, es_index=None):
        """Return the list of indices to query to access all the crash reports
        that were processed between from_date and to_date.

        The naming pattern for indices in elasticsearch is configurable, it is
        possible to have an index per day, per week, per month...

        Parameters:
        * from_date datetime object
        * to_date datetime object
        """
        if es_index is None:
            es_index = self.config.elasticsearch.elasticsearch_index

        indices = []
        current_date = from_date
        while current_date <= to_date:
            index = current_date.strftime(es_index)

            # Make sure no index is twice in the list
            # (for weekly or monthly indices for example)
            if index not in indices:
                indices.append(index)
            current_date += datetime.timedelta(days=1)

        return indices

    def get_indices(self, dates):
        """Return the list of indices to use for given dates. """
        start_date = None
        end_date = None
        for date in dates:
            if '>' in date.operator:
                start_date = date.value
            if '<' in date.operator:
                end_date = date.value

        return self.get_list_of_indices(start_date, end_date)

    def format_field_names(self, hit):
        """Return a hit with each field's database name replaced by its
        exposed name. """
        new_hit = {}
        for field in hit:
            new_field = field

            if '.' in new_field:
                # Remove the prefix ("processed_crash." or "raw_crash.").
                new_field = new_field.split('.')[-1]

            new_field = self.database_name_to_field_name_map.get(
                new_field, new_field
            )

            new_hit[new_field] = hit[field]

        return new_hit

    def format_fields(self, hit):
        """Return a well formatted document.

        Elasticsearch returns values as lists when using the `fields` option.
        This function removes the list when it contains zero or one element.
        It also calls `format_field_names` to correct all the field names.
        """
        hit = self.format_field_names(hit)

        for field in hit:
            if isinstance(hit[field], (list, tuple)):
                if len(hit[field]) == 0:
                    hit[field] = None
                elif len(hit[field]) == 1:
                    hit[field] = hit[field][0]

        return hit

    def get_field_name(self, value, full=True):
        try:
            field_ = self.all_fields[value]
        except KeyError:
            raise BadArgumentError(
                value,
                msg='Unknown field "%s"' % value
            )

        if not field_['is_returned']:
            # Returning this field is not allowed.
            raise BadArgumentError(
                value,
                msg='Field "%s" is not allowed to be returned' % value
            )

        field_name = '%s.%s' % (
            field_['namespace'],
            field_['in_database_name']
        )

        if full and field_['has_full_version']:
            # If the param has a full version, that means what matters
            # is the full string, and not its individual terms.
            field_name += '.full'

        return field_name

    def format_aggregations(self, aggregations):
        """Return aggregations in a form that looks like facets.

        We used to expose the Elasticsearch facets directly. This is thus
        needed for backwards compatibility.
        """
        aggs = aggregations.to_dict()
        for agg in aggs:
            for i, bucket in enumerate(aggs[agg]['buckets']):
                sub_aggs = {}
                for key in bucket:
                    # Go through all sub aggregations. Those are contained in
                    # all the keys that are not 'key' or 'count'.
                    if key in ('key', 'key_as_string', 'doc_count'):
                        continue

                    sub_aggs[key] = [
                        {
                            # For date data, Elasticsearch exposes a timestamp
                            # in 'key' and a human-friendly string in
                            # 'key_as_string'. We thus check if the later
                            # exists to expose it, and return the normal
                            # 'key' if not.
                            'term': x.get('key_as_string', x['key']),
                            'count': x['doc_count'],
                        }
                        for x in bucket[key]['buckets']
                    ]

                aggs[agg]['buckets'][i] = {
                    'term': bucket.get('key_as_string', bucket['key']),
                    'count': bucket['doc_count'],
                }

                if sub_aggs:
                    aggs[agg]['buckets'][i]['facets'] = sub_aggs

            aggs[agg] = aggs[agg]['buckets']

        return aggs

    def get(self, **kwargs):
        """Return a list of results and aggregations based on parameters.

        The list of accepted parameters (with types and default values) is in
        the database and can be accessed with the super_search_fields service.
        """
        # Filter parameters and raise potential errors.
        params = self.get_parameters(**kwargs)

        # Find the indices to use to optimize the elasticsearch query.
        indices = self.get_indices(params['date'])

        # Create and configure the search object.
        search = Search(
            using=self.get_connection(),
            index=indices,
            doc_type=self.config.elasticsearch.elasticsearch_doctype,
        )

        # Create filters.
        filters = []
        histogram_intervals = {}

        for field, sub_params in params.items():
            sub_filters = None
            for param in sub_params:
                if param.name.startswith('_'):
                    # By default, all param values are turned into lists,
                    # even when they have and can have only one value.
                    # For those we know there can only be one value,
                    # so we just extract it from the made-up list.
                    if param.name == '_results_offset':
                        results_from = param.value[0]
                    elif param.name == '_results_number':
                        results_number = param.value[0]
                    elif param.name == '_facets_size':
                        facets_size = param.value[0]

                    for f in self.histogram_fields:
                        if param.name == '_histogram_interval.%s' % f:
                            histogram_intervals[f] = param.value[0]

                    # Don't use meta parameters in the query.
                    continue

                field_data = self.all_fields[param.name]

                name = '%s.%s' % (
                    field_data['namespace'],
                    field_data['in_database_name']
                )

                if param.data_type in ('date', 'datetime'):
                    param.value = datetimeutil.date_to_string(param.value)
                elif param.data_type == 'enum':
                    param.value = [x.lower() for x in param.value]
                elif param.data_type == 'str' and not param.operator:
                    param.value = [x.lower() for x in param.value]

                # Operators needing wildcards, and the associated value
                # transformation with said wildcards.
                operator_wildcards = {
                    '~': '*%s*',  # contains
                    '$': '%s*',  # starts with
                    '^': '*%s'  # ends with
                }
                # Operators needing ranges, and the associated Elasticsearch
                # comparison operator.
                operator_range = {
                    '>': 'gt',
                    '<': 'lt',
                    '>=': 'gte',
                    '<=': 'lte',
                }

                args = {}
                filter_type = 'term'
                filter_value = None

                if not param.operator:
                    # contains one of the terms
                    if len(param.value) == 1:
                        val = param.value[0]

                        if not isinstance(val, basestring) or ' ' not in val:
                            # There's only one term and no white space, this
                            # is a simple term filter.
                            filter_value = val
                        else:
                            # If the term contains white spaces, we want to
                            # perform a phrase query.
                            filter_type = 'query'
                            args = Q(
                                'simple_query_string',
                                query=param.value[0],
                                fields=[name],
                                default_operator='and',
                            ).to_dict()
                    else:
                        # There are several terms, this is a terms filter.
                        filter_type = 'terms'
                        filter_value = param.value
                elif param.operator == '=':
                    # is exactly
                    if field_data['has_full_version']:
                        name = '%s.full' % name
                    filter_value = param.value
                elif param.operator in operator_range:
                    filter_type = 'range'
                    filter_value = {
                        operator_range[param.operator]: param.value
                    }
                elif param.operator == '__null__':
                    filter_type = 'missing'
                    args['field'] = name
                elif param.operator in operator_wildcards:
                    filter_type = 'query'

                    # Wildcard operations are better applied to a non-analyzed
                    # field (called "full") if there is one.
                    if field_data['has_full_version']:
                        name = '%s.full' % name

                    q_args = {}
                    q_args[name] = (
                        operator_wildcards[param.operator] % param.value
                    )
                    query = Q('wildcard', **q_args)
                    args = query.to_dict()

                if filter_value is not None:
                    args[name] = filter_value

                if args:
                    new_filter = F(filter_type, **args)
                    if param.operator_not:
                        new_filter = ~new_filter

                    if sub_filters is None:
                        sub_filters = new_filter
                    elif filter_type == 'range':
                        sub_filters &= new_filter
                    else:
                        sub_filters |= new_filter

                    continue

            if sub_filters is not None:
                filters.append(sub_filters)

        search = search.filter(F('bool', must=filters))

        # Restricting returned fields.
        fields = []
        for param in params['_columns']:
            for value in param.value:
                if not value:
                    continue

                field_name = self.get_field_name(value, full=False)
                fields.append(field_name)

        search = search.fields(fields)

        # Sorting.
        sort_fields = []
        for param in params['_sort']:
            for value in param.value:
                if not value:
                    continue

                # Values starting with a '-' are sorted in descending order.
                # In order to retrieve the database name of the field, we
                # must first remove the '-' part and add it back later.
                # Example: given ['product', '-version'], the results will be
                # sorted by ascending product and descending version.
                desc = False
                if value.startswith('-'):
                    desc = True
                    value = value[1:]

                field_name = self.get_field_name(value, full=False)

                if desc:
                    # The underlying library understands that '-' means
                    # sorting in descending order.
                    field_name = '-' + field_name

                sort_fields.append(field_name)

        search = search.sort(*sort_fields)

        # Pagination.
        results_to = results_from + results_number
        search = search[results_from:results_to]

        # Create facets.
        for param in params['_facets']:
            for value in param.value:
                if not value:
                    continue

                field_name = self.get_field_name(value)
                search.aggs.bucket(
                    value,
                    'terms',
                    field=field_name,
                    size=facets_size,
                )

        # Create signature aggregations.
        if params.get('_aggs.signature'):
            sig_bucket = A(
                'terms',
                field=self.get_field_name('signature'),
                size=facets_size,
            )
            for param in params['_aggs.signature']:
                for value in param.value:
                    if not value:
                        continue

                    if value.startswith('_histogram.'):
                        # This is a histogram aggregation we want to run,
                        # not a terms aggregation.
                        field_name = value[len('_histogram.'):]
                        if field_name not in self.histogram_fields:
                            continue

                        histogram_type = (
                            self.all_fields[field_name]['query_type'] == 'date'
                            and 'date_histogram' or 'histogram'
                        )
                        sig_bucket.bucket(
                            'histogram_%s' % field_name,
                            histogram_type,
                            field=self.get_field_name(field_name),
                            interval=histogram_intervals[field_name],
                        )
                    else:
                        sig_bucket.bucket(
                            value,
                            'terms',
                            field=self.get_field_name(value),
                            size=facets_size,
                        )

            search.aggs.bucket('signature', sig_bucket)

        # Create histograms.
        for f in self.histogram_fields:
            if params.get('_histogram.%s' % f):
                histogram_type = (
                    self.all_fields[f]['query_type'] == 'date'
                    and 'date_histogram' or 'histogram'
                )
                date_bucket = A(
                    histogram_type,
                    field=self.get_field_name(f),
                    interval=histogram_intervals[f],
                )
                for param in params['_histogram.%s' % f]:
                    for value in param.value:
                        if not value:
                            continue

                        field_name = self.get_field_name(value)
                        val_bucket = A(
                            'terms',
                            field=field_name,
                            size=facets_size,
                        )
                        date_bucket.bucket(value, val_bucket)

                search.aggs.bucket('histogram_%s' % f, date_bucket)

        # Query and compute results.
        hits = []

        if params['_return_query'][0].value[0]:
            # Return only the JSON query that would be sent to elasticsearch.
            return {
                'query': search.to_dict(),
                'indices': indices,
            }

        # We call elasticsearch with a computed list of indices, based on
        # the date range. However, if that list contains indices that do not
        # exist in elasticsearch, an error will be raised. We thus want to
        # remove all failing indices until we either have a valid list, or
        # an empty list in which case we return no result.
        while True:
            try:
                results = search.execute()
                for hit in results:
                    hits.append(self.format_fields(hit.to_dict()))

                total = search.count()
                aggregations = self.format_aggregations(results.aggregations)
                break  # Yay! Results!
            except NotFoundError, e:
                missing_index = re.findall(BAD_INDEX_REGEX, e.error)[0]
                if missing_index in indices:
                    del indices[indices.index(missing_index)]
                else:
                    # Wait what? An error caused by an index that was not
                    # in the request? That should never happen, but in case
                    # it does, better know it.
                    raise

                if indices:
                    # Update the list of indices and try again.
                    # Note: we need to first empty the list of indices before
                    # updating it, otherwise the removed indices never get
                    # actually removed.
                    search = search.index().index(*indices)
                else:
                    # There is no index left in the list, return an empty
                    # result.
                    hits = []
                    total = 0
                    aggregations = {}
                    break

        return {
            'hits': hits,
            'total': total,
            'facets': aggregations,
        }