class IntegrationTestSuperSearchFields(ElasticsearchTestCase): """Test SuperSearchFields with an elasticsearch database containing fake data. """ def setUp(self): super(IntegrationTestSuperSearchFields, self).setUp() # Create the supersearch fields. self.index_super_search_fields() self.api = SuperSearchFields(config=self.config) def tearDown(self): # Clear the test indices. self.index_client.delete( self.config.elasticsearch.elasticsearch_default_index ) super(IntegrationTestSuperSearchFields, self).tearDown() def test_get_fields(self): results = self.api.get_fields() eq_(results, SUPERSEARCH_FIELDS) def test_get_mapping(self): mapping = self.api.get_mapping()['mappings'] doctype = self.config.elasticsearch.elasticsearch_doctype ok_(doctype in mapping) properties = mapping[doctype]['properties'] ok_('processed_crash' in properties) ok_('raw_crash' in properties) # Check in_database_name is used. ok_('os_name' in properties['processed_crash']['properties']) ok_('platform' not in properties['processed_crash']['properties']) # Those fields have no `storage_mapping`. ok_('fake_field' not in properties['raw_crash']['properties']) # Test overwriting a field. mapping = self.api.get_mapping(overwrite_mapping={ 'name': 'fake_field', 'storage_mapping': { 'type': 'long' } })['mappings'] properties = mapping[doctype]['properties'] ok_('fake_field' in properties['raw_crash']['properties']) eq_( properties['raw_crash']['properties']['fake_field']['type'], 'long' )
def __init__(self, *args, **kwargs): self.config = kwargs.get('config') self.es_context = self.config.elasticsearch.elasticsearch_class( self.config.elasticsearch) self.all_fields = SuperSearchFields(config=self.config).get_fields() # Create a map to associate a field's name in the database to its # exposed name (in the results and facets). self.database_name_to_field_name_map = dict( (x['in_database_name'], x['name']) for x in self.all_fields.values()) kwargs.update(fields=self.all_fields) super(SuperSearch, self).__init__(*args, **kwargs)
def create_index(self, index_name, mappings=None): """Create an index that will receive crash reports. :arg index_name: the name of the index to create :arg mappings: dict of doctype->ES mapping :returns: True if the index was created, False if it already existed """ if mappings is None: mappings = SuperSearchFields(context=self).get_mapping() es_settings = self.get_socorro_index_settings(mappings) try: client = self.indices_client() client.create(index=index_name, body=es_settings) return True except elasticsearch.exceptions.RequestError as e: # If this index already exists, swallow the error. # NOTE! This is NOT how the error looks like in ES 2.x if 'IndexAlreadyExistsException' not in str(e): raise return False
def setUp(self): super(IntegrationTestSuperSearchFields, self).setUp() # Create the supersearch fields. self.index_super_search_fields() self.api = SuperSearchFields(config=self.config)
def create_socorro_index(self, es_index, mappings=None): """Create an index that will receive crash reports. """ if mappings is None: mappings = SuperSearchFields(config=self.config).get_mapping() es_settings = self.get_socorro_index_settings(mappings) self.create_index(es_index, es_settings)
def create_socorro_index(self, es_index, mappings=None): """Create an index that will receive crash reports. """ if mappings is None: # Import at runtime to avoid dependency circle. from socorro.external.es.super_search_fields import ( SuperSearchFields) mappings = SuperSearchFields(config=self.config).get_mapping() es_settings = self.get_socorro_index_settings(mappings) self.create_index(es_index, es_settings)
def create_socorro_index(self, es_index, mappings=None): """Create an index that will receive crash reports. """ if mappings is None: mappings = SuperSearchFields(config=self.config).get_mapping() es_settings = self.get_socorro_index_settings(mappings) if self.config.elasticsearch.dry_run: print(json.dumps(es_settings, indent=2)) else: self.create_index(es_index, es_settings)
def _get_all_fields(self): if (hasattr(self, '_all_fields') and hasattr(self, '_all_fields_timestamp')): # we might have it cached age = time.time() - self._all_fields_timestamp if age < 60 * 60: # fresh enough return self._all_fields self._all_fields = SuperSearchFields(config=self.config).get() self._all_fields_timestamp = time.time() return self._all_fields
def create_socorro_index(self, index_name, mappings=None, log_result=False): """Create an index that will receive crash reports. Note: This function can get called in two contexts: when the processor is saving crash reports and also in the local dev environment scripts. The former wants to ignore index-existing errors quietly but the latter wants to log the result. Hence the fickle nature of this function. """ if mappings is None: mappings = SuperSearchFields(context=self).get_mapping() es_settings = self.get_socorro_index_settings(mappings) self.create_index(index_name, es_settings, log_result)
def __init__(self, *args, **kwargs): self.config = kwargs.get('config') self.es_context = self.config.elasticsearch.elasticsearch_class( self.config.elasticsearch ) self.all_fields = SuperSearchFields(config=self.config).get_fields() # Create a map to associate a field's name in the database to its # exposed name (in the results and facets). self.database_name_to_field_name_map = dict( (x['in_database_name'], x['name']) for x in self.all_fields.values() ) kwargs.update(fields=self.all_fields) super(SuperSearch, self).__init__( *args, **kwargs )
def create_socorro_index(self, es_index): """Create an index that will receive crash reports. """ es_settings = SuperSearchFields(config=self.config).get_mapping() self.create_index(es_index, es_settings)
class IntegrationTestSuperSearchFields(ElasticsearchTestCase): """Test SuperSearchFields with an elasticsearch database containing fake data. """ def setUp(self): super(IntegrationTestSuperSearchFields, self).setUp() self.api = SuperSearchFields(config=self.config) def test_get_fields(self): results = self.api.get_fields() eq_(results, SUPERSEARCH_FIELDS) def test_create_field(self): # Test with all parameters set. params = { 'name': 'plotfarm', 'data_validation_type': 'str', 'default_value': None, 'description': 'a plotfarm like Lunix or Wondiws', 'form_field_choices': ['lun', 'won', 'cam'], 'has_full_version': True, 'in_database_name': 'os_name', 'is_exposed': True, 'is_returned': True, 'is_mandatory': False, 'query_type': 'str', 'namespace': 'processed_crash', 'permissions_needed': ['view_plotfarm'], 'storage_mapping': {"type": "multi_field"}, } res = self.api.create_field(**params) ok_(res) field = self.connection.get( index=self.config.webapi.elasticsearch_default_index, doc_type='supersearch_fields', id='plotfarm', ) field = field['_source'] eq_(sorted(field.keys()), sorted(params.keys())) for key in field.keys(): eq_(field[key], params[key]) # Test default values. res = self.api.create_field( name='brand_new_field', in_database_name='brand_new_field', namespace='processed_crash', ) ok_(res) ok_( self.connection.get( index=self.config.webapi.elasticsearch_default_index, doc_type='supersearch_fields', id='brand_new_field', ) ) # Test errors. # `name` is missing. assert_raises( MissingArgumentError, self.api.create_field, in_database_name='something', ) # `in_database_name` is missing. assert_raises( MissingArgumentError, self.api.create_field, name='something', ) # Field already exists. assert_raises( BadArgumentError, self.api.create_field, name='product', in_database_name='product', namespace='processed_crash', ) # Test logging. res = self.api.create_field( name='what_a_field', in_database_name='what_a_field', namespace='processed_crash', storage_mapping='{"type": "long"}', ) ok_(res) self.api.config.logger.info.assert_called_with( 'elasticsearch mapping changed for field "%s", ' 'added new mapping "%s"', 'what_a_field', {u'type': u'long'}, ) def test_update_field(self): # Let's create a field first. assert self.api.create_field( name='super_field', in_database_name='super_field', namespace='superspace', description='inaccurate description', permissions_needed=['view_nothing'], storage_mapping={'type': 'boolean', 'null_value': False} ) # Now let's update that field a little. res = self.api.update_field( name='super_field', description='very accurate description', storage_mapping={'type': 'long', 'analyzer': 'keyword'}, ) ok_(res) # Test logging. self.api.config.logger.info.assert_called_with( 'elasticsearch mapping changed for field "%s", ' 'was "%s", now "%s"', 'super_field', {'type': 'boolean', 'null_value': False}, {'type': 'long', 'analyzer': 'keyword'}, ) field = self.connection.get( index=self.config.elasticsearch.elasticsearch_default_index, doc_type='supersearch_fields', id='super_field', ) field = field['_source'] # Verify the changes were taken into account. eq_(field['description'], 'very accurate description') eq_(field['storage_mapping'], {'type': 'long', 'analyzer': 'keyword'}) # Verify other values did not change. eq_(field['permissions_needed'], ['view_nothing']) eq_(field['in_database_name'], 'super_field') eq_(field['namespace'], 'superspace') # Test errors. assert_raises( MissingArgumentError, self.api.update_field, ) # `name` is missing assert_raises( ResourceNotFound, self.api.update_field, name='unkownfield', ) def test_delete_field(self): self.api.delete_field(name='product') ok_( self.connection.get( index=self.config.elasticsearch.elasticsearch_default_index, doc_type='supersearch_fields', id='signature', ) ) assert_raises( elasticsearch.exceptions.NotFoundError, self.connection.get, index=self.config.elasticsearch.elasticsearch_default_index, doc_type='supersearch_fields', id='product', ) @minimum_es_version('1.0') def test_get_missing_fields(self): config = self.get_mware_config( es_index='socorro_integration_test_%W' ) fake_mappings = [ { 'mappings': { config.elasticsearch.elasticsearch_doctype: { 'properties': { # Add a bunch of unknown fields. 'field_z': { 'type': 'string' }, 'namespace1': { 'type': 'object', 'properties': { 'field_a': { 'type': 'string' }, 'field_b': { 'type': 'long' } } }, 'namespace2': { 'type': 'object', 'properties': { 'subspace1': { 'type': 'object', 'properties': { 'field_b': { 'type': 'long' } } } } }, # Add a few known fields that should not appear. 'processed_crash': { 'type': 'object', 'properties': { 'signature': { 'type': 'string' }, 'product': { 'type': 'string' }, } } } } } }, { 'mappings': { config.elasticsearch.elasticsearch_doctype: { 'properties': { 'namespace1': { 'type': 'object', 'properties': { 'subspace1': { 'type': 'object', 'properties': { 'field_d': { 'type': 'long' } } } } } } } } }, ] now = datetimeutil.utc_now() indices = [] try: # Using "2" here means that an index will be missing, hence testing # that it swallows the subsequent error. for i in range(2): date = now - datetime.timedelta(weeks=i) index = date.strftime(config.elasticsearch.elasticsearch_index) mapping = fake_mappings[i % len(fake_mappings)] self.index_creator.create_index(index, mapping) indices.append(index) api = SuperSearchFields(config=config) missing_fields = api.get_missing_fields() expected = [ 'field_z', 'namespace1.field_a', 'namespace1.field_b', 'namespace1.subspace1.field_d', 'namespace2.subspace1.field_b', ] eq_(missing_fields['hits'], expected) eq_(missing_fields['total'], 5) finally: for index in indices: self.index_client.delete(index=index) def test_get_mapping(self): mapping = self.api.get_mapping()['mappings'] doctype = self.config.elasticsearch.elasticsearch_doctype ok_(doctype in mapping) properties = mapping[doctype]['properties'] ok_('processed_crash' in properties) ok_('raw_crash' in properties) processed_crash = properties['processed_crash']['properties'] # Check in_database_name is used. ok_('os_name' in processed_crash) ok_('platform' not in processed_crash) # Those fields have no `storage_mapping`. ok_('fake_field' not in properties['raw_crash']['properties']) # Those fields have a `storage_mapping`. eq_(processed_crash['release_channel'], {'type': 'string'}) # Test nested objects. ok_('json_dump' in processed_crash) ok_('properties' in processed_crash['json_dump']) ok_('write_combine_size' in processed_crash['json_dump']['properties']) eq_( processed_crash['json_dump']['properties']['write_combine_size'], {'type': 'long'} ) # Test overwriting a field. mapping = self.api.get_mapping(overwrite_mapping={ 'name': 'fake_field', 'storage_mapping': { 'type': 'long' } })['mappings'] properties = mapping[doctype]['properties'] ok_('fake_field' in properties['raw_crash']['properties']) eq_( properties['raw_crash']['properties']['fake_field']['type'], 'long' ) def test_test_mapping(self): """Much test. So meta. Wow test_test_. """ # First test a valid mapping. mapping = self.api.get_mapping() ok_(self.api.test_mapping(mapping) is None) # Insert an invalid storage mapping. mapping = self.api.get_mapping({ 'name': 'fake_field', 'storage_mapping': { 'type': 'unkwown' } }) assert_raises( BadArgumentError, self.api.test_mapping, mapping, ) # Test with a correct mapping but with data that cannot be indexed. self.index_crash({ 'date_processed': datetimeutil.utc_now(), 'product': 'WaterWolf', }) self.refresh_index() mapping = self.api.get_mapping({ 'name': 'product', 'storage_mapping': { 'type': 'long' } }) assert_raises( BadArgumentError, self.api.test_mapping, mapping, )
def setUp(self): super(IntegrationTestSuperSearchFields, self).setUp() self.api = SuperSearchFields(config=self.config) self.api.get_fields = lambda: copy.deepcopy(SUPERSEARCH_FIELDS)
def test_get_missing_fields(self): config = self.get_base_config(es_index='socorro_integration_test_%W') fake_mappings = [ { 'mappings': { config.elasticsearch.elasticsearch_doctype: { 'properties': { # Add a bunch of unknown fields. 'field_z': { 'type': 'string' }, 'namespace1': { 'type': 'object', 'properties': { 'field_a': { 'type': 'string' }, 'field_b': { 'type': 'long' } } }, 'namespace2': { 'type': 'object', 'properties': { 'subspace1': { 'type': 'object', 'properties': { 'field_b': { 'type': 'long' } } } } }, # Add a few known fields that should not appear. 'processed_crash': { 'type': 'object', 'properties': { 'signature': { 'type': 'string' }, 'product': { 'type': 'string' }, } } } } } }, { 'mappings': { config.elasticsearch.elasticsearch_doctype: { 'properties': { 'namespace1': { 'type': 'object', 'properties': { 'subspace1': { 'type': 'object', 'properties': { 'field_d': { 'type': 'long' } } } } } } } } }, ] now = datetimeutil.utc_now() indices = [] try: # Using "2" here means that an index will be missing, hence testing # that it swallows the subsequent error. for i in range(2): date = now - datetime.timedelta(weeks=i) index = date.strftime(config.elasticsearch.elasticsearch_index) mapping = fake_mappings[i % len(fake_mappings)] self.index_creator.create_index(index, mapping) indices.append(index) api = SuperSearchFields(config=config) missing_fields = api.get_missing_fields() expected = [ 'field_z', 'namespace1.field_a', 'namespace1.field_b', 'namespace1.subspace1.field_d', 'namespace2.subspace1.field_b', ] assert missing_fields['hits'] == expected assert missing_fields['total'] == 5 finally: for index in indices: self.index_client.delete(index=index)
class IntegrationTestSuperSearchFields(ElasticsearchTestCase): """Test SuperSearchFields with an elasticsearch database containing fake data. """ def setUp(self): super(IntegrationTestSuperSearchFields, self).setUp() self.api = SuperSearchFields(config=self.config) def test_get_fields(self): results = self.api.get_fields() eq_(results, SUPERSEARCH_FIELDS) def test_create_field(self): # Test with all parameters set. params = { "name": "plotfarm", "data_validation_type": "str", "default_value": None, "description": "a plotfarm like Lunix or Wondiws", "form_field_choices": ["lun", "won", "cam"], "has_full_version": True, "in_database_name": "os_name", "is_exposed": True, "is_returned": True, "is_mandatory": False, "query_type": "str", "namespace": "processed_crash", "permissions_needed": ["view_plotfarm"], "storage_mapping": {"type": "multi_field"}, } res = self.api.create_field(**params) ok_(res) field = self.connection.get( index=self.config.webapi.elasticsearch_default_index, doc_type="supersearch_fields", id="plotfarm" ) field = field["_source"] eq_(sorted(field.keys()), sorted(params.keys())) for key in field.keys(): eq_(field[key], params[key]) # Test default values. res = self.api.create_field( name="brand_new_field", in_database_name="brand_new_field", namespace="processed_crash" ) ok_(res) ok_( self.connection.get( index=self.config.webapi.elasticsearch_default_index, doc_type="supersearch_fields", id="brand_new_field", ) ) # Test errors. # `name` is missing. assert_raises(MissingArgumentError, self.api.create_field, in_database_name="something") # `in_database_name` is missing. assert_raises(MissingArgumentError, self.api.create_field, name="something") # Field already exists. assert_raises( InsertionError, self.api.create_field, name="product", in_database_name="product", namespace="processed_crash", ) # Test logging. res = self.api.create_field( name="what_a_field", in_database_name="what_a_field", namespace="processed_crash", storage_mapping='{"type": "long"}', ) ok_(res) self.api.config.logger.info.assert_called_with( 'elasticsearch mapping changed for field "%s", ' 'added new mapping "%s"', "what_a_field", {u"type": u"long"}, ) def test_update_field(self): # Let's create a field first. assert self.api.create_field( name="super_field", in_database_name="super_field", namespace="superspace", description="inaccurate description", permissions_needed=["view_nothing"], storage_mapping={"type": "boolean", "null_value": False}, ) # Now let's update that field a little. res = self.api.update_field( name="super_field", description="very accurate description", storage_mapping={"type": "long", "analyzer": "keyword"}, ) ok_(res) # Test logging. self.api.config.logger.info.assert_called_with( 'elasticsearch mapping changed for field "%s", ' 'was "%s", now "%s"', "super_field", {"type": "boolean", "null_value": False}, {"type": "long", "analyzer": "keyword"}, ) field = self.connection.get( index=self.config.elasticsearch.elasticsearch_default_index, doc_type="supersearch_fields", id="super_field" ) field = field["_source"] # Verify the changes were taken into account. eq_(field["description"], "very accurate description") eq_(field["storage_mapping"], {"type": "long", "analyzer": "keyword"}) # Verify other values did not change. eq_(field["permissions_needed"], ["view_nothing"]) eq_(field["in_database_name"], "super_field") eq_(field["namespace"], "superspace") # Test errors. assert_raises(MissingArgumentError, self.api.update_field) # `name` is missing assert_raises(ResourceNotFound, self.api.update_field, name="unkownfield") def test_delete_field(self): self.api.delete_field(name="product") ok_( self.connection.get( index=self.config.elasticsearch.elasticsearch_default_index, doc_type="supersearch_fields", id="signature", ) ) assert_raises( elasticsearch.exceptions.NotFoundError, self.connection.get, index=self.config.elasticsearch.elasticsearch_default_index, doc_type="supersearch_fields", id="product", ) @minimum_es_version("1.0") def test_get_missing_fields(self): config = self.get_mware_config(es_index="socorro_integration_test_%W") fake_mappings = [ { "mappings": { config.elasticsearch.elasticsearch_doctype: { "properties": { # Add a bunch of unknown fields. "field_z": {"type": "string"}, "namespace1": { "type": "object", "properties": {"field_a": {"type": "string"}, "field_b": {"type": "long"}}, }, "namespace2": { "type": "object", "properties": { "subspace1": {"type": "object", "properties": {"field_b": {"type": "long"}}} }, }, # Add a few known fields that should not appear. "processed_crash": { "type": "object", "properties": {"signature": {"type": "string"}, "product": {"type": "string"}}, }, } } } }, { "mappings": { config.elasticsearch.elasticsearch_doctype: { "properties": { "namespace1": { "type": "object", "properties": { "subspace1": {"type": "object", "properties": {"field_d": {"type": "long"}}} }, } } } } }, ] now = datetimeutil.utc_now() indices = [] try: # Using "2" here means that an index will be missing, hence testing # that it swallows the subsequent error. for i in range(2): date = now - datetime.timedelta(weeks=i) index = date.strftime(config.elasticsearch.elasticsearch_index) mapping = fake_mappings[i % len(fake_mappings)] self.index_creator.create_index(index, mapping) indices.append(index) api = SuperSearchFields(config=config) missing_fields = api.get_missing_fields() expected = [ "field_z", "namespace1.field_a", "namespace1.field_b", "namespace1.subspace1.field_d", "namespace2.subspace1.field_b", ] eq_(missing_fields["hits"], expected) eq_(missing_fields["total"], 5) finally: for index in indices: self.index_client.delete(index=index) def test_get_mapping(self): mapping = self.api.get_mapping()["mappings"] doctype = self.config.elasticsearch.elasticsearch_doctype ok_(doctype in mapping) properties = mapping[doctype]["properties"] ok_("processed_crash" in properties) ok_("raw_crash" in properties) processed_crash = properties["processed_crash"]["properties"] # Check in_database_name is used. ok_("os_name" in processed_crash) ok_("platform" not in processed_crash) # Those fields have no `storage_mapping`. ok_("fake_field" not in properties["raw_crash"]["properties"]) # Those fields have a `storage_mapping`. eq_(processed_crash["release_channel"], {"type": "string"}) # Test nested objects. ok_("json_dump" in processed_crash) ok_("properties" in processed_crash["json_dump"]) ok_("write_combine_size" in processed_crash["json_dump"]["properties"]) eq_(processed_crash["json_dump"]["properties"]["write_combine_size"], {"type": "long"}) # Test overwriting a field. mapping = self.api.get_mapping(overwrite_mapping={"name": "fake_field", "storage_mapping": {"type": "long"}})[ "mappings" ] properties = mapping[doctype]["properties"] ok_("fake_field" in properties["raw_crash"]["properties"]) eq_(properties["raw_crash"]["properties"]["fake_field"]["type"], "long") def test_test_mapping(self): """Much test. So meta. Wow test_test_. """ # First test a valid mapping. mapping = self.api.get_mapping() ok_(self.api.test_mapping(mapping) is None) # Insert an invalid storage mapping. mapping = self.api.get_mapping({"name": "fake_field", "storage_mapping": {"type": "unkwown"}}) assert_raises(elasticsearch.exceptions.RequestError, self.api.test_mapping, mapping) # Test with a correct mapping but with data that cannot be indexed. self.index_crash({"date_processed": datetimeutil.utc_now(), "product": "WaterWolf"}) self.refresh_index() mapping = self.api.get_mapping({"name": "product", "storage_mapping": {"type": "long"}}) # self.api.test_mapping(mapping) assert_raises(elasticsearch.exceptions.RequestError, self.api.test_mapping, mapping)
class IntegrationTestSuperSearchFields(ElasticsearchTestCase): """Test SuperSearchFields with an elasticsearch database containing fake data. """ def setUp(self): super(IntegrationTestSuperSearchFields, self).setUp() self.api = SuperSearchFields(config=self.config) self.api.get_fields = lambda: copy.deepcopy(SUPERSEARCH_FIELDS) def test_get_fields(self): results = self.api.get_fields() assert results == SUPERSEARCH_FIELDS @minimum_es_version('1.0') def test_get_missing_fields(self): config = self.get_base_config(es_index='socorro_integration_test_%W') fake_mappings = [ { 'mappings': { config.elasticsearch.elasticsearch_doctype: { 'properties': { # Add a bunch of unknown fields. 'field_z': { 'type': 'string' }, 'namespace1': { 'type': 'object', 'properties': { 'field_a': { 'type': 'string' }, 'field_b': { 'type': 'long' } } }, 'namespace2': { 'type': 'object', 'properties': { 'subspace1': { 'type': 'object', 'properties': { 'field_b': { 'type': 'long' } } } } }, # Add a few known fields that should not appear. 'processed_crash': { 'type': 'object', 'properties': { 'signature': { 'type': 'string' }, 'product': { 'type': 'string' }, } } } } } }, { 'mappings': { config.elasticsearch.elasticsearch_doctype: { 'properties': { 'namespace1': { 'type': 'object', 'properties': { 'subspace1': { 'type': 'object', 'properties': { 'field_d': { 'type': 'long' } } } } } } } } }, ] now = datetimeutil.utc_now() indices = [] try: # Using "2" here means that an index will be missing, hence testing # that it swallows the subsequent error. for i in range(2): date = now - datetime.timedelta(weeks=i) index = date.strftime(config.elasticsearch.elasticsearch_index) mapping = fake_mappings[i % len(fake_mappings)] self.index_creator.create_index(index, mapping) indices.append(index) api = SuperSearchFields(config=config) missing_fields = api.get_missing_fields() expected = [ 'field_z', 'namespace1.field_a', 'namespace1.field_b', 'namespace1.subspace1.field_d', 'namespace2.subspace1.field_b', ] assert missing_fields['hits'] == expected assert missing_fields['total'] == 5 finally: for index in indices: self.index_client.delete(index=index) def test_get_mapping(self): mapping = self.api.get_mapping() doctype = self.config.elasticsearch.elasticsearch_doctype assert doctype in mapping properties = mapping[doctype]['properties'] assert 'processed_crash' in properties assert 'raw_crash' in properties processed_crash = properties['processed_crash']['properties'] # Check in_database_name is used. assert 'os_name' in processed_crash assert 'platform' not in processed_crash # Those fields have no `storage_mapping`. assert 'fake_field' not in properties['raw_crash']['properties'] # Those fields have a `storage_mapping`. assert processed_crash['release_channel'] == {'type': 'string'} # Test nested objects. assert 'json_dump' in processed_crash assert 'properties' in processed_crash['json_dump'] assert 'write_combine_size' in processed_crash['json_dump'][ 'properties'] assert processed_crash['json_dump']['properties'][ 'write_combine_size'] == { 'type': 'long' } # Test overwriting a field. mapping = self.api.get_mapping(overwrite_mapping={ 'name': 'fake_field', 'storage_mapping': { 'type': 'long' } }) properties = mapping[doctype]['properties'] assert 'fake_field' in properties['raw_crash']['properties'] assert properties['raw_crash']['properties']['fake_field'][ 'type'] == 'long' def test_test_mapping(self): """Much test. So meta. Wow test_test_. """ # First test a valid mapping. mapping = self.api.get_mapping() assert self.api.test_mapping(mapping) is None # Insert an invalid storage mapping. mapping = self.api.get_mapping({ 'name': 'fake_field', 'storage_mapping': { 'type': 'unkwown' } }) with pytest.raises(BadArgumentError): self.api.test_mapping(mapping) # Test with a correct mapping but with data that cannot be indexed. self.index_crash({ 'date_processed': datetimeutil.utc_now(), 'product': 'WaterWolf', }) self.refresh_index() mapping = self.api.get_mapping({ 'name': 'product', 'storage_mapping': { 'type': 'long' } }) with pytest.raises(BadArgumentError): self.api.test_mapping(mapping)
def delete_field(self, **kwargs): return SuperSearchFields(config=self.config).delete_field(**kwargs)
def get(self, **kwargs): kwargs['_fields'] = SuperSearchFields(config=self.config).get_fields() return super(SuperSearchWithFields, self).get(**kwargs)
def setUp(self): super(IntegrationTestSuperSearchFields, self).setUp() self.api = SuperSearchFields(config=self.config) self.api.get_fields = lambda: copy.deepcopy(FIELDS)
def get_fields(self, **kwargs): return SuperSearchFields(config=self.config).get_fields(**kwargs)
def setUp(self): super(IntegrationTestSuperSearchFields, self).setUp() self.api = SuperSearchFields(config=self.config)
def test_index_crash_mapping_keys(self): """Test indexing a crash that has keys not in the mapping Indexing a crash that has keys that aren't in the mapping for the index should cause those keys to be removed from the crash. """ # The test harness creates an index for this week and last week. So let's create # one for 4 weeks ago. now = utc_now() four_weeks_ago = now - timedelta(days=28) field = "user_comments" # We're going to use a mapping that's what SuperSearchFields gives us, but # remove the user_comments field. mappings = SuperSearchFields(context=self.es_context).get_mapping() doctype = self.es_context.get_doctype() del mappings[doctype]["properties"]["processed_crash"]["properties"][ field] # Create the index for 4 weeks ago self.es_context.create_index( index_name=self.es_context.get_index_for_date(four_weeks_ago), mappings=mappings, ) es_storage = ESCrashStorage(config=self.config) # Create a crash for this week and save it now_uuid = "00000000-0000-0000-0000-000000120408" raw_crash = { "BuildID": "20200506000000", } processed_crash = { field: "this week", "date_processed": date_to_string(now), "uuid": now_uuid, } es_storage.save_processed_crash( raw_crash=raw_crash, processed_crash=processed_crash, ) # Create a crash for four weeks ago with the bum mapping and save it old_uuid = "11111111-1111-1111-1111-111111120408" raw_crash = { "BuildID": "20200506000000", } processed_crash = { field: "this week", "date_processed": date_to_string(now - timedelta(days=28)), "uuid": old_uuid, } es_storage.save_processed_crash( raw_crash=raw_crash, processed_crash=processed_crash, ) self.es_context.refresh() # Retrieve the document from this week and verify it has the user_comments # field doc = self.conn.get( index=self.es_context.get_index_for_date(now), id=now_uuid, ) assert field in doc["_source"]["processed_crash"] # Retrieve the document from four weeks ago and verify it doesn't have the # user_comments field doc = self.conn.get( index=self.es_context.get_index_for_date(four_weeks_ago), id=old_uuid, ) assert field not in doc["_source"]["processed_crash"]
class SuperSearch(SearchBase): def __init__(self, *args, **kwargs): self.config = kwargs.get('config') self.es_context = self.config.elasticsearch.elasticsearch_class( self.config.elasticsearch) self.all_fields = SuperSearchFields(config=self.config).get_fields() # Create a map to associate a field's name in the database to its # exposed name (in the results and facets). self.database_name_to_field_name_map = dict( (x['in_database_name'], x['name']) for x in self.all_fields.values()) kwargs.update(fields=self.all_fields) super(SuperSearch, self).__init__(*args, **kwargs) def get_connection(self): with self.es_context() as conn: return conn def generate_list_of_indices(self, from_date, to_date, es_index=None): """Return the list of indices to query to access all the crash reports that were processed between from_date and to_date. The naming pattern for indices in elasticsearch is configurable, it is possible to have an index per day, per week, per month... Parameters: * from_date datetime object * to_date datetime object """ if es_index is None: es_index = self.config.elasticsearch_index indices = [] current_date = from_date while current_date <= to_date: index = current_date.strftime(es_index) # Make sure no index is twice in the list # (for weekly or monthly indices for example) if index not in indices: indices.append(index) current_date += datetime.timedelta(days=1) return indices def get_indices(self, dates): """Return the list of indices to use for given dates. """ start_date = None end_date = None for date in dates: if '>' in date.operator: start_date = date.value if '<' in date.operator: end_date = date.value return self.generate_list_of_indices(start_date, end_date) def format_field_names(self, hit): """Return a hit with each field's database name replaced by its exposed name. """ new_hit = {} for field in hit: new_field = field if '.' in new_field: # Remove the prefix ("processed_crash." or "raw_crash."). new_field = new_field.split('.')[-1] new_field = self.database_name_to_field_name_map.get( new_field, new_field) new_hit[new_field] = hit[field] return new_hit def format_fields(self, hit): """Return a well formatted document. Elasticsearch returns values as lists when using the `fields` option. This function removes the list when it contains zero or one element. It also calls `format_field_names` to correct all the field names. """ hit = self.format_field_names(hit) for field in hit: if isinstance(hit[field], (list, tuple)): if len(hit[field]) == 0: hit[field] = None elif len(hit[field]) == 1: hit[field] = hit[field][0] return hit def format_aggregations(self, aggregations): """Return aggregations in a form that looks like facets. We used to expose the Elasticsearch facets directly. This is thus needed for backwards compatibility. """ aggs = aggregations.to_dict() for agg in aggs: for i, row in enumerate(aggs[agg]['buckets']): aggs[agg]['buckets'][i] = { 'term': row['key'], 'count': row['doc_count'], } aggs[agg] = aggs[agg]['buckets'] return aggs def get(self, **kwargs): """Return a list of results and aggregations based on parameters. The list of accepted parameters (with types and default values) is in the database and can be accessed with the super_search_fields service. """ # Filter parameters and raise potential errors. params = self.get_parameters(**kwargs) # Find the indices to use to optimize the elasticsearch query. indices = self.get_indices(params['date']) # Create and configure the search object. search = Search( using=self.get_connection(), index=indices, doc_type=self.config.elasticsearch.elasticsearch_doctype, ) # Create filters. filters = None for field, sub_params in params.items(): sub_filters = None for param in sub_params: if param.name.startswith('_'): if param.name == '_results_offset': results_from = param.value[0] elif param.name == '_results_number': results_number = param.value[0] # Don't use meta parameters in the query. continue field_data = self.all_fields[param.name] name = '%s.%s' % (field_data['namespace'], field_data['in_database_name']) if param.data_type in ('date', 'datetime'): param.value = datetimeutil.date_to_string(param.value) elif param.data_type == 'enum': param.value = [x.lower() for x in param.value] elif param.data_type == 'str' and not param.operator: param.value = [x.lower() for x in param.value] args = {} filter_type = 'term' filter_value = None if not param.operator: # contains one of the terms if len(param.value) == 1: val = param.value[0] if not isinstance(val, basestring) or (isinstance( val, basestring) and ' ' not in val): filter_value = val # If the term contains white spaces, we want to perform # a phrase query. Thus we do nothing here and let this # value be handled later. else: filter_type = 'terms' filter_value = param.value elif param.operator == '=': # is exactly if field_data['has_full_version']: name = '%s.full' % name filter_value = param.value elif param.operator == '>': # greater than filter_type = 'range' filter_value = {'gt': param.value} elif param.operator == '<': # lower than filter_type = 'range' filter_value = {'lt': param.value} elif param.operator == '>=': # greater than or equal to filter_type = 'range' filter_value = {'gte': param.value} elif param.operator == '<=': # lower than or equal to filter_type = 'range' filter_value = {'lte': param.value} elif param.operator == '__null__': # is null filter_type = 'missing' args['field'] = name if filter_value is not None: args[name] = filter_value if args: if param.operator_not: new_filter = ~F(filter_type, **args) else: new_filter = F(filter_type, **args) if sub_filters is None: sub_filters = new_filter elif param.data_type == 'enum': sub_filters |= new_filter else: sub_filters &= new_filter continue # These use a wildcard and thus need to be in a query # instead of a filter. operator_wildcards = { '~': '*%s*', # contains '$': '%s*', # starts with '^': '*%s' # ends with } if param.operator in operator_wildcards: if field_data['has_full_version']: name = '%s.full' % name query_type = 'wildcard' args[name] = (operator_wildcards[param.operator] % param.value) elif not param.operator: # This is a phrase that was passed down. query_type = 'simple_query_string' args['query'] = param.value[0] args['fields'] = [name] args['default_operator'] = 'and' if args: query = Q(query_type, **args) if param.operator_not: query = ~query search = search.query(query) else: # If we reach this point, that means the operator is # not supported, and we should raise an error about that. raise NotImplementedError('Operator %s is not supported' % param.operator) if filters is None: filters = sub_filters elif sub_filters is not None: filters &= sub_filters search = search.filter(filters) # Pagination. results_to = results_from + results_number search = search[results_from:results_to] # Create facets. for param in params['_facets']: for value in param.value: try: field_ = self.all_fields[value] except KeyError: # That is not a known field, we can't facet on it. raise BadArgumentError( value, msg='Unknown field "%s", cannot facet on it' % value) field_name = '%s.%s' % (field_['namespace'], field_['in_database_name']) if field_['has_full_version']: # If the param has a full version, that means what matters # is the full string, and not its individual terms. field_name += '.full' search.aggs.bucket(value, 'terms', field=field_name, size=self.config.facets_max_number) # Query and compute results. hits = [] fields = [ '%s.%s' % (x['namespace'], x['in_database_name']) for x in self.all_fields.values() if x['is_returned'] ] search = search.fields(*fields) if params['_return_query'][0].value[0]: # Return only the JSON query that would be sent to elasticsearch. return { 'query': search.to_dict(), 'indices': indices, } # We call elasticsearch with a computed list of indices, based on # the date range. However, if that list contains indices that do not # exist in elasticsearch, an error will be raised. We thus want to # remove all failing indices until we either have a valid list, or # an empty list in which case we return no result. while True: try: results = search.execute() for hit in results: hits.append(self.format_fields(hit.to_dict())) total = search.count() aggregations = self.format_aggregations(results.aggregations) break # Yay! Results! except NotFoundError, e: missing_index = re.findall(BAD_INDEX_REGEX, e.error)[0] if missing_index in indices: del indices[indices.index(missing_index)] else: # Wait what? An error caused by an index that was not # in the request? That should never happen, but in case # it does, better know it. raise if indices: # Update the list of indices and try again. # Note: we need to first empty the list of indices before # updating it, otherwise the removed indices never get # actually removed. search = search.index().index(*indices) else: # There is no index left in the list, return an empty # result. hits = [] total = 0 aggregations = {} break return { 'hits': hits, 'total': total, 'facets': aggregations, }
class IntegrationTestSuperSearchFields(ElasticsearchTestCase): """Test SuperSearchFields with an elasticsearch database containing fake data. """ def setUp(self): super(IntegrationTestSuperSearchFields, self).setUp() self.api = SuperSearchFields(config=self.config) def test_get_fields(self): results = self.api.get_fields() eq_(results, SUPERSEARCH_FIELDS) def test_create_field(self): # Test with all parameters set. params = { 'name': 'plotfarm', 'data_validation_type': 'str', 'default_value': None, 'description': 'a plotfarm like Lunix or Wondiws', 'form_field_choices': ['lun', 'won', 'cam'], 'has_full_version': False, 'in_database_name': 'os_name', 'is_exposed': True, 'is_returned': True, 'is_mandatory': False, 'query_type': 'str', 'namespace': 'processed_crash', 'permissions_needed': ['view_plotfarm'], 'storage_mapping': { "type": "keyword" }, } res = self.api.create_field(**params) ok_(res) field = self.connection.get( index=self.config.elasticsearch.elasticsearch_default_index, doc_type='supersearch_fields', id='plotfarm', ) field = field['_source'] eq_(sorted(field.keys()), sorted(params.keys())) for key in field.keys(): eq_(field[key], params[key]) # Test default values. res = self.api.create_field( name='brand_new_field', in_database_name='brand_new_field', namespace='processed_crash', ) ok_(res) ok_( self.connection.get( index=self.config.elasticsearch.elasticsearch_default_index, doc_type='supersearch_fields', id='brand_new_field', )) # Test errors. # `name` is missing. assert_raises( MissingArgumentError, self.api.create_field, in_database_name='something', ) # `in_database_name` is missing. assert_raises( MissingArgumentError, self.api.create_field, name='something', ) # Field already exists. assert_raises( BadArgumentError, self.api.create_field, name='product', in_database_name='product', namespace='processed_crash', ) # Test logging. res = self.api.create_field( name='what_a_field', in_database_name='what_a_field', namespace='processed_crash', storage_mapping='{"type": "long"}', ) ok_(res) self.api.config.logger.info.assert_called_with( 'elasticsearch mapping changed for field "%s", ' 'added new mapping "%s"', 'what_a_field', {u'type': u'long'}, ) def test_update_field(self): # Let's create a field first. assert self.api.create_field(name='super_field', in_database_name='super_field', namespace='superspace', description='inaccurate description', permissions_needed=['view_nothing'], storage_mapping={ 'type': 'boolean', 'null_value': False }) # Now let's update that field a little. res = self.api.update_field( name='super_field', description='very accurate description', storage_mapping={'type': 'long'}, ) ok_(res) # Test logging. self.api.config.logger.info.assert_called_with( 'Elasticsearch mapping changed for field "%s", ' 'was "%s", now "%s"', 'super_field', { 'type': 'boolean', 'null_value': False }, {'type': 'long'}, ) field = self.connection.get( index=self.config.elasticsearch.elasticsearch_default_index, doc_type='supersearch_fields', id='super_field', ) field = field['_source'] # Verify the changes were taken into account. eq_(field['description'], 'very accurate description') eq_(field['storage_mapping'], {'type': 'long'}) # Verify other values did not change. eq_(field['permissions_needed'], ['view_nothing']) eq_(field['in_database_name'], 'super_field') eq_(field['namespace'], 'superspace') # Test errors. assert_raises( MissingArgumentError, self.api.update_field, ) # `name` is missing assert_raises( ResourceNotFound, self.api.update_field, name='unkownfield', ) def test_delete_field(self): self.api.delete_field(name='product') ok_( self.connection.get( index=self.config.elasticsearch.elasticsearch_default_index, doc_type='supersearch_fields', id='signature', )) assert_raises( elasticsearch.exceptions.NotFoundError, self.connection.get, index=self.config.elasticsearch.elasticsearch_default_index, doc_type='supersearch_fields', id='product', ) def test_get_missing_fields(self): config = self.get_base_config(es_index='socorro_integration_test_%W') fake_mappings = [ { 'mappings': { config.elasticsearch.elasticsearch_doctype: { 'properties': { # Add a bunch of unknown fields. 'field_z': { 'type': 'string' }, 'namespace1': { 'type': 'object', 'properties': { 'field_a': { 'type': 'string' }, 'field_b': { 'type': 'long' } } }, 'namespace2': { 'type': 'object', 'properties': { 'subspace1': { 'type': 'object', 'properties': { 'field_b': { 'type': 'long' } } } } }, # Add a few known fields that should not appear. 'processed_crash': { 'type': 'object', 'properties': { 'signature': { 'type': 'string' }, 'product': { 'type': 'string' }, } } } } } }, { 'mappings': { config.elasticsearch.elasticsearch_doctype: { 'properties': { 'namespace1': { 'type': 'object', 'properties': { 'subspace1': { 'type': 'object', 'properties': { 'field_d': { 'type': 'long' } } } } } } } } }, ] now = datetimeutil.utc_now() indices = [] try: # Using "2" here means that an index will be missing, hence testing # that it swallows the subsequent error. for i in range(2): date = now - datetime.timedelta(weeks=i) index = date.strftime(config.elasticsearch.elasticsearch_index) mapping = fake_mappings[i % len(fake_mappings)] self.index_creator.create_index(index, mapping) indices.append(index) api = SuperSearchFields(config=config) missing_fields = api.get_missing_fields() expected = [ 'field_z', 'namespace1.field_a', 'namespace1.field_b', 'namespace1.subspace1.field_d', 'namespace2.subspace1.field_b', ] eq_(missing_fields['hits'], expected) eq_(missing_fields['total'], 5) finally: for index in indices: self.index_client.delete(index=index) def test_get_mapping(self): mapping = self.api.get_mapping() doctype = self.config.elasticsearch.elasticsearch_doctype ok_(doctype in mapping) properties = mapping[doctype]['properties'] ok_('processed_crash' in properties) ok_('raw_crash' in properties) processed_crash = properties['processed_crash']['properties'] # Check in_database_name is used. ok_('os_name' in processed_crash) ok_('platform' not in processed_crash) # Those fields have no `storage_mapping`. ok_('fake_field' not in properties['raw_crash']['properties']) # Those fields have a `storage_mapping`. eq_(processed_crash['signature'], { 'type': 'text', 'fields': { 'full': { 'type': 'keyword', } } }) # Test nested objects. ok_('json_dump' in processed_crash) ok_('properties' in processed_crash['json_dump']) ok_('write_combine_size' in processed_crash['json_dump']['properties']) eq_(processed_crash['json_dump']['properties']['write_combine_size'], {'type': 'long'}) # Test overwriting a field. mapping = self.api.get_mapping(overwrite_mapping={ 'name': 'fake_field', 'storage_mapping': { 'type': 'long' } }) properties = mapping[doctype]['properties'] ok_('fake_field' in properties['raw_crash']['properties']) eq_(properties['raw_crash']['properties']['fake_field']['type'], 'long') def test_test_mapping(self): """Much test. So meta. Wow test_test_. """ # First test a valid mapping. mapping = self.api.get_mapping() ok_(self.api.test_mapping(mapping) is None) # Insert an invalid storage mapping. mapping = self.api.get_mapping({ 'name': 'fake_field', 'storage_mapping': { 'type': 'unkwown' } }) assert_raises( BadArgumentError, self.api.test_mapping, mapping, ) # Test with a correct mapping but with data that cannot be indexed. self.index_crash({ 'date_processed': datetimeutil.utc_now(), 'product': 'WaterWolf', }) self.refresh_index() mapping = self.api.get_mapping({ 'name': 'product', 'storage_mapping': { 'type': 'long' } }) assert_raises( BadArgumentError, self.api.test_mapping, mapping, )
class TelemetryBotoS3CrashStorage(BotoS3CrashStorage): """Sends a subset of the processed crash to an S3 bucket The subset of the processed crash is based on the JSON Schema which is derived from "socorro/external/es/super_search_fields.py". """ required_config = Namespace() required_config.resource_class = change_default( BotoCrashStorage, 'resource_class', 'socorro.external.boto.connection_context.RegionalS3ConnectionContext' ) required_config.elasticsearch = Namespace() required_config.elasticsearch.add_option( 'elasticsearch_class', default='socorro.external.es.connection_context.ConnectionContext', from_string_converter=class_converter, reference_value_from='resource.elasticsearch', ) def __init__(self, config, *args, **kwargs): super(TelemetryBotoS3CrashStorage, self).__init__( config, *args, **kwargs ) self._all_fields = SuperSearchFields(config=self.config).get() def save_raw_and_processed( self, raw_crash, dumps, processed_crash, crash_id ): crash_report = {} # TODO Opportunity of optimization; # We could inspect CRASH_REPORT_JSON_SCHEMA and get a list # of all (recursive) keys that are in there and use that # to limit the two following loops to not bother # filling up `crash_report` with keys that will never be # needed. # Rename fields in raw_crash. raw_fields_map = dict( (x['in_database_name'], x['name']) for x in self._all_fields.values() if x['namespace'] == 'raw_crash' ) for key, val in raw_crash.items(): crash_report[raw_fields_map.get(key, key)] = val # Rename fields in processed_crash. processed_fields_map = dict( (x['in_database_name'], x['name']) for x in self._all_fields.values() if x['namespace'] == 'processed_crash' ) for key, val in processed_crash.items(): crash_report[processed_fields_map.get(key, key)] = val # Validate crash_report. crash_report = json_schema_reducer.make_reduced_dict( CRASH_REPORT_JSON_SCHEMA, crash_report ) self.save_processed(crash_report) @staticmethod def _do_save_processed(boto_connection, processed_crash): """Overriding this to change "name of thing" to crash_report""" crash_id = processed_crash['uuid'] processed_crash_as_string = boto_connection._convert_mapping_to_string( processed_crash ) boto_connection.submit( crash_id, "crash_report", processed_crash_as_string ) @staticmethod def _do_get_unredacted_processed(boto_connection, crash_id, json_object_hook): """Overriding this to change "name of thing" to crash_report""" try: processed_crash_as_string = boto_connection.fetch(crash_id, 'crash_report') return json.loads( processed_crash_as_string, object_hook=json_object_hook, ) except boto_connection.ResponseError as x: raise CrashIDNotFound( '%s not found: %s' % (crash_id, x) )
def __init__(self, config, *args, **kwargs): super(TelemetryBotoS3CrashStorage, self).__init__( config, *args, **kwargs ) self._all_fields = SuperSearchFields(config=self.config).get()
class TelemetryBotoS3CrashStorage(BotoS3CrashStorage): """Sends a subset of the processed crash to an S3 bucket The subset of the processed crash is based on the JSON Schema which is derived from "socorro/external/es/super_search_fields.py". """ required_config = Namespace() required_config.resource_class = change_default( BotoCrashStorage, 'resource_class', 'socorro.external.boto.connection_context.RegionalS3ConnectionContext') required_config.elasticsearch = Namespace() required_config.elasticsearch.add_option( 'elasticsearch_class', default='socorro.external.es.connection_context.ConnectionContext', from_string_converter=class_converter, reference_value_from='resource.elasticsearch', ) def __init__(self, config, *args, **kwargs): super(TelemetryBotoS3CrashStorage, self).__init__(config, *args, **kwargs) self._all_fields = SuperSearchFields(config=self.config).get() def save_raw_and_processed(self, raw_crash, dumps, processed_crash, crash_id): crash_report = {} # TODO Opportunity of optimization; # We could inspect CRASH_REPORT_JSON_SCHEMA and get a list # of all (recursive) keys that are in there and use that # to limit the two following loops to not bother # filling up `crash_report` with keys that will never be # needed. # Rename fields in raw_crash. raw_fields_map = dict((x['in_database_name'], x['name']) for x in self._all_fields.values() if x['namespace'] == 'raw_crash') for key, val in raw_crash.items(): crash_report[raw_fields_map.get(key, key)] = val # Rename fields in processed_crash. processed_fields_map = dict((x['in_database_name'], x['name']) for x in self._all_fields.values() if x['namespace'] == 'processed_crash') for key, val in processed_crash.items(): crash_report[processed_fields_map.get(key, key)] = val # Validate crash_report. crash_report = json_schema_reducer.make_reduced_dict( CRASH_REPORT_JSON_SCHEMA, crash_report) self.save_processed(crash_report) @staticmethod def _do_save_processed(boto_connection, processed_crash): """Overriding this to change "name of thing" to crash_report""" crash_id = processed_crash['uuid'] processed_crash_as_string = boto_connection._convert_mapping_to_string( processed_crash) boto_connection.submit(crash_id, "crash_report", processed_crash_as_string) @staticmethod def _do_get_unredacted_processed(boto_connection, crash_id, json_object_hook): """Overriding this to change "name of thing" to crash_report""" try: processed_crash_as_string = boto_connection.fetch( crash_id, 'crash_report') return json.loads( processed_crash_as_string, object_hook=json_object_hook, ) except boto_connection.ResponseError as x: raise CrashIDNotFound('%s not found: %s' % (crash_id, x))
def test_get_missing_fields(self): config = self.get_mware_config(es_index="socorro_integration_test_%W") fake_mappings = [ { "mappings": { config.elasticsearch.elasticsearch_doctype: { "properties": { # Add a bunch of unknown fields. "field_z": {"type": "string"}, "namespace1": { "type": "object", "properties": {"field_a": {"type": "string"}, "field_b": {"type": "long"}}, }, "namespace2": { "type": "object", "properties": { "subspace1": {"type": "object", "properties": {"field_b": {"type": "long"}}} }, }, # Add a few known fields that should not appear. "processed_crash": { "type": "object", "properties": {"signature": {"type": "string"}, "product": {"type": "string"}}, }, } } } }, { "mappings": { config.elasticsearch.elasticsearch_doctype: { "properties": { "namespace1": { "type": "object", "properties": { "subspace1": {"type": "object", "properties": {"field_d": {"type": "long"}}} }, } } } } }, ] now = datetimeutil.utc_now() indices = [] try: # Using "2" here means that an index will be missing, hence testing # that it swallows the subsequent error. for i in range(2): date = now - datetime.timedelta(weeks=i) index = date.strftime(config.elasticsearch.elasticsearch_index) mapping = fake_mappings[i % len(fake_mappings)] self.index_creator.create_index(index, mapping) indices.append(index) api = SuperSearchFields(config=config) missing_fields = api.get_missing_fields() expected = [ "field_z", "namespace1.field_a", "namespace1.field_b", "namespace1.subspace1.field_d", "namespace2.subspace1.field_b", ] eq_(missing_fields["hits"], expected) eq_(missing_fields["total"], 5) finally: for index in indices: self.index_client.delete(index=index)
class IntegrationTestSuperSearchFields(ElasticsearchTestCase): """Test SuperSearchFields with an elasticsearch database containing fake data. """ def setUp(self): super(IntegrationTestSuperSearchFields, self).setUp() self.api = SuperSearchFields(config=self.config) self.api.get_fields = lambda: copy.deepcopy(FIELDS) def test_get_fields(self): results = self.api.get_fields() assert results == FIELDS def test_get_missing_fields(self): config = self.get_base_config( es_index='socorro_integration_test_%W' ) fake_mappings = [ { 'mappings': { config.elasticsearch.elasticsearch_doctype: { 'properties': { # Add a bunch of unknown fields. 'field_z': { 'type': 'string' }, 'namespace1': { 'type': 'object', 'properties': { 'field_a': { 'type': 'string' }, 'field_b': { 'type': 'long' } } }, 'namespace2': { 'type': 'object', 'properties': { 'subspace1': { 'type': 'object', 'properties': { 'field_b': { 'type': 'long' } } } } }, # Add a few known fields that should not appear. 'processed_crash': { 'type': 'object', 'properties': { 'signature': { 'type': 'string' }, 'product': { 'type': 'string' }, } } } } } }, { 'mappings': { config.elasticsearch.elasticsearch_doctype: { 'properties': { 'namespace1': { 'type': 'object', 'properties': { 'subspace1': { 'type': 'object', 'properties': { 'field_d': { 'type': 'long' } } } } } } } } }, ] now = datetimeutil.utc_now() indices = [] try: # Using "2" here means that an index will be missing, hence testing # that it swallows the subsequent error. for i in range(2): date = now - datetime.timedelta(weeks=i) index = date.strftime(config.elasticsearch.elasticsearch_index) mapping = fake_mappings[i % len(fake_mappings)] self.index_creator.create_index(index, mapping) indices.append(index) api = SuperSearchFields(config=config) missing_fields = api.get_missing_fields() expected = [ 'field_z', 'namespace1.field_a', 'namespace1.field_b', 'namespace1.subspace1.field_d', 'namespace2.subspace1.field_b', ] assert missing_fields['hits'] == expected assert missing_fields['total'] == 5 finally: for index in indices: self.index_client.delete(index=index) def test_get_mapping(self): mapping = self.api.get_mapping() doctype = self.config.elasticsearch.elasticsearch_doctype assert doctype in mapping properties = mapping[doctype]['properties'] assert 'processed_crash' in properties assert 'raw_crash' in properties processed_crash = properties['processed_crash']['properties'] # Check in_database_name is used. assert 'os_name' in processed_crash assert 'platform' not in processed_crash # Those fields have no `storage_mapping`. assert 'fake_field' not in properties['raw_crash']['properties'] # Those fields have a `storage_mapping`. assert processed_crash['release_channel'] == {'analyzer': 'keyword', 'type': 'string'} # Test nested objects. assert 'json_dump' in processed_crash assert 'properties' in processed_crash['json_dump'] assert 'write_combine_size' in processed_crash['json_dump']['properties'] assert processed_crash['json_dump']['properties']['write_combine_size'] == {'type': 'long'} # Test overwriting a field. mapping = self.api.get_mapping(overwrite_mapping={ 'name': 'fake_field', 'namespace': 'raw_crash', 'in_database_name': 'fake_field', 'storage_mapping': { 'type': 'long' } }) properties = mapping[doctype]['properties'] assert 'fake_field' in properties['raw_crash']['properties'] assert properties['raw_crash']['properties']['fake_field']['type'] == 'long' def test_test_mapping(self): """Much test. So meta. Wow test_test_. """ # First test a valid mapping. mapping = self.api.get_mapping() assert self.api.test_mapping(mapping) is None # Insert an invalid storage mapping. mapping = self.api.get_mapping({ 'name': 'fake_field', 'namespace': 'raw_crash', 'in_database_name': 'fake_field', 'storage_mapping': { 'type': 'unkwown' } }) with pytest.raises(BadArgumentError): self.api.test_mapping(mapping) # Test with a correct mapping but with data that cannot be indexed. self.index_crash({ 'date_processed': datetimeutil.utc_now(), 'product': 'WaterWolf', }) self.refresh_index() mapping = self.api.get_mapping({ 'name': 'product', 'storage_mapping': { 'type': 'long' } }) with pytest.raises(BadArgumentError): self.api.test_mapping(mapping)
def __init__(self, config, *args, **kwargs): super(TelemetryBotoS3CrashStorage, self).__init__(config, *args, **kwargs) self._all_fields = SuperSearchFields(config=self.config).get()
def test_get_missing_fields(self): config = self.get_base_config( es_index='socorro_integration_test_%W' ) fake_mappings = [ { 'mappings': { config.elasticsearch.elasticsearch_doctype: { 'properties': { # Add a bunch of unknown fields. 'field_z': { 'type': 'string' }, 'namespace1': { 'type': 'object', 'properties': { 'field_a': { 'type': 'string' }, 'field_b': { 'type': 'long' } } }, 'namespace2': { 'type': 'object', 'properties': { 'subspace1': { 'type': 'object', 'properties': { 'field_b': { 'type': 'long' } } } } }, # Add a few known fields that should not appear. 'processed_crash': { 'type': 'object', 'properties': { 'signature': { 'type': 'string' }, 'product': { 'type': 'string' }, } } } } } }, { 'mappings': { config.elasticsearch.elasticsearch_doctype: { 'properties': { 'namespace1': { 'type': 'object', 'properties': { 'subspace1': { 'type': 'object', 'properties': { 'field_d': { 'type': 'long' } } } } } } } } }, ] now = datetimeutil.utc_now() indices = [] try: # Using "2" here means that an index will be missing, hence testing # that it swallows the subsequent error. for i in range(2): date = now - datetime.timedelta(weeks=i) index = date.strftime(config.elasticsearch.elasticsearch_index) mapping = fake_mappings[i % len(fake_mappings)] self.index_creator.create_index(index, mapping) indices.append(index) api = SuperSearchFields(config=config) missing_fields = api.get_missing_fields() expected = [ 'field_z', 'namespace1.field_a', 'namespace1.field_b', 'namespace1.subspace1.field_d', 'namespace2.subspace1.field_b', ] assert missing_fields['hits'] == expected assert missing_fields['total'] == 5 finally: for index in indices: self.index_client.delete(index=index)
class SuperSearch(SearchBase): def __init__(self, *args, **kwargs): self.config = kwargs.get('config') self.es_context = self.config.elasticsearch.elasticsearch_class( self.config.elasticsearch ) self.all_fields = SuperSearchFields(config=self.config).get_fields() # Create a map to associate a field's name in the database to its # exposed name (in the results and facets). self.database_name_to_field_name_map = dict( (x['in_database_name'], x['name']) for x in self.all_fields.values() ) kwargs.update(fields=self.all_fields) super(SuperSearch, self).__init__( *args, **kwargs ) def get_connection(self): with self.es_context() as conn: return conn def generate_list_of_indices(self, from_date, to_date, es_index=None): """Return the list of indices to query to access all the crash reports that were processed between from_date and to_date. The naming pattern for indices in elasticsearch is configurable, it is possible to have an index per day, per week, per month... Parameters: * from_date datetime object * to_date datetime object """ if es_index is None: es_index = self.config.elasticsearch_index indices = [] current_date = from_date while current_date <= to_date: index = current_date.strftime(es_index) # Make sure no index is twice in the list # (for weekly or monthly indices for example) if index not in indices: indices.append(index) current_date += datetime.timedelta(days=1) return indices def get_indices(self, dates): """Return the list of indices to use for given dates. """ start_date = None end_date = None for date in dates: if '>' in date.operator: start_date = date.value if '<' in date.operator: end_date = date.value return self.generate_list_of_indices(start_date, end_date) def format_field_names(self, hit): """Return a hit with each field's database name replaced by its exposed name. """ new_hit = {} for field in hit: new_field = field if '.' in new_field: # Remove the prefix ("processed_crash." or "raw_crash."). new_field = new_field.split('.')[-1] new_field = self.database_name_to_field_name_map.get( new_field, new_field ) new_hit[new_field] = hit[field] return new_hit def format_fields(self, hit): """Return a well formatted document. Elasticsearch returns values as lists when using the `fields` option. This function removes the list when it contains zero or one element. It also calls `format_field_names` to correct all the field names. """ hit = self.format_field_names(hit) for field in hit: if isinstance(hit[field], (list, tuple)): if len(hit[field]) == 0: hit[field] = None elif len(hit[field]) == 1: hit[field] = hit[field][0] return hit def format_aggregations(self, aggregations): """Return aggregations in a form that looks like facets. We used to expose the Elasticsearch facets directly. This is thus needed for backwards compatibility. """ aggs = aggregations.to_dict() for agg in aggs: for i, row in enumerate(aggs[agg]['buckets']): aggs[agg]['buckets'][i] = { 'term': row['key'], 'count': row['doc_count'], } aggs[agg] = aggs[agg]['buckets'] return aggs def get(self, **kwargs): """Return a list of results and aggregations based on parameters. The list of accepted parameters (with types and default values) is in the database and can be accessed with the super_search_fields service. """ # Filter parameters and raise potential errors. params = self.get_parameters(**kwargs) # Find the indices to use to optimize the elasticsearch query. indices = self.get_indices(params['date']) # Create and configure the search object. search = Search( using=self.get_connection(), index=indices, doc_type=self.config.elasticsearch.elasticsearch_doctype, ) # Create filters. filters = None for field, sub_params in params.items(): sub_filters = None for param in sub_params: if param.name.startswith('_'): if param.name == '_results_offset': results_from = param.value[0] elif param.name == '_results_number': results_number = param.value[0] # Don't use meta parameters in the query. continue field_data = self.all_fields[param.name] name = '%s.%s' % ( field_data['namespace'], field_data['in_database_name'] ) if param.data_type in ('date', 'datetime'): param.value = datetimeutil.date_to_string(param.value) elif param.data_type == 'enum': param.value = [x.lower() for x in param.value] elif param.data_type == 'str' and not param.operator: param.value = [x.lower() for x in param.value] args = {} filter_type = 'term' filter_value = None if not param.operator: # contains one of the terms if len(param.value) == 1: val = param.value[0] if not isinstance(val, basestring) or ( isinstance(val, basestring) and ' ' not in val ): filter_value = val # If the term contains white spaces, we want to perform # a phrase query. Thus we do nothing here and let this # value be handled later. else: filter_type = 'terms' filter_value = param.value elif param.operator == '=': # is exactly if field_data['has_full_version']: name = '%s.full' % name filter_value = param.value elif param.operator == '>': # greater than filter_type = 'range' filter_value = { 'gt': param.value } elif param.operator == '<': # lower than filter_type = 'range' filter_value = { 'lt': param.value } elif param.operator == '>=': # greater than or equal to filter_type = 'range' filter_value = { 'gte': param.value } elif param.operator == '<=': # lower than or equal to filter_type = 'range' filter_value = { 'lte': param.value } elif param.operator == '__null__': # is null filter_type = 'missing' args['field'] = name if filter_value is not None: args[name] = filter_value if args: if param.operator_not: new_filter = ~F(filter_type, **args) else: new_filter = F(filter_type, **args) if sub_filters is None: sub_filters = new_filter elif param.data_type == 'enum': sub_filters |= new_filter else: sub_filters &= new_filter continue # These use a wildcard and thus need to be in a query # instead of a filter. operator_wildcards = { '~': '*%s*', # contains '$': '%s*', # starts with '^': '*%s' # ends with } if param.operator in operator_wildcards: if field_data['has_full_version']: name = '%s.full' % name query_type = 'wildcard' args[name] = ( operator_wildcards[param.operator] % param.value ) elif not param.operator: # This is a phrase that was passed down. query_type = 'simple_query_string' args['query'] = param.value[0] args['fields'] = [name] args['default_operator'] = 'and' if args: query = Q(query_type, **args) if param.operator_not: query = ~query search = search.query(query) else: # If we reach this point, that means the operator is # not supported, and we should raise an error about that. raise NotImplementedError( 'Operator %s is not supported' % param.operator ) if filters is None: filters = sub_filters elif sub_filters is not None: filters &= sub_filters search = search.filter(filters) # Restricting returned fields. fields = [] for param in params['_columns']: for value in param.value: if not value: continue try: field_ = self.all_fields[value] except KeyError: # That is not a known field, we can't restrict on it. raise BadArgumentError( value, msg='Unknown field "%s", cannot return it' % value ) if not field_['is_returned']: # Returning this field is not allowed. raise BadArgumentError( value, msg='Field "%s" is not allowed to be returned' % value ) field_name = '%s.%s' % ( field_['namespace'], field_['in_database_name'] ) fields.append(field_name) search = search.fields(fields) # Sorting. sort_fields = [] for param in params['_sort']: for value in param.value: if not value: continue # Values starting with a '-' are sorted in descending order. # In order to retrieve the database name of the field, we # must first remove the '-' part and add it back later. # Example: given ['product', '-version'], the results will be # sorted by ascending product and descending version. desc = False if value.startswith('-'): desc = True value = value[1:] try: field_ = self.all_fields[value] except KeyError: # That is not a known field, we can't sort on it. raise BadArgumentError( value, msg='Unknown field "%s", cannot sort on it' % value ) field_name = '%s.%s' % ( field_['namespace'], field_['in_database_name'] ) if desc: # The underlying library understands that '-' means # sorting in descending order. field_name = '-' + field_name sort_fields.append(field_name) search = search.sort(*sort_fields) # Pagination. results_to = results_from + results_number search = search[results_from:results_to] # Create facets. for param in params['_facets']: for value in param.value: try: field_ = self.all_fields[value] except KeyError: # That is not a known field, we can't facet on it. raise BadArgumentError( value, msg='Unknown field "%s", cannot facet on it' % value ) field_name = '%s.%s' % ( field_['namespace'], field_['in_database_name'] ) if field_['has_full_version']: # If the param has a full version, that means what matters # is the full string, and not its individual terms. field_name += '.full' search.aggs.bucket( value, 'terms', field=field_name, size=self.config.facets_max_number ) # Query and compute results. hits = [] if params['_return_query'][0].value[0]: # Return only the JSON query that would be sent to elasticsearch. return { 'query': search.to_dict(), 'indices': indices, } # We call elasticsearch with a computed list of indices, based on # the date range. However, if that list contains indices that do not # exist in elasticsearch, an error will be raised. We thus want to # remove all failing indices until we either have a valid list, or # an empty list in which case we return no result. while True: try: results = search.execute() for hit in results: hits.append(self.format_fields(hit.to_dict())) total = search.count() aggregations = self.format_aggregations(results.aggregations) break # Yay! Results! except NotFoundError, e: missing_index = re.findall(BAD_INDEX_REGEX, e.error)[0] if missing_index in indices: del indices[indices.index(missing_index)] else: # Wait what? An error caused by an index that was not # in the request? That should never happen, but in case # it does, better know it. raise if indices: # Update the list of indices and try again. # Note: we need to first empty the list of indices before # updating it, otherwise the removed indices never get # actually removed. search = search.index().index(*indices) else: # There is no index left in the list, return an empty # result. hits = [] total = 0 aggregations = {} break return { 'hits': hits, 'total': total, 'facets': aggregations, }
def update_field(self, **kwargs): return SuperSearchFields(config=self.config).update_field(**kwargs)
def run(self, end_datetime): # Truncate to the hour end_datetime = end_datetime.replace(minute=0, second=0, microsecond=0) # Do a super search and get the signature, buildid, and date processed for # every crash in the range all_fields = SuperSearchFields(config=self.config).get() api = SuperSearch(config=self.config) start_datetime = end_datetime - datetime.timedelta( minutes=self.config.period) self.config.logger.info('Looking at %s to %s', start_datetime, end_datetime) params = { 'date': [ '>={}'.format(start_datetime.isoformat()), '<{}'.format(end_datetime.isoformat()), ], '_columns': ['signature', 'build_id', 'date'], '_facets_size': 0, '_fields': all_fields, # Set up first page '_results_offset': 0, '_results_number': MAX_PAGE, } results = {} crashids_count = 0 while True: resp = api.get(**params) hits = resp['hits'] for hit in hits: crashids_count += 1 if not hit['build_id']: # Not all crashes have a build id, so skip the ones that don't. continue if hit['signature'] in results: data = results[hit['signature']] data['build_id'] = min(data['build_id'], hit['build_id']) data['date'] = min(data['date'], hit['date']) else: data = { 'signature': hit['signature'], 'build_id': hit['build_id'], 'date': hit['date'] } results[hit['signature']] = data # If there are no more crash ids to get, we return total = resp['total'] if not hits or crashids_count >= total: break # Get the next page, but only as many results as we need params['_results_offset'] += MAX_PAGE params['_results_number'] = min( # MAX_PAGE is the maximum we can request MAX_PAGE, # The number of results Super Search can return to us that is hasn't returned so far total - crashids_count) signature_data = results.values() # Save signature data to the db for item in signature_data: if self.config.dry_run: self.config.logger.info( 'Inserting/updating signature (%s, %s, %s)', item['signature'], item['date'], item['build_id']) else: self.update_crashstats_signature( signature=item['signature'], report_date=item['date'], report_build=item['build_id'], ) self.config.logger.info('Inserted/updated %d signatures.', len(signature_data))
def get_missing_fields(self): return SuperSearchFields(config=self.config).get_missing_fields()
class SuperSearch(SearchBase): def __init__(self, *args, **kwargs): self.config = kwargs.get('config') self.es_context = self.config.elasticsearch.elasticsearch_class( self.config.elasticsearch ) self.all_fields = SuperSearchFields(config=self.config).get_fields() # Create a map to associate a field's name in the database to its # exposed name (in the results and facets). self.database_name_to_field_name_map = dict( (x['in_database_name'], x['name']) for x in self.all_fields.values() ) kwargs.update(fields=self.all_fields) super(SuperSearch, self).__init__( *args, **kwargs ) def get_connection(self): with self.es_context() as conn: return conn def get_list_of_indices(self, from_date, to_date, es_index=None): """Return the list of indices to query to access all the crash reports that were processed between from_date and to_date. The naming pattern for indices in elasticsearch is configurable, it is possible to have an index per day, per week, per month... Parameters: * from_date datetime object * to_date datetime object """ if es_index is None: es_index = self.config.elasticsearch.elasticsearch_index indices = [] current_date = from_date while current_date <= to_date: index = current_date.strftime(es_index) # Make sure no index is twice in the list # (for weekly or monthly indices for example) if index not in indices: indices.append(index) current_date += datetime.timedelta(days=1) return indices def get_indices(self, dates): """Return the list of indices to use for given dates. """ start_date = None end_date = None for date in dates: if '>' in date.operator: start_date = date.value if '<' in date.operator: end_date = date.value return self.get_list_of_indices(start_date, end_date) def format_field_names(self, hit): """Return a hit with each field's database name replaced by its exposed name. """ new_hit = {} for field in hit: new_field = field if '.' in new_field: # Remove the prefix ("processed_crash." or "raw_crash."). new_field = new_field.split('.')[-1] new_field = self.database_name_to_field_name_map.get( new_field, new_field ) new_hit[new_field] = hit[field] return new_hit def format_fields(self, hit): """Return a well formatted document. Elasticsearch returns values as lists when using the `fields` option. This function removes the list when it contains zero or one element. It also calls `format_field_names` to correct all the field names. """ hit = self.format_field_names(hit) for field in hit: if isinstance(hit[field], (list, tuple)): if len(hit[field]) == 0: hit[field] = None elif len(hit[field]) == 1: hit[field] = hit[field][0] return hit def get_field_name(self, value, full=True): try: field_ = self.all_fields[value] except KeyError: raise BadArgumentError( value, msg='Unknown field "%s"' % value ) if not field_['is_returned']: # Returning this field is not allowed. raise BadArgumentError( value, msg='Field "%s" is not allowed to be returned' % value ) field_name = '%s.%s' % ( field_['namespace'], field_['in_database_name'] ) if full and field_['has_full_version']: # If the param has a full version, that means what matters # is the full string, and not its individual terms. field_name += '.full' return field_name def format_aggregations(self, aggregations): """Return aggregations in a form that looks like facets. We used to expose the Elasticsearch facets directly. This is thus needed for backwards compatibility. """ aggs = aggregations.to_dict() for agg in aggs: for i, bucket in enumerate(aggs[agg]['buckets']): sub_aggs = {} for key in bucket: # Go through all sub aggregations. Those are contained in # all the keys that are not 'key' or 'count'. if key in ('key', 'key_as_string', 'doc_count'): continue sub_aggs[key] = [ { # For date data, Elasticsearch exposes a timestamp # in 'key' and a human-friendly string in # 'key_as_string'. We thus check if the later # exists to expose it, and return the normal # 'key' if not. 'term': x.get('key_as_string', x['key']), 'count': x['doc_count'], } for x in bucket[key]['buckets'] ] aggs[agg]['buckets'][i] = { 'term': bucket.get('key_as_string', bucket['key']), 'count': bucket['doc_count'], } if sub_aggs: aggs[agg]['buckets'][i]['facets'] = sub_aggs aggs[agg] = aggs[agg]['buckets'] return aggs def get(self, **kwargs): """Return a list of results and aggregations based on parameters. The list of accepted parameters (with types and default values) is in the database and can be accessed with the super_search_fields service. """ # Filter parameters and raise potential errors. params = self.get_parameters(**kwargs) # Find the indices to use to optimize the elasticsearch query. indices = self.get_indices(params['date']) # Create and configure the search object. search = Search( using=self.get_connection(), index=indices, doc_type=self.config.elasticsearch.elasticsearch_doctype, ) # Create filters. filters = [] histogram_intervals = {} for field, sub_params in params.items(): sub_filters = None for param in sub_params: if param.name.startswith('_'): # By default, all param values are turned into lists, # even when they have and can have only one value. # For those we know there can only be one value, # so we just extract it from the made-up list. if param.name == '_results_offset': results_from = param.value[0] elif param.name == '_results_number': results_number = param.value[0] elif param.name == '_facets_size': facets_size = param.value[0] for f in self.histogram_fields: if param.name == '_histogram_interval.%s' % f: histogram_intervals[f] = param.value[0] # Don't use meta parameters in the query. continue field_data = self.all_fields[param.name] name = '%s.%s' % ( field_data['namespace'], field_data['in_database_name'] ) if param.data_type in ('date', 'datetime'): param.value = datetimeutil.date_to_string(param.value) elif param.data_type == 'enum': param.value = [x.lower() for x in param.value] elif param.data_type == 'str' and not param.operator: param.value = [x.lower() for x in param.value] # Operators needing wildcards, and the associated value # transformation with said wildcards. operator_wildcards = { '~': '*%s*', # contains '$': '%s*', # starts with '^': '*%s' # ends with } # Operators needing ranges, and the associated Elasticsearch # comparison operator. operator_range = { '>': 'gt', '<': 'lt', '>=': 'gte', '<=': 'lte', } args = {} filter_type = 'term' filter_value = None if not param.operator: # contains one of the terms if len(param.value) == 1: val = param.value[0] if not isinstance(val, basestring) or ' ' not in val: # There's only one term and no white space, this # is a simple term filter. filter_value = val else: # If the term contains white spaces, we want to # perform a phrase query. filter_type = 'query' args = Q( 'simple_query_string', query=param.value[0], fields=[name], default_operator='and', ).to_dict() else: # There are several terms, this is a terms filter. filter_type = 'terms' filter_value = param.value elif param.operator == '=': # is exactly if field_data['has_full_version']: name = '%s.full' % name filter_value = param.value elif param.operator in operator_range: filter_type = 'range' filter_value = { operator_range[param.operator]: param.value } elif param.operator == '__null__': filter_type = 'missing' args['field'] = name elif param.operator in operator_wildcards: filter_type = 'query' # Wildcard operations are better applied to a non-analyzed # field (called "full") if there is one. if field_data['has_full_version']: name = '%s.full' % name q_args = {} q_args[name] = ( operator_wildcards[param.operator] % param.value ) query = Q('wildcard', **q_args) args = query.to_dict() if filter_value is not None: args[name] = filter_value if args: new_filter = F(filter_type, **args) if param.operator_not: new_filter = ~new_filter if sub_filters is None: sub_filters = new_filter elif filter_type == 'range': sub_filters &= new_filter else: sub_filters |= new_filter continue if sub_filters is not None: filters.append(sub_filters) search = search.filter(F('bool', must=filters)) # Restricting returned fields. fields = [] for param in params['_columns']: for value in param.value: if not value: continue field_name = self.get_field_name(value, full=False) fields.append(field_name) search = search.fields(fields) # Sorting. sort_fields = [] for param in params['_sort']: for value in param.value: if not value: continue # Values starting with a '-' are sorted in descending order. # In order to retrieve the database name of the field, we # must first remove the '-' part and add it back later. # Example: given ['product', '-version'], the results will be # sorted by ascending product and descending version. desc = False if value.startswith('-'): desc = True value = value[1:] field_name = self.get_field_name(value, full=False) if desc: # The underlying library understands that '-' means # sorting in descending order. field_name = '-' + field_name sort_fields.append(field_name) search = search.sort(*sort_fields) # Pagination. results_to = results_from + results_number search = search[results_from:results_to] # Create facets. for param in params['_facets']: for value in param.value: if not value: continue field_name = self.get_field_name(value) search.aggs.bucket( value, 'terms', field=field_name, size=facets_size, ) # Create signature aggregations. if params.get('_aggs.signature'): sig_bucket = A( 'terms', field=self.get_field_name('signature'), size=facets_size, ) for param in params['_aggs.signature']: for value in param.value: if not value: continue if value.startswith('_histogram.'): # This is a histogram aggregation we want to run, # not a terms aggregation. field_name = value[len('_histogram.'):] if field_name not in self.histogram_fields: continue histogram_type = ( self.all_fields[field_name]['query_type'] == 'date' and 'date_histogram' or 'histogram' ) sig_bucket.bucket( 'histogram_%s' % field_name, histogram_type, field=self.get_field_name(field_name), interval=histogram_intervals[field_name], ) else: sig_bucket.bucket( value, 'terms', field=self.get_field_name(value), size=facets_size, ) search.aggs.bucket('signature', sig_bucket) # Create histograms. for f in self.histogram_fields: if params.get('_histogram.%s' % f): histogram_type = ( self.all_fields[f]['query_type'] == 'date' and 'date_histogram' or 'histogram' ) date_bucket = A( histogram_type, field=self.get_field_name(f), interval=histogram_intervals[f], ) for param in params['_histogram.%s' % f]: for value in param.value: if not value: continue field_name = self.get_field_name(value) val_bucket = A( 'terms', field=field_name, size=facets_size, ) date_bucket.bucket(value, val_bucket) search.aggs.bucket('histogram_%s' % f, date_bucket) # Query and compute results. hits = [] if params['_return_query'][0].value[0]: # Return only the JSON query that would be sent to elasticsearch. return { 'query': search.to_dict(), 'indices': indices, } # We call elasticsearch with a computed list of indices, based on # the date range. However, if that list contains indices that do not # exist in elasticsearch, an error will be raised. We thus want to # remove all failing indices until we either have a valid list, or # an empty list in which case we return no result. while True: try: results = search.execute() for hit in results: hits.append(self.format_fields(hit.to_dict())) total = search.count() aggregations = self.format_aggregations(results.aggregations) break # Yay! Results! except NotFoundError, e: missing_index = re.findall(BAD_INDEX_REGEX, e.error)[0] if missing_index in indices: del indices[indices.index(missing_index)] else: # Wait what? An error caused by an index that was not # in the request? That should never happen, but in case # it does, better know it. raise if indices: # Update the list of indices and try again. # Note: we need to first empty the list of indices before # updating it, otherwise the removed indices never get # actually removed. search = search.index().index(*indices) else: # There is no index left in the list, return an empty # result. hits = [] total = 0 aggregations = {} break return { 'hits': hits, 'total': total, 'facets': aggregations, }