def create_socorro_index(self, es_index, mappings=None): """Create an index that will receive crash reports. """ if mappings is None: mappings = SuperSearchFields(config=self.config).get_mapping() es_settings = self.get_socorro_index_settings(mappings) self.create_index(es_index, es_settings)
def create_index(self, index_name, mappings=None): """Create an index that will receive crash reports. :arg index_name: the name of the index to create :arg mappings: dict of doctype->ES mapping :returns: True if the index was created, False if it already existed """ if mappings is None: mappings = SuperSearchFields(context=self).get_mapping() es_settings = self.get_socorro_index_settings(mappings) try: client = self.indices_client() client.create(index=index_name, body=es_settings) return True except elasticsearch.exceptions.RequestError as e: # If this index already exists, swallow the error. # NOTE! This is NOT how the error looks like in ES 2.x if 'IndexAlreadyExistsException' not in str(e): raise return False
def create_socorro_index(self, es_index, mappings=None): """Create an index that will receive crash reports. """ if mappings is None: mappings = SuperSearchFields(config=self.config).get_mapping() es_settings = self.get_socorro_index_settings(mappings) if self.config.elasticsearch.dry_run: print(json.dumps(es_settings, indent=2)) else: self.create_index(es_index, es_settings)
def create_socorro_index(self, es_index, mappings=None): """Create an index that will receive crash reports. """ if mappings is None: # Import at runtime to avoid dependency circle. from socorro.external.es.super_search_fields import ( SuperSearchFields) mappings = SuperSearchFields(config=self.config).get_mapping() es_settings = self.get_socorro_index_settings(mappings) self.create_index(es_index, es_settings)
def _get_all_fields(self): if (hasattr(self, '_all_fields') and hasattr(self, '_all_fields_timestamp')): # we might have it cached age = time.time() - self._all_fields_timestamp if age < 60 * 60: # fresh enough return self._all_fields self._all_fields = SuperSearchFields(config=self.config).get() self._all_fields_timestamp = time.time() return self._all_fields
def __init__(self, *args, **kwargs): self.config = kwargs.get('config') self.es_context = self.config.elasticsearch.elasticsearch_class( self.config.elasticsearch) self.all_fields = SuperSearchFields(config=self.config).get_fields() # Create a map to associate a field's name in the database to its # exposed name (in the results and facets). self.database_name_to_field_name_map = dict( (x['in_database_name'], x['name']) for x in self.all_fields.values()) kwargs.update(fields=self.all_fields) super(SuperSearch, self).__init__(*args, **kwargs)
def create_socorro_index(self, index_name, mappings=None, log_result=False): """Create an index that will receive crash reports. Note: This function can get called in two contexts: when the processor is saving crash reports and also in the local dev environment scripts. The former wants to ignore index-existing errors quietly but the latter wants to log the result. Hence the fickle nature of this function. """ if mappings is None: mappings = SuperSearchFields(context=self).get_mapping() es_settings = self.get_socorro_index_settings(mappings) self.create_index(index_name, es_settings, log_result)
def get(self, **kwargs): kwargs['_fields'] = SuperSearchFields(config=self.config).get_fields() return super(SuperSearchWithFields, self).get(**kwargs)
def create_socorro_index(self, es_index): """Create an index that will receive crash reports. """ es_settings = SuperSearchFields(config=self.config).get_mapping() self.create_index(es_index, es_settings)
def setUp(self): super(IntegrationTestSuperSearchFields, self).setUp() self.api = SuperSearchFields(config=self.config)
def delete_field(self, **kwargs): return SuperSearchFields(config=self.config).delete_field(**kwargs)
def run(self, end_datetime): # Truncate to the hour end_datetime = end_datetime.replace(minute=0, second=0, microsecond=0) # Do a super search and get the signature, buildid, and date processed for # every crash in the range all_fields = SuperSearchFields(config=self.config).get() api = SuperSearch(config=self.config) start_datetime = end_datetime - datetime.timedelta( minutes=self.config.period) self.config.logger.info('Looking at %s to %s', start_datetime, end_datetime) params = { 'date': [ '>={}'.format(start_datetime.isoformat()), '<{}'.format(end_datetime.isoformat()), ], '_columns': ['signature', 'build_id', 'date'], '_facets_size': 0, '_fields': all_fields, # Set up first page '_results_offset': 0, '_results_number': MAX_PAGE, } results = {} crashids_count = 0 while True: resp = api.get(**params) hits = resp['hits'] for hit in hits: crashids_count += 1 if not hit['build_id']: # Not all crashes have a build id, so skip the ones that don't. continue if hit['signature'] in results: data = results[hit['signature']] data['build_id'] = min(data['build_id'], hit['build_id']) data['date'] = min(data['date'], hit['date']) else: data = { 'signature': hit['signature'], 'build_id': hit['build_id'], 'date': hit['date'] } results[hit['signature']] = data # If there are no more crash ids to get, we return total = resp['total'] if not hits or crashids_count >= total: break # Get the next page, but only as many results as we need params['_results_offset'] += MAX_PAGE params['_results_number'] = min( # MAX_PAGE is the maximum we can request MAX_PAGE, # The number of results Super Search can return to us that is hasn't returned so far total - crashids_count) signature_data = results.values() # Save signature data to the db for item in signature_data: if self.config.dry_run: self.config.logger.info( 'Inserting/updating signature (%s, %s, %s)', item['signature'], item['date'], item['build_id']) else: self.update_crashstats_signature( signature=item['signature'], report_date=item['date'], report_build=item['build_id'], ) self.config.logger.info('Inserted/updated %d signatures.', len(signature_data))
def __init__(self, config, *args, **kwargs): super(TelemetryBotoS3CrashStorage, self).__init__(config, *args, **kwargs) self._all_fields = SuperSearchFields(config=self.config).get()
def test_index_crash_mapping_keys(self): """Test indexing a crash that has keys not in the mapping Indexing a crash that has keys that aren't in the mapping for the index should cause those keys to be removed from the crash. """ # The test harness creates an index for this week and last week. So let's create # one for 4 weeks ago. now = utc_now() four_weeks_ago = now - timedelta(days=28) field = "user_comments" # We're going to use a mapping that's what SuperSearchFields gives us, but # remove the user_comments field. mappings = SuperSearchFields(context=self.es_context).get_mapping() doctype = self.es_context.get_doctype() del mappings[doctype]["properties"]["processed_crash"]["properties"][ field] # Create the index for 4 weeks ago self.es_context.create_index( index_name=self.es_context.get_index_for_date(four_weeks_ago), mappings=mappings, ) es_storage = ESCrashStorage(config=self.config) # Create a crash for this week and save it now_uuid = "00000000-0000-0000-0000-000000120408" raw_crash = { "BuildID": "20200506000000", } processed_crash = { field: "this week", "date_processed": date_to_string(now), "uuid": now_uuid, } es_storage.save_processed_crash( raw_crash=raw_crash, processed_crash=processed_crash, ) # Create a crash for four weeks ago with the bum mapping and save it old_uuid = "11111111-1111-1111-1111-111111120408" raw_crash = { "BuildID": "20200506000000", } processed_crash = { field: "this week", "date_processed": date_to_string(now - timedelta(days=28)), "uuid": old_uuid, } es_storage.save_processed_crash( raw_crash=raw_crash, processed_crash=processed_crash, ) self.es_context.refresh() # Retrieve the document from this week and verify it has the user_comments # field doc = self.conn.get( index=self.es_context.get_index_for_date(now), id=now_uuid, ) assert field in doc["_source"]["processed_crash"] # Retrieve the document from four weeks ago and verify it doesn't have the # user_comments field doc = self.conn.get( index=self.es_context.get_index_for_date(four_weeks_ago), id=old_uuid, ) assert field not in doc["_source"]["processed_crash"]
def get_fields(self, **kwargs): return SuperSearchFields(config=self.config).get_fields(**kwargs)
def update_field(self, **kwargs): return SuperSearchFields(config=self.config).update_field(**kwargs)
def setUp(self): super(IntegrationTestSuperSearchFields, self).setUp() self.api = SuperSearchFields(config=self.config) self.api.get_fields = lambda: copy.deepcopy(SUPERSEARCH_FIELDS)
def get_missing_fields(self): return SuperSearchFields(config=self.config).get_missing_fields()
def test_get_missing_fields(self): config = self.get_base_config(es_index='socorro_integration_test_%W') fake_mappings = [ { 'mappings': { config.elasticsearch.elasticsearch_doctype: { 'properties': { # Add a bunch of unknown fields. 'field_z': { 'type': 'string' }, 'namespace1': { 'type': 'object', 'properties': { 'field_a': { 'type': 'string' }, 'field_b': { 'type': 'long' } } }, 'namespace2': { 'type': 'object', 'properties': { 'subspace1': { 'type': 'object', 'properties': { 'field_b': { 'type': 'long' } } } } }, # Add a few known fields that should not appear. 'processed_crash': { 'type': 'object', 'properties': { 'signature': { 'type': 'string' }, 'product': { 'type': 'string' }, } } } } } }, { 'mappings': { config.elasticsearch.elasticsearch_doctype: { 'properties': { 'namespace1': { 'type': 'object', 'properties': { 'subspace1': { 'type': 'object', 'properties': { 'field_d': { 'type': 'long' } } } } } } } } }, ] now = datetimeutil.utc_now() indices = [] try: # Using "2" here means that an index will be missing, hence testing # that it swallows the subsequent error. for i in range(2): date = now - datetime.timedelta(weeks=i) index = date.strftime(config.elasticsearch.elasticsearch_index) mapping = fake_mappings[i % len(fake_mappings)] self.index_creator.create_index(index, mapping) indices.append(index) api = SuperSearchFields(config=config) missing_fields = api.get_missing_fields() expected = [ 'field_z', 'namespace1.field_a', 'namespace1.field_b', 'namespace1.subspace1.field_d', 'namespace2.subspace1.field_b', ] assert missing_fields['hits'] == expected assert missing_fields['total'] == 5 finally: for index in indices: self.index_client.delete(index=index)