def get_extra_as_options(input_str): if '|' not in input_str: raise JobDescriptionError('No frequency and/or time defined') metadata = input_str.split('|')[1:] if len(metadata) == 1: if ':' in metadata[0]: frequency = '1d' time_ = metadata[0] else: frequency = metadata[0] time_ = None else: frequency, time_ = metadata n = Namespace() n.add_option( 'frequency', doc='frequency', default=frequency, #from_string_converter=int exclude_from_print_conf=True, exclude_from_dump_conf=True ) n.add_option( 'time', doc='time', default=time_, exclude_from_print_conf=True, exclude_from_dump_conf=True ) return n
def logging_required_config(app_name): lc = Namespace() lc.namespace("logging") lc.logging.add_option("syslog_host", doc="syslog hostname", default="localhost") lc.logging.add_option("syslog_port", doc="syslog port", default=514) lc.logging.add_option( "syslog_facility_string", doc='syslog facility string ("user", "local0", etc)', default="user" ) lc.logging.add_option( "syslog_line_format_string", doc="python logging system format for syslog entries", default="%s (pid {process}): " "{asctime} {levelname} - {threadName} - " "{message}" % app_name, ) lc.logging.add_option( "syslog_error_logging_level", doc="logging level for the log file (10 - DEBUG, 20 " "- INFO, 30 - WARNING, 40 - ERROR, 50 - CRITICAL)", default=40, ) lc.logging.add_option( "stderr_line_format_string", doc="python logging system format for logging to stderr", default="{asctime} {levelname} - {threadName} - " "{message}", ) lc.logging.add_option( "stderr_error_logging_level", doc="logging level for the logging to stderr (10 - " "DEBUG, 20 - INFO, 30 - WARNING, 40 - ERROR, " "50 - CRITICAL)", default=10, ) return lc
def test_classes_in_namespaces_converter_5(self): n = Namespace() n.add_option( 'kls_list', default=( 'socorro.unittest.lib.test_converters.Alpha, ' 'socorro.unittest.lib.test_converters.Alpha, ' 'socorro.unittest.lib.test_converters.Alpha' ), from_string_converter=str_to_classes_in_namespaces_converter( '%(name)s_%(index)02d' ) ) cm = ConfigurationManager( n, [{ 'kls_list': ( 'socorro.unittest.lib.test_converters.Alpha, ' 'socorro.unittest.lib.test_converters.Beta, ' 'socorro.unittest.lib.test_converters.Beta, ' 'socorro.unittest.lib.test_converters.Alpha' ), 'Alpha_00.a': 21, 'Beta_01.b': 38, }] ) config = cm.get_config() self.assertEqual(len(config.kls_list.subordinate_namespace_names), 4) for i, (a_class_name, a_class, ns_name) in \ enumerate(config.kls_list.class_list): self.assertTrue(isinstance(a_class_name, str)) self.assertEqual(a_class_name, a_class.__name__) self.assertEqual(ns_name, "%s_%02d" % (a_class_name, i))
def define_config(): definition = Namespace() definition.add_option( name='devowel', default=False ) return definition
def main(initial_app, values_source_list=None): if isinstance(initial_app, basestring): initial_app = class_converter(initial_app) # the only config parameter is a special one that refers to a class or # module that defines an application. In order to qualify, a class must # have a constructor that accepts a DotDict derivative as the sole # input parameter. It must also have a 'main' function that accepts no # parameters. For a module to be acceptable, it must have a main # function that accepts a DotDict derivative as its input parameter. app_definition = Namespace() app_definition.add_option( 'application', doc='the fully qualified module or class of the ' 'application', default=initial_app, from_string_converter=class_converter ) try: app_name = initial_app.app_name # this will be used as the default # b app_version = initial_app.app_version app_description = initial_app.app_description except AttributeError, x: raise AppDetailMissingError(str(x))
def test_classes_in_namespaces_converter_4(self): n = Namespace() n.add_option('kls_list', default='configman.tests.test_converters.Alpha, ' 'configman.tests.test_converters.Alpha, ' 'configman.tests.test_converters.Alpha', from_string_converter= converters.classes_in_namespaces_converter( 'kls%d', 'kls', instantiate_classes=True)) cm = ConfigurationManager( n, [{'kls_list':'configman.tests.test_converters.Alpha, ' 'configman.tests.test_converters.Beta, ' 'configman.tests.test_converters.Beta, ' 'configman.tests.test_converters.Alpha'}]) config = cm.get_config() self.assertEqual(len(config.kls_list.subordinate_namespace_names), 4) for x in config.kls_list.subordinate_namespace_names: self.assertTrue(x in config) self.assertTrue('kls_instance' in config[x]) self.assertTrue(isinstance(config[x].kls_instance, config[x].kls))
def test_for_mapping_long_doc_in_write_conf(self): n = self._some_namespaces() n = Namespace(doc='top') n.add_option( 'aaa', 'Default Value Goes In Here', 'This time the documentation string is really long. So long ' 'that we have to write it on multiple lines.', ) cm = ConfigurationManager( n, values_source_list=[], ) out = StringIO() cm.write_conf(for_mapping, opener=stringIO_context_wrapper(out)) received = out.getvalue() out.close() for line in received.splitlines(): self.assertTrue(len(line) < 80, line) expected = """ # This time the documentation string is really long. So long that we have to # write it on multiple lines. (default: 'Default Value Goes In Here') aaa='Default Value Goes In Here' """.strip() self.assertEqual(received.strip(), expected)
def test_basic_crashstorage(self): required_config = Namespace() mock_logging = Mock() required_config.add_option('logger', default=mock_logging) config_manager = ConfigurationManager( [required_config], app_name='testapp', app_version='1.0', app_description='app description', values_source_list=[{ 'logger': mock_logging, }] ) with config_manager.context() as config: crashstorage = CrashStorageBase( config, quit_check_callback=fake_quit_check ) crashstorage.save_raw_crash({}, 'payload', 'ooid') crashstorage.save_processed({}) self.assertRaises(NotImplementedError, crashstorage.get_raw_crash, 'ooid') self.assertRaises(NotImplementedError, crashstorage.get_raw_dump, 'ooid') self.assertRaises(NotImplementedError, crashstorage.get_processed, 'ooid') self.assertRaises(NotImplementedError, crashstorage.remove, 'ooid') self.assertRaises(StopIteration, crashstorage.new_crashes) crashstorage.close()
def test_no_rollback_exception_with_postgres(self): required_config = Namespace() required_config.add_option( 'transaction_executor_class', default=TransactionExecutor, doc='a class that will execute transactions' ) mock_logging = MockLogging() required_config.add_option('logger', default=mock_logging) config_manager = ConfigurationManager( [required_config], app_name='testapp', app_version='1.0', app_description='app description', values_source_list=[{'database_class': MockConnectionContext}], ) with config_manager.context() as config: executor = config.transaction_executor_class(config) def mock_function(connection): assert isinstance(connection, MockConnection) raise NameError('crap!') self.assertRaises(NameError, executor, mock_function) self.assertEqual(commit_count, 0) self.assertEqual(rollback_count, 0) self.assertTrue(mock_logging.errors)
def test_write_with_imported_module_with_regex(self): required_config = Namespace() required_config.add_option( 'identifier', doc='just an identifier re', default=r'[a-zA-Z][a-zA-Z0-9]*', from_string_converter=re.compile ) cm = ConfigurationManager( required_config, values_source_list=[], ) config = cm.get_config() s = StringIO() @contextlib.contextmanager def s_opener(): yield s cm.write_conf('py', s_opener) generated_python_module_text = s.getvalue() expected = """# generated Python configman file # just an identifier re identifier = "[a-zA-Z][a-zA-Z0-9]*" """ self.assertEqual(generated_python_module_text, expected)
def test_classes_in_namespaces_converter_4(self): n = Namespace() n.add_option( "kls_list", default="configman.tests.test_converters.Alpha, " "configman.tests.test_converters.Alpha, " "configman.tests.test_converters.Alpha", from_string_converter=converters.classes_in_namespaces_converter("kls%d", "kls", instantiate_classes=True), ) cm = ConfigurationManager( n, [ { "kls_list": "configman.tests.test_converters.Alpha, " "configman.tests.test_converters.Beta, " "configman.tests.test_converters.Beta, " "configman.tests.test_converters.Alpha" } ], ) config = cm.get_config() self.assertEqual(len(config.kls_list.subordinate_namespace_names), 4) for x in config.kls_list.subordinate_namespace_names: self.assertTrue(x in config) self.assertTrue("kls_instance" in config[x]) self.assertTrue(isinstance(config[x].kls_instance, config[x].kls))
def run(*crash_ids): definition_source = Namespace() definition_source.namespace('queuing') definition_source.queuing.add_option( 'rabbitmq_reprocessing_class', default=SingleCrashMQCrashStorage, ) config_dict = { 'resource': { 'rabbitmq': { 'host': 'localhost', 'port': '5672', 'virtual_host': '/' } }, 'secrets': { 'rabbitmq': { 'rabbitmq_password': '******', 'rabbitmq_user': '******' } } } config = configuration( definition_source=definition_source, values_source_list=[config_dict], ) config.queuing.logger = logger config.logger = logger storage = SingleCrashMQCrashStorage(config=config['queuing']) for crash_id in crash_ids: print storage.submit(crash_id) return 0
def test_basic_usage_with_postgres(self): required_config = Namespace() required_config.add_option( 'transaction_executor_class', #default=TransactionExecutorWithBackoff, default=TransactionExecutor, doc='a class that will execute transactions' ) required_config.add_option( 'database_class', default=MockConnectionContext, from_string_converter=class_converter ) config_manager = ConfigurationManager( [required_config], app_name='testapp', app_version='1.0', app_description='app description', values_source_list=[], ) with config_manager.context() as config: mocked_context = config.database_class(config) executor = config.transaction_executor_class(config, mocked_context) _function_calls = [] # some mutable def mock_function(connection): assert isinstance(connection, MockConnection) _function_calls.append(connection) executor(mock_function) self.assertTrue(_function_calls) self.assertEqual(commit_count, 1) self.assertEqual(rollback_count, 0)
def test_classes_in_namespaces_converter_4(self): n = Namespace() n.add_option( 'kls_list', default=( 'socorro.unittest.lib.test_converters.Alpha, ' 'socorro.unittest.lib.test_converters.Alpha, ' 'socorro.unittest.lib.test_converters.Alpha' ), from_string_converter=str_to_classes_in_namespaces_converter( '%(name)s_%(index)02d' ) ) cm = ConfigurationManager( n, [{ 'kls_list': ( 'socorro.unittest.lib.test_converters.Alpha, ' 'socorro.unittest.lib.test_converters.Beta, ' 'socorro.unittest.lib.test_converters.Beta, ' 'socorro.unittest.lib.test_converters.Alpha' ), 'Alpha_00.a': 21, 'Beta_01.b': 38, }] ) config = cm.get_config() self.assertEqual(len(config.kls_list.subordinate_namespace_names), 4) for x in config.kls_list.subordinate_namespace_names: self.assertTrue(x in config) self.assertEqual(config.Alpha_00.a, 21) self.assertEqual(config.Beta_01.b, 38)
def main(initial_app, values_source_list=None, config_path=None): if isinstance(initial_app, basestring): initial_app = class_converter(initial_app) if config_path is None: default = './config' config_path = os.environ.get( 'DEFAULT_SOCORRO_CONFIG_PATH', default ) if config_path != default: # you tried to set it, then it must be a valid directory if not os.path.isdir(config_path): raise IOError('%s is not a valid directory' % config_path) # the only config parameter is a special one that refers to a class or # module that defines an application. In order to qualify, a class must # have a constructor that accepts a DotDict derivative as the sole # input parameter. It must also have a 'main' function that accepts no # parameters. For a module to be acceptable, it must have a main # function that accepts a DotDict derivative as its input parameter. app_definition = Namespace() app_definition.add_option( 'application', doc='the fully qualified module or class of the application', default=initial_app, from_string_converter=class_converter ) try: app_name = initial_app.app_name # this will be used as the default # b app_version = initial_app.app_version app_description = initial_app.app_description except AttributeError, x: raise AppDetailMissingError(str(x))
def test_poly_crash_storage_immutability_deeper(self): n = Namespace() n.add_option( 'storage', default=PolyCrashStorage, ) n.add_option( 'logger', default=mock.Mock(), ) value = { 'storage_classes': ( 'socorro.unittest.external.test_crashstorage_base' '.MutatingProcessedCrashCrashStorage' ), } cm = ConfigurationManager(n, values_source_list=[value]) with cm.context() as config: raw_crash = {'ooid': '12345'} dump = '12345' processed_crash = { 'foo': DotDict({'other': 'thing'}), 'bar': SocorroDotDict({'something': 'else'}), } poly_store = config.storage(config) poly_store.save_raw_and_processed( raw_crash, dump, processed_crash, 'n' ) eq_(processed_crash['foo']['other'], 'thing') eq_(processed_crash['bar']['something'], 'else')
def test_basic_crashstorage(self): required_config = Namespace() mock_logging = Mock() required_config.add_option("logger", default=mock_logging) required_config.update(CrashStorageBase.required_config) config_manager = ConfigurationManager( [required_config], app_name="testapp", app_version="1.0", app_description="app description", values_source_list=[{"logger": mock_logging}], argv_source=[], ) with config_manager.context() as config: crashstorage = CrashStorageBase(config, quit_check_callback=fake_quit_check) crashstorage.save_raw_crash({}, "payload", "ooid") crashstorage.save_processed({}) assert_raises(NotImplementedError, crashstorage.get_raw_crash, "ooid") assert_raises(NotImplementedError, crashstorage.get_raw_dump, "ooid") assert_raises(NotImplementedError, crashstorage.get_unredacted_processed, "ooid") assert_raises(NotImplementedError, crashstorage.remove, "ooid") eq_(crashstorage.new_crashes(), []) crashstorage.close()
def test_benchmarking_crashstore(self): required_config = Namespace() mock_logging = Mock() required_config.add_option("logger", default=mock_logging) required_config.update(BenchmarkingCrashStorage.get_required_config()) fake_crash_store = Mock() config_manager = ConfigurationManager( [required_config], app_name="testapp", app_version="1.0", app_description="app description", values_source_list=[ {"logger": mock_logging, "wrapped_crashstore": fake_crash_store, "benchmark_tag": "test"} ], argv_source=[], ) with config_manager.context() as config: crashstorage = BenchmarkingCrashStorage(config, quit_check_callback=fake_quit_check) crashstorage.start_timer = lambda: 0 crashstorage.end_timer = lambda: 1 fake_crash_store.assert_called_with(config, fake_quit_check) crashstorage.save_raw_crash({}, "payload", "ooid") crashstorage.wrapped_crashstore.save_raw_crash.assert_called_with({}, "payload", "ooid") mock_logging.debug.assert_called_with("%s save_raw_crash %s", "test", 1) mock_logging.debug.reset_mock() crashstorage.save_processed({}) crashstorage.wrapped_crashstore.save_processed.assert_called_with({}) mock_logging.debug.assert_called_with("%s save_processed %s", "test", 1) mock_logging.debug.reset_mock() crashstorage.get_raw_crash("uuid") crashstorage.wrapped_crashstore.get_raw_crash.assert_called_with("uuid") mock_logging.debug.assert_called_with("%s get_raw_crash %s", "test", 1) mock_logging.debug.reset_mock() crashstorage.get_raw_dump("uuid") crashstorage.wrapped_crashstore.get_raw_dump.assert_called_with("uuid") mock_logging.debug.assert_called_with("%s get_raw_dump %s", "test", 1) mock_logging.debug.reset_mock() crashstorage.get_raw_dumps("uuid") crashstorage.wrapped_crashstore.get_raw_dumps.assert_called_with("uuid") mock_logging.debug.assert_called_with("%s get_raw_dumps %s", "test", 1) mock_logging.debug.reset_mock() crashstorage.get_raw_dumps_as_files("uuid") crashstorage.wrapped_crashstore.get_raw_dumps_as_files.assert_called_with("uuid") mock_logging.debug.assert_called_with("%s get_raw_dumps_as_files %s", "test", 1) mock_logging.debug.reset_mock() crashstorage.get_unredacted_processed("uuid") crashstorage.wrapped_crashstore.get_unredacted_processed.assert_called_with("uuid") mock_logging.debug.assert_called_with("%s get_unredacted_processed %s", "test", 1) mock_logging.debug.reset_mock()
def main(app_object=None): if isinstance(app_object, six.string_types): app_object = class_converter(app_object) # the only config parameter is a special one that refers to a class or # module that defines an application. In order to qualify, a class must # have a constructor that accepts a DotDict derivative as the sole # input parameter. It must also have a 'main' function that accepts no # parameters. For a module to be acceptable, it must have a main # function that accepts a DotDict derivative as its input parameter. app_definition = Namespace() app_definition.add_option('application', doc='the fully qualified module or class of the ' 'application', default=app_object, from_string_converter=class_converter ) app_name = getattr(app_object, 'app_name', 'unknown') app_version = getattr(app_object, 'app_version', '0.0') app_description = getattr(app_object, 'app_description', 'no idea') # create an iterable collection of value sources # the order is important as these will supply values for the sources # defined in the_definition_source. The values will be overlain in turn. # First the os.environ values will be applied. Then any values from an ini # file parsed by getopt. Finally any values supplied on the command line # will be applied. value_sources = (ConfigFileFutureProxy, # alias for allowing the user # to specify a config file on # the command line environment, # alias for os.environ command_line) # alias for getopt # set up the manager with the definitions and values # it isn't necessary to provide the app_name because the # app_object passed in or loaded by the ConfigurationManager will alredy # have that information. config_manager = ConfigurationManager(app_definition, value_sources, app_name=app_name, app_version=app_version, app_description=app_description, ) config = config_manager.get_config() app_object = config.admin.application if isinstance(app_object, type): # invocation of the app if the app_object was a class instance = app_object(config) instance.main() elif inspect.ismodule(app_object): # invocation of the app if the app_object was a module app_object.main(config) elif inspect.isfunction(app_object): # invocation of the app if the app_object was a function app_object(config)
def test_operation_error_with_postgres_with_backoff_with_rollback(self): required_config = Namespace() required_config.add_option( 'transaction_executor_class', default=TransactionExecutorWithBackoff, #default=TransactionExecutor, doc='a class that will execute transactions' ) mock_logging = MockLogging() required_config.add_option('logger', default=mock_logging) config_manager = ConfigurationManager( [required_config], app_name='testapp', app_version='1.0', app_description='app description', values_source_list=[{'database_class': MockConnectionContext, 'backoff_delays': [2, 4, 6, 10, 15]}], ) with config_manager.context() as config: executor = config.transaction_executor_class(config) _function_calls = [] # some mutable _sleep_count = [] def mock_function(connection): assert isinstance(connection, MockConnection) connection.transaction_status = \ psycopg2.extensions.TRANSACTION_STATUS_INTRANS _function_calls.append(connection) # the default sleep times are going to be, # 2, 4, 6, 10, 15 # so after 2 + 4 + 6 + 10 + 15 seconds # all will be exhausted if sum(_sleep_count) < sum([2, 4, 6, 10, 15]): raise psycopg2.OperationalError('Arh!') def mock_sleep(n): _sleep_count.append(n) # monkey patch the sleep function from inside transaction_executor _orig_sleep = socorro.database.transaction_executor.time.sleep socorro.database.transaction_executor.time.sleep = mock_sleep try: executor(mock_function) self.assertTrue(_function_calls) self.assertEqual(commit_count, 1) self.assertEqual(rollback_count, 5) self.assertTrue(mock_logging.warnings) self.assertEqual(len(mock_logging.warnings), 5) self.assertTrue(len(_sleep_count) > 10) finally: socorro.database.transaction_executor.time.sleep = _orig_sleep
def setup_configman_namespace(self): n = Namespace() n.add_option( 'alpha', default=3, doc='the first parameter', is_argument=True ) n.add_option( 'beta', default='the second', doc='the first parameter', short_form='b', ) n.add_option( 'gamma', default="1 2 3", from_string_converter=quote_stripping_list_of_ints, to_string_converter=partial( list_to_str, delimiter=' ' ), secret=True, ) n.add_option( 'delta', default=False, from_string_converter=boolean_converter ) return n
def get_standard_config_manager( more_definitions=None, overrides=None, ): # MOCKED CONFIG DONE HERE required_config = Namespace() required_config.add_option( 'logger', default=SilentFakeLogger(), doc='a logger', ) required_config.add_option( 'executor_identity', default=Mock() ) if isinstance(more_definitions, Sequence): definitions = [required_config] definitions.extend(more_definitions) elif more_definitions is not None: definitions = [required_config, more_definitions] else: definitions = [required_config] local_overrides = [ environment, ] if isinstance(overrides, Sequence): overrides.extend(local_overrides) elif overrides is not None: overrides = [overrides].extend(local_overrides) else: overrides = local_overrides config_manager = ConfigurationManager( definitions, values_source_list=overrides, app_name='test-crontabber', app_description=__doc__, argv_source=[] ) # very useful debug #import contextlib #import sys #@contextlib.contextmanager #def stdout_opener(): #yield sys.stdout #config_manager.write_conf('conf', stdout_opener) return config_manager
def logging_required_config(app_name): lc = Namespace() lc.namespace('logging') lc.logging.add_option( 'syslog_host', doc='syslog hostname', default='localhost', reference_value_from='resource.logging', ) lc.logging.add_option( 'syslog_port', doc='syslog port', default=514, reference_value_from='resource.logging', ) lc.logging.add_option( 'syslog_facility_string', doc='syslog facility string ("user", "local0", etc)', default='user', reference_value_from='resource.logging', ) lc.logging.add_option( 'syslog_line_format_string', doc='python logging system format for syslog entries', default='%s (pid {process}): ' '{asctime} {levelname} - {threadName} - ' '{message}' % app_name, reference_value_from='resource.logging', ) lc.logging.add_option( 'syslog_error_logging_level', doc='logging level for the log file (10 - DEBUG, 20 ' '- INFO, 30 - WARNING, 40 - ERROR, 50 - CRITICAL)', default=40, reference_value_from='resource.logging', ) lc.logging.add_option( 'stderr_line_format_string', doc='python logging system format for logging to stderr', default='{asctime} {levelname} - {threadName} - ' '{message}', reference_value_from='resource.logging', ) lc.logging.add_option( 'stderr_error_logging_level', doc='logging level for the logging to stderr (10 - ' 'DEBUG, 20 - INFO, 30 - WARNING, 40 - ERROR, ' '50 - CRITICAL)', default=10, reference_value_from='resource.logging', ) return lc
def define_config(): definition = Namespace() definition.add_option( name='redmine-root', doc='Root url of redmine server', short_form='r' ) definition.add_option( name='redmine-apikey', doc='Redmine API key', short_form='a' ) return definition
def _common_config_setup(self): mock_logging = Mock() required_config = Namespace() required_config.namespace("hbase") required_config.hbase.hbase_class = crashstorage.HBaseCrashStorage required_config.hbase.add_option("logger", default=mock_logging) config_manager = ConfigurationManager( [required_config], app_name="testapp", app_version="1.0", app_description="app description", values_source_list=[{"hbase": {"logger": mock_logging}}], ) return config_manager
def _get_config_manager(): required_config = Namespace() webapi = Namespace() webapi.search_default_date_range = 7 webapi.search_maximum_date_range = 365 required_config.webapi = webapi config_manager = ConfigurationManager( [required_config], app_name="testapp", app_version="1.0", app_description="app description", argv_source=[] ) return config_manager
def _common_config_setup(self): mock_logging = Mock() required_config = Namespace() required_config.namespace("filesystem") required_config.filesystem.filesystem_class = crashstorage.FSRadixTreeStorage required_config.filesystem.add_option("logger", default=mock_logging) config_manager = ConfigurationManager( [required_config], app_name="testapp", app_version="1.0", app_description="app description", values_source_list=[{"filesystem": {"logger": mock_logging, "fs_root": self.fs_root}}], ) return config_manager
def _get_config_manager(): required_config = Namespace() required_config.search_default_date_range = 7 required_config.search_maximum_date_range = 365 config_manager = ConfigurationManager( [required_config], app_name='testapp', app_version='1.0', app_description='app description', argv_source=[] ) return config_manager
def define_config(): definition = Namespace() definition.add_option( name='devowel', default=False, doc='Removes all vowels (including Y)', short_form='d' ) definition.add_option( name='file', default='', doc='file name for the input text', short_form='f' ) return definition
def _get_config_manager(): required_config = Namespace() webapi = Namespace() webapi.search_default_date_range = 7 required_config.webapi = webapi config_manager = ConfigurationManager( [required_config], app_name='testapp', app_version='1.0', app_description='app description', ) return config_manager
class IndexCleaner(RequiredConfig): """Delete Elasticsearch indices from our databases.""" required_config = Namespace() required_config.add_option( 'retention_policy', default=26, doc='Number of weeks to keep an index alive. ', ) required_config.namespace('elasticsearch') required_config.elasticsearch.add_option( 'elasticsearch_class', default='socorro.external.es.connection_context.ConnectionContext', from_string_converter=class_converter, reference_value_from='resource.elasticsearch', ) required_config.elasticsearch.add_option( 'elasticsearch_index_regex', default='^socorro[0-9]{6}$', reference_value_from='resource.elasticsearch', ) def __init__(self, config): super().__init__() self.config = config def delete_indices(self, predicate=None): """Delete crash indices that match the given predicate. :arg callable predicate: A callable of the form ``predicate(index)``, where ``index`` is a string containing the name of the index. If the callable returns true, the index will be deleted. The default is None, which deletes all crash indices. :returns: List of indexes that were deleted """ es_class = self.config.elasticsearch.elasticsearch_class( self.config.elasticsearch) index_client = es_class.indices_client() status = index_client.status() indices = status['indices'].keys() aliases = index_client.get_aliases() deleted_indices = [] for index in indices: # Some indices look like 'socorro%Y%W_%Y%M%d', but they are # aliased to the expected format of 'socorro%Y%W'. In such cases, # replace the index with the alias. if index in aliases and 'aliases' in aliases[index]: index_aliases = list(aliases[index]['aliases'].keys()) if index_aliases: index = index_aliases[0] if not re.match( self.config.elasticsearch.elasticsearch_index_regex, index): # This index doesn't look like a crash index, let's skip it. continue if predicate is None or predicate(index): index_client.delete(index) deleted_indices.append(index) return deleted_indices def delete_old_indices(self): self.delete_indices(self.is_index_old) def is_index_old(self, index): now = utc_now() policy_delay = datetime.timedelta(weeks=self.config.retention_policy) time_limit = (now - policy_delay).replace(tzinfo=None) # strptime ignores week numbers if a day isn't specified, so we append # '-1' and '-%w' to specify Monday as the day. index_date = datetime.datetime.strptime( index + '-1', self.config.elasticsearch.elasticsearch_index + '-%w') return index_date < time_limit
def test_fallback_crash_storage(self): n = Namespace() n.add_option( 'storage', default=FallbackCrashStorage, ) n.add_option( 'logger', default=mock.Mock(), ) value = { 'primary.storage_class': 'socorro.unittest.external.test_crashstorage_base.A', 'fallback.storage_class': 'socorro.unittest.external.test_crashstorage_base.B', } cm = ConfigurationManager(n, values_source_list=[value]) with cm.context() as config: self.assertEqual(config.primary.storage_class.foo, 'a') self.assertEqual(config.fallback.storage_class.foo, 'b') raw_crash = {'ooid': ''} crash_id = '1498dee9-9a45-45cc-8ec8-71bb62121203' dump = '12345' processed_crash = {'ooid': '', 'product': 17} fb_store = config.storage(config) # save_raw tests fb_store.primary_store.save_raw_crash = Mock() fb_store.fallback_store.save_raw_crash = Mock() fb_store.save_raw_crash(raw_crash, dump, crash_id) fb_store.primary_store.save_raw_crash.assert_called_with( raw_crash, dump, crash_id) self.assertEqual(fb_store.fallback_store.save_raw_crash.call_count, 0) fb_store.primary_store.save_raw_crash = Mock() fb_store.primary_store.save_raw_crash.side_effect = Exception('!') fb_store.save_raw_crash(raw_crash, dump, crash_id) fb_store.primary_store.save_raw_crash.assert_called_with( raw_crash, dump, crash_id) fb_store.fallback_store.save_raw_crash.assert_called_with( raw_crash, dump, crash_id) fb_store.fallback_store.save_raw_crash = Mock() fb_store.fallback_store.save_raw_crash.side_effect = Exception('!') self.assertRaises(PolyStorageError, fb_store.save_raw_crash, raw_crash, dump, crash_id) fb_store.primary_store.save_raw_crash.assert_called_with( raw_crash, dump, crash_id) fb_store.fallback_store.save_raw_crash.assert_called_with( raw_crash, dump, crash_id) # save_processed tests fb_store.primary_store.save_processed = Mock() fb_store.fallback_store.save_processed = Mock() fb_store.save_processed(processed_crash) fb_store.primary_store.save_processed.assert_called_with( processed_crash) self.assertEqual(fb_store.fallback_store.save_processed.call_count, 0) fb_store.primary_store.save_processed = Mock() fb_store.primary_store.save_processed.side_effect = Exception('!') fb_store.save_processed(processed_crash) fb_store.primary_store.save_processed.assert_called_with( processed_crash) fb_store.fallback_store.save_processed.assert_called_with( processed_crash) fb_store.fallback_store.save_processed = Mock() fb_store.fallback_store.save_processed.side_effect = Exception('!') self.assertRaises(PolyStorageError, fb_store.save_processed, processed_crash) fb_store.primary_store.save_processed.assert_called_with( processed_crash) fb_store.fallback_store.save_processed.assert_called_with( processed_crash) # close tests fb_store.primary_store.close = Mock() fb_store.fallback_store.close = Mock() fb_store.close() fb_store.primary_store.close.assert_called_with() fb_store.fallback_store.close.assert_called_with() fb_store.primary_store.close = Mock() fb_store.fallback_store.close = Mock() fb_store.fallback_store.close.side_effect = NotImplementedError() fb_store.close() fb_store.primary_store.close.assert_called_with() fb_store.fallback_store.close.assert_called_with() fb_store.primary_store.close = Mock() fb_store.primary_store.close.side_effect = Exception('!') fb_store.close() fb_store.primary_store.close.assert_called_with() fb_store.fallback_store.close.assert_called_with() fb_store.fallback_store.close = Mock() fb_store.fallback_store.close.side_effect = Exception('!') self.assertRaises(PolyStorageError, fb_store.close) fb_store.primary_store.close.assert_called_with() fb_store.fallback_store.close.assert_called_with()
class BugzillaCronApp(PostgresTransactionManagedCronApp): app_name = 'bugzilla-associations' app_description = 'Bugzilla Associations' app_version = '0.1' required_config = Namespace() required_config.add_option('query', default=_URL, doc='Explanation of the option') required_config.add_option( 'days_into_past', default=0, doc='number of days to look into the past for bugs (0 - use last ' 'run time)') def run(self, connection): # record_associations logger = self.config.logger try: # KeyError if it's never run successfully # TypeError if self.job_information is None last_run = self.job_information['last_success'] except (KeyError, TypeError): last_run = utc_now() - datetime.timedelta( days=self.config.days_into_past and self.config.days_into_past or 30) last_run_formatted = last_run.strftime('%Y-%m-%d') query = self.config.query % last_run_formatted cursor = connection.cursor() for bug_id, status, resolution, short_desc, signature_set in self._iterator( query): logger.debug("bug %s (%s, %s) %s: %s", bug_id, status, resolution, short_desc, signature_set) if not signature_set: cursor.execute( """ DELETE FROM bugs WHERE id = %s """, (bug_id, )) continue useful = False insert_made = False try: status_db, resolution_db, short_desc_db = singleRowSql( cursor, """ SELECT status, resolution, short_desc FROM bugs WHERE id = %s """, (bug_id, )) if status_db != status or resolution_db != resolution or short_desc_db != short_desc: cursor.execute( """ UPDATE bugs SET status = %s, resolution = %s, short_desc = %s WHERE id = %s""", (status, resolution, short_desc, bug_id)) logger.info("bug status updated: %s - %s, %s", bug_id, status, resolution) useful = True cursor.execute( """ SELECT signature FROM bug_associations WHERE bug_id = %s""", (bug_id, )) signatures_db = [x[0] for x in cursor.fetchall()] for signature in signatures_db: if signature not in signature_set: cursor.execute( """ DELETE FROM bug_associations WHERE signature = %s and bug_id = %s""", (signature, bug_id)) logger.info('association removed: %s - "%s"', bug_id, signature) useful = True except SQLDidNotReturnSingleRow: cursor.execute( """ INSERT INTO bugs (id, status, resolution, short_desc) VALUES (%s, %s, %s, %s)""", (bug_id, status, resolution, short_desc)) insert_made = True signatures_db = [] for signature in signature_set: if signature not in signatures_db: if self._has_signature_report(signature, cursor): cursor.execute( """ INSERT INTO bug_associations (signature, bug_id) VALUES (%s, %s)""", (signature, bug_id)) logger.info('new association: %s - "%s"', bug_id, signature) useful = True else: logger.info( 'rejecting association (no reports with this ' 'signature): %s - "%s"', bug_id, signature) if useful: connection.commit() if insert_made: logger.info('new bug: %s - %s, %s, "%s"', bug_id, status, resolution, short_desc) else: connection.rollback() if insert_made: logger.info( 'rejecting bug (no useful information): ' '%s - %s, %s, "%s"', bug_id, status, resolution, short_desc) else: logger.info( 'skipping bug (no new information): ' '%s - %s, %s, "%s"', bug_id, status, resolution, short_desc) def _iterator(self, query): ##assert query.startswith('file://'), query## DEBUGGGINGG opener = urllib2.urlopen for report in csv.DictReader(opener(query)): yield (int(report['bug_id']), report['bug_status'], report['resolution'], report['short_desc'], self._signatures_found(report['cf_crash_signature'])) def _signatures_found(self, signature): if not signature: return set() set_ = set() try: start = 0 end = 0 while True: start = signature.index("[@", end) + 2 end = signature.index("]", end + 1) set_.add(signature[start:end].strip()) except ValueError: # throw when index cannot match another sig, ignore pass return set_ def _has_signature_report(self, signature, cursor): try: singleRowSql( cursor, """ SELECT 1 FROM reports WHERE signature = %s LIMIT 1""", (signature, )) return True except SQLDidNotReturnSingleRow: return False
class CSignatureTool(CSignatureToolBase): """This is a C/C++ signature generation class that gets its initialization from configuration.""" required_config = Namespace() required_config.add_option( 'signature_sentinels', doc='a list of frame signatures that should always be considered top ' 'of the stack if present in the stack', default="""['_purecall', ('mozilla::ipc::RPCChannel::Call(IPC::Message*, IPC::Message*)', lambda x: 'CrashReporter::CreatePairedMinidumps(void*, ' 'unsigned long, nsAString_internal*, nsILocalFile**, ' 'nsILocalFile**)' in x ), 'Java_org_mozilla_gecko_GeckoAppShell_reportJavaCrash', 'google_breakpad::ExceptionHandler::HandleInvalidParameter' '(wchar_t const*, wchar_t const*, wchar_t const*, unsigned ' 'int, unsigned int)' ]""", from_string_converter=eval) required_config.add_option( 'irrelevant_signature_re', doc='a regular expression matching frame signatures that should be ' 'ignored when generating an overall signature', default="""'|'.join([ '@0x[0-9a-fA-F]{2,}', '@0x[1-9a-fA-F]', 'ashmem', 'app_process@0x.*', 'core\.odex@0x.*', '_CxxThrowException', 'dalvik-heap', 'dalvik-jit-code-cache', 'dalvik-LinearAlloc', 'dalvik-mark-stack', 'data@app@org\.mozilla\.f.*-\d\.apk@classes\.dex@0x.*', 'framework\.odex@0x.*', 'google_breakpad::ExceptionHandler::HandleInvalidParameter.*', 'KiFastSystemCallRet', 'libandroid_runtime\.so@0x.*', 'libbinder\.so@0x.*', 'libc\.so@.*', 'libc-2\.5\.so@.*', 'libEGL\.so@.*', 'libdvm\.so\s*@\s*0x.*', 'libgui\.so@0x.*', 'libicudata.so@.*', 'libMali\.so@0x.*', 'libutils\.so@0x.*', 'libz\.so@0x.*', 'linux-gate\.so@0x.*', 'mnt@asec@org\.mozilla\.f.*-\d@pkg\.apk@classes\.dex@0x.*', 'MOZ_Assert', 'MOZ_Crash', 'mozcrt19.dll@0x.*', 'mozilla::ipc::RPCChannel::Call\(IPC::Message\*, IPC::Message\*\)', '_NSRaiseError', '(Nt|Zw)WaitForSingleObject(Ex)?', '(Nt|Zw)WaitForMultipleObjects(Ex)?', 'nvmap@0x.*', 'org\.mozilla\.f.*-\d\.apk@0x.*', 'RaiseException', 'RtlpAdjustHeapLookasideDepth', 'system@framework@.*\.jar@classes\.dex@0x.*', '___TERMINATING_DUE_TO_UNCAUGHT_EXCEPTION___', 'WaitForSingleObjectExImplementation', 'WaitForMultipleObjectsExImplementation', 'RealMsgWaitFor.*' '_ZdlPv', 'zero', ])""", from_string_converter=eval) required_config.add_option( 'prefix_signature_re', doc='a regular expression matching frame signatures that should always ' 'be coupled with the following frame signature when generating an ' 'overall signature', default="""'|'.join([ '@0x0', '.*CrashAtUnhandlableOOM', 'Abort', '.*abort', '_alloca_probe.*', '__android_log_assert', 'arena_.*', 'BaseGetNamedObjectDirectory', '.*calloc', 'cert_.*', 'CERT_.*', 'CFRelease', '_chkstk', 'CrashInJS', '__delayLoadHelper2', 'dlmalloc', 'dlmalloc_trim', 'dvm.*', 'EtwEventEnabled', 'extent_.*', 'fastcopy_I', 'fastzero_I', '_files_getaddrinfo', '.*free', 'GCGraphBuilder::NoteXPCOMChild', 'getanswer', 'huge_dalloc', 'ialloc', 'imalloc', 'init_library', 'isalloc', 'je_malloc', 'jemalloc_crash', 'je_realloc', 'JNI_CreateJavaVM', '_JNIEnv.*', 'JNI_GetCreatedJavaVM.*', 'js::AutoCompartment::AutoCompartment.*', 'JSAutoCompartment::JSAutoCompartment.*', 'JS_DHashTableEnumerate', 'JS_DHashTableOperate', 'kill', '__libc_android_abort', 'libobjc.A.dylib@0x1568.', '(libxul\.so|xul\.dll|XUL)@0x.*', 'LL_.*', 'malloc', '_MD_.*', 'memcmp', '__memcmp16', 'memcpy', 'memmove', 'memset', 'mozalloc_abort.*', 'mozalloc_handle_oom', 'moz_free', 'mozilla::AndroidBridge::AutoLocalJNIFrame::~AutoLocalJNIFrame', 'mozilla::ipc::RPCChannel::Call', 'mozilla::ipc::RPCChannel::CxxStackFrame::CxxStackFrame', 'mozilla::ipc::RPCChannel::EnteredCxxStack', 'mozilla::ipc::RPCChannel::Send', 'mozilla.*FatalError', 'moz_xmalloc', 'moz_xrealloc', 'NP_Shutdown', 'nsACString_internal::Assign.*', 'nsCOMPtr.*', 'NS_ABORT_OOM.*', 'NS_DebugBreak.*', '[-+]\[NSException raise(:format:(arguments:)?)?\]', 'nsObjCExceptionLogAbort(\(.*?\)){0,1}', 'nsRefPtr.*', 'NSS.*', 'nss.*', 'nsTArray<.*', 'nsTArray_base<.*', 'NtUser.*', 'objc_exception_throw', 'objc_msgSend', 'operator new\([^,\)]+\)', 'PL_.*', 'port_.*', 'PORT_.*', '_PR_.*', 'PR_.*', 'pthread_mutex_lock', '_purecall', 'raise', 'realloc', 'recv', '_RTC_Terminate', 'Rtl.*', '_Rtl.*', '__Rtl.*', 'SEC_.*Item', 'seckey_.*', 'SECKEY_.*', '__security_check_cookie', 'send', 'setjmp', 'sigblock', 'sigprocmask', 'SocketAccept', 'SocketAcceptRead', 'SocketAvailable', 'SocketAvailable64', 'SocketBind', 'SocketClose', 'SocketConnect', 'SocketGetName', 'SocketGetPeerName', 'SocketListen', 'SocketPoll', 'SocketRead', 'SocketRecv', 'SocketSend', 'SocketShutdown', 'SocketSync', 'SocketTransmitFile', 'SocketWrite', 'SocketWritev', 'ssl_.*', 'SSL_.*', 'strcat', 'ssl3_.*', 'strchr', 'strcmp', 'strcpy', '.*strdup', 'strlen', 'strncpy', 'strzcmp16', 'strstr', '__swrite', 'TouchBadMemory', '_VEC_memcpy', '_VEC_memzero', '.*WaitFor.*', 'wcslen', '__wrap_realloc', 'WSARecv.*', 'WSASend.*', '_ZdaPvRKSt9nothrow_t\"', 'zzz_AsmCodeRange_.*', '.*DebugAbort.*', 'mozilla::ipc::MessageChannel::~MessageChannel.*', ])""", from_string_converter=eval) required_config.add_option( 'signatures_with_line_numbers_re', doc='any signatures that match this list should be combined with their ' 'associated source code line numbers', default='js_Interpret') #-------------------------------------------------------------------------- def __init__(self, config, quit_check_callback=None): super(CSignatureTool, self).__init__(config, quit_check_callback) self.irrelevant_signature_re = \ re.compile(self.config.irrelevant_signature_re) self.prefix_signature_re = \ re.compile(self.config.prefix_signature_re) self.signatures_with_line_numbers_re = \ re.compile(self.config.signatures_with_line_numbers_re) self.signature_sentinels = config.signature_sentinels
def test_processed_crash_storage(self): n = Namespace() n.add_option( 'storage', default=PrimaryDeferredProcessedStorage, ) n.add_option( 'logger', default=mock.Mock(), ) value = { 'primary.storage_class': 'socorro.unittest.external.test_crashstorage_base.A', 'deferred.storage_class': 'socorro.unittest.external.test_crashstorage_base.B', 'processed.storage_class': 'socorro.unittest.external.test_crashstorage_base.B', 'deferral_criteria': lambda x: x.get('foo') == 'foo' } cm = ConfigurationManager(n, values_source_list=[value]) with cm.context() as config: self.assertEqual(config.primary.storage_class.foo, 'a') self.assertEqual(config.deferred.storage_class.foo, 'b') self.assertEqual(config.processed.storage_class.foo, 'b') raw_crash = {'ooid': ''} crash_id = '1498dee9-9a45-45cc-8ec8-71bb62121203' dump = '12345' deferred_crash = {'ooid': '', 'foo': 'foo'} processed_crash = {'ooid': '', 'product': 17} pd_store = config.storage(config) # save_raw tests pd_store.primary_store.save_raw_crash = Mock() pd_store.deferred_store.save_raw_crash = Mock() pd_store.processed_store.save_raw_crash = Mock() pd_store.save_raw_crash(raw_crash, dump, crash_id) pd_store.primary_store.save_raw_crash.assert_called_with( raw_crash, dump, crash_id) self.assertEqual(pd_store.deferred_store.save_raw_crash.call_count, 0) pd_store.save_raw_crash(deferred_crash, dump, crash_id) pd_store.deferred_store.save_raw_crash.assert_called_with( deferred_crash, dump, crash_id) # save_processed tests pd_store.primary_store.save_processed = Mock() pd_store.deferred_store.save_processed = Mock() pd_store.processed_store.save_processed = Mock() pd_store.save_processed(processed_crash) pd_store.processed_store.save_processed.assert_called_with( processed_crash) self.assertEqual(pd_store.primary_store.save_processed.call_count, 0) pd_store.save_processed(deferred_crash) pd_store.processed_store.save_processed.assert_called_with( deferred_crash) # close tests pd_store.primary_store.close = Mock() pd_store.deferred_store.close = Mock() pd_store.close() pd_store.primary_store.close.assert_called_with() pd_store.deferred_store.close.assert_called_with() pd_store.primary_store.close = Mock() pd_store.deferred_store.close = Mock() pd_store.deferred_store.close.side_effect = NotImplementedError() pd_store.close() pd_store.primary_store.close.assert_called_with() pd_store.deferred_store.close.assert_called_with() pd_store.primary_store.close = Mock() pd_store.primary_store.close.side_effect = Exception('!') pd_store.close() pd_store.primary_store.close.assert_called_with() pd_store.deferred_store.close.assert_called_with() pd_store.deferred_store.close = Mock() pd_store.deferred_store.close.side_effect = Exception('!') self.assertRaises(PolyStorageError, pd_store.close) pd_store.primary_store.close.assert_called_with() pd_store.deferred_store.close.assert_called_with()
class ElasticSearchCrashStorage(CrashStorageBase): """This class sends processed crash reports to elasticsearch. It handles indices creation and type mapping. It cannot store raw dumps or raw crash reports as Socorro doesn't need those in elasticsearch at the moment. """ required_config = Namespace() required_config.add_option( 'transaction_executor_class', default="socorro.database.transaction_executor." "TransactionExecutorWithLimitedBackoff", doc='a class that will manage transactions', from_string_converter=class_converter, reference_value_from='resource.elasticsearch', ) required_config.add_option( 'elasticsearch_class', default='socorro.external.elasticsearch.connection_context.' 'ConnectionContext', from_string_converter=class_converter, reference_value_from='resource.elasticsearch', ) required_config.add_option( 'elasticsearch_base_settings', default='%s/mappings/socorro_index_settings.json' % DIRECTORY, doc='the file containing the mapping of the indexes receiving ' 'crash reports', reference_value_from='resource.elasticsearch', ) required_config.add_option( 'elasticsearch_emails_index_settings', default='%s/mappings/socorro_emails_index_settings.json' % DIRECTORY, doc='the file containing the mapping of the indexes receiving ' 'email addresses for the automatic-emails cron job', reference_value_from='resource.elasticsearch', ) required_config.add_option( 'elasticsearch_emails_index', default='socorro_emails', doc='the index that handles data about email addresses for ' 'the automatic-emails cron job', reference_value_from='resource.elasticsearch', ) required_config.add_option( 'use_mapping_file', default=True, doc='load the mapping from a file if true, load it from the database ' 'otherwise', reference_value_from='resource.elasticsearch', ) operational_exceptions = (pyelasticsearch.exceptions.ConnectionError, pyelasticsearch.exceptions.Timeout) conditional_exceptions = () indices_cache = set() #-------------------------------------------------------------------------- def __init__(self, config, quit_check_callback=None): super(ElasticSearchCrashStorage, self).__init__(config, quit_check_callback) self.transaction = config.transaction_executor_class( config, self, quit_check_callback) if self.config.elasticsearch_urls: self.es = pyelasticsearch.ElasticSearch( self.config.elasticsearch_urls, timeout=self.config.elasticsearch_timeout) else: config.logger.warning('elasticsearch crash storage is disabled.') #-------------------------------------------------------------------------- def save_processed(self, processed_crash): crash_id = processed_crash['uuid'] crash_document = { 'crash_id': crash_id, 'processed_crash': processed_crash, 'raw_crash': None } try: # Why is the function specified as unbound? The elastic search # crashstorage class serves as its own connection context object. # In otherwords, it has no actual connection class. The # transaction executor passes a connection object as the first # paremeter to the function that it calls. That means that it will # be passing the ElasticSearchCrashStorage instance as the self # parameter. A bound function would already have that input # parameter and thus an exception would be raised. By using an # unbound function, we avoid this problem. self.transaction(self.__class__._submit_crash_to_elasticsearch, crash_id, crash_document) except KeyError, x: if x == 'uuid': raise CrashIDNotFound raise
class ESBulkClassTemplate(base_class): required_config = Namespace() required_config.add_option( 'items_per_bulk_load', default=500, doc="the number of crashes that triggers a flush to ES" ) required_config.add_option( 'maximum_queue_size', default=512, doc='the maximum size of the internal queue' ) #---------------------------------------------------------------------- def __init__(self, config, quit_check_callback=None): super(ESBulkClassTemplate, self).__init__( config, quit_check_callback ) self.task_queue = QueueWrapper(config.maximum_queue_size) self.consuming_thread = Thread( name="ConsumingThread", target=self._consuming_thread_func ) # overwrites original self.transaction = config.transaction_executor_class( config, QueueContextSource(self.task_queue), quit_check_callback ) self.done = False self.consuming_thread.start() #---------------------------------------------------------------------- def _submit_crash_to_elasticsearch(self, queue, crash_document): # Massage the crash such that the date_processed field is formatted # in the fashion of our established mapping. # First create a datetime object from the string in the crash # report. self.reconstitute_datetimes(crash_document['processed_crash']) # Obtain the index name. es_index = self.get_index_for_crash( crash_document['processed_crash']['date_processed'] ) es_doctype = self.config.elasticsearch.elasticsearch_doctype crash_id = crash_document['crash_id'] # Attempt to create the index; it's OK if it already exists. if es_index not in self.indices_cache: index_creator = self.config.index_creator_class( config=self.config ) index_creator.create_socorro_index(es_index) action = { '_index': es_index, '_type': es_doctype, '_id': crash_id, '_source': crash_document, } queue.put(action) #---------------------------------------------------------------------- def _consumer_iter(self): while True: try: crash_document = self.task_queue.get() except Exception: self.config.logger.critical( "Failure in ES Bulktask_queue", exc_info=True ) crash_document = None if crash_document is None: self.done = True break yield crash_document # execute the task #---------------------------------------------------------------------- def close(self): self.task_queue.put(None) self.consuming_thread.join() #---------------------------------------------------------------------- def _consuming_thread_func(self): # execute the bulk load with self.es_context() as es: try: elasticsearch.helpers.bulk( es, self._consumer_iter(), chunk_size=self.config.items_per_bulk_load ) except Exception: self.config.logger.critical( "Failure in ES elasticsearch.helpers.bulk", exc_info=True )
class FetchADIFromHiveCronApp(BaseCronApp): """ This cron is our daily blocklist ping web logs query that rolls up all the browser checkins and let's us know how many browsers we think were active on the internet for a particular day """ app_name = 'fetch-adi-from-hive' app_description = 'Fetch ADI From Hive App' app_version = '0.1' required_config = Namespace() required_config.add_option( 'query', default=_QUERY, doc='Hive query for fetching ADI data') required_config.add_option( 'hive_host', default='localhost', doc='Hostname to run Hive query on') required_config.add_option( 'hive_port', default=10000, doc='Port to run Hive query on') required_config.add_option( 'hive_user', default='socorro', doc='User to connect to Hive with') required_config.add_option( 'hive_password', default='ignored', doc='Password to connect to Hive with') required_config.add_option( 'hive_database', default='default', doc='Database name to connect to Hive with') required_config.add_option( 'hive_auth_mechanism', default='PLAIN', doc='Auth mechanism for Hive') required_config.add_option( 'timeout', default=30 * 60, # 30 minutes doc='number of seconds to wait before timing out') @staticmethod def remove_control_characters(s): if isinstance(s, str): s = unicode(s, 'utf-8', errors='replace') return ''.join(c for c in s if unicodedata.category(c)[0] != "C") def run(self, connection, date): target_date = (date - datetime.timedelta(days=1)).strftime('%Y-%m-%d') raw_adi_logs_pathname = os.path.join( tempfile.gettempdir(), "%s.raw_adi_logs.TEMPORARY%s" % ( target_date, '.txt' ) ) try: with codecs.open(raw_adi_logs_pathname, 'w', 'utf-8') as f: hive = pyhs2.connect( host=self.config.hive_host, port=self.config.hive_port, authMechanism=self.config.hive_auth_mechanism, user=self.config.hive_user, password=self.config.hive_password, database=self.config.hive_database, # the underlying TSocket setTimeout() wants milliseconds timeout=self.config.timeout * 1000 ) cur = hive.cursor() query = self.config.query % target_date cur.execute(query) for row in cur: if None in row: continue f.write( "\t" .join( self.remove_control_characters( urllib2.unquote(v) ).replace('\\', '\\\\') if isinstance(v, basestring) else str(v) for v in row ) ) f.write("\n") with codecs.open(raw_adi_logs_pathname, 'r', 'utf-8') as f: pgcursor = connection.cursor() pgcursor.copy_from( f, 'raw_adi_logs', null='None', columns=[ 'report_date', 'product_name', 'product_os_platform', 'product_os_version', 'product_version', 'build', 'build_channel', 'product_guid', 'count' ] ) pgcursor.execute(_RAW_ADI_QUERY, (target_date,)) finally: if os.path.isfile(raw_adi_logs_pathname): os.remove(raw_adi_logs_pathname)
class BetaVersionRule(Rule): required_config = Namespace() required_config.add_option( 'database_class', doc="the class of the database", default='socorro.external.postgresql.connection_context.' 'ConnectionContext', from_string_converter=str_to_python_object, reference_value_from='resource.postgresql', ) required_config.add_option( 'transaction_executor_class', default="socorro.database.transaction_executor." "TransactionExecutorWithInfiniteBackoff", doc='a class that will manage transactions', from_string_converter=str_to_python_object, reference_value_from='resource.postgresql', ) def __init__(self, config): super(BetaVersionRule, self).__init__(config) database = config.database_class(config) self.transaction = config.transaction_executor_class( config, database, ) self._versions_data_cache = {} def version(self): return '1.0' def _get_version_data(self, product, version, build_id): """Return the real version number of a specific product, version and build. For example, beta builds of Firefox declare their version number as the major version (i.e. version 54.0b3 would say its version is 54.0). This database call returns the actual version number of said build (i.e. 54.0b3 for the previous example). """ key = '%s:%s:%s' % (product, version, build_id) if key in self._versions_data_cache: return self._versions_data_cache[key] sql = """ SELECT pv.version_string FROM product_versions pv LEFT JOIN product_version_builds pvb ON (pv.product_version_id = pvb.product_version_id) WHERE pv.product_name = %(product)s AND pv.release_version = %(version)s AND pvb.build_id = %(build_id)s """ params = { 'product': product, 'version': version, 'build_id': build_id, } results = self.transaction(execute_query_fetchall, sql, params) for real_version, in results: self._versions_data_cache[key] = real_version return self._versions_data_cache.get(key) def _predicate(self, raw_crash, raw_dumps, processed_crash, proc_meta): try: # We apply this Rule only if the release channel is beta, because # beta versions are the only ones sending an "incorrect" version # number in their data. # 2017-06-14: Ohai! This is not true anymore! With the removal of # the aurora channel, there is now a new type of build called # "DevEdition", that is released on the aurora channel, but has # the same version naming logic as builds on the beta channel. # We thus want to apply the same logic to aurora builds # as well now. Note that older crash reports won't be affected, # because they have a "correct" version number, usually containing # the letter 'a' (like '50.0a2'). return processed_crash['release_channel'].lower() in ('beta', 'aurora') except KeyError: # No release_channel. return False def _action(self, raw_crash, raw_dumps, processed_crash, processor_meta): try: # Sanitize the build id to avoid errors during the SQL query. try: build_id = int(processed_crash['build']) except ValueError: build_id = None real_version = self._get_version_data( processed_crash['product'], processed_crash['version'], build_id, ) if real_version: processed_crash['version'] = real_version else: # This is a beta version but we do not have data about it. It # could be because we don't have it yet (if the cron jobs are # running late for example), so we mark this crash. This way, # we can reprocess it later to give it the correct version. processed_crash['version'] += 'b0' processor_meta.processor_notes.append( 'release channel is %s but no version data was found ' '- added "b0" suffix to version number' % (processed_crash['release_channel'], )) except KeyError: return False return True
def test_programming_error_with_postgres_with_backoff_with_rollback(self): required_config = Namespace() required_config.add_option( 'transaction_executor_class', default=TransactionExecutorWithInfiniteBackoff, doc='a class that will execute transactions') required_config.add_option('database_class', default=MockConnectionContext, from_string_converter=class_converter) mock_logging = MockLogging() required_config.add_option('logger', default=mock_logging) config_manager = ConfigurationManager( [required_config], app_name='testapp', app_version='1.0', app_description='app description', values_source_list=[{ 'backoff_delays': [2, 4, 6, 10, 15] }], argv_source=[]) with config_manager.context() as config: mocked_context = config.database_class(config) executor = config.transaction_executor_class( config, mocked_context) _function_calls = [] # some mutable _sleep_count = [] def mock_function_struggling(connection): assert isinstance(connection, MockConnection) connection.transaction_status = psycopg2.extensions.TRANSACTION_STATUS_INTRANS _function_calls.append(connection) # the default sleep times are going to be, # 2, 4, 6, 10, 15 # so after 2 + 4 + 6 + 10 + 15 seconds # all will be exhausted if sum(_sleep_count) < sum([2, 4, 6, 10, 15]): raise psycopg2.ProgrammingError( 'SSL SYSCALL error: EOF detected') def mock_sleep(n): _sleep_count.append(n) # monkey patch the sleep function from inside transaction_executor _orig_sleep = socorro.database.transaction_executor.time.sleep socorro.database.transaction_executor.time.sleep = mock_sleep try: executor(mock_function_struggling) assert _function_calls assert commit_count == 1 assert rollback_count == 5 assert mock_logging.criticals assert len(mock_logging.criticals) == 5 assert len(_sleep_count) > 10 finally: socorro.database.transaction_executor.time.sleep = _orig_sleep # this time, simulate an actual code bug where a callable function # raises a ProgrammingError() exception by, for example, a syntax error with config_manager.context() as config: mocked_context = config.database_class(config) executor = config.transaction_executor_class( config, mocked_context) def mock_function_developer_mistake(connection): assert isinstance(connection, MockConnection) connection.transaction_status = psycopg2.extensions.TRANSACTION_STATUS_INTRANS raise psycopg2.ProgrammingError("syntax error") with pytest.raises(psycopg2.ProgrammingError): executor(mock_function_developer_mistake)
class ConnectionContext(RequiredConfig): """Postgres Connection Context""" required_config = Namespace() required_config.add_option( name='database_hostname', default=get_field_from_pg_database_url('hostname', 'localhost'), doc='the hostname of the database', reference_value_from='resource.postgresql', ) required_config.add_option( name='database_name', default=get_field_from_pg_database_url('path', ' breakpad')[1:], doc='the name of the database', reference_value_from='resource.postgresql', ) required_config.add_option( name='database_port', default=get_field_from_pg_database_url('port', 5432), doc='the port for the database', reference_value_from='resource.postgresql', ) required_config.add_option( name='database_username', default=get_field_from_pg_database_url('username', 'breakpad_rw'), doc='the name of the user within the database', reference_value_from='secrets.postgresql', ) required_config.add_option( name='database_password', default=get_field_from_pg_database_url('password', 'aPassword'), doc="the user's database password", reference_value_from='secrets.postgresql', secret=True, ) RETRYABLE_EXCEPTIONS = (psycopg2.InterfaceError, socket.timeout) def __init__(self, config, local_config=None): """Initialize the parts needed to start making database connections parameters: config - the complete config for the app. If a real app, this would be where a logger or other resources could be found. local_config - this is the namespace within the complete config where the actual database parameters are found """ super().__init__() self.config = config self.logger = logging.getLogger(__name__ + '.' + self.__class__.__name__) if local_config is None: local_config = config if local_config['database_port'] is None: local_config['database_port'] = 5432 self.dsn = ("host=%(database_hostname)s " "dbname=%(database_name)s " "port=%(database_port)s " "user=%(database_username)s " "password=%(database_password)s") % local_config def connection(self, name_unused=None): return psycopg2.connect(self.dsn) @contextlib.contextmanager def __call__(self, name=None): """returns a database connection wrapped in a contextmanager. The context manager will assure that the connection is closed but will not try to commit or rollback lingering transactions. parameters: name - an optional name for the database connection """ conn = self.connection(name) try: yield conn finally: self.close_connection(conn) def close_connection(self, connection, force=False): """close the connection passed in. This function exists to allow derived classes to override the closing behavior. parameters: connection - the database connection object force - unused boolean to force closure; used in derived classes """ connection.close() def close(self): pass def is_retryable_exception(self, exc): """Return True if this is a retryable exception""" message = exc.args[0] if message in ('SSL SYSCALL error: EOF detected', ): # Ideally we'd like to check against exc.pgcode values # but certain odd ProgrammingError exceptions don't have # pgcodes so we have to rely on reading the pgerror :( return True if isinstance( exc, psycopg2.OperationalError) and message != 'out of memory': return True return isinstance(exc, self.RETRYABLE_EXCEPTIONS) def force_reconnect(self): pass
class FileSystemThrottledCrashStorage(FileSystemRawCrashStorage): """This varient of file system storage segregates crashes based on the result of Collector throttling. When the collector recieves a crash, it applies throttle rules and saves the result in the crash json under the key 'legacy_processing'. Only crashes that have a value of 0 in that field will eventually make it on to processing. legacy_processing == 0 : crashes stored in the filesystem rooted at 'std_fs_root' (standard file system storage) defined in the parent class legacy_processing == 1 : crashes stored in the filesysetm rooted at 'def_fs_root' (deferred file system storage) defined in this class This class only implements raw crash storage and is not appropriate for storing processed crashes.""" required_config = Namespace() required_config.add_option( 'def_fs_root', doc='a path to a local file system', default='./deferredCrashStore', reference_value_from='resource.filesystem', ) #-------------------------------------------------------------------------- def __init__(self, config, quit_check_callback=None): super(FileSystemThrottledCrashStorage, self).__init__(config) self.def_crash_store = JsonDumpStorage( root=config.def_fs_root, maxDirectoryEntries=config.dump_dir_count, jsonSuffix=config.json_file_suffix, dumpSuffix=config.dump_file_suffix, dumpGID=config.dump_gid, dumpPermissions=config.dump_permissions, dirPermissions=config.dir_permissions, logger=config.logger) self._crash_store_tuple = (self.std_crash_store, self.def_crash_store) #-------------------------------------------------------------------------- def save_raw_crash(self, raw_crash, dump, crash_id): """save the raw crash and the dump in the appropriate file system based on the value of 'legacy_processing' with the raw_crash itself""" try: if raw_crash['legacy_processing'] == ACCEPT: self._do_save_raw(self.std_crash_store, raw_crash, dump, crash_id) else: self._do_save_raw(self.def_crash_store, raw_crash, dump, crash_id) except KeyError: # if 'legacy_processing' is missing, then it assumed that this # crash should be processed. Therefore save it into standard # storage self._do_save_raw(self.std_crash_store, raw_crash, dump, crash_id) #-------------------------------------------------------------------------- def get_raw_crash(self, crash_id): """fetch the raw_crash trying each file system in turn""" for a_crash_store in self._crash_store_tuple: try: pathname = a_crash_store.getJson(crash_id) return self._load_raw_crash_from_file(pathname) except OSError: # only raise the exception if we've got no more file systems # to look through if a_crash_store is self._crash_store_tuple[-1]: raise CrashIDNotFound(crash_id) #-------------------------------------------------------------------------- def get_raw_dump(self, crash_id, dump_name=None): """fetch the dump trying each file system in turn""" for a_crash_store in self._crash_store_tuple: try: job_pathname = a_crash_store.getDump(crash_id, dump_name) with open(job_pathname) as dump_file: dump = dump_file.read() return dump except OSError: # only raise the exception if we've got no more file systems # to look through if a_crash_store is self._crash_store_tuple[-1]: raise CrashIDNotFound(crash_id) #-------------------------------------------------------------------------- def get_raw_dumps(self, crash_id): """fetch the dump trying each file system in turn""" for a_crash_store in self._crash_store_tuple: try: return self._do_get_raw_dumps(crash_id, a_crash_store) except CrashIDNotFound: pass # try the next crash store raise CrashIDNotFound(crash_id) #-------------------------------------------------------------------------- def get_raw_dumps_as_files(self, crash_id): """fetch the dump trying each file system in turn""" for a_crash_store in self._crash_store_tuple: try: return a_crash_store.get_dumps(crash_id) except CrashIDNotFound: pass # try the next crash store raise CrashIDNotFound(crash_id) #-------------------------------------------------------------------------- def remove(self, crash_id): """try to remove the raw_crash and the dump from each """ for a_crash_store in self._crash_store_tuple: try: a_crash_store.remove(crash_id) # raises NoSuchUuidFound if # unsuccessful. return # break the loop as soon as we succeed except (NoSuchUuidFound, OSError): # only raise the exception if we've got no more file systems # to look through if a_crash_store is self._crash_store_tuple[-1]: raise CrashIDNotFound(crash_id)
class FileSystemRawCrashStorage(CrashStorageBase): """This crash storage class impements only the raw crash part of the api. Raw crashes (the json file and the binary dump) are stored in a file system. This class is appropriate for fast storage of crashes into a local file system. In 2011, a varient of this code base was adopted by the Socorro Collector for fast temporary storage as crashes came in.""" required_config = Namespace() required_config.add_option( 'std_fs_root', doc='a path to a local file system', default='./primaryCrashStore', reference_value_from='resource.filesystem', ) required_config.add_option( 'dump_dir_count', doc='the number of dumps to be stored in a single directory in the ' 'local file system', default=1024, reference_value_from='resource.filesystem', ) required_config.add_option( 'dump_gid', doc='the group ID for saved crashes in local file system (optional)', default='', reference_value_from='resource.filesystem', ) required_config.add_option( 'dump_permissions', doc='a number used for permissions crash dump files in the local ' 'file system', default=stat.S_IRGRP | stat.S_IWGRP | stat.S_IRUSR | stat.S_IWUSR, reference_value_from='resource.filesystem', ) required_config.add_option( 'dir_permissions', doc='a number used for permissions for directories in the local ' 'file system', default=(stat.S_IRGRP | stat.S_IXGRP | stat.S_IWGRP | stat.S_IRUSR | stat.S_IXUSR | stat.S_IWUSR), reference_value_from='resource.filesystem', ) required_config.add_option( 'json_file_suffix', doc='the suffix used to identify a json file', default='.json', reference_value_from='resource.filesystem', ) required_config.add_option( 'dump_file_suffix', doc='the suffix used to identify a dump file', default='.dump', reference_value_from='resource.filesystem', ) #-------------------------------------------------------------------------- def __init__(self, config, quit_check_callback=None): super(FileSystemRawCrashStorage, self).__init__(config) self.std_crash_store = JsonDumpStorage( root=config.std_fs_root, maxDirectoryEntries=config.dump_dir_count, jsonSuffix=config.json_file_suffix, dumpSuffix=config.dump_file_suffix, dumpGID=config.dump_gid, dumpPermissions=config.dump_permissions, dirPermissions=config.dir_permissions, logger=config.logger) self.hostname = os.uname()[1] #-------------------------------------------------------------------------- def _load_raw_crash_from_file(self, pathname): with open(pathname) as json_file: raw_crash = json.load(json_file, object_hook=DotDict) return raw_crash #-------------------------------------------------------------------------- def _do_save_raw(self, json_storage_system, raw_crash, dumps, crash_id): json_storage_system.new_entry(crash_id, raw_crash, dumps, self.hostname) #-------------------------------------------------------------------------- def save_raw_crash(self, raw_crash, dumps, crash_id): """forward the raw_crash and the dump to the underlying file system""" self._do_save_raw(self.std_crash_store, raw_crash, dumps, crash_id) def save_raw_and_processed(self, raw_crash, dumps, processed_crash, crash_id): """ bug 866973 - do not try to save dumps=None into the Filesystem We are doing this in lieu of a queuing solution that could allow us to operate an independent crashmover. When the queuing system is implemented, we could remove this, and have the raw crash saved by a crashmover that's consuming crash_ids the same way that the processor consumes them. Even though it is ok to resave the raw_crash in this case to the filesystem, the fs does not know what to do with a dumps=None when passed to save_raw, so we are going to avoid that. """ self.save_processed(processed_crash) #-------------------------------------------------------------------------- def get_raw_crash(self, crash_id): """fetch the raw crash from the underlying file system""" try: pathname = self.std_crash_store.getJson(crash_id) return self._load_raw_crash_from_file(pathname) except OSError: raise CrashIDNotFound(crash_id) except ValueError: # empty json file? return DotDict() #-------------------------------------------------------------------------- def get_raw_dump(self, crash_id, dump_name=None): """read the binary crash dump from the underlying file system by getting the pathname and then opening and reading the file.""" try: job_pathname = self.std_crash_store.getDump(crash_id, dump_name) with open(job_pathname) as dump_file: binary = dump_file.read() return binary except OSError: raise CrashIDNotFound(crash_id) #-------------------------------------------------------------------------- def _do_get_raw_dumps(self, crash_id, crash_store): try: dumpname_paths_map = crash_store.get_dumps(crash_id) dumpname_dump_map = {} for dump_name, dump_pathname in dumpname_paths_map.iteritems(): with open(dump_pathname, 'rb') as f: dumpname_dump_map[dump_name] = f.read() return dumpname_dump_map except OSError: raise CrashIDNotFound(crash_id) #-------------------------------------------------------------------------- def get_raw_dumps(self, crash_id): """read the all the binary crash dumps from the underlying file system by getting the pathnames and then opening and reading the files. returns a dict of dump names to binary dumps""" return self._do_get_raw_dumps(crash_id, self.std_crash_store) #-------------------------------------------------------------------------- def get_raw_dumps_as_files(self, crash_id): """read the all the binary crash dumps from the underlying file system by getting the pathnames and then opening and reading the files. returns a dict of dump names to binary dumps""" return self.std_crash_store.get_dumps(crash_id) #-------------------------------------------------------------------------- def new_crashes(self): """return an iterator that yields a list of crash_ids of raw crashes that were added to the file system since the last time this iterator was requested.""" # why is this called 'destructiveDateWalk'? The underlying code # that manages the filesystem uses a tree of radix date directories # and symbolic links to track "new" raw crashes. As the the crash_ids # are fetched from the file system, the symbolic links are removed and # directories are deleted. Essentially, the state of what is # considered to be new is saved within the file system by those links. return self.std_crash_store.destructiveDateWalk() #-------------------------------------------------------------------------- def remove(self, crash_id): """delegate removal of a raw crash to the underlying filesystem""" try: self.std_crash_store.quickDelete(crash_id) except NoSuchUuidFound: raise CrashIDNotFound(crash_id)
class SubmitterApp(FetchTransformSaveWithSeparateNewCrashSourceApp): app_name = 'submitter_app' app_version = '3.1' app_description = __doc__ required_config = Namespace() required_config.namespace('submitter') required_config.submitter.add_option( 'delay', doc="pause between submission queuing in milliseconds", default='0', from_string_converter=lambda x: float(x) / 1000.0 ) required_config.submitter.add_option( 'dry_run', doc="don't actually submit, just print product/version from raw crash", short_form='D', default=False ) #-------------------------------------------------------------------------- @staticmethod def get_application_defaults(): return { "source.crashstorage_class": SubmitterFileSystemWalkerSource, "destination.crashstorage_class": 'socorro.collector.breakpad_submitter_utilities' '.BreakpadPOSTDestination', "number_of_submissions": "all", } #-------------------------------------------------------------------------- def _action_between_each_iteration(self): if self.config.submitter.delay: time.sleep(self.config.submitter.delay) #-------------------------------------------------------------------------- def _action_after_iteration_completes(self): self.config.logger.info( 'the queuing iterator is exhausted - waiting to quit' ) self.task_manager.wait_for_empty_queue( 5, "waiting for the queue to drain before quitting" ) time.sleep(self.config.producer_consumer.number_of_threads * 2) #-------------------------------------------------------------------------- def _filter_disallowed_values(self, current_value): """in this base class there are no disallowed values coming from the iterators. Other users of these iterator may have some standards and can detect and reject them here""" return current_value is None #-------------------------------------------------------------------------- def _transform(self, crash_id): """this transform function only transfers raw data from the source to the destination without changing the data.""" if self.config.submitter.dry_run: print crash_id else: raw_crash = self.source.get_raw_crash(crash_id) dumps = self.source.get_raw_dumps_as_files(crash_id) self.destination.save_raw_crash_with_file_dumps( raw_crash, dumps, crash_id )
class PostgreSQLCrashStorage(CrashStorageBase): """this implementation of crashstorage saves processed crashes to an instance of Postgresql. It only saves certain key values to the partitioned reports table, therefore it is not a source for fetching complete processed reports and doesn't not implement any of the 'get' methods.""" required_config = Namespace() required_config.add_option( 'transaction_executor_class', default="socorro.database.transaction_executor." "TransactionExecutorWithInfiniteBackoff", doc='a class that will manage transactions', from_string_converter=class_converter, reference_value_from='resource.postgresql', ) required_config.add_option( 'database_class', default=ConnectionContext, doc='the class responsible for connecting to Postgres', reference_value_from='resource.postgresql', ) _reports_table_mappings = ( # processed name, reports table name ("addons_checked", "addons_checked"), ("address", "address"), ("app_notes", "app_notes"), ("build", "build"), ("client_crash_date", "client_crash_date"), ("completeddatetime", "completed_datetime"), ("cpu_info", "cpu_info"), ("cpu_name", "cpu_name"), ("date_processed", "date_processed"), ("distributor", "distributor"), ("distributor_version", "distributor_version"), ("email", "email"), ("exploitability", "exploitability"), # ("flash_process_dump", "flash_process_dump"), # future ("flash_version", "flash_version"), ("hangid", "hangid"), ("install_age", "install_age"), ("last_crash", "last_crash"), ("os_name", "os_name"), ("os_version", "os_version"), ("processor_notes", "processor_notes"), ("process_type", "process_type"), ("product", "product"), ("productid", "productid"), ("reason", "reason"), ("release_channel", "release_channel"), ("signature", "signature"), ("startedDateTime", "started_datetime"), ("success", "success"), ("topmost_filenames", "topmost_filenames"), ("truncated", "truncated"), ("uptime", "uptime"), ("user_comments", "user_comments"), ("user_id", "user_id"), ("url", "url"), ("uuid", "uuid"), ("version", "version"), ) #-------------------------------------------------------------------------- def __init__(self, config, quit_check_callback=None): super(PostgreSQLCrashStorage, self).__init__(config, quit_check_callback=quit_check_callback) self.database = config.database_class(config) self.transaction = config.transaction_executor_class( config, self.database, quit_check_callback=quit_check_callback) #-------------------------------------------------------------------------- def save_raw_crash(self, raw_crash, dumps, crash_id): """nota bene: this function does not save the dumps in PG, only the raw crash json is saved.""" self.transaction(self._save_raw_crash_transaction, raw_crash, crash_id) #------------------------------------------------------------------------- def _save_raw_crash_transaction(self, connection, raw_crash, crash_id): raw_crash_table_name = ('raw_crashes_%s' % self._table_suffix_for_crash_id(crash_id)) upsert_sql = """ WITH update_raw_crash AS ( UPDATE %(table)s SET raw_crash = %%(raw_crash)s, date_processed = %%(date_processed)s WHERE uuid = %%(crash_id)s RETURNING 1 ), insert_raw_crash AS ( INSERT into %(table)s (uuid, raw_crash, date_processed) ( SELECT %%(crash_id)s as uuid, %%(raw_crash)s as raw_crash, %%(date_processed)s as date_processed WHERE NOT EXISTS ( SELECT uuid from %(table)s WHERE uuid = %%(crash_id)s LIMIT 1 ) ) RETURNING 2 ) SELECT * from update_raw_crash UNION ALL SELECT * from insert_raw_crash """ % { 'table': raw_crash_table_name } values = { 'crash_id': crash_id, 'raw_crash': json.dumps(raw_crash), 'date_processed': raw_crash["submitted_timestamp"] } execute_no_results(connection, upsert_sql, values) #-------------------------------------------------------------------------- def get_raw_crash(self, crash_id): """the default implementation of fetching a raw_crash parameters: crash_id - the id of a raw crash to fetch""" return self.transaction(self._get_raw_crash_transaction, crash_id) #-------------------------------------------------------------------------- def _get_raw_crash_transaction(self, connection, crash_id): raw_crash_table_name = ('raw_crash_%s' % self._table_suffix_for_crash_id(crash_id)) fetch_sql = 'select raw_crash from %s where uuid = %%s' % \ raw_crash_table_name try: return single_value_sql(connection, fetch_sql, (crash_id, )) except SQLDidNotReturnSingleValue: raise CrashIDNotFound(crash_id) #-------------------------------------------------------------------------- def save_processed(self, processed_crash): self.transaction(self._save_processed_transaction, processed_crash) #-------------------------------------------------------------------------- def _save_processed_transaction(self, connection, processed_crash): report_id = self._save_processed_report(connection, processed_crash) self._save_plugins(connection, processed_crash, report_id) self._save_extensions(connection, processed_crash, report_id) self._save_processed_crash(connection, processed_crash) def _save_processed_crash(self, connection, processed_crash): crash_id = processed_crash['uuid'] processed_crashes_table_name = ( 'processed_crashes_%s' % self._table_suffix_for_crash_id(crash_id)) upsert_sql = """ WITH update_processed_crash AS ( UPDATE %(table)s SET processed_crash = %%(processed_json)s, date_processed = %%(date_processed)s WHERE uuid = %%(uuid)s RETURNING 1 ), insert_processed_crash AS ( INSERT INTO %(table)s (uuid, processed_crash, date_processed) ( SELECT %%(uuid)s as uuid, %%(processed_json)s as processed_crash, %%(date_processed)s as date_processed WHERE NOT EXISTS ( SELECT uuid from %(table)s WHERE uuid = %%(uuid)s LIMIT 1 ) ) RETURNING 2 ) SELECT * from update_processed_crash UNION ALL SELECT * from insert_processed_crash """ % { 'table': processed_crashes_table_name, 'uuid': crash_id } values = { 'processed_json': json.dumps(processed_crash, cls=JsonDTEncoder), 'date_processed': processed_crash["date_processed"], 'uuid': crash_id } execute_no_results(connection, upsert_sql, values) #-------------------------------------------------------------------------- def _save_processed_report(self, connection, processed_crash): """ Here we INSERT or UPDATE a row in the reports table. This is the first stop before imported data gets into our normalized batch reporting (next table: reports_clean). At some point in the future, we will switch to using the raw_crash table and JSON transforms instead. This work will require an overhaul and optimization of the update_reports_clean() and update_reports_duplicates() stored procedures. We perform an UPSERT using a PostgreSQL CTE (aka WITH clause) that first tests whether a row exists and performs an UPDATE if it can, or it performs an INSERT. Because we're using raw SQL in this function, we've got a substantial parameterized query that requires two sets of parameters to be passed in via value_list. The value_list ends up having an extra crash_id added to the list, and being doubled before being passed to single_value_sql(). The SQL produced isn't beautiful, but a side effect of the CTE style of UPSERT-ing. We look forward to SQL UPSERT being adopted as a first-class citizen in PostgreSQL. Similar code is present for _save_raw_crash() and _save_processed_crash(), but is much simpler seeming because there are far fewer columns being passed into the parameterized query. """ column_list = [] placeholder_list = [] value_list = [] for pro_crash_name, report_name in self._reports_table_mappings: column_list.append(report_name) placeholder_list.append('%s') value_list.append(processed_crash[pro_crash_name]) def print_eq(a, b): # Helper for UPDATE SQL clause return a + ' = ' + b def print_as(a, b): # Helper for INSERT SQL clause return b + ' as ' + a crash_id = processed_crash['uuid'] reports_table_name = ('reports_%s' % self._table_suffix_for_crash_id(crash_id)) upsert_sql = """ WITH update_report AS ( UPDATE %(table)s SET %(joined_update_clause)s WHERE uuid = %%s RETURNING id ), insert_report AS ( INSERT INTO %(table)s (%(column_list)s) ( SELECT %(joined_select_clause)s WHERE NOT EXISTS ( SELECT uuid from %(table)s WHERE uuid = %%s LIMIT 1 ) ) RETURNING id ) SELECT * from update_report UNION ALL SELECT * from insert_report """ % { 'joined_update_clause': ", ".join(map(print_eq, column_list, placeholder_list)), 'table': reports_table_name, 'column_list': ', '.join(column_list), 'joined_select_clause': ", ".join(map(print_as, column_list, placeholder_list)), } value_list.append(crash_id) value_list.extend(value_list) report_id = single_value_sql(connection, upsert_sql, value_list) return report_id #-------------------------------------------------------------------------- def _save_plugins(self, connection, processed_crash, report_id): """ Electrolysis Support - Optional - processed_crash may contain a ProcessType of plugin. In the future this value would be default, content, maybe even Jetpack... This indicates which process was the crashing process. plugin - When set to plugin, the jsonDocument MUST calso contain PluginFilename, PluginName, and PluginVersion """ process_type = processed_crash['process_type'] if not process_type: return if process_type == "plugin": # Bug#543776 We actually will are relaxing the non-null policy... # a null filename, name, and version is OK. We'll use empty strings try: plugin_filename = processed_crash['PluginFilename'] plugin_name = processed_crash['PluginName'] plugin_version = processed_crash['PluginVersion'] except KeyError, x: self.config.logger.error( 'the crash is missing a required field: %s', str(x)) return find_plugin_sql = ('select id from plugins ' 'where filename = %s ' 'and name = %s') try: plugin_id = single_value_sql(connection, find_plugin_sql, (plugin_filename, plugin_name)) except SQLDidNotReturnSingleValue: insert_plugsins_sql = ("insert into plugins (filename, name) " "values (%s, %s) returning id") plugin_id = single_value_sql(connection, insert_plugsins_sql, (plugin_filename, plugin_name)) crash_id = processed_crash['uuid'] table_suffix = self._table_suffix_for_crash_id(crash_id) plugin_reports_table_name = 'plugins_reports_%s' % table_suffix plugins_reports_insert_sql = ( 'insert into %s ' ' (report_id, plugin_id, date_processed, version) ' 'values ' ' (%%s, %%s, %%s, %%s)' % plugin_reports_table_name) values_tuple = (report_id, plugin_id, processed_crash['date_processed'], plugin_version) execute_no_results(connection, plugins_reports_insert_sql, values_tuple)
class MissingSymbolsRule(Rule): required_config = Namespace() required_config.add_option( 'database_class', doc="the class of the database", default='socorro.external.postgresql.connection_context.' 'ConnectionContext', from_string_converter=str_to_python_object, reference_value_from='resource.postgresql', ) required_config.add_option( 'transaction_executor_class', default="socorro.database.transaction_executor." "TransactionExecutorWithInfiniteBackoff", doc='a class that will manage transactions', from_string_converter=str_to_python_object, reference_value_from='resource.postgresql', ) def __init__(self, config): super(MissingSymbolsRule, self).__init__(config) self.database = self.config.database_class(config) self.transaction = self.config.transaction_executor_class( config, self.database, ) self.sql = ( "INSERT INTO missing_symbols_%s" " (date_processed, debug_file, debug_id, code_file, code_id)" " VALUES (%%s, %%s, %%s, %%s, %%s)") def version(self): return '1.0' def _action(self, raw_crash, raw_dumps, processed_crash, processor_meta): try: date = processed_crash['date_processed'] # update partition information based on date processed sql = self.sql % datestring_to_weekly_partition(date) for module in processed_crash['json_dump']['modules']: try: # First of all, only bother if there are # missing_symbols in this module. # And because it's not useful if either of debug_file # or debug_id are empty, we filter on that here too. if (module['missing_symbols'] and module['debug_file'] and module['debug_id']): self.transaction( execute_no_results, sql, ( date, module['debug_file'], module['debug_id'], # These two use .get() because the keys # were added later in history. If it's # non-existent (or existant and None), it # will proceed and insert as a nullable. module.get('filename'), module.get('code_id'), )) except self.database.ProgrammingError: processor_meta.processor_notes.append( "WARNING: missing symbols rule failed for" " %s" % raw_crash.uuid) except KeyError: pass except KeyError: return False return True
class OutOfDateClassifier(SupportClassificationRule): """To satisfy Bug 956879, this rule will detect classify crashes as out of date if the version is less than the threshold 'firefox_out_of_date_version' found in the processor configuration""" required_config = Namespace() required_config.add_option( 'firefox_out_of_date_version', doc='the version of Firefox that is considered to be old enough ' 'to warrant a warning to the user', default='17', ) def version(self): return '1.0' def _predicate(self, raw_crash, raw_dumps, processed_crash, processor): try: return (raw_crash.ProductName == 'Firefox' and normalize(raw_crash.Version) < self.out_of_date_threshold) except AttributeError: try: self.out_of_date_threshold = normalize( self.config.firefox_out_of_date_version) except (AttributeError, KeyError): self.out_of_date_threshold = normalize( processor.config.firefox_out_of_date_version) return self._predicate(raw_crash, raw_dumps, processed_crash, processor) @staticmethod def _normalize_windows_version(version_str): ver_list = version_str.split('.')[:2] def as_int(x): try: return int(x) except ValueError: return maxint # get the first integer out of the last last token ver_list[-1] = ver_list[-1].split(' ')[0] ver_list_normalized = [as_int(x) for x in ver_list] if "Service" in version_str: try: # assume last space delimited field is service pack number ver_list_normalized.append(int(version_str.split(' ')[-1])) except ValueError: # appears to have been a bad assumption ver_list_normalized.append(0) return tuple(ver_list_normalized) def _windows_action(self, raw_crash, raw_dumps, processed_crash, processor): win_version_normalized = self._normalize_windows_version( processed_crash["json_dump"]["system_info"]["os_ver"]) if win_version_normalized[:2] == (5, 0): # Win2K return self._add_classification( processed_crash, 'firefox-no-longer-works-windows-2000', None, processor.config.logger) elif win_version_normalized < (5, 1, 3): # WinXP SP2 return self._add_classification( processed_crash, 'firefox-no-longer-works-some-versions-windows-xp', None, processor.config.logger) return self._add_classification(processed_crash, 'update-firefox-latest-version', None, processor.config.logger) @staticmethod def _normalize_osx_version(version_str): ver_list = version_str.split('.')[:2] def as_int(x): try: return int(x) except ValueError: return maxint return tuple(as_int(x) for x in ver_list) def _osx_action(self, raw_crash, raw_dumps, processed_crash, processor): osx_version_normalized = self._normalize_osx_version( processed_crash["json_dump"]["system_info"]["os_ver"]) if (osx_version_normalized <= (10, 4) or processed_crash["json_dump"]["system_info"]["cpu_arch"] == 'ppc'): return self._add_classification( processed_crash, 'firefox-no-longer-works-mac-os-10-4-or-powerpc', None, processor.config.logger) elif osx_version_normalized == (10, 5): return self._add_classification( processed_crash, 'firefox-no-longer-works-mac-os-x-10-5', None, processor.config.logger) return self._add_classification(processed_crash, 'update-firefox-latest-version', None, processor.config.logger) def _action(self, raw_crash, raw_dumps, processed_crash, processor): crashed_version = normalize(raw_crash.Version) if "Win" in processed_crash["json_dump"]["system_info"]['os']: return self._windows_action(raw_crash, raw_dumps, processed_crash, processor) elif processed_crash["json_dump"]["system_info"]['os'] == "Mac OS X": return self._osx_action(raw_crash, raw_dumps, processed_crash, processor) else: return self._add_classification(processed_crash, 'update-firefox-latest-version', None, processor.config.logger)
class FlashVersionRule(Rule): required_config = Namespace() required_config.add_option( 'known_flash_identifiers', doc='A subset of the known "debug identifiers" for flash versions, ' 'associated to the version', default={ '7224164B5918E29AF52365AF3EAF7A500': '10.1.51.66', 'C6CDEFCDB58EFE5C6ECEF0C463C979F80': '10.1.51.66', '4EDBBD7016E8871A461CCABB7F1B16120': '10.1', 'D1AAAB5D417861E6A5B835B01D3039550': '10.0.45.2', 'EBD27FDBA9D9B3880550B2446902EC4A0': '10.0.45.2', '266780DB53C4AAC830AFF69306C5C0300': '10.0.42.34', 'C4D637F2C8494896FBD4B3EF0319EBAC0': '10.0.42.34', 'B19EE2363941C9582E040B99BB5E237A0': '10.0.32.18', '025105C956638D665850591768FB743D0': '10.0.32.18', '986682965B43DFA62E0A0DFFD7B7417F0': '10.0.23', '937DDCC422411E58EF6AD13710B0EF190': '10.0.23', '860692A215F054B7B9474B410ABEB5300': '10.0.22.87', '77CB5AC61C456B965D0B41361B3F6CEA0': '10.0.22.87', '38AEB67F6A0B43C6A341D7936603E84A0': '10.0.12.36', '776944FD51654CA2B59AB26A33D8F9B30': '10.0.12.36', '974873A0A6AD482F8F17A7C55F0A33390': '9.0.262.0', 'B482D3DFD57C23B5754966F42D4CBCB60': '9.0.262.0', '0B03252A5C303973E320CAA6127441F80': '9.0.260.0', 'AE71D92D2812430FA05238C52F7E20310': '9.0.246.0', '6761F4FA49B5F55833D66CAC0BBF8CB80': '9.0.246.0', '27CC04C9588E482A948FB5A87E22687B0': '9.0.159.0', '1C8715E734B31A2EACE3B0CFC1CF21EB0': '9.0.159.0', 'F43004FFC4944F26AF228334F2CDA80B0': '9.0.151.0', '890664D4EF567481ACFD2A21E9D2A2420': '9.0.151.0', '8355DCF076564B6784C517FD0ECCB2F20': '9.0.124.0', '51C00B72112812428EFA8F4A37F683A80': '9.0.124.0', '9FA57B6DC7FF4CFE9A518442325E91CB0': '9.0.115.0', '03D99C42D7475B46D77E64D4D5386D6D0': '9.0.115.0', '0CFAF1611A3C4AA382D26424D609F00B0': '9.0.47.0', '0F3262B5501A34B963E5DF3F0386C9910': '9.0.47.0', 'C5B5651B46B7612E118339D19A6E66360': '9.0.45.0', 'BF6B3B51ACB255B38FCD8AA5AEB9F1030': '9.0.28.0', '83CF4DC03621B778E931FC713889E8F10': '9.0.16.0', }, from_string_converter=ujson.loads) required_config.add_option( 'flash_re', doc='a regular expression to match Flash file names', default=(r'NPSWF32_?(.*)\.dll|' 'FlashPlayerPlugin_?(.*)\.exe|' 'libflashplayer(.*)\.(.*)|' 'Flash ?Player-?(.*)'), from_string_converter=re.compile) def version(self): return '1.0' def _get_flash_version(self, **kwargs): """If (we recognize this module as Flash and figure out a version): Returns version; else (None or '')""" filename = kwargs.get('filename', None) version = kwargs.get('version', None) debug_id = kwargs.get('debug_id', None) m = self.config.flash_re.match(filename) if m: if version: return version # we didn't get a version passed into us # try do deduce it groups = m.groups() if groups[0]: return groups[0].replace('_', '.') if groups[1]: return groups[1].replace('_', '.') if groups[2]: return groups[2] if groups[4]: return groups[4] return self.config.known_flash_identifiers.get(debug_id, None) return None def _action(self, raw_crash, raw_dumps, processed_crash, processor_meta): processed_crash.flash_version = '' flash_version = None modules = processed_crash.get('json_dump', {}).get('modules', []) if isinstance(modules, (tuple, list)): for index, a_module in enumerate(modules): flash_version = self._get_flash_version(**a_module) if flash_version: break if flash_version: processed_crash.flash_version = flash_version else: processed_crash.flash_version = '[blank]' return True
class FetchADIFromHiveCronApp(BaseCronApp): """ This cron is our daily blocklist ping web logs query that rolls up all the browser checkins and let's us know how many browsers we think were active on the internet for a particular day """ app_name = 'fetch-adi-from-hive' app_description = 'Fetch ADI From Hive App' app_version = '0.1' required_config = Namespace() required_config.add_option('query', default=_QUERY, doc='Hive query for fetching ADI data') required_config.add_option('hive_host', default='localhost', doc='Hostname to run Hive query on') required_config.add_option('hive_port', default=10000, doc='Port to run Hive query on') required_config.add_option('hive_user', default='socorro', doc='User to connect to Hive with') required_config.add_option('hive_password', default='ignored', doc='Password to connect to Hive with') required_config.add_option('hive_database', default='default', doc='Database name to connect to Hive with') required_config.add_option('hive_auth_mechanism', default='PLAIN', doc='Auth mechanism for Hive') def run(self, connection, date): target_date = (date - datetime.timedelta(days=1)).strftime('%Y-%m-%d') raw_adi_logs_pathname = os.path.join( tempfile.gettempdir(), "%s.raw_adi_logs.TEMPORARY%s" % (target_date, '.txt')) try: with open(raw_adi_logs_pathname, 'w') as f: hive = pyhs2.connect( host=self.config.hive_host, port=self.config.hive_port, authMechanism=self.config.hive_auth_mechanism, user=self.config.hive_user, password=self.config.hive_password, database=self.config.hive_database) cur = hive.cursor() query = self.config.query % target_date cur.execute(query) for row in cur: f.write("\t".join(str(v) for v in row)) f.write("\n") with open(raw_adi_logs_pathname, 'r') as f: pgcursor = connection.cursor() pgcursor.copy_from(f, 'raw_adi_logs', null='None', columns=[ 'report_date', 'product_name', 'product_os_platform', 'product_os_version', 'product_version', 'build', 'build_channel', 'product_guid', 'count' ]) finally: if os.path.isfile(raw_adi_logs_pathname): os.remove(raw_adi_logs_pathname)
class OutOfMemoryBinaryRule(Rule): required_config = Namespace() required_config.add_option( 'max_size_uncompressed', default=20 * 1024 * 1024, # ~20 Mb doc=("Number of bytes, max, that we accept memory info payloads " "as JSON.")) def version(self): return '1.0' def _predicate(self, raw_crash, raw_dumps, processed_crash, proc_meta): return 'memory_report' in raw_dumps def _extract_memory_info(self, dump_pathname, processor_notes): """Extract and return the JSON data from the .json.gz memory report. file""" def error_out(error_message): processor_notes.append(error_message) return {"ERROR": error_message} try: fd = gzip.open(dump_pathname, "rb") except IOError as x: error_message = "error in gzip for %s: %r" % (dump_pathname, x) return error_out(error_message) try: memory_info_as_string = fd.read() if len(memory_info_as_string) > self.config.max_size_uncompressed: error_message = ( "Uncompressed memory info too large %d (max: %d)" % ( len(memory_info_as_string), self.config.max_size_uncompressed, )) return error_out(error_message) memory_info = ujson.loads(memory_info_as_string) except IOError as x: error_message = "error in gzip for %s: %r" % (dump_pathname, x) return error_out(error_message) except ValueError as x: error_message = "error in json for %s: %r" % (dump_pathname, x) return error_out(error_message) finally: fd.close() return memory_info def _action(self, raw_crash, raw_dumps, processed_crash, processor_meta): pathname = raw_dumps['memory_report'] with temp_file_context(pathname): memory_report = self._extract_memory_info( dump_pathname=pathname, processor_notes=processor_meta.processor_notes) if isinstance(memory_report, dict) and memory_report.get('ERROR'): processed_crash.memory_report_error = memory_report['ERROR'] else: processed_crash.memory_report = memory_report return True
class ESCrashStorage(CrashStorageBase): """This sends processed crash reports to Elasticsearch.""" required_config = Namespace() required_config.add_option( 'transaction_executor_class', default="socorro.database.transaction_executor." "TransactionExecutorWithLimitedBackoff", doc='a class that will manage transactions', from_string_converter=class_converter, ) required_config.add_option( 'index_creator_class', doc='a class that can create Elasticsearch indices', default='socorro.external.es.index_creator.IndexCreator', from_string_converter=class_converter ) required_config.elasticsearch = Namespace() required_config.elasticsearch.add_option( 'elasticsearch_class', default='socorro.external.es.connection_context.ConnectionContext', from_string_converter=class_converter, reference_value_from='resource.elasticsearch', ) # This cache reduces attempts to create indices, thus lowering overhead # each time a document is indexed. indices_cache = set() # These regex will catch field names from Elasticsearch exceptions. They # have been tested with Elasticsearch 1.4. field_name_string_error_re = re.compile(r'field=\"([\w\-.]+)\"') field_name_number_error_re = re.compile( r'\[failed to parse \[([\w\-.]+)]]' ) #-------------------------------------------------------------------------- def __init__(self, config, quit_check_callback=None): super(ESCrashStorage, self).__init__( config, quit_check_callback ) # Ok, it's sane, so let's continue. self.es_context = self.config.elasticsearch.elasticsearch_class( config=self.config.elasticsearch ) self.transaction = config.transaction_executor_class( config, self.es_context, quit_check_callback ) #-------------------------------------------------------------------------- def get_index_for_crash(self, crash_date): """Return the submission URL for a crash; based on the submission URL from config and the date of the crash. If the index name contains a datetime pattern (ex. %Y%m%d) then the crash_date will be parsed and appended to the index name. """ index = self.config.elasticsearch.elasticsearch_index if not index: return None elif '%' in index: # Note that crash_date must be a datetime object! index = crash_date.strftime(index) return index #-------------------------------------------------------------------------- def save_raw_and_processed(self, raw_crash, dumps, processed_crash, crash_id): """This is the only write mechanism that is actually employed in normal usage. """ crash_document = { 'crash_id': crash_id, 'processed_crash': processed_crash, 'raw_crash': raw_crash } self.transaction( self._submit_crash_to_elasticsearch, crash_document=crash_document ) #-------------------------------------------------------------------------- @staticmethod def reconstitute_datetimes(processed_crash): datetime_fields = [ 'submitted_timestamp', 'date_processed', 'client_crash_date', 'started_datetime', 'startedDateTime', 'completed_datetime', 'completeddatetime', ] for a_key in datetime_fields: try: processed_crash[a_key] = string_to_datetime( processed_crash[a_key] ) except KeyError: # not there? we don't care pass #-------------------------------------------------------------------------- def _submit_crash_to_elasticsearch(self, connection, crash_document): """Submit a crash report to elasticsearch. """ # Massage the crash such that the date_processed field is formatted # in the fashion of our established mapping. self.reconstitute_datetimes(crash_document['processed_crash']) # Obtain the index name. es_index = self.get_index_for_crash( crash_document['processed_crash']['date_processed'] ) es_doctype = self.config.elasticsearch.elasticsearch_doctype crash_id = crash_document['crash_id'] # Attempt to create the index; it's OK if it already exists. if es_index not in self.indices_cache: index_creator = self.config.index_creator_class(config=self.config) index_creator.create_socorro_index(es_index) # Submit the crash for indexing. # Don't retry more than 5 times. That is to avoid infinite loops in # case of an unhandled exception. times = range(5) while times.pop(-1): try: connection.index( index=es_index, doc_type=es_doctype, body=crash_document, id=crash_id ) break except elasticsearch.exceptions.TransportError as e: field_name = None if 'MaxBytesLengthExceededException' in e.error: # This is caused by a string that is way too long for # Elasticsearch. matches = self.field_name_string_error_re.findall(e.error) if matches: field_name = matches[0] elif 'NumberFormatException' in e.error: # This is caused by a number that is either too big for # Elasticsearch or just not a number. matches = self.field_name_number_error_re.findall(e.error) if matches: field_name = matches[0] if not field_name: # We are unable to parse which field to remove, we cannot # try to fix the document. Let it raise. self.config.logger.critical( 'Submission to Elasticsearch failed for %s (%s)', crash_id, e, exc_info=True ) raise if field_name.endswith('.full'): # Remove the `.full` at the end, that is a special mapping # construct that is not part of the real field name. field_name = field_name.rstrip('.full') # Now remove that field from the document before trying again. field_path = field_name.split('.') parent = crash_document for i, field in enumerate(field_path): if i == len(field_path) - 1: # This is the last level, so `field` contains the name # of the field that we want to remove from `parent`. del parent[field] else: parent = parent[field] # Add a note in the document that a field has been removed. if crash_document.get('removed_fields'): crash_document['removed_fields'] = '{} {}'.format( crash_document['removed_fields'], field_name ) else: crash_document['removed_fields'] = field_name except elasticsearch.exceptions.ElasticsearchException as e: self.config.logger.critical( 'Submission to Elasticsearch failed for %s (%s)', crash_id, e, exc_info=True ) raise
class HBaseSingleConnectionContext(RequiredConfig): """a configman compliant class for setup of HBase connections DO NOT SHARE HBASE CONNECTIONS BETWEEN THREADS """ #-------------------------------------------------------------------------- # configman parameter definition section # here we're setting up the minimal parameters required for connecting required_config = Namespace() required_config.add_option( 'number_of_retries', doc='Max. number of retries when fetching from hbaseClient', default=0, reference_value_from='resource.hbase') required_config.add_option( 'hbase_host', doc='Host to HBase server', default='localhost', reference_value_from='resource.hbase', ) required_config.add_option( 'hbase_port', doc='Port to HBase server', default=9090, reference_value_from='resource.hbase', ) required_config.add_option( 'hbase_timeout', doc='timeout in milliseconds for an HBase connection', default=5000, reference_value_from='resource.hbase', ) required_config.add_option( 'temporary_file_system_storage_path', doc='a local filesystem path where dumps temporarily ' 'during processing', default='/home/socorro/temp', reference_value_from='resource.hbase', ) required_config.add_option( 'dump_file_suffix', doc='the suffix used to identify a dump file (for use in temp files)', default='.dump', reference_value_from='resource.hbase', ) #-------------------------------------------------------------------------- def __init__(self, config, local_config=None): """Initialize the parts needed to start making database connections parameters: config - the complete config for the app. If a real app, this would be where a logger or other resources could be found. local_config - this is the namespace within the complete config where the actual database parameters are found""" super(HBaseSingleConnectionContext, self).__init__() self.config = config if local_config is None: local_config = config dummy_connection = hbase_client.HBaseConnectionForCrashReports( local_config.hbase_host, local_config.hbase_port, local_config.hbase_timeout, logger=self.config.logger) dummy_connection.close() self.operational_exceptions = \ dummy_connection.hbaseThriftExceptions self.operational_exceptions += \ (hbase_client.NoConnectionException,) self.conditional_exceptions = () #-------------------------------------------------------------------------- def connection(self, name_unused=None): """return a new database connection parameters: name_unused - optional named connections. Used by the derived class """ #self.config.logger.debug('creating new HBase connection') return hbase_client.HBaseConnectionForCrashReports( self.config.hbase_host, self.config.hbase_port, self.config.hbase_timeout, logger=self.config.logger) #-------------------------------------------------------------------------- @contextlib.contextmanager def __call__(self, name=None): """returns a database connection wrapped in a contextmanager. The context manager will assure that the connection is closed but will not try to commit or rollback lingering transactions. parameters: name - an optional name for the database connection""" conn = self.connection(name) try: #self.config.logger.debug('connection HBase acquired') yield conn finally: self.close_connection(conn) #-------------------------------------------------------------------------- def close_connection(self, connection, force=False): """close the connection passed in. This function exists to allow derived classes to override the closing behavior. parameters: connection - the database connection object force - unused boolean to force closure; used in derived classes """ #self.config.logger.debug('connection HBase closed') connection.close() #-------------------------------------------------------------------------- def close(self): """close any pooled or cached connections. Since this base class object does no caching, there is no implementation required. Derived classes may implement it.""" pass #-------------------------------------------------------------------------- def is_operational_exception(self, msg): """return True if a conditional exception is actually an operational error. Return False if it's a genuine error that should probably be raised and propagate up. Some conditional exceptions might be actually be some form of operational exception "labelled" wrong by the psycopg2 code error handler. """ return False #-------------------------------------------------------------------------- def force_reconnect(self): pass
class DependencySecurityCheckCronApp(BaseCronApp): """Configuration values used by this app: crontabber.class-DependencySecurityCheckCronApp.nsp_path Path to the nsp binary for checking Node dependencies. crontabber.class-DependencySecurityCheckCronApp.safety_path Path to the PyUp Safety binary for checking Python dependencies. crontabber.class-DependencySecurityCheckCronApp.safety_api_key Optional API key to pass to Safety. crontabber.class-DependencySecurityCheckCronApp.package_json_path Path to the package.json file to run nsp against. secrets.sentry.dsn If specified, vulnerabilities will be reported to Sentry instead of logged to the console. """ app_name = 'dependency-security-check' app_description = ( 'Runs third-party tools that check for known security vulnerabilites in Socorro\'s ' 'dependencies.') app_version = '0.1' required_config = Namespace() required_config.add_option( 'nsp_path', doc='Path to the nsp binary', ) required_config.add_option( 'safety_path', doc='Path to the PyUp safety binary', ) required_config.add_option( 'safety_api_key', doc='API key for Safety to use latest Pyup vulnerability database', ) required_config.add_option( 'package_json_path', doc='Path to the package.json file to run nsp against', ) def run(self): self.validate_options() vulnerabilities = self.get_python_vulnerabilities( ) + self.get_javascript_vulnerabilities() if vulnerabilities: try: dsn = self.config.sentry.dsn except KeyError: dsn = None if dsn: self.alert_sentry(dsn, vulnerabilities) else: self.alert_log(vulnerabilities) def validate_options(self): # Validate file path options for option in ('nsp_path', 'safety_path', 'package_json_path'): value = self.config.get(option) if not value: raise OptionError('Required option "%s" is empty' % option) elif not os.path.exists(value): raise OptionError( 'Option "%s" points to a nonexistant file (%s)' % (option, value)) elif not os.path.isfile(value): raise OptionError('Option "%s" does not point to a file (%s)' % (option, value)) def alert_sentry(self, dsn, vulnerabilities): client = raven_client.get_client(dsn) client.context.activate() client.context.merge({ 'extra': { 'data': {vuln.key: vuln.summary for vuln in vulnerabilities}, }, }) client.captureMessage('Dependency security check failed') def alert_log(self, vulnerabilities): self.config.logger.error('Vulnerabilities found in dependencies!') for vuln in vulnerabilities: self.config.logger.error('%s: %s' % (vuln.key, vuln.summary)) def get_python_vulnerabilities(self): """Check Python dependencies via Pyup's safety command. :returns list(Vulnerability): :raises DependencySecurityCheckFailed: """ # Safety checks what's installed in the current virtualenv, so no need # for any paths. cmd = [self.config.safety_path, 'check', '--json'] if self.config.get('safety_api_key'): cmd += ['--key', self.config.safety_api_key] process = Popen(cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE) output, error_output = process.communicate() if process.returncode == 0: return [] elif process.returncode == 255: try: results = json.loads(output) return [ Vulnerability( type='python', dependency=result[0], installed_version=result[2], affected_versions=result[1], description=result[3], ) for result in results ] except (ValueError, IndexError) as err: raise DependencySecurityCheckFailed( 'Could not parse pyup safety output', err, output, ) raise DependencySecurityCheckFailed(error_output) def get_javascript_vulnerabilities(self): """Check JavaScript dependencies via the nsp command. :returns list(Vulnerability): :raises DependencySecurityCheckFailed: """ process = Popen( [ self.config.nsp_path, 'check', '--reporter=json', ], stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=dirname(self.config.package_json_path), ) output, error_output = process.communicate() if process.returncode == 0: return [] elif process.returncode == 1: try: results = json.loads(output) return [ Vulnerability( type='javascript', dependency=result['module'], installed_version=result['version'], affected_versions=result['vulnerable_versions'], description=result['advisory'], ) for result in results ] except (ValueError, KeyError) as err: raise DependencySecurityCheckFailed( 'Could not parse nsp output', err, output) raise DependencySecurityCheckFailed(error_output)
class ESCrashStorage(CrashStorageBase): """This sends raw and processed crash reports to Elasticsearch.""" required_config = Namespace() required_config.elasticsearch = Namespace() required_config.elasticsearch.add_option( 'elasticsearch_class', default='socorro.external.es.connection_context.ConnectionContext', from_string_converter=class_converter, reference_value_from='resource.elasticsearch', ) # These regex will catch field names from Elasticsearch exceptions. They # have been tested with Elasticsearch 1.4. field_name_string_error_re = re.compile(r'field=\"([\w\-.]+)\"') field_name_number_error_re = re.compile(r'\[failed to parse \[([\w\-.]+)]]') def __init__(self, config, namespace='', quit_check_callback=None): super().__init__(config, namespace=namespace, quit_check_callback=quit_check_callback) self.es_context = self.config.elasticsearch.elasticsearch_class( config=self.config.elasticsearch ) self.metrics = markus.get_metrics(namespace) def get_index_for_crash(self, crash_date): """Return the submission URL for a crash; based on the submission URL from config and the date of the crash. If the index name contains a datetime pattern (ex. %Y%m%d) then the crash_date will be parsed and appended to the index name. """ index = self.config.elasticsearch.elasticsearch_index if not index: return None elif '%' in index: # Note that crash_date must be a datetime object! index = crash_date.strftime(index) return index def save_raw_and_processed(self, raw_crash, dumps, processed_crash, crash_id): """Save raw and processed crash data to Elasticsearch""" # Massage the crash such that the date_processed field is formatted # in the fashion of our established mapping. reconstitute_datetimes(processed_crash) # Remove bad keys from the raw crash--these keys are essentially # user-provided and can contain junk data remove_bad_keys(raw_crash) # Truncate values that are too long truncate_keyword_field_values(FIELDS, raw_crash) truncate_keyword_field_values(FIELDS, processed_crash) # Convert pseudo-boolean values to boolean values convert_booleans(FIELDS, raw_crash) convert_booleans(FIELDS, processed_crash) # Capture crash data size metrics--do this only after we've cleaned up # the crash data self.capture_crash_metrics(raw_crash, processed_crash) crash_document = { 'crash_id': crash_id, 'processed_crash': processed_crash, 'raw_crash': raw_crash } self._submit_crash_to_elasticsearch(crash_document) def capture_crash_metrics(self, raw_crash, processed_crash): """Capture metrics about crash data being saved to Elasticsearch""" try: self.metrics.histogram( 'raw_crash_size', value=len(json.dumps(raw_crash, cls=JsonDTEncoder)) ) except Exception: # NOTE(willkg): An error here shouldn't screw up saving data. Log it so we can fix it # later. self.logger.exception('something went wrong when capturing raw_crash_size') try: self.metrics.histogram( 'processed_crash_size', value=len(json.dumps(processed_crash, cls=JsonDTEncoder)) ) except Exception: # NOTE(willkg): An error here shouldn't screw up saving data. Log it so we can fix it # later. self.logger.exception('something went wrong when capturing processed_crash_size') def _index_crash(self, connection, es_index, es_doctype, crash_document, crash_id): try: start_time = time.time() connection.index( index=es_index, doc_type=es_doctype, body=crash_document, id=crash_id ) index_outcome = 'successful' except Exception: index_outcome = 'failed' raise finally: elapsed_time = time.time() - start_time self.metrics.histogram( 'index', value=elapsed_time * 1000.0, tags=['outcome:' + index_outcome] ) def _submit_crash_to_elasticsearch(self, crash_document): """Submit a crash report to elasticsearch""" index_name = self.get_index_for_crash(crash_document['processed_crash']['date_processed']) es_doctype = self.config.elasticsearch.elasticsearch_doctype crash_id = crash_document['crash_id'] # Attempt to create the index; it's OK if it already exists. self.es_context.create_index(index_name) # Submit the crash for indexing. # Don't retry more than 5 times. That is to avoid infinite loops in # case of an unhandled exception. for attempt in range(5): try: with self.es_context() as conn: return self._index_crash(conn, index_name, es_doctype, crash_document, crash_id) except elasticsearch.exceptions.ConnectionError: # If this is a connection error, sleep a second and then try again time.sleep(1.0) except elasticsearch.exceptions.TransportError as e: # If this is a TransportError, we try to figure out what the error # is and fix the document and try again field_name = None if 'MaxBytesLengthExceededException' in e.error: # This is caused by a string that is way too long for # Elasticsearch. matches = self.field_name_string_error_re.findall(e.error) if matches: field_name = matches[0] elif 'NumberFormatException' in e.error: # This is caused by a number that is either too big for # Elasticsearch or just not a number. matches = self.field_name_number_error_re.findall(e.error) if matches: field_name = matches[0] if not field_name: # We are unable to parse which field to remove, we cannot # try to fix the document. Let it raise. self.logger.critical( 'Submission to Elasticsearch failed for %s (%s)', crash_id, e, exc_info=True ) raise if field_name.endswith('.full'): # Remove the `.full` at the end, that is a special mapping # construct that is not part of the real field name. field_name = field_name.rstrip('.full') # Now remove that field from the document before trying again. field_path = field_name.split('.') parent = crash_document for i, field in enumerate(field_path): if i == len(field_path) - 1: # This is the last level, so `field` contains the name # of the field that we want to remove from `parent`. del parent[field] else: parent = parent[field] # Add a note in the document that a field has been removed. if crash_document.get('removed_fields'): crash_document['removed_fields'] = '{} {}'.format( crash_document['removed_fields'], field_name ) else: crash_document['removed_fields'] = field_name except elasticsearch.exceptions.ElasticsearchException as exc: self.logger.critical( 'Submission to Elasticsearch failed for %s (%s)', crash_id, exc, exc_info=True ) raise
class B(A): foo = 'b' required_config = Namespace() required_config.add_option('z', default=2)
class ReprocessCrashlistApp(App): app_name = 'reprocess_crashlist' app_version = '1.0' app_description = __doc__ required_config = Namespace() required_config.namespace('reprocesscrashlist') required_config.reprocesscrashlist.add_option( 'host', doc='host to connect to for RabbitMQ', default='localhost', reference_value_from='resource.rabbitmq', ) required_config.reprocesscrashlist.add_option( 'port', doc='port to connect to for RabbitMQ', default=5672, reference_value_from='resource.rabbitmq', ) required_config.reprocesscrashlist.add_option( 'rabbitmq_user', doc='user to connect to for RabbitMQ', default='guest', reference_value_from='secrets.rabbitmq', ) required_config.reprocesscrashlist.add_option( 'rabbitmq_password', doc="the user's RabbitMQ password", default='guest', reference_value_from='secrets.rabbitmq', secret=True, ) required_config.reprocesscrashlist.add_option( name='virtual_host', doc='the name of the RabbitMQ virtual host', default='/', reference_value_from='resource.rabbitmq', ) required_config.reprocesscrashlist.add_option( 'crashes', doc='File containing crash UUIDs, one per line', default='crashlist.txt') def connect(self): logging.debug("connecting to rabbit") config = self.config.reprocesscrashlist try: connection = pika.BlockingConnection( pika.ConnectionParameters( host=config.host, port=config.port, virtual_host=config.virtual_host, credentials=pika.credentials.PlainCredentials( config.rabbitmq_user, config.rabbitmq_password))) except Exception: logging.error("Failed to connect") raise self.connection = connection def main(self): self.connect() channel = self.connection.channel() channel.queue_declare(queue='socorro.reprocessing', durable=True) with open(self.config.reprocesscrashlist.crashes, 'r') as file: for uuid in file.read().splitlines(): channel.basic_publish( exchange='', routing_key="socorro.reprocessing", body=uuid, properties=pika.BasicProperties(delivery_mode=2)) logging.debug('submitted %s' % uuid) self.connection.close()
def test_poly_crash_storage(self): n = Namespace() n.add_option( 'storage', default=PolyCrashStorage, ) n.add_option( 'logger', default=mock.Mock(), ) value = { 'storage_classes': 'socorro.unittest.external.test_crashstorage_base.A,' 'socorro.unittest.external.test_crashstorage_base.A,' 'socorro.unittest.external.test_crashstorage_base.B', 'storage1.y': 37, } cm = ConfigurationManager(n, values_source_list=[value]) with cm.context() as config: self.assertEqual(config.storage0.crashstorage_class.foo, 'a') self.assertEqual(config.storage1.crashstorage_class.foo, 'a') self.assertEqual(config.storage1.y, 37) self.assertEqual(config.storage2.crashstorage_class.foo, 'b') poly_store = config.storage(config) l = len(poly_store.storage_namespaces) self.assertEqual( l, 3, 'expected poly_store to have lenth of 3, ' 'but %d was found instead' % l) self.assertEqual(poly_store.storage_namespaces[0], 'storage0') self.assertEqual(poly_store.storage_namespaces[1], 'storage1') self.assertEqual(poly_store.storage_namespaces[2], 'storage2') l = len(poly_store.stores) self.assertEqual( l, 3, 'expected poly_store.store to have lenth of 3, ' 'but %d was found instead' % l) self.assertEqual(poly_store.stores.storage0.foo, 'a') self.assertEqual(poly_store.stores.storage1.foo, 'a') self.assertEqual(poly_store.stores.storage2.foo, 'b') raw_crash = {'ooid': ''} dump = '12345' processed_crash = {'ooid': '', 'product': 17} for v in poly_store.stores.itervalues(): v.save_raw_crash = Mock() v.save_processed = Mock() v.close = Mock() poly_store.save_raw_crash(raw_crash, dump, '') for v in poly_store.stores.itervalues(): v.save_raw_crash.assert_called_once_with(raw_crash, dump, '') poly_store.save_processed(processed_crash) for v in poly_store.stores.itervalues(): v.save_processed.assert_called_once_with(processed_crash) poly_store.save_raw_and_processed(raw_crash, dump, processed_crash, 'n') for v in poly_store.stores.itervalues(): v.save_raw_crash.assert_called_with(raw_crash, dump, 'n') v.save_processed.assert_called_with(processed_crash) raw_crash = {'ooid': 'oaeu'} dump = '5432' processed_crash = {'ooid': 'aoeu', 'product': 33} poly_store.stores['storage1'].save_raw_crash = Mock() poly_store.stores['storage1'].save_raw_crash.side_effect = \ Exception('this is messed up') poly_store.stores['storage2'].save_processed = Mock() poly_store.stores['storage2'].save_processed.side_effect = \ Exception('this is messed up') self.assertRaises(PolyStorageError, poly_store.save_raw_crash, raw_crash, dump, '') for v in poly_store.stores.itervalues(): v.save_raw_crash.assert_called_with(raw_crash, dump, '') self.assertRaises(PolyStorageError, poly_store.save_processed, processed_crash) for v in poly_store.stores.itervalues(): v.save_processed.assert_called_with(processed_crash) poly_store.stores['storage2'].close.side_effect = \ Exception self.assertRaises(PolyStorageError, poly_store.close) for v in poly_store.stores.itervalues(): v.close.assert_called_with()
class MiddlewareApp(App): app_name = 'middleware' app_version = '3.1' app_description = __doc__ #-------------------------------------------------------------------------- # in this section, define any configuration requirements required_config = Namespace() #-------------------------------------------------------------------------- # implementations namespace # the namespace is for external implementations of the services #------------------------------------------------------------------------- required_config.namespace('implementations') required_config.implementations.add_option( 'implementation_list', doc='list of packages for service implementations', default='psql:socorro.external.postgresql, ' 'hbase:socorro.external.hb, ' 'es:socorro.external.elasticsearch, ' 'fs:socorro.external.fs, ' 'http:socorro.external.http, ' 'rabbitmq:socorro.external.rabbitmq', from_string_converter=items_list_decode, to_string_converter=items_list_encode) required_config.implementations.add_option( 'service_overrides', doc='comma separated list of class overrides, e.g `Crashes: hbase`', default='CrashData: fs, ' 'Correlations: http, ' 'CorrelationsSignatures: http, ' 'SuperSearch: es, ' 'Priorityjobs: rabbitmq, ' 'Query: es', from_string_converter=items_list_decode, to_string_converter=items_list_encode) #-------------------------------------------------------------------------- # database namespace # the namespace is for external implementations of the services #------------------------------------------------------------------------- required_config.namespace('database') required_config.database.add_option( 'database_class', default='socorro.external.postgresql.connection_context.' 'ConnectionContext', from_string_converter=class_converter) #-------------------------------------------------------------------------- # hbase namespace # the namespace is for external implementations of the services #------------------------------------------------------------------------- required_config.namespace('hbase') required_config.hbase.add_option( 'hbase_class', default='socorro.external.hb.crashstorage.HBaseCrashStorage', from_string_converter=class_converter) #-------------------------------------------------------------------------- # filesystem namespace # the namespace is for external implementations of the services #------------------------------------------------------------------------- required_config.namespace('filesystem') required_config.filesystem.add_option( 'filesystem_class', default='socorro.external.fs.crashstorage.FSLegacyRadixTreeStorage', from_string_converter=class_converter) #-------------------------------------------------------------------------- # rabbitmq namespace # the namespace is for external implementations of the services #------------------------------------------------------------------------- required_config.namespace('rabbitmq') required_config.rabbitmq.add_option( 'rabbitmq_class', default='socorro.external.rabbitmq.connection_context.' 'ConnectionContext', from_string_converter=class_converter) #-------------------------------------------------------------------------- # webapi namespace # this is all config options that used to belong to webapiconfig.py #------------------------------------------------------------------------- required_config.namespace('webapi') required_config.webapi.add_option( 'elasticSearchHostname', default='localhost', doc='String containing the URI of the Elastic Search instance.', reference_value_from='resource.elasticsearch', ) required_config.webapi.add_option( 'elasticSearchPort', default='9200', doc='String containing the port on which calling the Elastic ' 'Search instance.', reference_value_from='resource.elasticsearch', ) required_config.webapi.add_option( 'elasticsearch_urls', default=['http://localhost:9200'], doc='the urls to the elasticsearch instances', from_string_converter=string_to_list, reference_value_from='resource.elasticsearch', ) required_config.add_option( 'elasticsearch_default_index', default='socorro', doc='the default index used to store data', reference_value_from='resource.elasticsearch', ) required_config.webapi.add_option( 'elasticsearch_index', default='socorro%Y%W', doc='an index format to pull crashes from elasticsearch ' "(use datetime's strftime format to have " 'daily, weekly or monthly indexes)', reference_value_from='resource.elasticsearch', ) required_config.webapi.add_option( 'elasticsearch_doctype', default='crash_reports', doc='the default doctype to use in elasticsearch', reference_value_from='resource.elasticsearch', ) required_config.webapi.add_option( 'elasticsearch_timeout', default=30, doc='the time in seconds before a query to elasticsearch fails', reference_value_from='resource.elasticsearch', ) required_config.webapi.add_option( 'elasticsearch_timeout_extended', default=120, doc='the time in seconds before a query to elasticsearch fails in ' 'restricted sections', reference_value_from='resource.elasticsearch', ) required_config.webapi.add_option( 'facets_max_number', default=50, doc='the maximum number of results a facet will return in search') required_config.webapi.add_option( 'searchMaxNumberOfDistinctSignatures', default=1000, doc='Integer containing the maximum allowed number of distinct ' 'signatures the system should retrieve. Used mainly for ' 'performances in ElasticSearch') required_config.webapi.add_option( 'search_default_date_range', default=7, # in days doc='the default date range for searches, in days') required_config.webapi.add_option( 'search_maximum_date_range', default=365, # in days doc='the maximum date range for searches, in days') required_config.webapi.add_option( 'platforms', default=[ { "id": "windows", "name": "Windows NT" }, { "id": "mac", "name": "Mac OS X" }, { "id": "linux", "name": "Linux" }, ], doc='Array associating OS ids to full names.', from_string_converter=lambda x: json.loads(x)) required_config.webapi.add_option( 'non_release_channels', default=['beta', 'aurora', 'nightly'], doc='List of channels, excluding the `release` one.', from_string_converter=string_to_list) required_config.webapi.add_option( 'restricted_channels', default=['beta'], doc='List of channels to restrict based on build ids.', from_string_converter=string_to_list) #-------------------------------------------------------------------------- # web_server namespace # the namespace is for config parameters the web server #-------------------------------------------------------------------------- required_config.namespace('web_server') required_config.web_server.add_option( 'wsgi_server_class', doc='a class implementing a wsgi web server', default='socorro.webapi.servers.CherryPy', from_string_converter=class_converter) #-------------------------------------------------------------------------- # http namespace # the namespace is for config parameters the http modules #-------------------------------------------------------------------------- required_config.namespace('http') required_config.http.namespace('correlations') required_config.http.correlations.add_option( 'base_url', doc='Base URL where correlations text files are', default='https://crash-analysis.mozilla.com/crash_analysis/', ) required_config.http.correlations.add_option( 'save_download', doc='Whether files downloaded for correlations should be ' 'temporary stored on disk', default=True, ) required_config.http.correlations.add_option( 'save_seconds', doc='Number of seconds that the downloaded .txt file is stored ' 'in a temporary place', default=60 * 10, ) required_config.http.correlations.add_option( 'save_root', doc='Directory where the temporary downloads are stored ' '(if left empty will become the systems tmp directory)', default='', ) #-------------------------------------------------------------------------- # sentry namespace # the namespace is for Sentry error capturing with Raven #-------------------------------------------------------------------------- required_config.namespace('sentry') required_config.sentry.add_option( 'dsn', doc='DSN for Sentry via raven', default='', reference_value_from='secrets.sentry', ) #-------------------------------------------------------------------------- # laglog namespace # the namespace for the replica lag log #-------------------------------------------------------------------------- required_config.namespace('laglog') required_config.laglog.add_option( 'max_bytes_warning', default=16 * 1024 * 1024, doc="Number of bytes that warrents a warning") required_config.laglog.add_option( 'max_bytes_critical', default=32 * 1024 * 1024, doc="Number of bytes that warrents a critial") # because the socorro.webapi.servers classes bring up their own default # configurations like port number, the only way to override the default # is like this: from socorro.webapi.servers import StandAloneServer StandAloneServer.required_config.port.set_default(8883, force=True) #-------------------------------------------------------------------------- def main(self): # Apache modwsgi requireds a module level name 'application' global application # 1 turn these names of classes into real references to classes def lookup(file_and_class): file_name, class_name = file_and_class.rsplit('.', 1) overrides = dict(self.config.implementations.service_overrides) _list = self.config.implementations.implementation_list for prefix, base_module_path in _list: if class_name in overrides: if prefix != overrides[class_name]: continue try: module = __import__( '%s.%s' % (base_module_path, file_name), globals(), locals(), [class_name]) except ImportError: raise ImportError( "Unable to import %s.%s.%s" % (base_module_path, file_name, class_name)) return getattr(module, class_name) raise ImplementationConfigurationError(file_and_class) # This list will hold the collection of url/service-implementations. # It is populated in the for loop a few lines lower in this file. # This list is used in the 'wrap' function so that all services have # place to lookup dependent services. all_services_mapping = {} # 2 wrap each service class with the ImplementationWrapper class def wrap(cls, file_and_class): return type( cls.__name__, (ImplementationWrapper, ), { 'cls': cls, 'file_and_class': file_and_class, # give lookup access of dependent services to all services 'all_services': all_services_mapping, }) services_list = [] # populate the 'services_list' with the tuples that will define the # urls and services offered by the middleware. for url, impl_class in SERVICES_LIST: impl_instance = lookup(impl_class) wrapped_impl = wrap(impl_instance, impl_class) services_list.append((url, wrapped_impl)) all_services_mapping[impl_instance.__name__] = wrapped_impl self.web_server = self.config.web_server.wsgi_server_class( self.config, # needs the whole config not the local namespace services_list) # for modwsgi the 'run' method returns the wsgi function that Apache # will use. For other webservers, the 'run' method actually starts # the standalone web server. application = self.web_server.run()
def config_from_configman(): definition_source = Namespace() definition_source.namespace('logging') definition_source.logging = socorro_app.App.required_config.logging definition_source.namespace('metricscfg') definition_source.metricscfg = socorro_app.App.required_config.metricscfg definition_source.namespace('elasticsearch') definition_source.elasticsearch.add_option( 'elasticsearch_class', default=ElasticsearchConfig, ) definition_source.namespace('database') definition_source.database.add_option( 'database_storage_class', default=PostgreSQLCrashStorage, ) definition_source.namespace('queuing') definition_source.queuing.add_option( 'rabbitmq_reprocessing_class', default=ReprocessingOneRabbitMQCrashStore, ) definition_source.namespace('priority') definition_source.priority.add_option( 'rabbitmq_priority_class', default=PriorityjobRabbitMQCrashStore, ) definition_source.namespace('data') definition_source.data.add_option( 'crash_data_class', default=socorro.external.boto.crash_data.SimplifiedCrashData, ) config = configuration(definition_source=definition_source, values_source_list=[ settings.SOCORRO_IMPLEMENTATIONS_CONFIG, ]) # The ReprocessingOneRabbitMQCrashStore crash storage, needs to have # a "logger" in the config object. To avoid having to use the # logger set up by configman as an aggregate, we just use the # same logger as we have here in the webapp. config.queuing.logger = logger config.priority.logger = logger config.data.logger = logger return config
class FileSystemCrashStorage(FileSystemThrottledCrashStorage): """This storage class is the only file system based crash storage system appropriate for storing both raw and processed crashes. This class uses the same segregating raw crash storage as the previous class and adds processed storage. Processed crashes are stored in their own file system root, 'pro_fs_root' (processed file system root) using the same radix directory system as the raw crashes.""" required_config = Namespace() required_config.add_option( 'pro_fs_root', doc='a path to a local file system for processed storage', default='./processedCrashStore', reference_value_from='resource.filesystem', ) required_config.add_option( 'minutes_per_slot', doc='the number of minutes in the lowest date directory', default=1, reference_value_from='resource.filesystem', ) required_config.add_option( 'sub_slot_count', doc='distribute data evenly among this many sub timeslots', default=1, reference_value_from='resource.filesystem', ) required_config.add_option( 'index_name', doc='the relative path to the top of the name storage tree from ' 'root parameter', default='name', reference_value_from='resource.filesystem', ) required_config.add_option( 'date_name', doc='the relative path to the top of the date storage tree from ' 'root parameter', default='date', reference_value_from='resource.filesystem', ) required_config.add_option( 'processed_crash_file_suffix', doc='the processed crash filename suffix', default='.jsonz', reference_value_from='resource.filesystem', ) required_config.add_option( 'gzip_compression_level', doc='the level of compression to use', default=9, reference_value_from='resource.filesystem', ) required_config.add_option( 'storage_depth', doc='the length of branches in the radix storage tree', default=2, reference_value_from='resource.filesystem', ) #-------------------------------------------------------------------------- def __init__(self, config, quit_check_callback=None): super(FileSystemCrashStorage, self).__init__(config) self.pro_crash_store = ProcessedDumpStorage( root=config.pro_fs_root, minutesPerSlot=config.minutes_per_slot, subSlotCount=config.sub_slot_count, indexName=config.index_name, dateName=config.date_name, fileSuffix=config.processed_crash_file_suffix, gzipCompression=config.gzip_compression_level, storageDepth=config.storage_depth, dumpGID=config.dump_gid, dumpPermissions=config.dump_permissions, dirPermissions=config.dir_permissions, ) #-------------------------------------------------------------------------- def save_processed(self, processed_crash): """save a processed crash (in the form of a Mapping) into a json file. It first gets the underlying file system to give it a file handle open for writing, then it uses the 'json' module to write the mapping to the open file handle.""" try: crash_id = processed_crash['uuid'] except KeyError: raise CrashIDNotFound("uuid missing from processed_crash") try: self._stringify_dates_in_dict(processed_crash) processed_crash_file_handle = \ self.pro_crash_store.newEntry(crash_id) try: json.dump(processed_crash, processed_crash_file_handle) finally: processed_crash_file_handle.close() self.logger.debug('saved processed- %s', crash_id) except Exception: self.logger.critical( 'processed file system storage has failed for: %s', crash_id, exc_info=True) raise #-------------------------------------------------------------------------- def get_unredacted_processed(self, crash_id): """fetch a processed json file from the underlying file system""" try: return self.pro_crash_store.getDumpFromFile(crash_id) except OSError: raise CrashIDNotFound(crash_id) #-------------------------------------------------------------------------- def remove(self, crash_id): """remove the all traces of a crash, both raw and processed from the file system.""" try: super(FileSystemCrashStorage, self).remove(crash_id) except CrashIDNotFound: self.logger.warning('raw crash not found for deletion: %s', crash_id) try: self.pro_crash_store.removeDumpFile(crash_id) except OSError: self.logger.warning('processed crash not found for deletion: %s', crash_id) #-------------------------------------------------------------------------- @staticmethod def _stringify_dates_in_dict(a_dict): for name, value in a_dict.iteritems(): if isinstance(value, datetime.datetime): a_dict[name] = ( "%4d-%02d-%02d %02d:%02d:%02d.%d" % (value.year, value.month, value.day, value.hour, value.minute, value.second, value.microsecond))