def parse_config(self, config_file_path): """Parses the configuration file Args: config_file_path (string): path to the configuration file """ # Read main logfeeder configuration file staticconf.YamlConfiguration(config_file_path) self.aws_config_filepath = staticconf.read( 'logfeeder.aws_config_filepath') self.domain = staticconf.read('logfeeder.domain') app_file = staticconf.read('{0}.file'.format(self.APP_NAME)) # Read app specific configuration file contents = staticconf.YamlConfiguration(app_file) self.api_creds_filepath = staticconf.read('api_creds_filepath') if 'rate_limiter_num_calls_per_timeunit' in contents: self.rate_limiter = RateLimiter( calls_per_timeunit=staticconf.read_int( 'rate_limiter_num_calls_per_timeunit'), seconds_per_timeunit=staticconf.read_int( 'rate_limiter_num_seconds_per_timeunit'), ) self.sub_apis = {} for key in contents: if key.startswith('enable_'): name_of_subapi = key.split('enable_', 1)[1] self.sub_apis[name_of_subapi] = staticconf.read_bool(key) # If an API doesn't have any sub_apis, then set set its APP_NAME to self.sub_apis for code compatibility if not self.sub_apis: self.sub_apis = {self.APP_NAME: True}
def __init__(self, db_obj, sqs_scanner_queue, sqs_worker_queue, emailer): """ :param db_obj: dynamodb table of scheduled jobs :type db_obj: boto.dynamodb2.table.Table :param sqs_scanner_queue: scanner queue which to send a feed back message :type sqs_scanner_queue: boto.sqs.queue.Queue :param sqs_worker_queue: worker queue which to receive job from :type sqs_worker_queue: boto.sqs.queue.Queue """ self.db = db_obj self.scanner_queue = sqs_scanner_queue self.worker_queue = sqs_worker_queue self._should_run = True self._run_once = False self.worker_keepalive_sec = staticconf.read_int('scanner.worker_keepalive_sec') self.max_error_retries = staticconf.read_int('max_error_retries') self.msg_max_retention_sec = int( self.scanner_queue.get_queue_attributes()['MessageRetentionPeriod'] ) self.msg_max_retention_sec += 3600 # give SQS enough time to delete the message self.emailer = emailer log("scanner initialization") log(dict((k, str(v))for k, v in vars(self).iteritems()))
def get_target_capacity_value(target_capacity: str, pool: str, scheduler: str) -> int: target_capacity = target_capacity.lower() pool_namespace = POOL_NAMESPACE.format(pool=pool, scheduler=scheduler) if target_capacity == 'min': return staticconf.read_int('scaling_limits.min_capacity', namespace=pool_namespace) elif target_capacity == 'max': return staticconf.read_int('scaling_limits.max_capacity', namespace=pool_namespace) else: return int(target_capacity)
def configure_initial(self): # Any keys in the env_config will override defaults in config.yaml. setup_config(self.options) self.logger = logger self.region = staticconf.read_string('aws.region') self.last_time_called = self.options.start_time self.run_interval = staticconf.read_int('batches.spot_prices.run_interval_seconds') self.dedupe_interval = staticconf.read_int('batches.spot_prices.dedupe_interval_seconds') self.metrics_client = ClustermanMetricsBotoClient(region_name=self.region)
def add_instance(self, instance): cpus = instance.resources.cpus self.aws_cpus.add_delta(self.current_time, cpus) join_delay_mean = staticconf.read_int('join_delay_mean_seconds') join_delay_stdev = staticconf.read_int('join_delay_stdev_seconds') instance.join_time = instance.start_time.shift( seconds=random.gauss(join_delay_mean, join_delay_stdev)) self.mesos_cpus.add_delta(instance.join_time, cpus)
def _populate_cluster_size_events(simulator, start_time, end_time): capacity_metrics = simulator.metrics_client.get_metric_values( f'fulfilled_capacity', METADATA, start_time.timestamp, end_time.timestamp, use_cache=False, extra_dimensions=get_cluster_dimensions( simulator.metadata.cluster, simulator.metadata.pool, simulator.metadata.scheduler, ), ) for i, (timestamp, data) in enumerate(capacity_metrics['fulfilled_capacity']): market_data = {} for market_str, value in data.items(): market = InstanceMarket.parse(market_str) weight = get_market_resources(market).cpus // staticconf.read_int( 'cpus_per_weight') market_data[market] = int(value) // weight simulator.markets |= set(market_data.keys()) use_join_delay = ( i != 0) # Want to start the cluster out at the expected capacity simulator.add_event( ModifyClusterSizeEvent(arrow.get(timestamp), market_data, use_join_delay))
def submit_host_for_termination(self, host: Host, delay: Optional[int] = None) -> None: delay_seconds = delay if delay is not None else staticconf.read_int( f'drain_termination_timeout_seconds.{host.sender}', default=90) logger.info( f'Delaying terminating {host.instance_id} for {delay_seconds} seconds' ) return self.client.send_message( QueueUrl=self.termination_queue_url, DelaySeconds=delay_seconds, MessageAttributes={ 'Sender': { 'DataType': 'String', 'StringValue': host.sender, }, }, MessageBody=json.dumps({ 'instance_id': host.instance_id, 'ip': host.ip, 'hostname': host.hostname, 'group_id': host.group_id, 'scheduler': host.scheduler, }), )
def __init__(self, config_loc, config_override_loc, emailer, num_processes=1, wait_timeout_sec=60): """ :param config_loc: path of config.yaml :type config_loc: string :param config_override_loc: path of config-env-dev.yaml :type config_override_loc: string :param run_local: run local flag :type run_local: boolean :param num_processes: number of worker processes to use for sqs request :type num_processes: int :param wait_timeout_sec: A timeout passed to conditional variable wait function. If thread is woken up on timeout, do some maintenance work. :type wait_timeout_sec: int """ self._config_loc = config_loc self._config_override_loc = config_override_loc self._stop_requested = False self._run_once = False self.max_error_retries = staticconf.read_int('max_error_retries') self.etl_helper = ETLStatusHelper() self.jobs_db = TableConnection.get_connection('ScheduledJobs') self.runs_db = TableConnection.get_connection('ETLRecords') self._num_processes = num_processes self._cond = threading.Condition(threading.Lock()) self._wait_timeout_sec = max(wait_timeout_sec, 60) self.emailer = emailer
def configure_initial(self) -> None: setup_config(self.options) # Since we want to collect metrics for all the pools, we need to call setup_config # first to load the cluster config path, and then read all the entries in that directory self.pools: MutableMapping[str, List[str]] = {} for scheduler in {'mesos', 'kubernetes'}: self.pools[scheduler] = get_pool_name_list(self.options.cluster, scheduler) for scheduler, pools in self.pools.items(): for pool in pools: self.config.watchers.append({ f'{pool}.{scheduler}': get_pool_config_path(self.options.cluster, pool, scheduler), }) load_cluster_pool_config(self.options.cluster, pool, scheduler, None) self.region = staticconf.read_string('aws.region') self.run_interval = staticconf.read_int( 'batches.cluster_metrics.run_interval_seconds') self.logger = logger self.metrics_client = ClustermanMetricsBotoClient( region_name=self.region)
def __init__( self, region_name: str, app_identifier: Optional[str] = None, ttl_days: Optional[int] = None, ) -> None: """ :param region_name: name of AWS region to use instead of the default. :param app_identifier: prefix for all application metric names. Required from client applications to avoid name collisions. :param ttl_days: number of days after which data written by this client should expire. Use -1 if data should never expire, and leave as None to use the default value. """ self.region_name = region_name ttl_days = ttl_days or staticconf.read_int('dynamodb.ttl_days', namespace=CONFIG_NAMESPACE) self.ddb = get_metrics_session().resource( 'dynamodb', region_name=self.region_name, ) self.app_identifier = app_identifier if ttl_days == -1: # Never expire self.ttl_seconds = None else: self.ttl_seconds = int(timedelta(days=ttl_days).total_seconds()) self._cache: MutableMapping[str, CacheEntry] = defaultdict(CacheEntry)
def mock_autoscaler(): autoscaling_config_dict = { 'default_signal_role': 'clusterman', 'setpoint': 0.7, 'target_capacity_margin': 0.1, } with mock.patch( 'clusterman.autoscaler.autoscaler.ClustermanMetricsBotoClient', autospec=True, ), mock.patch( 'clusterman.autoscaler.autoscaler.PoolManager', autospec=True, ), mock.patch( 'clusterman.autoscaler.autoscaler.Autoscaler._get_signal_for_app', autospec=True, ), mock.patch( 'clusterman.autoscaler.autoscaler.get_monitoring_client', ), mock.patch( 'clusterman.autoscaler.autoscaler.ExternalSignal', ), mock.patch( 'clusterman.autoscaler.autoscaler.PendingPodsSignal', ), staticconf.testing.PatchConfiguration( {'autoscaling': autoscaling_config_dict}, ): mock_autoscaler = Autoscaler('mesos-test', 'bar', 'mesos', ['bar'], monitoring_enabled=False) mock_autoscaler.pool_manager.cluster_connector = mock.Mock() mock_autoscaler.pool_manager.target_capacity = 300 mock_autoscaler.pool_manager.min_capacity = staticconf.read_int( 'scaling_limits.min_capacity', namespace=POOL_NAMESPACE.format(pool='bar', scheduler='mesos') ) mock_autoscaler.pool_manager.max_capacity = staticconf.read_int( 'scaling_limits.max_capacity', namespace=POOL_NAMESPACE.format(pool='bar', scheduler='mesos') ) mock_autoscaler.pool_manager.non_orphan_fulfilled_capacity = 0 mock_autoscaler.target_capacity_gauge = mock.Mock(spec=GaugeProtocol) mock_autoscaler.non_orphan_capacity_gauge = mock.Mock(spec=GaugeProtocol) mock_autoscaler.resource_request_gauges = { 'mem': mock.Mock(spec=GaugeProtocol), 'cpus': mock.Mock(spec=GaugeProtocol), 'disk': mock.Mock(spec=GaugeProtocol), 'gpus': mock.Mock(spec=GaugeProtocol), } return mock_autoscaler
def test_load_cluster_pool_config(cluster, pool, pool_other_config, mock_config_files): config.load_cluster_pool_config(cluster, pool, 'mesos', None) pool_namespace = POOL_NAMESPACE.format(pool=pool, scheduler='mesos') assert staticconf.read_int('other_config', namespace=pool_namespace) == pool_other_config assert staticconf.read_string(f'resource_groups', namespace=pool_namespace) == cluster
def dates_from_rs_status(status_helper, db, logstream, retry_on_err, single_date=None): """ date_from_rs_status gets the jobs that have completed the et step, but have not started the load step, and have no jobs before them running or in error Args: status_helper -- a wrapper around a backing store to aid in CRUD db -- is the database we query logstream -- a PipelineStreamLogger retry_on_err -- a boolean, True if we're retrying on errors single_date -- date string of the form YYYY-MM-DD if we're \ only looking for one Returns: a list of dates to catch up on formatted as strings YYYY/MM/DD """ versions = get_yaml_table_versions(pipeline_yaml_schema_file_path()) if single_date is not None: data_date = get_formatted_date(single_date) if data_date is None: handle_error("bad input date: {0}".format(single_date), logstream) start_datetime = datetime.strptime(data_date, "%Y/%m/%d") status_tuples = \ status_helper.query_et_complete_job(db, versions, data_date) else: days_back = read_int('pipeline.load_step.days_to_check') + 1 start_datetime = datetime.utcnow() - timedelta(days=days_back) status_tuples = \ status_helper.query_et_complete_jobs(db, versions, start_datetime) if status_tuples is False: handle_error( "query for complete et job failed, version={0}, date={1}".format( versions, data_date if single_date is not None else start_datetime), logstream) candidates = [] last_date = (start_datetime - timedelta(days=1)).strftime("%Y/%m/%d") for ddate, ld_status in status_tuples: if not one_day_greater(ddate, last_date): break elif ld_status is None or (ld_status == 'error' and retry_on_err): candidates.append(ddate) elif ld_status == 'error': break last_date = ddate candidate_string = "candidates dates for load: {0}".format(candidates) logstream.write_msg(status='running', extra_msg=candidate_string) return candidates
def dates_from_rs_status(status_helper, db, logstream, retry_on_err, single_date=None): """ date_from_rs_status gets the jobs that have completed the et step, but have not started the load step, and have no jobs before them running or in error Args: status_helper -- a wrapper around a backing store to aid in CRUD db -- is the database we query logstream -- a PipelineStreamLogger retry_on_err -- a boolean, True if we're retrying on errors single_date -- date string of the form YYYY-MM-DD if we're \ only looking for one Returns: a list of dates to catch up on formatted as strings YYYY/MM/DD """ versions = get_yaml_table_versions(pipeline_yaml_schema_file_path()) if single_date is not None: data_date = get_formatted_date(single_date) if data_date is None: handle_error("bad input date: {0}".format(single_date), logstream) start_datetime = datetime.strptime(data_date, "%Y/%m/%d") status_tuples = \ status_helper.query_et_complete_job(db, versions, data_date) else: days_back = read_int('pipeline.load_step.days_to_check') + 1 start_datetime = datetime.utcnow() - timedelta(days=days_back) status_tuples = \ status_helper.query_et_complete_jobs(db, versions, start_datetime) if status_tuples is False: handle_error( "query for complete et job failed, version={0}, date={1}".format( versions, data_date if single_date is not None else start_datetime ), logstream ) candidates = [] last_date = (start_datetime - timedelta(days=1)).strftime("%Y/%m/%d") for ddate, ld_status in status_tuples: if not one_day_greater(ddate, last_date): break elif ld_status is None or (ld_status == 'error' and retry_on_err): candidates.append(ddate) elif ld_status == 'error': break last_date = ddate candidate_string = "candidates dates for load: {0}".format(candidates) logstream.write_msg(status='running', extra_msg=candidate_string) return candidates
def fetch_creds(): ''' Return a dictionary holding temporary credentials from the metadata server. This function will block upto the timeout specified in config file. You may not call this method unless config.yaml is loaded ''' url = '{url_root}/{name}'.format( url_root=staticconf.read_string('instance_profile_creds_url'), name=staticconf.read_string('instance_profile_name')) in_stream = urllib2.urlopen( url, timeout=staticconf.read_int( 'instance_profile_creds_timeout_in_seconds', default=4)) return simplejson.load(in_stream)
def fetch_creds(): ''' Return a dictionary holding temporary credentials from the metadata server. This function will block upto the timeout specified in config file. You may not call this method unless config.yaml is loaded ''' url = '{url_root}/{name}'.format( url_root=staticconf.read_string('instance_profile_creds_url'), name=staticconf.read_string('instance_profile_name')) in_stream = urllib2.urlopen( url, timeout=staticconf.read_int( 'instance_profile_creds_timeout_in_seconds', default=4 ) ) return simplejson.load(in_stream)
def __init__(self, logstrm, psql_auth_file, run_local=False): self.run_local = run_local self.host = staticconf.read_string('redshift_host') self.port = staticconf.read_int('redshift_port') private_dict = YamlConfiguration(psql_auth_file) self.user = private_dict['redshift_user'] self.password = private_dict['redshift_password'] self.log_stream = logstrm self._aws_key = '' self._aws_secret = '' self._aws_token = '' self._aws_token_expiry = datetime.utcnow() self._whitelist = ['select', 'create', 'insert', 'update'] self._set_aws_auth() psycopg2.extensions.set_wait_callback(wait_select_inter)
def process_drain_queue( self, mesos_operator_client: Optional[Callable[..., Callable[[str], Callable[..., None]]]], kube_operator_client: Optional[KubernetesClusterConnector], ) -> None: host_to_process = self.get_host_to_drain() if host_to_process and host_to_process.instance_id not in self.draining_host_ttl_cache: self.draining_host_ttl_cache[ host_to_process.instance_id] = arrow.now().shift( seconds=DRAIN_CACHE_SECONDS) if host_to_process.scheduler == 'mesos': logger.info( f'Mesos host to drain and submit for termination: {host_to_process}' ) try: drain( mesos_operator_client, [f'{host_to_process.hostname}|{host_to_process.ip}'], arrow.now().timestamp * 1000000000, staticconf.read_int( 'mesos_maintenance_timeout_seconds', default=600) * 1000000000) except Exception as e: logger.error( f'Failed to drain {host_to_process.hostname} continuing to terminate anyway: {e}' ) finally: self.submit_host_for_termination(host_to_process) elif host_to_process.scheduler == 'kubernetes': logger.info( f'Kubernetes host to drain and submit for termination: {host_to_process}' ) self.submit_host_for_termination(host_to_process, delay=0) else: logger.info( f'Host to submit for termination immediately: {host_to_process}' ) self.submit_host_for_termination(host_to_process, delay=0) self.delete_drain_messages([host_to_process]) elif host_to_process: logger.warning( f'Host: {host_to_process.hostname} already being processed, skipping...' ) self.delete_drain_messages([host_to_process])
def process_drain_queue( self, mesos_operator_client: Callable[..., Callable[[str], Callable[..., None]]], ) -> None: host_to_process = self.get_host_to_drain() if host_to_process and host_to_process.instance_id not in self.draining_host_ttl_cache: self.draining_host_ttl_cache[ host_to_process.instance_id] = arrow.now().shift( seconds=DRAIN_CACHE_SECONDS) # if hosts do not have hostname it means they are likely not in mesos and don't need draining # so instead we send them to terminate straight away if not host_to_process.hostname: logger.info( f'Host to submit for termination immediately: {host_to_process}' ) self.submit_host_for_termination(host_to_process, delay=0) else: logger.info( f'Host to drain and submit for termination: {host_to_process}' ) try: drain( mesos_operator_client, [f'{host_to_process.hostname}|{host_to_process.ip}'], arrow.now().timestamp * 1000000000, staticconf.read_int( 'mesos_maintenance_timeout_seconds', default=600) * 1000000000) except Exception as e: logger.error( f'Failed to drain {host_to_process.hostname} continuing to terminate anyway: {e}' ) finally: self.submit_host_for_termination(host_to_process) self.delete_drain_messages([host_to_process]) elif host_to_process: logger.warning( f'Host: {host_to_process.hostname} already being processed, skipping...' ) self.delete_drain_messages([host_to_process])
def copy_table(psql_helper, db_name, ddate, log_tuple, ttl_days, logstream): s3_log, rs_table = log_tuple namespaced_table_name = get_namespaced_tablename(rs_table) table_start = time.time() extra_msg = "from s3 log: {0}".format(s3_log) logstream.write_msg('starting', extra_msg=extra_msg) # about to load new day, remove oldest rows_deleted = None if ttl_days is not None: rows_deleted = \ delete_old_data(psql_helper, db_name, rs_table, ttl_days - 1) if rows_deleted: logstream.write_msg('delete_ok', extra_msg="{0} rows".format(rows_deleted)) # Try to reclaim disk space. If not needed, it will be fast. # Calling here and not in the 'if rows_deleted' code to prevent # scenario where rows were deleted but compact failed. Then on retry # there will be nothing to delete but since space is not reclaimed # there may not be enough for a new load, resulting in failure forever. if ttl_days is not None: compact_table(psql_helper, db_name, namespaced_table_name) delimiter = read_string('redshift_column_delimiter') delimiter = delimiter.decode("string_escape") if delimiter not in string.printable: delimiter = '\\' + oct(ord(delimiter)) copy_sql = LOAD % (namespaced_table_name, s3_log, delimiter) result = psql_helper.run_sql( copy_sql, db_name, " copying from " + s3_log, s3_needed=True, time_est_secs=read_int('pipeline.load_step.copy_time_est_secs')) if result is not False: logstream.write_msg('complete', job_start_secs=table_start, extra_msg=extra_msg) return result
def _get_agent_metadata(self, instance_ip: str) -> AgentMetadata: for c in self.simulator.aws_clusters: for i in c.instances.values(): if instance_ip == i.ip_address: return AgentMetadata( agent_id=str(uuid.uuid4()), state=( AgentState.ORPHANED if self.simulator.current_time < i.join_time else AgentState.IDLE ), total_resources=ClustermanResources( cpus=i.resources.cpus, mem=i.resources.mem * 1000, disk=(i.resources.disk or staticconf.read_int('ebs_volume_size', 0)) * 1000, gpus=(i.resources.gpus), ) ) # if we don't know the given IP then it's orphaned return AgentMetadata(state=AgentState.ORPHANED)
def copy_table(psql_helper, db_name, ddate, log_tuple, ttl_days, logstream): s3_log, rs_table = log_tuple namespaced_table_name = get_namespaced_tablename(rs_table) table_start = time.time() extra_msg = "from s3 log: {0}".format(s3_log) logstream.write_msg('starting', extra_msg=extra_msg) # about to load new day, remove oldest rows_deleted = None if ttl_days is not None: rows_deleted = \ delete_old_data(psql_helper, db_name, rs_table, ttl_days - 1) if rows_deleted: logstream.write_msg('delete_ok', extra_msg="{0} rows".format(rows_deleted)) # Try to reclaim disk space. If not needed, it will be fast. # Calling here and not in the 'if rows_deleted' code to prevent # scenario where rows were deleted but compact failed. Then on retry # there will be nothing to delete but since space is not reclaimed # there may not be enough for a new load, resulting in failure forever. if ttl_days is not None: compact_table(psql_helper, db_name, namespaced_table_name) delimiter = read_string('redshift_column_delimiter') delimiter = delimiter.decode("string_escape") if delimiter not in string.printable: delimiter = '\\' + oct(ord(delimiter)) copy_sql = LOAD % (namespaced_table_name, s3_log, delimiter) result = psql_helper.run_sql( copy_sql, db_name, " copying from " + s3_log, s3_needed=True, time_est_secs=read_int('pipeline.load_step.copy_time_est_secs') ) if result is not False: logstream.write_msg('complete', job_start_secs=table_start, extra_msg=extra_msg) return result
def _get_timeout(self): return read_int("scanner.et_timeout")
from pyfiglet import Figlet def _get_memos_dir(): dir_memos = staticconf.read_string('dir_memos', default='') # try absolute dir path_memos = Path(dir_memos) if path_memos and path_memos.exists() and path_memos.is_dir(): return path_memos # try relative dir path_memos = PATH_CWD / dir_memos if path_memos and path_memos.exists() and path_memos.is_dir(): return path_memos return None # global utilities FIGLET = Figlet(font='big') PATH_CWD = Path.cwd() PATH_CONFIG = PATH_CWD / 'config.yaml' PATH_RECENT_OPEN = PATH_CWD / 'recent_open.yaml' staticconf.YamlConfiguration(PATH_CONFIG.as_posix()) PATH_MEMOS = _get_memos_dir() EDITOR = staticconf.read_string('default_editor', default=os.getenv('EDITOR', 'vi')) SEARCH_FUZZY = staticconf.read_string('default_search_fuzzy', default='True').lower() == 'true' RECENT_OPEN_LIST_SIZE = staticconf.read_int('recent_open_list_size', default=10)
def __load_data_from_s3( status_helper, prefixes, date_with_slashes, mrjob_path, local, db_name, logstream, force_et=False ): """ load_data_from_s3 iterates over prefixes and loads data for a particular date for the first prefix where the data exists. It also checks whether data has already been loaded for a date and if so, skips the load Args: status_helper -- An object handle to interact with status table prefixes -- a list of s3 prefixes for input data date_with_slashes -- a date string of the form 'YYYY/MM/DD' mrjob_path -- module.entry_point of the job to extract and \ transform the data local -- True if we're running locally (i.e., devc) False for aws instance logstream -- a PipelineStreamLogger Returns: --- """ start_time = time.time() table_versions = get_yaml_table_versions(pipeline_yaml_schema_file_path()) conditions = { 'table_versions': table_versions, 'data_date': date_with_slashes } if status_helper.et_started(conditions, db_name): logstream.write_msg( "complete", extra_msg="skipping: et_step already started" ) return prefix_for_this_data = get_next_dir_to_load( prefixes, date_with_slashes, local, logstream, force_et ) if not prefix_for_this_data: jobtime = 0 err_msg = "no prefix available date={0}, prefixes={1}".format( date_with_slashes, prefixes ) logstream.write_msg("error", error_msg=err_msg) status_helper.log_status_result( conditions, jobtime, db_name, failed=True, err_msg=err_msg ) raise Exception(err_msg) # check if mrjob is already done data_we_check = "{0} {1} {2}".format( get_s3_output_user_prefix(), date_with_slashes, local ) logstream.write_msg("running", extra_msg=data_we_check) if data_available( get_s3_output_user_prefix(), date_with_slashes, local, done_file_name='_SUCCESS' ): logstream.write_msg( "complete", extra_msg="skipping: et_step already done" ) return jobtime = time.time() mrjob_args = create_emr_args( date_with_slashes, read_int('pipeline.et_step.cores'), prefix_for_this_data, local ) status_helper.insert_et(conditions, db_name) logstream.write_msg("running", extra_msg=mrjob_args) result, err_reason = __run_mr_job(mrjob_path, mrjob_args, logstream) failed = not result jobtime = time.time() - start_time status_helper.log_status_result( conditions, jobtime, db_name, failed=failed, err_msg=err_reason ) if failed: raise Exception(err_reason) return
""" Everything related to creating a WSGI application to actually serve the application. Also exposes a __main__ function to actually serve the application using a simple default server. """ import staticconf from pyramid.config import Configurator from wsgiref.simple_server import make_server from pyramid_api_example.config import load_configuration from pyramid_api_example.routes import register_routes def create_application(): config = Configurator() register_routes(config) config.scan('pyramid_api_example.views') return config.make_wsgi_app() if __name__ == '__main__': load_configuration() host = staticconf.read_string('application_bind_host') port = staticconf.read_int('application_bind_port') app = create_application() print ('Starting up server on http://{0}:{1}'.format(host, port)) server = make_server(host, port, app) server.serve_forever()
def __load_data_from_s3(status_helper, prefixes, date_with_slashes, mrjob_path, local, db_name, logstream, force_et=False): """ load_data_from_s3 iterates over prefixes and loads data for a particular date for the first prefix where the data exists. It also checks whether data has already been loaded for a date and if so, skips the load Args: status_helper -- An object handle to interact with status table prefixes -- a list of s3 prefixes for input data date_with_slashes -- a date string of the form 'YYYY/MM/DD' mrjob_path -- module.entry_point of the job to extract and \ transform the data local -- True if we're running locally (i.e., devc) False for aws instance logstream -- a PipelineStreamLogger Returns: --- """ start_time = time.time() table_versions = get_yaml_table_versions(pipeline_yaml_schema_file_path()) conditions = { 'table_versions': table_versions, 'data_date': date_with_slashes } if status_helper.et_started(conditions, db_name): logstream.write_msg("complete", extra_msg="skipping: et_step already started") return prefix_for_this_data = get_next_dir_to_load(prefixes, date_with_slashes, local, logstream, force_et) if not prefix_for_this_data: jobtime = 0 err_msg = "no prefix available date={0}, prefixes={1}".format( date_with_slashes, prefixes) logstream.write_msg("error", error_msg=err_msg) status_helper.log_status_result(conditions, jobtime, db_name, failed=True, err_msg=err_msg) raise Exception(err_msg) # check if mrjob is already done data_we_check = "{0} {1} {2}".format(get_s3_output_user_prefix(), date_with_slashes, local) logstream.write_msg("running", extra_msg=data_we_check) if data_available(get_s3_output_user_prefix(), date_with_slashes, local, done_file_name='_SUCCESS'): logstream.write_msg("complete", extra_msg="skipping: et_step already done") return jobtime = time.time() mrjob_args = create_emr_args(date_with_slashes, read_int('pipeline.et_step.cores'), prefix_for_this_data, local) status_helper.insert_et(conditions, db_name) logstream.write_msg("running", extra_msg=mrjob_args) result, err_reason = __run_mr_job(mrjob_path, mrjob_args, logstream) failed = not result jobtime = time.time() - start_time status_helper.log_status_result(conditions, jobtime, db_name, failed=failed, err_msg=err_reason) if failed: raise Exception(err_reason) return