def _build_raw_model_flow(self, schema, dag): feature_aggregation_buckets_short_circuit_operator = self._create_infinite_retry_short_circuit_operator( task_id=('feature_aggregation_buckets_short_circuit_{0}'.format(schema)), dag=dag, python_callable=lambda **kwargs: is_execution_date_valid(kwargs['execution_date'], FeatureAggregationBucketsOperatorBuilder.get_feature_aggregation_buckets_interval(PresidioDagBuilder.conf_reader), get_schedule_interval(dag)) ) feature_aggregation_buckets_operator = FeatureAggregationBucketsOperatorBuilder(schema).build(dag) raw_model_short_circuit_operator = self._create_infinite_retry_short_circuit_operator( task_id='raw_model_short_circuit_{0}'.format(schema), dag=dag, python_callable=lambda **kwargs: is_execution_date_valid(kwargs['execution_date'], RawModelOperatorBuilder.get_build_raw_model_interval(PresidioDagBuilder.conf_reader), get_schedule_interval(dag)) & PresidioDagBuilder.validate_the_gap_between_dag_start_date_and_current_execution_date( dag, RawModelOperatorBuilder.get_min_gap_from_dag_start_date_to_start_raw_modeling(PresidioDagBuilder.conf_reader), kwargs['execution_date'], get_schedule_interval( dag))) raw_model_operator = RawModelOperatorBuilder(schema).build(dag) feature_aggregation_buckets_short_circuit_operator >> feature_aggregation_buckets_operator >> raw_model_short_circuit_operator >> raw_model_operator
def _build_aggr_model_flow(self, schema, dag): acc_aggregation_operator = AccumulateAggregationsOperatorBuilder(schema, FIX_DURATION_STRATEGY_HOURLY).build(dag) aggr_accumulate_short_circuit_operator = self._create_infinite_retry_short_circuit_operator( task_id='aggr_accumulate_short_circuit_{0}'.format(schema), dag=dag, python_callable=lambda **kwargs: is_execution_date_valid(kwargs['execution_date'], AccumulateAggregationsOperatorBuilder.get_accumulate_interval(PresidioDagBuilder.conf_reader), get_schedule_interval(dag)) ) aggr_model_short_circuit_operator = self._create_infinite_retry_short_circuit_operator( task_id='aggr_model_short_circuit_{0}'.format(schema), dag=dag, python_callable=lambda **kwargs: is_execution_date_valid(kwargs['execution_date'], AggrModelOperatorBuilder.get_aggr_model_interval(PresidioDagBuilder.conf_reader), get_schedule_interval(dag)) & PresidioDagBuilder.validate_the_gap_between_dag_start_date_and_current_execution_date( dag, AggrModelOperatorBuilder.get_min_gap_from_dag_start_date_to_start_aggr_modeling(PresidioDagBuilder.conf_reader), kwargs['execution_date'], get_schedule_interval(dag))) aggr_model_operator = AggrModelOperatorBuilder(schema).build(dag) aggr_accumulate_short_circuit_operator >> acc_aggregation_operator >> aggr_model_short_circuit_operator >> aggr_model_operator
def _build_output_operator(self, smart_record_conf_name, entity_type, dag, smart_operator): self.log.debug("populating the %s dag with output tasks", dag.dag_id) # build hourly output processor task_sensor_service = TaskSensorService() # This operator validates that output run in intervals that are no less than hourly intervals and that the dag # start only after the defined gap. output_short_circuit_operator = self._create_infinite_retry_short_circuit_operator( task_id='output_short_circuit', dag=dag, python_callable=lambda **kwargs: is_execution_date_valid( kwargs['execution_date'], FIX_DURATION_STRATEGY_HOURLY, dag. schedule_interval) & PresidioDagBuilder. validate_the_gap_between_dag_start_date_and_current_execution_date( dag, self._min_gap_from_dag_start_date_to_start_scoring, kwargs['execution_date'], dag.schedule_interval)) hourly_output_operator = OutputOperator( fixed_duration_strategy=timedelta(hours=1), command=PresidioDagBuilder.presidio_command, smart_record_conf_name=smart_record_conf_name, entity_type=entity_type, dag=dag, ) task_sensor_service.add_task_sequential_sensor(hourly_output_operator) task_sensor_service.add_task_short_circuit( hourly_output_operator, output_short_circuit_operator) # build entity score entity_score_operator = EntityScoreOperatorBuilder( smart_record_conf_name, entity_type).build(dag) # Create daily short circuit operator to wire the output processing and the entity score recalculation daily_short_circuit_operator = self._create_infinite_retry_short_circuit_operator( task_id='output_daily_short_circuit', dag=dag, python_callable=lambda **kwargs: is_execution_date_valid( kwargs['execution_date'], FIX_DURATION_STRATEGY_DAILY, dag. schedule_interval) & PresidioDagBuilder. validate_the_gap_between_dag_start_date_and_current_execution_date( dag, EntityScoreOperatorBuilder. get_min_gap_from_dag_start_date_to_start_modeling( PresidioDagBuilder.conf_reader), kwargs[ 'execution_date'], dag.schedule_interval)) daily_short_circuit_operator >> entity_score_operator self._push_forwarding(hourly_output_operator, daily_short_circuit_operator, dag, entity_type) smart_operator >> output_short_circuit_operator return entity_score_operator
def _push_forwarding(self, hourly_output_operator, daily_short_circuit_operator, dag, entity_type): self.log.debug("creating the forwarder task") default_args = dag.default_args enable_output_forwarder = default_args.get("enable_output_forwarder") self.log.debug("enable_output_forwarder=%s ", enable_output_forwarder) if enable_output_forwarder == 'true': push_forwarding_operator = OutputForwarderOperator( command=PresidioDagBuilder.presidio_command, entity_type=entity_type, run_clean_command_before_retry=False, dag=dag) output_forward_short_circuit_operator = self._create_infinite_retry_short_circuit_operator( task_id='output_forward_short_circuit', dag=dag, python_callable=lambda **kwargs: is_execution_date_valid( kwargs['execution_date'], FIX_DURATION_STRATEGY_HOURLY, dag .schedule_interval) & PresidioDagBuilder. validate_the_gap_between_dag_start_date_and_current_execution_date( dag, EntityScoreOperatorBuilder. get_min_gap_from_dag_start_date_to_start_modeling( PresidioDagBuilder.conf_reader), kwargs[ 'execution_date'], dag.schedule_interval)) hourly_output_operator >> output_forward_short_circuit_operator >> push_forwarding_operator >> daily_short_circuit_operator else: hourly_output_operator >> daily_short_circuit_operator
def add_java_args(context): params = context['params'] fixed_duration_strategy = params['retry_extra_params'][ 'fixed_duration_strategy'] interval = params['retry_extra_params']['schedule_interval'] context_wrapper = ContextWrapper(context) execution_date = context_wrapper.get_execution_date() if not is_execution_date_valid(execution_date, fixed_duration_strategy, interval): logging.info( 'The execution date {} is not the last interval of fixed duration {}.' .format(execution_date, fixed_duration_strategy)) start_date = floor_time(execution_date, time_delta=fixed_duration_strategy) end_date = floor_time(execution_date + interval, time_delta=fixed_duration_strategy) utc_start_date = convert_to_utc(start_date) utc_end_date = convert_to_utc(end_date) java_args = {'start_date': utc_start_date, 'end_date': utc_end_date} java_args = ' '.join(SpringBootJarOperator.java_args_prefix + '%s %s' % (key, val) for (key, val) in java_args.iteritems()) return java_args
def execute(self, context): """ Checks if execution_date is last interval of fixed duration, then creates java args, otherwise skip the task. java args include start_date, end_date and fixed_duration_strategy :raise InvalidExecutionDateError - Raise error if the execution_date is not the last interval of fixed duration. """ context_wrapper = ContextWrapper(context) execution_date = context_wrapper.get_execution_date() if not is_execution_date_valid( execution_date, self.fixed_duration_strategy, self.interval): # e.g: execution_date = datetime(2014, 11, 28, 13, 50, 0) # interval = timedelta(minutes=5) # fixed_duration = timedelta(days=1) self.log.info( 'The execution date {} is not the last interval of fixed duration {}.' .format(execution_date, self.fixed_duration_strategy)) start_date = floor_time(execution_date, time_delta=self.fixed_duration_strategy) end_date = floor_time(execution_date + self.interval, time_delta=self.fixed_duration_strategy) utc_start_date = convert_to_utc(start_date) utc_end_date = convert_to_utc(end_date) java_args = {'start_date': utc_start_date, 'end_date': utc_end_date} super(FixedDurationJarOperator, self).update_java_args(java_args) super(FixedDurationJarOperator, self).execute(context)
def build(self, dag): """ Fill the given "Smart Model DAG" with smart accumulating operator followed by smart model build operator The smart accumulating operator responsible for accumulating the smart events The smart model build operator is respobsible for building the models Accumulating the data will happen once a day whereas the models might be built once a day or less (i.e. once a week) :param dag: The smart_model DAG to populate :type dag: airflow.models.DAG :return: The smart model DAG, after it has been populated :rtype: airflow.models.DAG """ smart_accumulate_short_circuit_operator = self._create_infinite_retry_short_circuit_operator( task_id='smart_accumulate_short_circuit', dag=dag, python_callable=lambda **kwargs: is_execution_date_valid(kwargs['execution_date'], SmartModelAccumulateOperatorBuilder.get_accumulate_interval( PresidioDagBuilder.conf_reader), get_schedule_interval(dag)) & PresidioDagBuilder.validate_the_gap_between_dag_start_date_and_current_execution_date( dag, SmartModelAccumulateOperatorBuilder.get_min_gap_from_dag_start_date_to_start_accumulating( PresidioDagBuilder.conf_reader), kwargs['execution_date'], get_schedule_interval(dag)) ) smart_model_accumulate_operator = SmartModelAccumulateOperatorBuilder().build(dag) smart_model_short_circuit_operator = self._create_infinite_retry_short_circuit_operator( task_id='smart_model_short_circuit', dag=dag, python_callable=lambda **kwargs: is_execution_date_valid(kwargs['execution_date'], SmartModelOperatorBuilder.get_build_model_interval( PresidioDagBuilder.conf_reader), get_schedule_interval(dag)) & PresidioDagBuilder.validate_the_gap_between_dag_start_date_and_current_execution_date( dag, SmartModelOperatorBuilder.get_min_gap_from_dag_start_date_to_start_modeling( PresidioDagBuilder.conf_reader), kwargs['execution_date'], get_schedule_interval(dag))) smart_model_operator = SmartModelOperatorBuilder().build(dag) smart_accumulate_short_circuit_operator >> smart_model_accumulate_operator >> smart_model_short_circuit_operator >> smart_model_operator return dag
def _build_model_trigger_operator(self, dag, schema): model_dag_id = ModelDagFactory.get_dag_id(schema) python_callable = lambda context, dag_run_obj: dag_run_obj if is_execution_date_valid( context['execution_date'], FIX_DURATION_STRATEGY_DAILY, get_schedule_interval(dag)) else None model_trigger = self._create_expanded_trigger_dag_run_operator( '{0}_{1}'.format(schema, 'model_trigger_dagrun'), model_dag_id, dag, python_callable) set_schedule_interval(model_dag_id, FIX_DURATION_STRATEGY_DAILY) return model_trigger
def _build_input_pre_processing_trigger_operator(self, dag, schema): input_pre_processing_dag_id = InputPreProcessingDagFactory.get_dag_id( schema) python_callable = lambda context, dag_run_obj: dag_run_obj if is_execution_date_valid( context['execution_date'], FIX_DURATION_STRATEGY_DAILY, get_schedule_interval(dag)) else None input_pre_processing_trigger = self._create_expanded_trigger_dag_run_operator( "{0}_input_pre_processing_trigger_dag_run".format(schema), input_pre_processing_dag_id, dag, python_callable) return input_pre_processing_trigger
def _is_execution_date_valid(self, context): execution_date = context['execution_date'] if not is_execution_date_valid( execution_date, self.fixed_duration_strategy, self.interval): # e.g: execution_date = datetime(2014, 11, 28, 13, 50, 0) # interval = timedelta(minutes=5) # fixed_duration = timedelta(days=1) logging.error( 'The execution date {} is not the last interval of fixed duration {}.' .format(execution_date, self.fixed_duration_strategy)) raise InvalidExecutionDateError(execution_date, self.fixed_duration_strategy)
def _build_smart(self, root_dag_gap_sensor_operator, smart_dag, smart_record_conf_name): task_sensor_service = TaskSensorService() smart_short_circuit_operator = self._create_infinite_retry_short_circuit_operator( task_id='ade_scoring_hourly_short_circuit', dag=smart_dag, python_callable=lambda **kwargs: is_execution_date_valid( kwargs['execution_date'], FIX_DURATION_STRATEGY_HOURLY, smart_dag.schedule_interval) & PresidioDagBuilder. validate_the_gap_between_dag_start_date_and_current_execution_date( smart_dag, self._min_gap_from_dag_start_date_to_start_scoring, kwargs['execution_date'], smart_dag.schedule_interval)) smart_operator = SmartEventsOperator( command=SmartEventsOperator.liors_special_run_command, fixed_duration_strategy=FIX_DURATION_STRATEGY_HOURLY, smart_events_conf=smart_record_conf_name, dag=smart_dag, ) task_sensor_service.add_task_sequential_sensor(smart_operator) task_sensor_service.add_task_short_circuit( smart_operator, smart_short_circuit_operator) root_dag_gap_sensor_operator >> smart_short_circuit_operator smart_model_dag_id = SmartModelDagFactory.get_dag_id( smart_record_conf_name) python_callable = lambda context, dag_run_obj: dag_run_obj if is_execution_date_valid( context['execution_date'], FIX_DURATION_STRATEGY_DAILY, smart_dag. schedule_interval) else None smart_model_trigger = self._create_expanded_trigger_dag_run_operator( "smart_model_trigger", smart_model_dag_id, smart_dag, python_callable) set_schedule_interval(smart_model_dag_id, FIX_DURATION_STRATEGY_DAILY) smart_operator >> smart_model_trigger return smart_operator
def _build_alert_retention_operator(self, dag, entity_score_operator, entity_type): alert_retention_short_circuit_operator = self._create_infinite_retry_short_circuit_operator( task_id='alert_retention_short_circuit', dag=dag, python_callable=lambda **kwargs: is_execution_date_valid( kwargs['execution_date'], AlertRetentionOperatorBuilder. get_alert_retention_interval_in_hours( PresidioDagBuilder.conf_reader), dag.schedule_interval ) & PresidioDagBuilder. validate_the_gap_between_dag_start_date_and_current_execution_date( dag, timedelta(days=AlertRetentionOperatorBuilder. get_alert_min_time_to_start_retention_in_days( PresidioDagBuilder.conf_reader)), kwargs[ 'execution_date'], dag.schedule_interval)) alert_retention = AlertRetentionOperatorBuilder().build( dag, entity_type) entity_score_operator >> alert_retention_short_circuit_operator >> alert_retention
def build(self, dag): """ Receives an indicator DAG, creates the adapter, input and scoring operators, links them to the DAG and configures the dependencies between them. :param dag: The indicator DAG to populate :type dag: airflow.models.DAG :return: The given indicator DAG, after it has been populated :rtype: airflow.models.DAG """ self.log.debug("populating the %s dag with input tasks", dag.dag_id) schema = dag.default_args.get('schema') adapter_operator = AdapterOperatorBuilder(schema).build(dag) input_task_sensor_service = TaskSensorService() input_operator = InputOperator( fixed_duration_strategy=timedelta(hours=1), command=PresidioDagBuilder.presidio_command, schema=schema, dag=dag) input_task_sensor_service.add_task_sequential_sensor(input_operator) self.log.debug("populating the %s dag with scoring tasks", dag.dag_id) scoring_task_sensor_service = TaskSensorService() feature_aggregations_operator = FeatureAggregationsOperator( fixed_duration_strategy=FIX_DURATION_STRATEGY_HOURLY, command=PresidioDagBuilder.presidio_command, data_source=schema, dag=dag) scoring_task_sensor_service.add_task_sequential_sensor( feature_aggregations_operator) score_aggregations_operator = ScoreAggregationsOperator( fixed_duration_strategy=FIX_DURATION_STRATEGY_HOURLY, command=PresidioDagBuilder.presidio_command, data_source=schema, dag=dag) scoring_task_sensor_service.add_task_sequential_sensor( score_aggregations_operator) hourly_short_circuit_operator = self._create_infinite_retry_short_circuit_operator( task_id='ade_scoring_hourly_short_circuit', dag=dag, python_callable=lambda **kwargs: is_execution_date_valid( kwargs['execution_date'], FIX_DURATION_STRATEGY_HOURLY, get_schedule_interval(dag)) & PresidioDagBuilder. validate_the_gap_between_dag_start_date_and_current_execution_date( dag, self._min_gap_from_dag_start_date_to_start_scoring, kwargs['execution_date'], get_schedule_interval(dag))) if schema in InputPreProcessingDagFactory.get_registered_schemas(): input_pre_processing_trigger = self._build_input_pre_processing_trigger_operator( dag, schema) input_pre_processing_gap_sensor = DagIntervalGapSequentialSensorOperator( dag=dag, task_id='input_pre_processing_gap_sensor_{0}'.format(schema), dag_ids=[InputPreProcessingDagFactory.get_dag_id(schema)], interval=timedelta(hours=1), start_time=dag.start_date, fixed_duration_strategy=FIX_DURATION_STRATEGY_DAILY, poke_interval=5) input_pre_processing_gap_sensor >> input_operator >> input_pre_processing_trigger adapter_operator >> input_operator >> hourly_short_circuit_operator scoring_task_sensor_service.add_task_short_circuit( feature_aggregations_operator, hourly_short_circuit_operator) scoring_task_sensor_service.add_task_short_circuit( score_aggregations_operator, hourly_short_circuit_operator) model_trigger = self._build_model_trigger_operator(dag, schema) input_operator >> model_trigger return dag