Ejemplo n.º 1
0
    def __init__(self,
                 pipeline,
                 schedule_interval='@daily',
                 extra_default_args={},
                 extra_config={},
                 base_config={}):
        self.pipeline = pipeline

        loaded_config = config_tools.load_config(pipeline)
        self.config = base_config.copy()
        self.config.update(loaded_config)
        self.config.update(extra_config)

        self.default_args = config_tools.default_args(self.config)
        self.default_args.update(extra_default_args)

        self.schedule_interval = schedule_interval

        self.flexible_operator = Variable.get('FLEXIBLE_OPERATOR')
                    dag,
                    'arguments': [
                        'publish_postgres', '{start_date}'.format(**config),
                        '{end_date}'.format(**config),
                        '{project_id}:{events_dataset}.{events_table}'.format(
                            **config), '{temp_bucket}'.format(**config),
                        '{project_id}'.format(**config),
                        '{postgres_database_region}'.format(**config),
                        '{postgres_db_instance_name}'.format(**config),
                        '{postgres_database}'.format(**config),
                        '{postgres_db_user}'.format(**config),
                        '{postgres_db_password}'.format(**config),
                        '{postgres_db_table}'.format(**config),
                        'encounter'.format(**config)
                    ]
                })
                publish_events_bigquery >> publish_events_postgres

            return dag


encounters_config = config_tools.load_config('pipe_events.encounters')
events_encounters_daily_dag = PipelineDagFactory(encounters_config).build(
    'pipe_events_daily.encounters')
events_encounters_monthly_dag = PipelineDagFactory(
    encounters_config,
    schedule_interval='@monthly').build('pipe_events_monthly.encounters')
events_encounters_yearly_dag = PipelineDagFactory(
    encounters_config,
    schedule_interval='@yearly').build('pipe_events_yearly.encounters')
    def build(self, dag_id):
        """
        Override of build method.

        :@param dag_id: The id of the DAG.
        :@type dag_id: str.
        """

        config = self.config
        config['source_dataset'] = config['pipeline_dataset']
        config['source_tables'] = config['normalized_tables']

        default_args = self.default_args

        subdag_default_args = dict(
            start_date=default_args['start_date'],
            end_date=default_args['end_date']
        )
        subdag_config = dict(
            pipeline_dataset=config['pipeline_dataset'],
            source_dataset=config['pipeline_dataset'],
            events_dataset=config['events_dataset'],
            dataflow_runner='{dataflow_runner}'.format(**config),
            temp_shards_per_day="3",
        )
        config['source_paths'] = ','.join(self.source_table_paths())
        config['source_dates'] = ','.join(self.source_date_range())

        with DAG(dag_id, schedule_interval=self.schedule_interval, default_args=self.default_args) as dag:

            source_sensors = self.source_table_sensors(dag)

            segment = SubDagOperator(
                subdag=pipe_segment.PipeSegmentDagFactory(
                    schedule_interval=dag.schedule_interval,
                    extra_default_args=subdag_default_args,
                    extra_config=dict(
                        pipeline_dataset=config['pipeline_dataset'],
                        source_dataset=config['pipeline_dataset'],
                        source_tables='{normalized_tables}'.format(**config),
                        dataflow_runner='{dataflow_runner}'.format(**config),
                        temp_shards_per_day="3",
                    )
                ).build(dag_id=dag_id+'.segment'),
                trigger_rule=TriggerRule.ONE_SUCCESS,
                depends_on_past=True,
                task_id='segment'
            )

            measures = SubDagOperator(
                subdag=pipe_measures.PipeMeasuresDagFactory(
                    schedule_interval=dag.schedule_interval,
                    extra_default_args=subdag_default_args,
                    extra_config=subdag_config
                ).build(dag_id=dag_id+'.measures'),
                task_id='measures'
            )

            port_events = SubDagOperator(
                subdag=pipe_anchorages.PipeAnchoragesPortEventsDagFactory(
                    schedule_interval=dag.schedule_interval,
                    extra_default_args=subdag_default_args,
                    extra_config=subdag_config
                ).build(dag_id=dag_id+'.port_events'),
                task_id='port_events'
            )

            port_visits = SubDagOperator(
                subdag=pipe_anchorages.PipeAnchoragesPortVisitsDagFactory(
                    schedule_interval=dag.schedule_interval,
                    extra_default_args=subdag_default_args,
                    extra_config=subdag_config
                ).build(dag_id=dag_id+'.port_visits'),
                task_id='port_visits'
            )

            encounters = SubDagOperator(
                subdag=pipe_encounters.PipeEncountersDagFactory(
                    schedule_interval=dag.schedule_interval,
                    extra_default_args=subdag_default_args,
                    extra_config=subdag_config
                ).build(dag_id=dag_id+'.encounters'),
                task_id='encounters'
            )


            for sensor in source_sensors:
                dag >> sensor >> segment >> measures

            measures >> port_events >> port_visits
            measures >> encounters

            if config.get('enable_features_events', False):

                features = SubDagOperator(
                    subdag=pipe_features.PipeFeaturesDagFactory(
                        schedule_interval=dag.schedule_interval,
                        extra_default_args=subdag_default_args,
                        extra_config=subdag_config
                    ).build(dag_id=dag_id+'.features'),
                    depends_on_past=True,
                    task_id='features'
                )

                events_anchorages = SubDagOperator(
                    subdag = pipe_events_anchorages.PipelineDagFactory(
                        config_tools.load_config('pipe_events.anchorages'),
                        schedule_interval=dag.schedule_interval,
                        extra_default_args=subdag_default_args,
                        extra_config=subdag_config
                    ).build(dag_id=dag_id+'.pipe_events_anchorages'),
                    depends_on_past=True,
                    task_id='pipe_events_anchorages'
                )

                events_encounters = SubDagOperator(
                    subdag = pipe_events_encounters.PipelineDagFactory(
                        config_tools.load_config('pipe_events.encounters'),
                        schedule_interval=dag.schedule_interval,
                        extra_default_args=subdag_default_args,
                        extra_config=subdag_config
                    ).build(dag_id=dag_id+'.pipe_events_encounters'),
                    depends_on_past=True,
                    task_id='pipe_events_encounters'
                )

                events_fishing = SubDagOperator(
                    subdag = pipe_events_fishing.PipelineDagFactory(
                        config_tools.load_config('pipe_events.fishing'),
                        schedule_interval=dag.schedule_interval,
                        extra_default_args=subdag_default_args,
                        extra_config=subdag_config
                    ).build(dag_id=dag_id+'.pipe_events_fishing'),
                    depends_on_past=True,
                    task_id='pipe_events_fishing'
                )

                port_visits >> features
                encounters >> features

                # Points to each independent event
                features >> events_anchorages
                features >> events_encounters
                features >> events_fishing

        return dag
                             '{postgres_connection_string}'.format(**config),
                             '{postgres_table_tracks}'.format(**config)]
            })


            check_source_existance = config.get('check_source_existance', None)
            if (check_source_existance is not None or not check_source_existance):
                dag >> aggregate_tracks
                dag >> publish_vessel_info
            else:
                source_sensors = self.source_table_sensors(dag)
                for sensor in source_sensors:
                    dag >> sensor
                    sensor >> aggregate_tracks
                    sensor >> publish_vessel_info
            aggregate_tracks >> publish_postgres_tracks

            return dag


modes=['daily','monthly']
vessels_configurations = config_tools.load_config(PIPELINE)['configurations']
for mode in modes:
    for vessels_configuration in vessels_configurations:
        dag_factory = VesselsPipelineDagFactory(vessels_configuration,schedule_interval='@{}'.format(mode))
        dag_id = dag_factory.get_dag_id(
            '{}_{}'.format(PIPELINE, mode),
            vessels_configuration['name']
        )
        globals()[dag_id] = dag_factory.build(dag_id=dag_id)
                port_visits >> features
                encounters >> features

                # Points to each independent event
                features >> events_anchorages
                features >> events_encounters
                features >> events_fishing

        return dag

def validateJson(data):
    """
    Validates the configuration with a JSON schema.

    :@param data: The data to be validated.
    :@type data: dict.
    :raise: Error in case the dict don't match the schema.
    """
    folder=os.path.abspath(os.path.dirname(__file__))
    with open('{}/{}'.format(folder,"schemas/vms_list_schema.json")) as vms_schema:
        validate(instance=data, schema=json.loads(vms_schema.read()))

variables = config_tools.load_config(PIPELINE)
validateJson(variables)
for vms in variables['vms_list']:
    for mode in ['daily','monthly', 'yearly']:
        print('>>>>>> VMS: {}'.format(vms))
        pipeline_start_date = datetime.strptime(vms['start_date'].strip(), "%Y-%m-%d")
        dag_id = '{}_{}_{}'.format(PIPELINE, vms['name'], mode)
        globals()[dag_id] = VMSGenericDagFactory(vms['name'], schedule_interval='@{}'.format(mode), extra_default_args={'start_date':pipeline_start_date}, extra_config=vms).build(dag_id)
                    '{docker_run}'.format(**config),
                    'image':
                    '{docker_image}'.format(**config),
                    'name':
                    'fishing-publish-events-postgres',
                    'dag':
                    dag,
                    'arguments': [
                        'publish_postgres', '{date_range}'.format(**config),
                        '{project_id}:{events_dataset}.{events_table}'.format(
                            **config), '{temp_bucket}'.format(**config),
                        '{postgres_instance}'.format(**config),
                        '{postgres_connection_string}'.format(**config),
                        '{postgres_table}'.format(**config), 'fishing'
                    ]
                })
                publish_events_bigquery >> publish_events_postgres

            return dag


fishing_config = config_tools.load_config('pipe_events.fishing')
events_fishing_daily_dag = PipelineDagFactory(fishing_config).build(
    'pipe_events_daily.fishing')
events_fishing_monthly_dag = PipelineDagFactory(
    fishing_config,
    schedule_interval='@monthly').build('pipe_events_monthly.fishing')
events_fishing_yearly_dag = PipelineDagFactory(
    fishing_config,
    schedule_interval='@yearly').build('pipe_events_yearly.fishing')
Ejemplo n.º 7
0
                    'dag':
                    dag,
                    'arguments': [
                        'publish_postgres', '{start_date}'.format(**config),
                        '{end_date}'.format(**config),
                        '{project_id}:{events_dataset}.{events_table}'.format(
                            **config), '{temp_bucket}'.format(**config),
                        '{project_id}'.format(**config),
                        '{postgres_database_region}'.format(**config),
                        '{postgres_db_instance_name}'.format(**config),
                        '{postgres_database}'.format(**config),
                        '{postgres_db_user}'.format(**config),
                        '{postgres_db_password}'.format(**config),
                        '{postgres_db_table}'.format(**config), 'port'
                    ]
                })
                publish_events_bigquery >> publish_events_postgres

            return dag


anchorages_config = config_tools.load_config('pipe_events.anchorages')
events_anchorages_daily_dag = PipelineDagFactory(anchorages_config).build(
    'pipe_events_daily.anchorages')
events_anchorages_monthly_dag = PipelineDagFactory(
    anchorages_config,
    schedule_interval='@monthly').build('pipe_events_monthly.anchorages')
events_anchorages_yearly_dag = PipelineDagFactory(
    anchorages_config,
    schedule_interval='@yearly').build('pipe_events_yearly.anchorages')
Ejemplo n.º 8
0
 def __init__(self, interval):
     subpipeline_config_key = '{}.{}'.format(PIPELINE, SUBPIPELINE)
     super(DagFactory, self).__init__(
         pipeline=PIPELINE,
         extra_config=config_tools.load_config(subpipeline_config_key),
         interval=interval)
import posixpath as pp
from datetime import datetime, timedelta, date
import logging

from airflow import DAG
from airflow.contrib.sensors.bigquery_sensor import BigQueryTableSensor
from airflow.operators.bash_operator import BashOperator
from airflow.models import Variable

from airflow_ext.gfw.operators.bigquery_operator import BigQueryCreateEmptyTableOperator
from airflow_ext.gfw.operators.dataflow_operator import DataFlowDirectRunnerOperator
from airflow_ext.gfw.config import load_config
from airflow_ext.gfw.config import default_args


CONFIG = load_config('pipe_anchorages')
DEFAULT_ARGS = default_args(CONFIG)


def table_sensor(dataset_id, table_id, date):
    return BigQueryTableSensor(
        task_id='source_exists',
        dataset_id=dataset_id,
        table_id='{}{}'.format(table_id, date),
        poke_interval=10,   # check every 10 seconds for a minute
        timeout=60,
        retries=24*7,       # retry once per hour for a week
        retry_delay=timedelta(minutes=60),
        retry_exponential_backoff=False
    )