def test_get_dag_runs_success_with_state_no_result(self):
        url_template = '/api/experimental/dags/{}/dag_runs?state=dummy'
        dag_id = 'example_bash_operator'
        # Create DagRun
        trigger_dag(dag_id=dag_id, run_id='test_get_dag_runs_success')

        response = self.app.get(url_template.format(dag_id))
        self.assertEqual(200, response.status_code)
        data = json.loads(response.data.decode('utf-8'))

        self.assertIsInstance(data, list)
        self.assertEqual(len(data), 0)
 def execute(self, context):
     dro = DagRunOrder(run_id='trig__' + timezone.utcnow().isoformat())
     if self.python_callable is not None:
         dro = self.python_callable(context, dro)
     if dro:
         trigger_dag(dag_id=self.trigger_dag_id,
                     run_id=dro.run_id,
                     conf=json.dumps(dro.payload),
                     execution_date=self.execution_date,
                     replace_microseconds=False)
     else:
         self.log.info("Criteria not met, moving on")
    def test_task_instance_info(self):
        url_template = '/api/experimental/dags/{}/dag_runs/{}/tasks/{}'
        dag_id = 'example_bash_operator'
        task_id = 'also_run_this'
        execution_date = utcnow().replace(microsecond=0)
        datetime_string = quote_plus(execution_date.isoformat())
        wrong_datetime_string = quote_plus(
            datetime(1990, 1, 1, 1, 1, 1).isoformat()
        )

        # Create DagRun
        trigger_dag(dag_id=dag_id,
                    run_id='test_task_instance_info_run',
                    execution_date=execution_date)

        # Test Correct execution
        response = self.client.get(
            url_template.format(dag_id, datetime_string, task_id)
        )
        self.assertEqual(200, response.status_code)
        self.assertIn('state', response.data.decode('utf-8'))
        self.assertNotIn('error', response.data.decode('utf-8'))

        # Test error for nonexistent dag
        response = self.client.get(
            url_template.format('does_not_exist_dag', datetime_string,
                                task_id),
        )
        self.assertEqual(404, response.status_code)
        self.assertIn('error', response.data.decode('utf-8'))

        # Test error for nonexistent task
        response = self.client.get(
            url_template.format(dag_id, datetime_string, 'does_not_exist_task')
        )
        self.assertEqual(404, response.status_code)
        self.assertIn('error', response.data.decode('utf-8'))

        # Test error for nonexistent dag run (wrong execution_date)
        response = self.client.get(
            url_template.format(dag_id, wrong_datetime_string, task_id)
        )
        self.assertEqual(404, response.status_code)
        self.assertIn('error', response.data.decode('utf-8'))

        # Test error for bad datetime format
        response = self.client.get(
            url_template.format(dag_id, 'not_a_datetime', task_id)
        )
        self.assertEqual(400, response.status_code)
        self.assertIn('error', response.data.decode('utf-8'))
Beispiel #4
0
def trigger_dag(dag_id):
    """
    Trigger a new dag run for a Dag
    """
    data = request.get_json(force=True)

    run_id = None
    if 'run_id' in data:
        run_id = data['run_id']

    conf = None
    if 'conf' in data:
        conf = data['conf']

    try:
        dr = trigger.trigger_dag(dag_id, run_id, conf)
    except AirflowException as err:
        logging.error(err)
        response = jsonify(error="{}".format(err))
        response.status_code = 404
        return response

    if getattr(g, 'user', None):
        logging.info("User {} created {}".format(g.user, dr))

    response = jsonify(message="Created {}".format(dr))
    return response
    def test_get_dag_runs_success_with_capital_state_parameter(self):
        url_template = '/api/experimental/dags/{}/dag_runs?state=RUNNING'
        dag_id = 'example_bash_operator'
        # Create DagRun
        dag_run = trigger_dag(dag_id=dag_id, run_id='test_get_dag_runs_success')

        response = self.app.get(url_template.format(dag_id))
        self.assertEqual(200, response.status_code)
        data = json.loads(response.data.decode('utf-8'))

        self.assertIsInstance(data, list)
        self.assertEqual(len(data), 1)
        self.assertEqual(data[0]['dag_id'], dag_id)
        self.assertEqual(data[0]['id'], dag_run.id)
    def test_get_dag_runs_success_with_state_parameter(self):
        url_template = '/api/experimental/dags/{}/dag_runs?state=running'
        dag_id = 'example_bash_operator'
        # Create DagRun
        dag_run = trigger_dag(dag_id=dag_id,
                              run_id='test_get_dag_runs_success')

        response = self.app.get(url_template.format(dag_id))
        self.assertEqual(200, response.status_code)
        data = json.loads(response.data.decode('utf-8'))

        self.assertIsInstance(data, list)
        self.assertEqual(len(data), 1)
        self.assertEqual(data[0]['dag_id'], dag_id)
        self.assertEqual(data[0]['id'], dag_run.id)
Beispiel #7
0
def trigger_dag(dag_id):
    """
    Trigger a new dag run for a Dag with an execution date of now unless
    specified in the data.
    """
    data = request.get_json(force=True)

    run_id = None
    if 'run_id' in data:
        run_id = data['run_id']

    conf = None
    if 'conf' in data:
        conf = data['conf']

    execution_date = None
    if 'execution_date' in data and data['execution_date'] is not None:
        execution_date = data['execution_date']

        # Convert string datetime into actual datetime
        try:
            execution_date = datetime.strptime(execution_date,
                                               '%Y-%m-%dT%H:%M:%S')
        except ValueError:
            error_message = (
                'Given execution date, {}, could not be identified '
                'as a date. Example date format: 2015-11-16T14:34:15'
                .format(execution_date))
            _log.info(error_message)
            response = jsonify({'error': error_message})
            response.status_code = 400

            return response

    try:
        dr = trigger.trigger_dag(dag_id, run_id, conf, execution_date)
    except AirflowException as err:
        _log.error(err)
        response = jsonify(error="{}".format(err))
        response.status_code = 404
        return response

    if getattr(g, 'user', None):
        _log.info("User {} created {}".format(g.user, dr))

    response = jsonify(message="Created {}".format(dr))
    return response
Beispiel #8
0
def trigger_dag(dag_id):
    """
    Trigger a new dag run for a Dag with an execution date of now unless
    specified in the data.
    """
    data = request.get_json(force=True)

    run_id = None
    if 'run_id' in data:
        run_id = data['run_id']

    conf = None
    if 'conf' in data:
        conf = data['conf']

    execution_date = None
    if 'execution_date' in data and data['execution_date'] is not None:
        execution_date = data['execution_date']

        # Convert string datetime into actual datetime
        try:
            execution_date = datetime.strptime(execution_date,
                                               '%Y-%m-%dT%H:%M:%S')
        except ValueError:
            error_message = (
                'Given execution date, {}, could not be identified '
                'as a date. Example date format: 2015-11-16T14:34:15'.format(
                    execution_date))
            _log.info(error_message)
            response = jsonify({'error': error_message})
            response.status_code = 400

            return response

    try:
        dr = trigger.trigger_dag(dag_id, run_id, conf, execution_date)
    except AirflowException as err:
        _log.error(err)
        response = jsonify(error="{}".format(err))
        response.status_code = 404
        return response

    if getattr(g, 'user', None):
        _log.info("User {} created {}".format(g.user, dr))

    response = jsonify(message="Created {}".format(dr))
    return response
    def test_get_dag_runs_success_with_capital_state_parameter(self):
        with conf_vars(
            {("core", "store_serialized_dags"): self.dag_serialzation}
        ):
            url_template = '/api/experimental/dags/{}/dag_runs?state=RUNNING'
            dag_id = 'example_bash_operator'
            # Create DagRun
            dag_run = trigger_dag(
                dag_id=dag_id, run_id='test_get_dag_runs_success')

            response = self.app.get(url_template.format(dag_id))
            self.assertEqual(200, response.status_code)
            data = json.loads(response.data.decode('utf-8'))

            self.assertIsInstance(data, list)
            self.assertEqual(len(data), 1)
            self.assertEqual(data[0]['dag_id'], dag_id)
            self.assertEqual(data[0]['id'], dag_run.id)
Beispiel #10
0
def trigger_dag_for_date(dag_id, execution_date):
    """
    Trigger a new dag run for a Dag with the given execution date. The
    format for the execution date is expected to be "YYYY-mm-DDTHH:MM:SS",
    for example: "2016-11-16T11:34:15". The colons ought to be escaped to %3A,
    as you would expect, within the URL. These are then automatically replaced
    by Flask before being passed into this method.
    """
    data = request.get_json(force=True)

    run_id = None
    if 'run_id' in data:
        run_id = data['run_id']

    conf = None
    if 'conf' in data:
        conf = data['conf']

    # Convert string datetime into actual datetime
    try:
        execution_date = datetime.strptime(execution_date, '%Y-%m-%dT%H:%M:%S')
    except ValueError:
        error_message = ('Given execution date, {}, could not be identified '
                         'as a date. Example date format: 2015-11-16T14:34:15'.
                         format(execution_date))
        _log.info(error_message)
        response = jsonify({'error': error_message})
        response.status_code = 400

        return response

    try:
        dr = trigger.trigger_dag(dag_id, run_id, conf, execution_date)
    except AirflowException as err:
        logging.error(err)
        response = jsonify(error="{}".format(err))
        response.status_code = 404
        return response

    if getattr(g, 'user', None):
        _log.info("User {} created {}".format(g.user, dr))

    response = jsonify(message="Created {}".format(dr))
    return response
Beispiel #11
0
def trigger_dag(dag_id):
    """
    Trigger a new dag run for a Dag with an execution date of now unless
    specified in the data.
    """
    data = request.get_json(force=True)

    run_id = None
    if "run_id" in data:
        run_id = data["run_id"]

    conf = None
    if "conf" in data:
        conf = data["conf"]

    execution_date = None
    if "execution_date" in data and data["execution_date"] is not None:
        execution_date = data["execution_date"]

        # Convert string datetime into actual datetime
        try:
            execution_date = timezone.parse(execution_date)
        except ValueError:
            error_message = (
                "Given execution date, {}, could not be identified "
                "as a date. Example date format: 2015-11-16T14:34:15+00:00".
                format(execution_date))
            response = jsonify({"error": error_message})
            response.status_code = 400

            return response

    try:
        dr = trigger.trigger_dag(dag_id, run_id, conf, execution_date)
    except AirflowException as err:
        response = jsonify(error="{}".format(err))
        response.status_code = err.status_code
        return response

    response = jsonify(message="Created {}".format(dr))
    return response
Beispiel #12
0
 def start_pipeline(self, id):
     from airflow.api.common.experimental.trigger_dag import trigger_dag
     from airflow import models
     models.DagModel.get_dagmodel(id).set_is_paused(is_paused=False)
     run = trigger_dag(id)
     return dict(result=run.run_id if run else None) 
    def execute(self, context: Dict):
        if isinstance(self.execution_date, datetime.datetime):
            execution_date = self.execution_date
        elif isinstance(self.execution_date, str):
            execution_date = timezone.parse(self.execution_date)
            self.execution_date = execution_date
        else:
            execution_date = timezone.utcnow()

        run_id = DagRun.generate_run_id(DagRunType.MANUAL, execution_date)
        try:
            # Ignore MyPy type for self.execution_date
            # because it doesn't pick up the timezone.parse() for strings
            dag_run = trigger_dag(
                dag_id=self.trigger_dag_id,
                run_id=run_id,
                conf=self.conf,
                execution_date=self.execution_date,
                replace_microseconds=False,
            )

        except DagRunAlreadyExists as e:
            if self.reset_dag_run:
                self.log.info("Clearing %s on %s", self.trigger_dag_id,
                              self.execution_date)

                # Get target dag object and call clear()

                dag_model = DagModel.get_current(self.trigger_dag_id)
                if dag_model is None:
                    raise DagNotFound(
                        f"Dag id {self.trigger_dag_id} not found in DagModel")

                dag_bag = DagBag(dag_folder=dag_model.fileloc,
                                 read_dags_from_db=True)

                dag = dag_bag.get_dag(self.trigger_dag_id)

                dag.clear(start_date=self.execution_date,
                          end_date=self.execution_date)

                dag_run = DagRun.find(dag_id=dag.dag_id, run_id=run_id)[0]
            else:
                raise e

        if self.wait_for_completion:
            # wait for dag to complete
            while True:
                self.log.info(
                    'Waiting for %s on %s to become allowed state %s ...',
                    self.trigger_dag_id,
                    dag_run.execution_date,
                    self.allowed_states,
                )
                time.sleep(self.poke_interval)

                dag_run.refresh_from_db()
                state = dag_run.state
                if state in self.failed_states:
                    raise AirflowException(
                        f"{self.trigger_dag_id} failed with failed states {state}"
                    )
                if state in self.allowed_states:
                    self.log.info("%s finished with allowed state %s",
                                  self.trigger_dag_id, state)
                    return
 def trigger_dag(self, dag_id, run_id=None, conf=None, execution_date=None):
     dag_run = trigger_dag.trigger_dag(dag_id=dag_id,
                                       run_id=run_id,
                                       conf=conf,
                                       execution_date=execution_date)
     return "Created {}".format(dag_run)
Beispiel #15
0
def production_trigger__callable(
    *, dag_run: DagRun, files_path: Path, cdr_type_config: dict, **kwargs
):
    """
    Function that determines which files in files/ should be processed
    and triggers the correct ETL dag with config based on filename.

    Parameters
    ----------
    dag_run : DagRun
        Passed as part of the Dag context - contains the config.
    files_path : Path
        Location of files directory
    cdr_type_config : dict
        ETL config for each cdr type
    """
    session = get_session()

    for cdr_type, cfg in cdr_type_config.items():
        cdr_type = CDRType(cdr_type)

        source_type = cfg["source"]["source_type"]
        logger.info(f"Config for {cdr_type!r} ({source_type}): {cfg}")

        if source_type == "csv":
            filename_pattern = cfg["source"]["filename_pattern"]
            logger.info(f"Filename pattern: {filename_pattern!r}")
            all_files_found = find_files_matching_pattern(
                files_path=files_path, filename_pattern=filename_pattern
            )
            dates_found = {
                filename: extract_date_from_filename(filename, filename_pattern)
                for filename in all_files_found
            }
            unprocessed_files_and_dates = {
                filename: date
                for filename, date in dates_found.items()
                if ETLRecord.can_process(
                    cdr_type=cdr_type, cdr_date=date, session=session
                )
            }
            for file, cdr_date in unprocessed_files_and_dates.items():
                uuid = uuid1()
                cdr_date_str = cdr_date.strftime("%Y%m%d")
                execution_date = pendulum.Pendulum(
                    cdr_date.year, cdr_date.month, cdr_date.day
                )
                config = {
                    "cdr_type": cdr_type,
                    "cdr_date": cdr_date,
                    "file_name": file,
                    "template_path": f"etl/{cdr_type}",
                }
                trigger_dag(
                    f"etl_{cdr_type}",
                    execution_date=execution_date,
                    run_id=f"{cdr_type.upper()}_{cdr_date_str}-{str(uuid)}",
                    conf=config,
                    replace_microseconds=False,
                )
        elif source_type == "sql":
            source_table = cfg["source"]["table_name"]

            # Extract unprocessed dates from source_table

            # TODO: this requires a full parse of the existing data so may not be
            # the most be efficient if a lot of data is present (esp. data that has
            # already been processed). If it turns out too sluggish might be good to
            # think about a more efficient way to determine dates with unprocessed data.
            dates_present = find_distinct_dates_in_table(
                session, source_table, event_time_col="event_time"
            )
            unprocessed_dates = [
                date
                for date in dates_present
                if ETLRecord.can_process(
                    cdr_type=cdr_type, cdr_date=date, session=session
                )
            ]
            logger.info(f"Dates found: {dates_present}")
            logger.info(f"Unprocessed dates: {unprocessed_dates}")

            for cdr_date in unprocessed_dates:
                uuid = uuid1()
                cdr_date_str = cdr_date.strftime("%Y%m%d")
                execution_date = pendulum.Pendulum(
                    cdr_date.year, cdr_date.month, cdr_date.day
                )
                config = {
                    "cdr_type": cdr_type,
                    "cdr_date": cdr_date,
                    "source_table": source_table,
                }
                trigger_dag(
                    f"etl_{cdr_type}",
                    execution_date=execution_date,
                    run_id=f"{cdr_type.upper()}_{cdr_date_str}-{str(uuid)}",
                    conf=config,
                    replace_microseconds=False,
                )
        else:
            raise ValueError(f"Invalid source type: '{source_type}'")
Beispiel #16
0
 def trigger_dag(self, dag_id, run_id, conf):
     return trigger_dag.trigger_dag(dag_id=dag_id,
                                    run_id=run_id,
                                    conf=conf,
                                    replace_microseconds=False)
Beispiel #17
0
def request_ingest():
    authorization = request.headers.get('authorization')
    LOGGER.info('top of request_ingest.')
    assert authorization[:len('BEARER')].lower(
    ) == 'bearer', 'authorization is not BEARER'
    substr = authorization[len('BEARER'):].strip()
    if 'nexus' in substr:
        auth_dct = ast.literal_eval(authorization[len('BEARER'):].strip())
        LOGGER.info('auth_dct: %s', auth_dct)
        assert 'nexus_token' in auth_dct, 'authorization has no nexus_token'
        auth_tok = auth_dct['nexus_token']
    else:
        auth_tok = substr
    #LOGGER.info('auth_tok: %s', auth_tok)  # reduce visibility of auth_tok

    # decode input
    data = request.get_json(force=True)

    LOGGER.debug('request_ingest data: {}'.format(str(data)))
    # Test and extract required parameters
    try:
        provider = _get_required_string(data, 'provider')
        submission_id = _get_required_string(data, 'submission_id')
        process = _get_required_string(data, 'process')
        full_path = _get_required_string(data, 'full_path')
    except HubmapApiInputException as e:
        return HubmapApiResponse.bad_request(
            'Must specify {} to request data be ingested'.format(str(e)))

    process = process.lower(
    )  # necessary because config parser has made the corresponding string lower case

    try:
        dag_id = config('ingest_map', process)
    except HubmapApiConfigException:
        return HubmapApiResponse.bad_request(
            '{} is not a known ingestion process'.format(process))

    try:
        check_ingest_parms(provider, submission_id, process, full_path)

        session = settings.Session()

        dagbag = DagBag('dags')

        if dag_id not in dagbag.dags:
            LOGGER.warning('Requested dag {} not among {}'.format(
                dag_id, [did for did in dagbag.dags]))
            LOGGER.warning('Dag dir full path {}'.os.path.abspath('dags'))
            return HubmapApiResponse.not_found(
                "Dag id {} not found".format(dag_id))

        dag = dagbag.get_dag(dag_id)

        # Produce one and only one run
        tz = pytz.timezone(config('core', 'timezone'))
        execution_date = datetime.now(tz)
        LOGGER.info('starting {} with execution_date: {}'.format(
            dag_id, execution_date))

        run_id = '{}_{}_{}'.format(submission_id, process,
                                   execution_date.isoformat())
        ingest_id = run_id
        fernet = Fernet(config('core', 'fernet_key').encode())
        crypt_auth_tok = fernet.encrypt(auth_tok.encode()).decode()

        conf = {
            'provider': provider,
            'submission_id': submission_id,
            'process': process,
            'dag_id': dag_id,
            'run_id': run_id,
            'ingest_id': ingest_id,
            'crypt_auth_tok': crypt_auth_tok,
            'src_path': config('connections', 'src_path'),
            'lz_path': full_path
        }

        if find_dag_runs(session, dag_id, run_id, execution_date):
            # The run already happened??
            raise AirflowException('The request happened twice?')

        try:
            dr = trigger_dag.trigger_dag(dag_id,
                                         run_id,
                                         conf,
                                         execution_date=execution_date)
        except AirflowException as err:
            LOGGER.error(err)
            raise AirflowException(
                "Attempt to trigger run produced an error: {}".format(err))
        LOGGER.info('dagrun follows: {}'.format(dr))

        #             dag.create_dagrun(
        #                 run_id=run['run_id'],
        #                 execution_date=run['execution_date'],
        #                 state=State.RUNNING,
        #                 conf=conf,
        #                 external_trigger=True
        #             )
        #            results.append(run['run_id'])

        session.close()
    except HubmapApiInputException as e:
        return HubmapApiResponse.bad_request(str(e))
    except ValueError as e:
        return HubmapApiResponse.server_error(str(e))
    except AirflowException as e:
        return HubmapApiResponse.server_error(str(e))
    except Exception as e:
        return HubmapApiResponse.server_error(str(e))

    return HubmapApiResponse.success({
        'ingest_id': ingest_id,
        'run_id': run_id
    })
Beispiel #18
0
 def trigger_event(event, pipeline):
     from airflow.api.common.experimental.trigger_dag import trigger_dag
     from airflow import models
     dag_id = f'event_handler_{event}_pipeline_dag'
     models.DagModel.get_dagmodel(dag_id).set_is_paused(is_paused=False)
     trigger_dag(dag_id, conf=pipeline)
Beispiel #19
0
 def trigger_dag(self, dag_id, run_id=None, conf=None):
     dr = trigger_dag.trigger_dag(dag_id=dag_id, run_id=run_id, conf=conf)
     return "Created {}".format(dr)
Beispiel #20
0
 def trigger_dag(self, dag_id, execution_date):
     """
     Trigger a new dag run for a Dag with an execution date.
     """
     execution_date = timezone.parse(execution_date)
     trigger.trigger_dag(dag_id, None, None, execution_date)
Beispiel #21
0
def trigger_dag_python(**context):
    trigger_dag(dag_id='tutorial', run_id="triggered__" + str(datetime.utcnow()), conf={'process_id': '1'},
                replace_microseconds=False)
Beispiel #22
0
    def execute(self, context: Dict):
        if isinstance(self.execution_date, datetime.datetime):
            execution_date = self.execution_date
        elif isinstance(self.execution_date, str):
            execution_date = timezone.parse(self.execution_date)
            self.execution_date = execution_date
        else:
            execution_date = timezone.utcnow()

        if self.trigger_run_id:
            run_id = self.trigger_run_id
        else:
            run_id = DagRun.generate_run_id(DagRunType.MANUAL, execution_date)

        try:
            dag_run = trigger_dag(
                dag_id=self.trigger_dag_id,
                run_id=run_id,
                conf=self.conf,
                execution_date=self.execution_date,
                replace_microseconds=False,
            )

        except DagRunAlreadyExists as e:
            if self.reset_dag_run:
                self.log.info("Clearing %s on %s", self.trigger_dag_id, self.execution_date)

                # Get target dag object and call clear()

                dag_model = DagModel.get_current(self.trigger_dag_id)
                if dag_model is None:
                    raise DagNotFound(f"Dag id {self.trigger_dag_id} not found in DagModel")

                dag_bag = DagBag(dag_folder=dag_model.fileloc, read_dags_from_db=True)
                dag = dag_bag.get_dag(self.trigger_dag_id)
                dag.clear(start_date=self.execution_date, end_date=self.execution_date)
                dag_run = DagRun.find(dag_id=dag.dag_id, run_id=run_id)[0]
            else:
                raise e

        # Store the execution date from the dag run (either created or found above) to
        # be used when creating the extra link on the webserver.
        ti = context['task_instance']
        ti.xcom_push(key=XCOM_EXECUTION_DATE_ISO, value=dag_run.execution_date.isoformat())
        ti.xcom_push(key=XCOM_RUN_ID, value=dag_run.run_id)

        if self.wait_for_completion:
            # wait for dag to complete
            while True:
                self.log.info(
                    'Waiting for %s on %s to become allowed state %s ...',
                    self.trigger_dag_id,
                    dag_run.execution_date,
                    self.allowed_states,
                )
                time.sleep(self.poke_interval)

                dag_run.refresh_from_db()
                state = dag_run.state
                if state in self.failed_states:
                    raise AirflowException(f"{self.trigger_dag_id} failed with failed states {state}")
                if state in self.allowed_states:
                    self.log.info("%s finished with allowed state %s", self.trigger_dag_id, state)
                    return
Beispiel #23
0
def pollForFiles(**kwargs):
    # Create some local scope variables for use later in proc
    sftpName = kwargs['SFTP_Name']
    sftpConnName = kwargs['SFTP_Connection_Name']
    feedGroups = kwargs['Feed_Groups']
    
    # Connect to SFTP site using provided credentials - should be saved in Connections
    sourceHook = SFTPHook(ftp_conn_id = sftpConnName)

    # Create empty dictionary for storing files that match file masks
    fileMatches = {}

    # Loop through feed locations and their regex for this SFTP site.
    for i in feedGroups:
        fullPath = i['Feed_Group_Location']
        filePattern = i['Feed_Group_Regex']
        feedGroupName = i['Feed_Group_Name']

        try:
            directory = sourceHook.describe_directory(path = fullPath)
            for file in directory.keys():
                if re.match(filePattern, file):
                    fileMatches[os.path.join(fullPath, file)] = directory[file]
        except Exception as e:
            logging.error('Error attempting to poll feed group {} in directory {}'.format(feedGroupName, fullPath))
            raise e

    # If we do not find a file that matches a file mask in any of the directories, exit.
    if not fileMatches:
        return 0

    # If no trigger files or renaming is utilized by the client when placing files on SFTP, we
    #   have to resort to polling for files, waiting for a time period and then comparing the size/modified time
    #   to see if they are ready to pull down.
    time.sleep(SLEEP_TIME)

    for j in feedGroups:
        fullPath = j['Feed_Group_Location']
        filePattern = j['Feed_Group_Regex']
        feedGroupName = j['Feed_Group_Name']
        newFileMatches = {}

        try:
            newDirResults = sourceHook.describe_directory(fullPath)
            # Add only the files that match regular expression for this feed group
            for file in newDirResults.keys():
                if re.match(filePattern, file):
                    newFileMatches[os.path.join(fullPath, file)] = newDirResults[file]

            for file in newFileMatches.keys():
                # fullFilePath = os.path.join(fullPath, file)

                if file in fileMatches.keys():
                    if newFileMatches[file]['size'] == fileMatches[file]['size'] and \
                            newFileMatches[file]['modify'] == fileMatches[file]['modify']:
                        
                        readyFile = file + '.ready'
                        
                        # If file hasn't changed size or modified time since first look, set to ready for another process to pick up and transfer.
                        sourceHook.conn.rename(file, readyFile)
                        logging.info('SFTP: {} FeedGroup: {} File: {} is ready.'.format(sftpName, feedGroupName, os.path.basename(file)))
                        
                        triggerConfig = {
                            'SFTP_Name': sftpName, 
                            'SFTP_Connection_Name': sftpConnName,
                            'File_Name': readyFile,
                        }

                        triggerConfig.update(j)
                        
                        trigger_dag(
                            dag_id = 'SingleFileTransferJob',
                            run_id = 'trig_{}'.format(timezone.utcnow().isoformat()),
                            conf = json.dumps(triggerConfig),
                            execution_date = None,
                            replace_microseconds = False
                        )
        except Exception as e:
            logging.error('Error attempting to rename files in feed group {} in directory {}'.format(feedGroupName, fullPath))
            raise e
Beispiel #24
0
 def trigger_dag(self, dag_id, run_id=None, conf=None, execution_date=None):
     dr = trigger_dag.trigger_dag(dag_id=dag_id,
                                  run_id=run_id,
                                  conf=conf,
                                  execution_date=execution_date)
     return "Created {}".format(dr)
def get_list_mongo_meta(**kwargs):
    meta_base = MongoClient("mongodb://" + globals()["META_MONGO_IP"] + ":" +
                            globals()["MONGO_PORT"] + "/")
    # meta_base = MongoHook(globals()["MONGO_META_CONN_ID"])
    data_to_process_list = meta_base.stats["swift"].find_one(
        {"type": "data_to_process_list"})
    swift_data_list = data_to_process_list["data_to_process"]

    for data_doc in swift_data_list:
        run_id = '%s_%s_%s:%s' % (
            data_doc["swift_user"], data_doc["swift_container"],
            data_doc["swift_id"],
            datetime.utcnow().replace(microsecond=0).isoformat())
        trigger_dag(dag_id=data_account[data_doc["swift_container"]][
            data_doc["content_type"]],
                    run_id=run_id,
                    conf=data_doc)
        logging.info('triggering dag %s with %s' % (run_id, data_doc))
        meta_base.stats["swift"].find_one_and_update(
            {"type": "data_to_process_list"},
            {"$pop": {
                "data_to_process": -1
            }})
        today = datetime.date.today()
        tomorrow = today + datetime.timedelta(days=1)
        today = datetime.datetime(today.year, today.month, today.day)
        tomorrow = datetime.datetime(tomorrow.year, tomorrow.month,
                                     tomorrow.day)
        if meta_base.stats["data_log"].find_one(
            {"date": {
                "$gte": today,
                "$lt": tomorrow
            }}) is None:
            meta_base.stats["data_log"].insert_one({
                "date":
                today,
                "data_processed": [{
                    "swift_id":
                    data_doc["swift_id"],
                    "swift_container":
                    data_doc["swift_container"],
                    "content_type":
                    data_doc["content_type"]
                }]
            })
        else:
            meta_base.stats["data_log"].find_one_and_update(
                {"date": {
                    "$gte": today,
                    "$lt": tomorrow
                }}, {
                    "$push": {
                        "data_processed": {
                            "swift_id": data_doc["swift_id"],
                            "swift_container": data_doc["swift_container"],
                            "content_type": data_doc["content_type"]
                        }
                    }
                })
        return
    raise AirflowSkipException('No external dags triggered')