Beispiel #1
0
    def get_newest_task_instances(self):
        newest_task_instances_sql = '''
        SELECT dr.dag_id, dr.execution_date, dag_state, task_id, ti.state AS task_state, duration, start_date, end_date FROM (
            SELECT dag_run.dag_id, execution_date, state AS dag_state, ROW_NUMBER() OVER (PARTITION BY dag_run.dag_id ORDER BY execution_date DESC) AS age 
            FROM dag_run
            JOIN dag ON dag.dag_id = dag_run.dag_id AND is_active = 1 AND is_paused = 0) dr 
        JOIN task_instance ti ON ti.dag_id = dr.dag_id AND ti.execution_date = dr.execution_date
        WHERE dr.age = 1'''.replace("\n", "")

        data = self.client.query(newest_task_instances_sql)
        result = {}

        for row in data:
            row['dag_name'] = clean_dag_id(row['dag_id'])
            key = row['dag_name'] + row['task_id']
            if key in result and row[
                    'end_date'] and result[key].end_date > row['end_date']:
                continue  # duplicate with dag old version

            if row['dag_name'] in self.config.get('TECHNICAL_ETLS', set()):
                continue  # task instance from the technical ETL

            result[key] = TaskInstance(**row)

        return list(result.values())
Beispiel #2
0
    def get_tables_graph(self, dag_id, execution_date):
        name_without_version = clean_dag_id(dag_id)
        dag_tables = self.get_tables_by_dag(name_without_version)
        dag_progress = {
            d.task_id: d
            for d in self.airflow.get_dag_tasks(dag_id, execution_date)
        }

        # root node - dag name
        yield GraphVertex(id='main',
                          name=name_without_version,
                          state=self.airflow.get_dag_state(
                              dag_id, execution_date))

        # tables
        for table in dag_tables.values():
            yield GraphVertex(
                id=table.id,
                name=table.name +
                (' ({})'.format(table.period.name) if table.period else ''),
                state=dag_progress[table.task_id].task_state,
                tooltip='Finished at: {}, duration: {}'.format(
                    dag_progress[table.task_id].end_date,
                    dag_progress[table.task_id].duration),
                # workaround for this entire method not being able to reference table managed by other DAG in table.uses
                # see https://github.com/Wikia/discreETLy/issues/22
                parent='main' if table.uses is None
                or table.uses not in dag_tables.keys() else table.uses)
Beispiel #3
0
    def get_history(self, days):
        # used to skip the dag version while sorting in order to get only the most recent ones
        extracted_dag_name_regex = '''SUBSTRING(dag_id FROM 1 FOR (REGEXP_INSTR(dag_id, '_v?[0-9]+.[0-9]+$') - 1))'''

        SQL = f'''
        SELECT dag_id, date, state FROM (
            SELECT dag_id, execution_date as date, state, ROW_NUMBER() OVER (PARTITION BY {extracted_dag_name_regex} ORDER BY execution_date DESC) AS age
            FROM dag_run
        ) t WHERE age <= {days}
        '''
        data = self.client.query(SQL)

        dag_names = set(map(lambda row: clean_dag_id(row['dag_id']), data))

        return {dag_name: reversed([DagRun(**row) for row in data if clean_dag_id(row['dag_id']) == dag_name])
                for dag_name in dag_names}
Beispiel #4
0
    def get_tables_graph(self, dag_id, execution_date):
        name_without_version = clean_dag_id(dag_id)
        dag_tables = self.get_tables_by_dag(name_without_version)
        dag_progess = {
            d.task_id: d
            for d in self.airflow.get_dag_tasks(dag_id, execution_date)
        }

        # root node - dag name
        yield GraphVertex(id='main',
                          name=name_without_version,
                          state=self.airflow.get_dag_state(
                              dag_id, execution_date))

        # tables
        for table in dag_tables:
            yield GraphVertex(
                id=table.id,
                name=table.name +
                (' ({})'.format(table.period.name) if table.period else ''),
                state=dag_progess[table.task_id].task_state,
                tooltip='Finished at: {}, duration: {}'.format(
                    dag_progess[table.task_id].end_date,
                    dag_progess[table.task_id].duration),
                parent=table.get_parent())
Beispiel #5
0
    def get_dags_status(self):
        '''
        For each non-technical DAG returns the name and the state of last run
        :return:
        '''

        latest_dags_status_sql = '''
        SELECT * FROM (
            SELECT dag.dag_id, state, ROW_NUMBER() OVER (PARTITION BY dag.dag_id ORDER BY execution_date DESC) AS age 
            FROM dag_run
            JOIN dag ON dag.dag_id = dag_run.dag_id AND is_active = 1 AND is_paused = 0) dr 
        WHERE age = 1'''.replace("\n", "")
        return [{
            'name': clean_dag_id(dag['dag_id']),
            'state': dag['state']}
            for dag in self.client.query(latest_dags_status_sql)
            if clean_dag_id(dag['dag_id']) not in self.config.get('TECHNICAL_ETLS', set())]
Beispiel #6
0
    def get_history(self, days):
        SQL = f'''
        SELECT dag_id, date, state FROM (
            SELECT dag_id, execution_date as date, state, ROW_NUMBER() OVER (PARTITION BY dag_id ORDER BY execution_date DESC) AS age
            FROM dag_run
        ) t WHERE age <= {days}
        '''
        data = self.client.query(SQL)

        dag_names = set(map(lambda row: clean_dag_id(row['dag_id']), data))

        return {
            dag_name: reversed([
                DagRun(**row) for row in data
                if clean_dag_id(row['dag_id']) == dag_name
            ])
            for dag_name in dag_names
        }
Beispiel #7
0
def etl_details(dag_id, execution_date):
    name_without_version = clean_dag_id(dag_id)
    data = app.table_data_provider.get_tables_graph(dag_id, execution_date)
    return render_template('etl_details.html', name=name_without_version, data=data, raw=json.dumps(
        tree_diagram([{
            'id': v.id,
            'name': v.name,
            'success': v.get_graph_color(),
            'parent': v.parent} for v in data], width=800, height=800, padding=8)
    ))
Beispiel #8
0
    def get_last_successful_tasks(self):
        last_successful_task_end_date = '''
        SELECT dag_id, task_id, max(end_date) as end_date
        FROM task_instance
        WHERE state = "success" AND end_date is not null
        GROUP BY dag_id, task_id
        '''

        data = self.client.query(last_successful_task_end_date)
        result = {}

        for row in data:
            row['dag_name'] = clean_dag_id(row['dag_id'])
            key = row['dag_name'] + row['task_id']
            if key in result and result[key].end_date > row['end_date']:
                continue  # duplicate with dag old version
            result[key] = TaskInstance(**row)

        return list(result.values())