Example #1
0
def insert_from_copy_table_if_needed(target_db,
                                     table_name=None,
                                     task_instance=None,
                                     run_fetch_task_id=None,
                                     **kwargs):
    """
    Inserts from copy table to target table when all tasks succeed, if target table
    already exists. The rational behind is not doing any modification on target table
    before we make sure fetching from source and insertation is successful.
    """
    insert_from_copy_sql = """
        DELETE FROM {table_name};
        INSERT INTO {table_name}
        SELECT * FROM {copy_table_name};
        DROP TABLE {copy_table_name};
    """
    table_exists = task_instance.xcom_pull(
        task_ids='check-if-table-exists')[0][0]
    fetcher_state = task_instance.xcom_pull(key='state',
                                            task_ids=run_fetch_task_id)
    inserter_state = True
    for index in range(constants.INGEST_TASK_CONCURRENCY):
        inserter_state = (inserter_state and task_instance.xcom_pull(
            key='state',
            task_ids=f'execute-insert-into-{index}',
        ))
    if ((table_exists and table_exists != 'None') and fetcher_state is True
            and inserter_state is True):
        logging.info(f'Inserting from {table_name}_copy table to {table_name}')
        try:
            target_db_conn = PostgresHook(
                postgres_conn_id=target_db).get_conn()
            target_db_cursor = target_db_conn.cursor()
            target_db_cursor.execute(
                insert_from_copy_sql.format(
                    table_name=sql.Identifier(table_name).as_string(
                        target_db_conn),
                    copy_table_name=sql.Identifier(
                        f'{table_name}_copy').as_string(target_db_conn)))
            target_db_conn.commit()

        # TODO: Gotta Catch'm all
        except Exception as e:
            logging.error(f'Exception: {e}')
            target_db_conn.rollback()
            raise

        finally:
            if target_db_conn:
                target_db_cursor.close()
                target_db_conn.close()

    else:
        logging.info('Target table newly created. No need for copy table')
Example #2
0
def create_target_table(
    target_db,
    table_name=None,
    field_mapping=None,
    task_instance=None,
    **kwargs,
):
    """Create target database table or delete all from it.

    If target table exists, create a copy table as a back up to be used in case of failure.
    If target table doesn't exits, create one.
    """
    create_table_sql = """
        CREATE TABLE {{ table_name }} (
        {% for _, tt_field_name, tt_field_constraints in field_mapping %}
            {{ tt_field_name }} {{ tt_field_constraints }}{{ "," if not loop.last }}
        {% endfor %}
        );
    """

    table_exists = task_instance.xcom_pull(
        task_ids='check-if-table-exists')[0][0]
    try:
        target_db_conn = PostgresHook(postgres_conn_id=target_db).get_conn()
        target_db_cursor = target_db_conn.cursor()

        # If table already exists in the target database, create a copy table to be used
        # for rollback in case of failiure or create one.
        # Until there will be possibility for incremental load
        if table_exists and table_exists != 'None':
            table_name = f'{table_name}_copy'

        logging.info(f'Creating a target table {table_name}')
        rendered_create_table_sql = Template(create_table_sql).render(
            table_name=sql.Identifier(table_name).as_string(target_db_conn),
            field_mapping=field_mapping,
        )
        target_db_cursor.execute(rendered_create_table_sql)
        target_db_conn.commit()

    # TODO: Gotta Catch'm all
    except Exception as e:
        logging.error(f'Exception: {e}')
        target_db_conn.rollback()
        raise

    finally:
        if target_db_conn:
            target_db_cursor.close()
            target_db_conn.close()
Example #3
0
def execute_insert_into(target_db,
                        table_name=None,
                        run_fetch_task_id=None,
                        field_mapping=None,
                        task_instance=None,
                        **kwargs):
    """Inserts each paginated response data into target database table.
    Polls to find variable hasn't been processed, generates regarding sql statement to
    insert data in, incrementally waits for new variables.
    Success depends on fetcher task completion.
    """

    insert_into_sql = """
        INSERT INTO {{ table_name }} (
        {% for _, tt_field_name, _ in field_mapping %}
            {{ tt_field_name }}{{ "," if not loop.last }}
        {% endfor %}
        )
        VALUES
        {% for record in record_subset %}
        (
            {% for st_field_name, _, _ in field_mapping %}
                {% if not record[st_field_name] or record[st_field_name] == 'None' %}
                    NULL
                {% else %}
                    {{ record[st_field_name] }}
                {% endif %}
                {{ "," if not loop.last }}
            {% endfor %}
        {{ ")," if not loop.last }}
        {% endfor %}
        );
    """
    # Give some initial time to fetch task to get a page and save it into variable
    time.sleep(3)
    # Used for providing incremental wait
    sleep_time = 5
    number_of_run = 1
    redis_client = get_redis_client()
    table_exists = task_instance.xcom_pull(
        task_ids='check-if-table-exists')[0][0]
    if table_exists and table_exists != 'None':
        table_name = f'{table_name}_copy'
    try:
        target_db_conn = PostgresHook(postgres_conn_id=target_db).get_conn()
        target_db_cursor = target_db_conn.cursor()

        while True:
            var_name = get_available_page_var(redis_client, run_fetch_task_id)
            if var_name:
                logging.info(f'Got the unprocessed var_name {var_name}')
                sleep_time = 5
                var_name = var_name.decode('utf-8')
                try:
                    record_subset = json.loads(Variable.get(var_name))
                except KeyError:
                    logging.info(
                        f'Var {var_name} no more exist! It is processed by another worker. Moving on.'
                    )
                    continue

                escaped_record_subset = []
                for record in record_subset:
                    escaped_record = {}
                    for key, value in record.items():
                        if value and value != 'None':
                            escaped_record[key] = sql.Literal(value).as_string(
                                target_db_conn)
                        else:
                            escaped_record[key] = sql.Literal(None).as_string(
                                target_db_conn)
                    escaped_record_subset.append(escaped_record)

                exec_sql = Template(insert_into_sql).render(
                    table_name=sql.Identifier(table_name).as_string(
                        target_db_conn),
                    field_mapping=field_mapping,
                    record_subset=escaped_record_subset,
                )
                target_db_cursor.execute(exec_sql)
                logging.info(f'Deleting the var_name {var_name}')
                Variable.delete(var_name)
            else:
                # Check if fetch task completed successfully, if it's, break out of loop and commit
                # the transaction because there is no more page to process. If it's failed raise Exception so that
                # transaction will be rollbacked
                state = task_instance.xcom_pull(key='state',
                                                task_ids=run_fetch_task_id)
                logging.info(f'Checking the state of fetcher task {state}')
                if state is False:
                    raise Exception('Fetcher task failed!')
                elif state is True:
                    logging.info(
                        'Fetcher task successfully completed and there is no more variable to process.'
                    )
                    break
                else:
                    logging.info(
                        f'Sleeping for {sleep_time} fetcher task to catchup')
                    sleep_time = sleep_time * number_of_run
                    time.sleep(sleep_time)
                    number_of_run += 1

        target_db_conn.commit()
        task_instance.xcom_push(key='state', value=True)

    # TODO: Gotta Catch'm all
    except Exception as e:
        logging.error(f'Exception: {e}')
        target_db_conn.rollback()
        task_instance.xcom_push(key='state', value=False)
        raise

    finally:
        if target_db_conn:
            target_db_cursor.close()
            target_db_conn.close()