Beispiel #1
0
def update_datahub_contact_consent(
    target_db: str,
    table: sqlalchemy.Table,
    **kwargs,
):
    """
    Updates Contacts temp table with email marketing consent data from Consent dataset.
    """
    table = get_temp_table(table, kwargs['ts_nodash'])
    update_consent_query = f"""
        UPDATE {table.schema}.{table.name} AS contacts_temp
        SET email_marketing_consent = consent.email_marketing_consent
        FROM {ConsentPipeline.fq_table_name()} AS consent
        WHERE lower(contacts_temp.email) = lower(consent.email)
    """
    engine = sqlalchemy.create_engine(
        'postgresql+psycopg2://',
        creator=PostgresHook(postgres_conn_id=target_db).get_conn,
        echo=config.DEBUG,
    )
    with engine.begin() as conn:
        conn.execute(sqlalchemy.text(update_consent_query))

    logger.info(
        'Updated Contacts temp table with email consent from Consent dataset')
Beispiel #2
0
def _get_csvfile_for_each_period(data_for_each_period):
    for zip_bytes in data_for_each_period:
        with zipfile.ZipFile(io.BytesIO(zip_bytes)) as archive:
            name = archive.namelist()[0]
            logger.info('Opening csv file %s in zip', name)
            with archive.open(name, "r") as file:
                yield file
Beispiel #3
0
def fetch_apple_mobility_data(
    table_name: str,
    base_url: str,
    config_path: str,
    df_transform: Callable[[pd.DataFrame], pd.DataFrame],
    page_size: int = 1000,
    **kwargs,
):
    s3 = S3Data(table_name, kwargs['ts_nodash'])

    api_config = requests.get(base_url + config_path).json()
    source_url = (
        base_url + api_config['basePath'] + api_config['regions']['en-us']['csvPath']
    )
    logger.info(f'Fetching csv from {source_url}')
    response = requests.get(source_url)
    df = pd.read_csv(io.StringIO(response.content.decode('utf-8')))
    df = df_transform(df)

    page = 1
    for i in range((len(df) // page_size) + 1):
        results = df.iloc[page_size * i : page_size * (i + 1)].to_json(
            orient="records", date_format="iso"
        )
        s3.write_key(f"{page:010}.json", results, jsonify=False)
        page += 1

    logger.info('Fetching from source completed')
Beispiel #4
0
def fetch_from_hosted_csv(
    table_name: str,
    source_url: str,
    page_size: int = 1000,
    allow_empty_strings: bool = True,
    **kwargs,
):
    s3 = S3Data(table_name, kwargs["ts_nodash"])
    results = []
    page = 1
    with closing(requests.get(source_url, stream=True)) as request:
        reader = csv.DictReader(
            codecs.iterdecode(request.iter_lines(), 'utf-8'))
        for row in reader:
            if not allow_empty_strings:
                row = {k: v if v != '' else None
                       for k, v in row.items()}  # type: ignore
            results.append(row)
            if len(results) >= page_size:
                s3.write_key(f"{page:010}.json", results)
                results = []
                page += 1
        if results:
            s3.write_key(f"{page:010}.json", results)

    logger.info("Fetching from source completed")
Beispiel #5
0
def _check_table(engine, conn, temp: sa.Table, target: sa.Table,
                 allow_null_columns: bool):
    logger.info("Checking %s", temp.name)

    if engine.dialect.has_table(conn, target.name, schema=target.schema):
        logger.info("Checking record counts")
        temp_count = conn.execute(
            sa.select([sa.func.count()]).select_from(temp)).fetchone()[0]
        target_count = conn.execute(
            sa.select([sa.func.count()]).select_from(target)).fetchone()[0]

        logger.info("Current records count %s, new import count %s",
                    target_count, temp_count)

        if target_count > 0 and temp_count / target_count < 0.9:
            raise MissingDataError(
                "New record count is less than 90% of current data")

    logger.info("Checking for empty columns")
    for col in temp.columns:
        row = conn.execute(
            sa.select([temp]).select_from(temp).where(
                col.isnot(None)).limit(1)).fetchone()
        if row is None:
            error = f"Column {col} only contains NULL values"
            if allow_null_columns or config.ALLOW_NULL_DATASET_COLUMNS:
                logger.warning(error)
            else:
                raise UnusedColumnError(error)
    logger.info("All columns are used")
Beispiel #6
0
def send_dataset_update_emails(update_emails_data_environment_variable):
    if update_emails_data_environment_variable not in os.environ:
        raise ValueError(
            f"Could not find data in environment for `{update_emails_data_environment_variable}`"
        )

    dataset_info = json.loads(
        os.environ[update_emails_data_environment_variable])

    dataset_url = dataset_info['dataset_url']
    dataset_name = dataset_info['dataset_name']
    emails = dataset_info['emails']

    client = NotificationsAPIClient(os.environ['NOTIFY_API_KEY'])

    logger.info(
        f"Sending `dataset updated` emails to subscribers for "
        f"this pipeline (`{update_emails_data_environment_variable}`).")
    for email in emails:
        client.send_email_notification(
            email_address=email,
            template_id=os.environ['NOTIFY_TEMPLATE_ID__DATASET_UPDATED'],
            personalisation={
                "dataset_name": dataset_name,
                "dataset_url": dataset_url
            },
        )
Beispiel #7
0
def _download(source_url, params=()):
    logger.info(
        'Downloading %s %s',
        source_url,
        [(key, value) for (key, value) in params if key != 'token'],
    )
    response = requests.get(source_url, stream=True, params=params)
    response.raise_for_status()
    return response
Beispiel #8
0
def _download(source_url):
    logger.info('Downloading %s', source_url)
    response = requests.get(source_url)

    try:
        response.raise_for_status()
    except requests.exceptions.HTTPError:
        logging.error("Request failed: %s", response.text)
        raise

    return response.content
Beispiel #9
0
def _hawk_api_request(
    url: str,
    credentials: dict,
    results_key: Optional[str],
    next_key: Optional[str],
    validate_response: Optional[bool] = True,
    force_http: Optional[bool] = False,
):
    sender = Sender(
        credentials,
        # Currently data workspace denies hawk requests signed with https urls.
        # Once fixed the protocol replacement can be removed.
        url.replace('https', 'http') if force_http else url,
        "get",
        content="",
        content_type="",
        always_hash_content=True,
    )

    logger.info(f"Fetching page {url}")
    response = requests.get(
        url,
        headers={
            "Authorization": sender.request_header,
            "Content-Type": ""
        },
        timeout=300,
    )

    try:
        response.raise_for_status()
    except requests.exceptions.HTTPError:
        logger.warning(f"Request failed: {response.text}")
        raise

    if validate_response:
        try:
            sender.accept_response(
                response.headers["Server-Authorization"],
                content=response.content,
                content_type=response.headers["Content-Type"],
            )
        except HawkFail as e:
            logger.error(f"HAWK Authentication failed {str(e)}")
            raise

    response_json = response.json()

    if (next_key and next_key not in response_json) or (
            results_key and results_key not in response_json):
        raise ValueError("Unexpected response structure")

    return response_json
Beispiel #10
0
def create_views(
    target_db: str,
    schema_name: str,
    table_name: str,
    **kwargs,
):
    """
    Create views for available publication dates
    """
    engine = sa.create_engine(
        'postgresql+psycopg2://',
        creator=PostgresHook(postgres_conn_id=target_db).get_conn,
    )
    with engine.begin() as conn:
        fq_table_name = f'"{schema_name}"."{table_name}"'
        result_set = conn.execute(f"""
                select
                    distinct publication_date
                from {fq_table_name}
            """)
        resultproxy = result_set.fetchall()
        if resultproxy:
            dates = [[value for column, value in rowproxy.items()][0]
                     for rowproxy in resultproxy]
            for date in dates:
                postfix = date.strftime("%b%Y").lower()
                fq_view_name = f'"{schema_name}"."{table_name}__{postfix}"'
                logger.info(f'Creating materialized view {fq_view_name}')
                conn.execute(f"""
                    create materialized view if not exists {fq_view_name} as (
                        select
                            *
                        from {fq_table_name}
                        where publication_date = '{date.strftime("%Y-%m-%d")}'
                    )
                """)
            fq_view_name = f'"{schema_name}"."{table_name}__latest"'
            logger.info(f'Creating materialized view {fq_view_name}')
            conn.execute(f"""
                create materialized view if not exists {fq_view_name} as (
                    select
                        *
                    from {fq_table_name}
                    where publication_date = (
                        select
                            max(publication_date)
                        from {fq_table_name}
                    )
                );
                refresh materialized view {fq_view_name}
            """)
Beispiel #11
0
def cleanup_old_s3_files(*args, **kwargs):
    s3 = S3Hook("DEFAULT_S3")
    bucket = s3.get_bucket(config.S3_IMPORT_DATA_BUCKET)

    current_time = datetime.strptime(kwargs["ts_nodash"], "%Y%m%dT%H%M%S")
    logger.info(
        f"Retention period is {config.S3_RETENTION_PERIOD_DAYS} days before {current_time}"
    )

    pipelines = s3.list_prefixes(
        config.S3_IMPORT_DATA_BUCKET, prefix="import-data/", delimiter="/"
    )

    for pipeline in pipelines:
        run_ids = sorted(
            s3.list_prefixes(
                config.S3_IMPORT_DATA_BUCKET, prefix=pipeline, delimiter="/"
            )
        )

        for run_id in run_ids:
            run_dt = datetime.strptime(run_id.split("/")[-2], "%Y%m%dT%H%M%S")
            if run_id == run_ids[-1]:
                logger.info(
                    f"Keeping {pipeline} run {run_id} ({run_dt}) - always retain the last run."
                )
            elif current_time - run_dt >= timedelta(
                days=config.S3_RETENTION_PERIOD_DAYS
            ):
                logger.info(
                    f"Deleting {pipeline} run {run_id} ({run_dt}) older than retention period"
                )
                bucket.objects.filter(Prefix=run_id).delete()
            else:
                logger.info(f"Keeping {pipeline} run {run_id} ({run_dt})")
Beispiel #12
0
 def get_lines(files):
     for file, source_name in files:
         logger.info('Parsing file %s', file)
         for line in _without_first_and_last(file):
             data = line.strip().decode('utf-8', errors='replace').split("|")
             if min_fields <= len(data) < max_fields:
                 yield data + [source_name]
             else:
                 logger.warn(
                     "Ignoring row with %s fields instead of expected %s: %s",
                     len(data),
                     num_expected_fields,
                     line,
                 )
Beispiel #13
0
    def get_file_linked_from(url, filename):
        logger.info('Looking on %s for links to %s', url, filename)
        html = _download(url)
        soup = BeautifulSoup(html, "html.parser")
        links = [link.get('href') for link in soup.find_all('a') if link.get('href')]

        logger.info("Found links %s", links)
        matching_links = [link for link in links if link.endswith(filename)]
        if not matching_links:
            raise Exception(f'Unable to find link to {filename}')
        if len(matching_links) > 1:
            raise Exception(f'Too many links for {filename}')

        return _download(urljoin(url, matching_links[0]))
Beispiel #14
0
def report_metric_per_model(actual, predict, average_type='binary'):
    precisions = precision_score(actual, predict, average=average_type)
    recalls = recall_score(actual, predict, average=average_type)
    f1 = f1_score(actual, predict, average=average_type)
    accuracy = accuracy_score(actual, predict)
    auc = roc_auc_score(actual, predict)
    logger.info(f"Precision = {precisions}")
    logger.info(f"Recall = {recalls}")
    logger.info(f"f1 = {f1}")
    logger.info(f"Accuracy = {accuracy}")
    # logger.info("AUC = {}".format(roc_auc_score(Y_test_tag, np.concatenate(test_predictions_tag))))
    logger.info(f"AUC = {auc}")

    return precisions, recalls, f1, accuracy, auc
Beispiel #15
0
def create_csv(
    target_db: str, base_file_name: str, timestamp_output: bool, query: str, **kwargs,
):
    """
    Given a db, view name and a query create a csv file and upload it to s3.
    """
    if timestamp_output:
        file_name = (
            f'{base_file_name}-{kwargs["next_execution_date"].strftime("%Y-%m-%d")}.csv'
        )
    else:
        file_name = f'{base_file_name}.csv'

    engine = sa.create_engine(
        'postgresql+psycopg2://',
        creator=PostgresHook(postgres_conn_id=target_db).get_conn,
        echo=config.DEBUG,
    )
    row_count = 0
    run_date = kwargs.get('run_date', kwargs.get('execution_date'))
    with engine.begin() as conn:
        result = conn.execution_options(stream_results=True).execute(
            sa.text(query), run_date=run_date.date()
        )
        with tempfile.NamedTemporaryFile('w', encoding='utf8') as fh:
            writer = csv.writer(fh, quoting=csv.QUOTE_NONNUMERIC)
            writer.writerow(result.keys())
            while True:
                chunk = result.fetchmany(1000)
                if not chunk:
                    break
                row_count += len(chunk)
                for row in chunk:
                    writer.writerow(row)
            fh.flush()

            logger.info(f'Wrote {row_count} rows to file {file_name}')

            s3_client = S3Hook('DATA_WORKSPACE_S3')
            s3_output_path = f's3://csv-pipelines/{base_file_name}/{file_name}'
            s3_client.load_file(
                fh.name,
                s3_output_path,
                bucket_name=config.DATA_WORKSPACE_S3_BUCKET,
                replace=True,
            )

            logger.info(f"Uploaded {file_name} to {s3_output_path}")
Beispiel #16
0
def drop_swap_tables(target_db: str, *tables, **kwargs):
    """Delete temporary swap dataset DB tables.

    Given a dataset table `table`, deletes any related swap tables
    containing the previous version of the dataset.

    """
    engine = sa.create_engine(
        'postgresql+psycopg2://',
        creator=PostgresHook(postgres_conn_id=target_db).get_conn,
    )
    with engine.begin() as conn:
        conn.execute("SET statement_timeout = 600000")
        for table in tables:
            swap_table = get_temp_table(table, kwargs["ts_nodash"] + "_swap")
            logger.info("Removing %s", swap_table.name)
            swap_table.drop(conn, checkfirst=True)
Beispiel #17
0
def drop_temp_tables(target_db: str, *tables, **kwargs):
    """Delete temporary dataset DB tables.

    Given a dataset table `table`, deletes any related temporary
    tables created during the DAG run.

    """
    engine = sa.create_engine(
        'postgresql+psycopg2://',
        creator=PostgresHook(postgres_conn_id=target_db).get_conn,
    )
    with engine.begin() as conn:
        conn.execute("SET statement_timeout = 600000")
        for table in tables:
            temp_table = get_temp_table(table, kwargs["ts_nodash"])
            logger.info("Removing %s", temp_table.name)
            temp_table.drop(conn, checkfirst=True)
Beispiel #18
0
def _fetch(s3, trade_type, expected_keys):
    years = _get_years(trade_type)

    def paginate(items, num_per_page):
        page = []
        for item in items:
            page.append(item)
            if len(page) == num_per_page:
                yield page
                page = []
        if page:
            yield page

    def file_from_zip(zip_bytes):
        with zipfile.ZipFile(io.BytesIO(zip_bytes)) as archive:
            name = archive.namelist()[0]
            logger.info('Opening csv file %s in zip', name)
            with archive.open(name, "r") as file:
                yield from file

    def get_files(trade_type, expected_keys, periods):
        frequency = 'A'
        classification = 'HS' if trade_type == 'C' else 'EB02'
        for period in periods:
            yield file_from_zip(
                _download(
                    f'https://comtrade.un.org/api/get/bulk/{trade_type}/{frequency}/{period}/all/{classification}',
                    params=(('token', config.COMTRADE_TOKEN), ),
                ).content)

    def get_dicts(f):
        for f in files:
            for row in csv.DictReader(codecs.iterdecode(f, 'utf-8-sig')):
                if list(row.keys()) != expected_keys:
                    raise Exception('Unexpected columns {}'.format(row.keys()))
                yield {k: v if v else None for k, v in row.items()}

    files = get_files(trade_type, expected_keys, years)
    result_records = get_dicts(files)
    results_pages = paginate(result_records, 10000)

    for i, page in enumerate(results_pages):
        output_filename = f"{i:010}.json"
        logger.info('Saving file to S3 %s', output_filename)
        s3.write_key(output_filename, page)
Beispiel #19
0
def _predict(X_to_predict, tokenizer, tags_to_predict, model_path):
    import tensorflow as tf

    logger.info("Start making prediction")

    ids = X_to_predict['id']
    X_to_predict = X_to_predict['sentence']
    text_to_predict = X_to_predict.copy()

    X_to_predict = transform_X(X_to_predict.values, tokenizer)
    Y_test_predict = np.zeros((X_to_predict.shape[0], len(tags_to_predict)))
    Y_test_predict_prob = np.zeros(
        (X_to_predict.shape[0], len(tags_to_predict)))

    for ind, tag_i in enumerate(
        ['_'.join(j.split(' ')) for j in tags_to_predict]):
        logger.info(f"Predicting for tag {ind}, {tag_i}")
        m = tf.keras.models.load_model(model_path + tag_i)
        test_predictions_prob_tag = m.predict(X_to_predict)
        test_predictions_class_tag = (test_predictions_prob_tag >
                                      probability_threshold) + 0
        Y_test_predict_prob[:, ind] = np.concatenate(test_predictions_prob_tag)
        Y_test_predict[:, ind] = np.concatenate(test_predictions_class_tag)

    predict = []
    sentence = []
    predict_prob = []

    for i in np.arange(0, X_to_predict.shape[0]):
        sentence.append(X_to_predict[i])
        predict.append(
            list(
                compress(tags_to_predict,
                         Y_test_predict_prob[i] > probability_threshold)))
        predict_prob.append(dict(zip(tags_to_predict, Y_test_predict_prob[i])))

    prediction_on_data = pd.DataFrame({
        'id': ids,
        'sentence': text_to_predict,
        'prediction': predict,
        'prediction_prob': predict_prob,
    })

    return prediction_on_data
Beispiel #20
0
def make_prediction(target_db: str, query: str, table_name, **context):

    with TemporaryDirectory() as tempdir:
        os.chdir(tempdir)
        os.mkdir('the_models')
        os.chdir(tempdir + '/the_models')
        logger.info(f"working dir: {os.getcwd()}")

        logger.info("step 1: fetch model")
        tags_to_predict = fetch_model()

        logger.info("step 2: fetch data")
        df = fetch_interaction_data(target_db, query)

        logger.info("step 3: make prediction")
        predictions = predict_tags(df, tags_to_predict)

        logger.info("step 4: write prediction to S3")
        write_prediction(table_name, predictions, context)
Beispiel #21
0
def cleanup_old_datasets_db_tables(*args, **kwargs):
    engine = sa.create_engine(
        'postgresql+psycopg2://',
        creator=PostgresHook(postgres_conn_id=config.DATASETS_DB_NAME).get_conn,
    )

    current_time = datetime.strptime(kwargs["ts_nodash"], "%Y%m%dT%H%M%S")
    logger.info(
        f"Retention period is {config.DB_TEMP_TABLE_RETENTION_PERIOD_DAYS} days before {current_time}"
    )

    with engine.begin() as conn:
        tables = [
            table
            for table in conn.execute(
                '''
SELECT schemaname, tablename
FROM pg_catalog.pg_tables
WHERE schemaname NOT IN ('dataflow', 'information_schema')
AND schemaname NOT LIKE '\\_%%'
AND schemaname NOT LIKE 'pg_%%'
'''
            )
        ]

        for table in tables:
            schema, table_name = table
            table_match = re.match(r"(.*)_(\d{8}t\d{6})(?:_swap)?", table_name)
            if not table_match:
                logger.info(f"Skipping {schema}.{table_name}")
                continue

            table_dt = datetime.strptime(table_match.groups()[1], "%Y%m%dt%H%M%S")

            if current_time - table_dt >= timedelta(
                days=config.DB_TEMP_TABLE_RETENTION_PERIOD_DAYS
            ):
                if table_match.groups()[0] not in [table[1] for table in tables]:
                    logger.warning(
                        f"Main table {table_match.groups()[0]} missing for {schema}.{table_name}, skipping"
                    )
                else:
                    logger.info(
                        f"Deleting temporary table {schema}.{table_name} ({table_dt}) older than retention period"
                    )
                    conn.execute(
                        "DROP TABLE {}.{}".format(
                            engine.dialect.identifier_preparer.quote(schema),
                            engine.dialect.identifier_preparer.quote(table_name),
                        )
                    )
            else:
                logger.info(f"Keeping table {schema}.{table_name}")
def update_table(target_db: str, target_table: sa.Table, update_query: str, **kwargs):
    """
    Run a query to update an existing table from a temporary table.
    """
    engine = sa.create_engine(
        'postgresql+psycopg2://',
        creator=PostgresHook(postgres_conn_id=target_db).get_conn,
    )
    with engine.begin() as conn:
        from_table = get_temp_table(target_table, kwargs["ts_nodash"])
        logger.info(f'Updating {target_table.name} from {from_table.name}')
        conn.execute(
            update_query.format(
                schema=engine.dialect.identifier_preparer.quote(target_table.schema),
                target_table=engine.dialect.identifier_preparer.quote(
                    target_table.name
                ),
                from_table=engine.dialect.identifier_preparer.quote(from_table.name),
            )
        )
def fetch_companies_house_companies(
    table_name: str,
    source_url: str,
    number_of_files: int,
    page_size: int = 10000,
    **kwargs,
):
    """
    Loop through `number_of_files`, build the url, download the zip file,
    extract and write data in batches of `page_size` to s3
    """
    s3 = S3Data(table_name, kwargs['ts_nodash'])
    page = 1
    results = []
    publish_date = datetime(kwargs['next_execution_date'].year,
                            kwargs['next_execution_date'].month,
                            1).strftime('%Y-%m-01')
    for file_num in range(1, number_of_files + 1):
        url = source_url.format(
            file_date=publish_date,
            file_num=file_num,
            num_files=number_of_files,
        )
        logger.info('Fetching zip file from %s', url)
        with zipfile.ZipFile(io.BytesIO(_download(url))) as archive:
            with archive.open(archive.namelist()[0], 'r') as f:
                reader = csv.DictReader(codecs.iterdecode(f, 'utf-8'))
                if reader.fieldnames is not None:
                    reader.fieldnames = [x.strip() for x in reader.fieldnames]
                for row in reader:
                    row['publish_date'] = publish_date
                    results.append(row)
                    if len(results) >= page_size:
                        s3.write_key(f'{page:010}.json', results)
                        results = []
                        page += 1

    if results:
        s3.write_key(f'{page:010}.json', results)

    logger.info('Fetching from source completed')
def fetch_from_gtr_api(table_name: str, resource_type: str, **kwargs):
    source_url = 'https://gtr.ukri.org/gtr/api'

    s3 = S3Data(table_name, kwargs["ts_nodash"])
    page = 1

    while True:
        response = requests.get(
            f'{source_url}/{resource_type}s',
            params={
                'p': page,
                's': 100
            },
            headers={'Accept': 'application/json'},
        )

        try:
            response.raise_for_status()
        except requests.exceptions.HTTPError:
            logger.error(f"Request failed: {response.text}")
            raise

        response_json = response.json()
        total_pages = response_json['totalPages']
        total_number_of_results = response_json['totalSize']

        results = response_json[resource_type]

        s3.write_key(f"{page:010}.json", results)

        logger.info(
            f"Fetched {len(results*page)} out of {total_number_of_results} {resource_type} records"
        )

        page += 1
        if page > total_pages:
            break

    logger.info("Fetching from source completed")
Beispiel #25
0
def branch_on_modified_date(target_db: str, table_config: TableConfig,
                            **context):
    engine = sa.create_engine(
        'postgresql+psycopg2://',
        creator=PostgresHook(postgres_conn_id=target_db).get_conn,
    )

    with engine.begin() as conn:
        res = conn.execute(
            """
            SELECT source_data_modified_utc
            FROM dataflow.metadata
            WHERE table_schema = %s and table_name = %s
            """,
            [table_config.schema, table_config.table_name],
        ).fetchall()

        if len(res) == 0:
            return 'continue'
        if len(res) > 1:
            raise AirflowException(
                f"Multiple rows in the dataflow metadata table for {table_config.schema}.{table_config.table_name}"
            )
        if not res[0][0]:
            return 'continue'

        old_modified_utc = res[0][0]

    new_modified_utc = context['task_instance'].xcom_pull(
        task_ids='get-source-modified-date')
    context['task_instance'].xcom_push('source-modified-date-utc',
                                       new_modified_utc)

    logger.info("Old: %s. New: %s", old_modified_utc, new_modified_utc)

    if new_modified_utc > old_modified_utc:
        return 'continue'

    return 'stop'
Beispiel #26
0
 def nested_files_from_zip(zip_bytes):
     with zipfile.ZipFile(io.BytesIO(zip_bytes)) as archive:
         for name in archive.namelist():
             if not name.lower().startswith(base_filename):
                 # Some sesx16 files seem to contain unrelated data
                 logger.info('Skipping file %s', name)
                 continue
             logger.info('Opening file in zip %s', name)
             with archive.open(name, "r") as file:
                 with zipfile.ZipFile(file) as inner_archive:
                     inner_name = inner_archive.namelist()[0]
                     logger.info('Opening inner file in zip %s', inner_name)
                     with inner_archive.open(inner_name, "r") as inner_file:
                         logger.info('Opened inner file in zip %s', inner_name)
                         yield inner_file, inner_name
Beispiel #27
0
def scrape_load_and_check_data(
    target_db: str,
    table_config: TableConfig,
    pipeline_instance: "_PandasPipelineWithPollingSupport",
    **kwargs,
):
    create_temp_tables(target_db, *table_config.tables, **kwargs)

    temp_table = get_temp_table(table_config.table, suffix=kwargs['ts_nodash'])

    data_frames = pipeline_instance.__class__.data_getter()

    parsed_uri = urlparse(os.environ['AIRFLOW_CONN_DATASETS_DB'])
    host, port, dbname, user, password = (
        parsed_uri.hostname,
        parsed_uri.port or 5432,
        parsed_uri.path.strip('/'),
        parsed_uri.username,
        parsed_uri.password,
    )
    # Psycopg3 is still under active development, but crucially has support for generating data and pushing it to
    # postgres efficiently via `cursor.copy` and the COPY protocol.
    with psycopg3.connect(
            f'host={host} port={port} dbname={dbname} user={user} password={password}'
    ) as connection:
        with connection.cursor() as cursor:
            logger.info("Starting streaming copy to DB")

            records_num = 0
            df_num = 0
            with cursor.copy(
                    f'COPY "{temp_table.schema}"."{temp_table.name}" FROM STDIN'
            ) as copy:
                for data_frame in data_frames:
                    df_num += 1
                    df_len = len(data_frame)
                    records_num += df_len

                    logger.info(
                        "Copying data frame #%s (records %s - %s)",
                        df_num,
                        records_num - df_len,
                        records_num,
                    )
                    copy.write(
                        data_frame.to_csv(
                            index=False,
                            header=False,
                            sep='\t',
                            na_rep=r'\N',
                            columns=[
                                data_column for data_column, sa_column in
                                table_config.columns
                            ],
                        ))
                    del data_frame

            logger.info("Copy complete.")
Beispiel #28
0
def create_temp_tables(target_db: str, *tables: sa.Table, **kwargs):
    """
    Create a temporary table for the current DAG run for each of the given dataset
    tables.


    Table names are unique for each DAG run and use target table name as a prefix
    and current DAG execution timestamp as a suffix.
    """

    engine = sa.create_engine(
        'postgresql+psycopg2://',
        creator=PostgresHook(postgres_conn_id=target_db).get_conn,
    )

    with engine.begin() as conn:
        conn.execute("SET statement_timeout = 600000")
        for table in tables:
            table = get_temp_table(table, kwargs["ts_nodash"])
            logger.info("Creating schema %s if not exists", table.schema)
            conn.execute(f"CREATE SCHEMA IF NOT EXISTS {table.schema}")
            logger.info("Creating %s", table.name)
            table.create(conn, checkfirst=True)
Beispiel #29
0
def fetch_from_hawk_api(
    table_name: str,
    source_url: str,
    hawk_credentials: dict,
    results_key: str = "results",
    next_key: Optional[str] = "next",
    validate_response: Optional[bool] = True,
    force_http: Optional[bool] = False,
    **kwargs,
):
    s3 = S3Data(table_name, kwargs["ts_nodash"])
    total_records = 0
    page = 1

    while True:
        data = _hawk_api_request(
            source_url,
            credentials=hawk_credentials,
            results_key=results_key,
            next_key=next_key,
            validate_response=validate_response,
            force_http=force_http,
        )

        results = get_nested_key(data, results_key)
        s3.write_key(f"{page:010}.json", results)

        total_records += len(results)
        logger.info(f"Fetched {total_records} records")

        source_url = get_nested_key(data, next_key) if next_key else None
        if not source_url:
            break

        page += 1

    logger.info("Fetching from source completed")
Beispiel #30
0
def fetch_mapped_hosted_csvs(
    table_name: str,
    source_urls: Dict[str, str],
    df_transform: Callable[[pd.DataFrame], pd.DataFrame],
    page_size: int = 10000,
    **kwargs,
):
    s3 = S3Data(table_name, kwargs["ts_nodash"])

    page = 1
    for type_, source_url in source_urls.items():
        logger.info(f"Fetching {source_url}")
        response = requests.get(source_url)
        df = pd.read_csv(io.StringIO(response.content.decode('utf-8')))
        df = df_transform(df)
        df["source_url_key"] = type_

        for i in range((len(df) // page_size) + 1):
            results = df.iloc[page_size * i:page_size * (i + 1)].to_json(
                orient="records", date_format="iso")
            s3.write_key(f"{page:010}.json", results, jsonify=False)
            page += 1

    logger.info("Fetching from source completed")