Ejemplo n.º 1
0
import time
import tarfile
import boto3
import botocore
from botocore.exceptions import ClientError
import uuid
import json
from pprint import pformat
from configparser import ConfigParser
from shutil import copyfile
import site
import yaetos.etl_utils as eu
from yaetos.git_utils import Git_Config_Manager
from yaetos.logger import setup_logging

logger = setup_logging('Deploy')


class DeployPySparkScriptOnAws(object):
    """
    Programmatically deploy a local PySpark script on an AWS cluster
    """
    SCRIPTS = 'yaetos/scripts/'  # TODO: move to etl_utils.py
    TMP = 'tmp/files_to_ship/'

    def __init__(self, deploy_args, app_args):

        logger.info("etl deploy_args: \n{}".format(pformat(deploy_args)))
        logger.info("etl app_args: \n{}".format(pformat(app_args)))
        aws_setup = deploy_args['aws_setup']
        config = ConfigParser()
Ejemplo n.º 2
0
"""Helper functions for kafka.
"""

import json
import jsonschema
import kafka
import requests
from yaetos.logger import setup_logging
logger = setup_logging('Kafka_push')


class KafkaProducer(object):
    def __init__(self,
                 broker_address,
                 topic,
                 send_timeout,
                 check_schema=False,
                 schema_uri=None,
                 connect_kafka=True):
        # TODO: add schema validation to only do message validation in the later stage (send()) to avoid validating schema for every record.
        self.__send_timeout = send_timeout
        self.__TOPIC = topic
        self.check_schema = check_schema
        if check_schema:
            self.schema_init(schema_uri)
        if connect_kafka:
            self.__producer = kafka.KafkaProducer(
                bootstrap_servers=[broker_address])
            logger.info("Producer connection started")

    def schema_init(self, schema_uri):
Ejemplo n.º 3
0
"""Helper functions for oracle."""

from libs.python_db_connectors.query_oracle import connect
import numpy as np
from yaetos.logger import setup_logging
logger = setup_logging('Oracle')


def create_table(df, connection_profile, name_tb, types, creds_or_file,
                 is_incremental):
    """
    Creates table in oracle, full drop or incremental drop.
    types should be of sqlalchemy type. Ex: types.Date(), types.Integer()
    """
    user = creds.get(connection_profile, 'user')
    assert schema == user
    if_exist = 'replace' if not is_incremental else 'append'
    connection = connect(db=connection_profile,
                         connection_type='sqlalchemy',
                         creds_or_file=creds_or_file)
    chunksize = 500000
    logger.info(
        'Sending table "{}" to oracle, mode "{}", size "{}", and chunksize "{}".'
        .format(name_tb, if_exist, len(df), chunksize))
    df.to_sql(
        name=name_tb,
        con=connection,
        if_exists=if_exist,
        dtype=types,
        index=False,
        chunksize=chunksize
Ejemplo n.º 4
0
"""Helper functions for redshift."""

from libs.python_db_connectors.query_redshift import connect
import numpy as np
from yaetos.logger import setup_logging
logger = setup_logging('Redshift')


def create_table(df, connection_profile, name_tb, schema, types, creds_or_file,
                 is_incremental):
    """
    Creates table in redshift, full drop or incremental drop. Functional but very very slow.
    types should be of sqlalchemy type. Ex: types.Date(), types.Integer()
    """
    if_exist = 'replace' if not is_incremental else 'append'
    connection = connect(db=connection_profile, creds_or_file=creds_or_file)
    chunksize = 500000
    logger.info(
        'Sending table "{}" to redshift in schema "{}", mode "{}", size "{}", and chunksize "{}".'
        .format(name_tb, schema, if_exist, len(df), chunksize))
    df.to_sql(name=name_tb,
              schema=schema,
              con=connection,
              if_exists=if_exist,
              dtype=types,
              index=False,
              chunksize=chunksize)
    # TODO: check df.to_sql above for long integers. Noticed long numbers where rounded.
    logger.info(
        "Copied table to redshift '{}.{}', using connection profile '{}'".
        format(schema, name_tb, connection_profile))
Ejemplo n.º 5
0
import pandas as pd
from yaetos.etl_utils import Path_Handler
from yaetos.db_utils import pdf_to_sdf
from yaetos.logger import setup_logging
logger = setup_logging('Job')


def load_excel(jargs, input_name, output_types, sc, sc_sql, **xls_args):

    path = jargs.inputs[input_name]['path']
    path = path.replace('s3://',
                        's3a://') if jargs.mode == 'dev_local' else path
    logger.info("Input '{}' to be loaded from files '{}'.".format(
        input_name, path))
    path = Path_Handler(path, jargs.base_path).expand_later(jargs.storage)
    logger.info("Input '{}' loaded from files '{}'.".format(input_name, path))

    pdf = pd.read_excel(io=path, engine='openpyxl', **xls_args)
    sdf = pdf_to_sdf(pdf, output_types, sc, sc_sql)
    return sdf
Ejemplo n.º 6
0
"""Helper functions for clickhouse. Using postgres connector."""

from yaetos.logger import setup_logging
logger = setup_logging('Clickhouse')


def create_table(df, connection_profile, name_tb, schema, creds_or_file,
                 is_incremental):
    """
    Creates table in Clickhouse, full drop or incremental drop, using spark connector. Implies pushing data to S3 first.
    """
    load_type = 'overwrite' if not is_incremental else 'append'
    db = creds_or_file[connection_profile]
    url = 'jdbc:postgresql://{host}/{service}'.format(host=db['host'],
                                                      service=db['service'])
    dbtable = '{}.{}'.format(schema, name_tb)

    logger.info(
        'Sending table "{}" to clickhouse in schema "{}", load type "{}", size "{}".'
        .format(name_tb, schema, load_type, df.count()))

    df.write \
        .format('jdbc') \
        .option('driver', "org.postgresql.Driver") \
        .option("url", url) \
        .option("user", db['user']) \
        .option("password", db['password']) \
        .option("dbtable", dbtable)\
        .mode(load_type) \
        .save()
Ejemplo n.º 7
0
"""Helper functions for databases, based on sqlalchemy, mostly for oracle for now.
Some code to be run in worker nodes, so can't rely on libraries only on master (ex db connections)."""

import pandas as pd
from sqlalchemy import types as db_types
from pyspark.sql import types as spk_types
from datetime import datetime, date
from yaetos.logger import setup_logging
logger = setup_logging('DB_Utils')


def cast_rec(rec, output_types):
    new_rec = {}
    for field in output_types.keys():
        new_rec[field] = cast_value(rec[field], output_types[field], field)
    return new_rec


def cast_value(value, required_type, field_name):
    # TODO: make it less ugly.. or avoid using pandas to not require this.
    try:
        if isinstance(required_type, type(db_types.DATE())):
            if isinstance(value, str):
                return datetime.strptime(value,
                                         "%Y-%m-%d")  # assuming iso format
            elif isinstance(value, pd.Timestamp):  # == datetime
                return value.to_pydatetime().date()
            elif isinstance(value, date):
                return value
            elif pd.isnull(value):
                return None