def insert_into_staging_table(**kwargs):
    """
    reads teh temp file and inserts into postgres using
    python for better error handling.
    """
    # connect to db
    connection_id = "postgres_default"  # this is the name in airflow variables
    dbhook = PostgresHook(postgres_conn_id=connection_id)
    pg_conn = dbhook.get_connection(conn_id=connection_id)
    db_url = "postgresql://{c.login}:{c.password}@{c.host}:{c.port}/{c.schema}".format(
        c=pg_conn
    )
    engine = create_engine(db_url)
    meta = MetaData()
    meta.bind = engine
    logging.info("got db connection")
    # read data
    df = pd.read_csv(filename)
    # write to db. name (myla311_staging) is the table in the schema
    df.to_sql(
        name="myla311_staging",
        schema="public",
        con=meta.bind,
        if_exists="append",
        index=False,
    )
    return "done"
Example #2
0
def csv_to_pg(**kwargs):
    conn = PostgresHook.get_connection('etl_postgres')
    engine = create_engine(f"postgres://{conn.host}/{conn.schema}")

    df = pd.read_csv(f"/tmp/{kwargs['name']}", low_memory=False)
    df.rename(columns=lambda x: clean_cols(x), inplace=True)
    df.to_sql(name='contracts', con=engine, schema='ocp', if_exists='append')
Example #3
0
    def execute(self, context):
        redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)
        con = redshift.get_connection(self.redshift_conn_id)
        iam_role = con.extra_dejson['aws_iam_role']

        if self.overwrite:
            self.log.info("Clearing stage table")
            redshift.run(f"Truncate Table {self.table}")

        self.log.info("Copying data from s3 to stage table")
        s3_path = f"s3://{self.s3_bucket}/{self.s3_key}"

        format_parms = {
            'table':self.table,
            's3_path':s3_path,
            'iam_role':iam_role
        }
        
        if self.file_format == 'csv':
            format_parms.update({'ignore_header':self.ignore_header,
                                 'delimiter':self.delimiter,
                                 'region':self.region})
        
        exec_copy_sql = SqlQueries.copy_from_s3[self.file_format].format(**format_parms)
        redshift.run(exec_copy_sql)
Example #4
0
def csv_to_pg(**kwargs):
    conn = PostgresHook.get_connection('etl_postgres')
    engine = create_engine(f"postgres://{conn.host}/{conn.schema}")
    df = pd.read_csv(f"/tmp/{kwargs['name']}.csv",
                     delimiter='|',
                     encoding='latin1',
                     low_memory=False)
    df.to_sql(f"{kwargs['name'].lower().replace('_', '')}_update",
              con=engine,
              schema='accela',
              if_exists='replace')
Example #5
0
def create_connection_object(connection_id: str):

    try:
        pg_hook = PostgresHook(postgres_conn_id=connection_id)
        connection = pg_hook.get_connection(connection_id)
        logging.info("Connection: {0}".format(connection))
        connection_uri = 'postgresql+psycopg2://{c.login}:{c.password}@{c.host}:{c.port}/{c.schema}'.format(c=connection)
        return connection_uri
    except Exception as e:
        logging.error("Failed to initialize db connection. {}".format(e))
        logging.error(traceback.format_exc())
Example #6
0
def get_connection_dict_from_airflow(conn_id):
    """Look up `conn_id` and return a dictionary usable with `psycopg2.connect`.
    """
    from airflow.hooks.postgres_hook import PostgresHook
    conn = PostgresHook.get_connection(conn_id)
    return {
        'host': conn.host,
        'dbname': conn.schema,
        'port': conn.port,
        'user': conn.login,
        'password': conn.password
    }
Example #7
0
def connect_database(connection_id):
    logging.info("connect_database called")
    dbhook = PostgresHook(postgres_conn_id=connection_id)
    logging.info("got hook")
    pgconn = dbhook.get_connection(conn_id=connection_id)
    logging.info("got connection")
    #	DATABASE = database_dict
    #	db_url = URL(**DATABASE)
    db_url = 'postgresql://{c.login}:{c.password}@{c.host}:{c.port}/{c.schema}'.format(
        c=pgconn)
    #logging.info ("url: "+db_url)
    engine = create_engine(db_url)
    meta = MetaData()
    meta.bind = engine

    return meta
Example #8
0
def create_connection_object(connection_id: str):
    """
    :param connection_id: The identifier in the Airflow Connections menu
    :return:
    """
    try:
        pg_hook = PostgresHook(postgres_conn_id=connection_id)
        connection = pg_hook.get_connection(connection_id)
        logging.info("Connection: {0}".format(connection))
        # Schema is actually database
        connection_uri = 'postgresql+psycopg2://{c.login}:{c.password}@{c.host}:{c.port}/{c.schema}'.format(
            c=connection)
        return connection_uri
    except Exception as e:
        logging.error("Failed to initialize database connection. {}".format(e))
        logging.error(traceback.format_exc())
def get_connection():
    hook = PostgresHook(conn_name_attr='tsdb')
    conn = hook.get_connection('tsdb')
    SQLALCHEMY_DATABASE_URI = 'postgresql://{db_user}:{db_password}@{db_host}:5432/{db_name}'.format(
        db_host='34.69.215.94',
        db_name='snpiao_data',
        db_password='******',
        db_user='******',
    )

    # db_host = '34.69.215.94',
    # db_name = 'snpiao_data',
    # db_password = '******',
    # db_user = '******',

    # db_host = conn.host,
    # db_name = conn.schema,
    # db_password = conn.password,
    # db_user = conn.login,

    engine = create_engine(SQLALCHEMY_DATABASE_URI)
    return engine.connect()
Example #10
0
    def execute(self, context):
        """
        Create a Redshift cluster usign parameters stored in file aws.cfg
        """
        aws_connection = BaseHook.get_connection(self.aws_credentials)
        redshift = PostgresHook(self.conn_id)
        redshift_connection = redshift.get_connection(self.conn_id)

        config = configparser.ConfigParser()
        config.read_file(open('/usr/local/airflow/plugins/operators/aws.cfg'))
        KEY = aws_connection.login
        SECRET = aws_connection.password
        REGION = config.get('AWS', 'REGION')
        VPC_SECUTIRY_GROUPS = config.get('AWS', 'VPC_SECUTIRY_GROUPS')
        REDSHIFT_CLUSTER_TYPE = config.get("REDSHIFT", "CLUSTER_TYPE")
        REDSHIFT_NODE_TYPE = config.get("REDSHIFT", "NODE_TYPE")
        REDSHIFT_CLUSTER_IDENTIFIER = config.get("REDSHIFT", "CLUSTER_ID")
        REDSHIFT_DB = redshift_connection.login
        REDSHIFT_DB_USER = redshift_connection.login
        REDSHIFT_DB_PASSWORD = redshift_connection.password
        REDSHIFT_ROLE_NAME = config.get("REDSHIFT", "S3_ROLE")

        redshift = boto3.client('redshift',
                                region_name=REGION,
                                aws_access_key_id=KEY,
                                aws_secret_access_key=SECRET)

        response = redshift.create_cluster(
            ClusterType=REDSHIFT_CLUSTER_TYPE,
            NodeType=REDSHIFT_NODE_TYPE,
            DBName=REDSHIFT_DB,
            ClusterIdentifier=REDSHIFT_CLUSTER_IDENTIFIER,
            MasterUsername=REDSHIFT_DB_USER,
            MasterUserPassword=REDSHIFT_DB_PASSWORD,
            IamRoles=[REDSHIFT_ROLE_NAME],
            VpcSecurityGroupIds=[VPC_SECUTIRY_GROUPS])
        self.log.info(response)
dag = DAG(
    "replace_secmar_database",
    default_args=default_args,
    max_active_runs=1,
    concurrency=2,
    catchup=False,
    schedule_interval=None,
)
dag.doc_md = __doc__

tables = SECMAR_TABLES + ["operations_stats", "moyens_snsm"]

template = "sudo -u postgres pg_dump -c --no-owner {tables} {schema} > {output}"
dump_command = template.format(
    schema=PostgresHook.get_connection("postgresql_local").schema,
    output=OUTPUT_PATH,
    tables=" ".join(["-t " + t for t in tables]),
)

dump_local_database = BashOperator(
    task_id="dump_local_database", bash_command=dump_command, dag=dag
)

template = "psql -U {user} -h {host} {schema} < {input}"
target_connection = PostgresHook.get_connection("target_secmar")
import_command = template.format(
    user=target_connection.login,
    host=target_connection.host,
    schema=target_connection.schema,
    input=OUTPUT_PATH,