def insert_into_staging_table(**kwargs): """ reads teh temp file and inserts into postgres using python for better error handling. """ # connect to db connection_id = "postgres_default" # this is the name in airflow variables dbhook = PostgresHook(postgres_conn_id=connection_id) pg_conn = dbhook.get_connection(conn_id=connection_id) db_url = "postgresql://{c.login}:{c.password}@{c.host}:{c.port}/{c.schema}".format( c=pg_conn ) engine = create_engine(db_url) meta = MetaData() meta.bind = engine logging.info("got db connection") # read data df = pd.read_csv(filename) # write to db. name (myla311_staging) is the table in the schema df.to_sql( name="myla311_staging", schema="public", con=meta.bind, if_exists="append", index=False, ) return "done"
def csv_to_pg(**kwargs): conn = PostgresHook.get_connection('etl_postgres') engine = create_engine(f"postgres://{conn.host}/{conn.schema}") df = pd.read_csv(f"/tmp/{kwargs['name']}", low_memory=False) df.rename(columns=lambda x: clean_cols(x), inplace=True) df.to_sql(name='contracts', con=engine, schema='ocp', if_exists='append')
def execute(self, context): redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) con = redshift.get_connection(self.redshift_conn_id) iam_role = con.extra_dejson['aws_iam_role'] if self.overwrite: self.log.info("Clearing stage table") redshift.run(f"Truncate Table {self.table}") self.log.info("Copying data from s3 to stage table") s3_path = f"s3://{self.s3_bucket}/{self.s3_key}" format_parms = { 'table':self.table, 's3_path':s3_path, 'iam_role':iam_role } if self.file_format == 'csv': format_parms.update({'ignore_header':self.ignore_header, 'delimiter':self.delimiter, 'region':self.region}) exec_copy_sql = SqlQueries.copy_from_s3[self.file_format].format(**format_parms) redshift.run(exec_copy_sql)
def csv_to_pg(**kwargs): conn = PostgresHook.get_connection('etl_postgres') engine = create_engine(f"postgres://{conn.host}/{conn.schema}") df = pd.read_csv(f"/tmp/{kwargs['name']}.csv", delimiter='|', encoding='latin1', low_memory=False) df.to_sql(f"{kwargs['name'].lower().replace('_', '')}_update", con=engine, schema='accela', if_exists='replace')
def create_connection_object(connection_id: str): try: pg_hook = PostgresHook(postgres_conn_id=connection_id) connection = pg_hook.get_connection(connection_id) logging.info("Connection: {0}".format(connection)) connection_uri = 'postgresql+psycopg2://{c.login}:{c.password}@{c.host}:{c.port}/{c.schema}'.format(c=connection) return connection_uri except Exception as e: logging.error("Failed to initialize db connection. {}".format(e)) logging.error(traceback.format_exc())
def get_connection_dict_from_airflow(conn_id): """Look up `conn_id` and return a dictionary usable with `psycopg2.connect`. """ from airflow.hooks.postgres_hook import PostgresHook conn = PostgresHook.get_connection(conn_id) return { 'host': conn.host, 'dbname': conn.schema, 'port': conn.port, 'user': conn.login, 'password': conn.password }
def connect_database(connection_id): logging.info("connect_database called") dbhook = PostgresHook(postgres_conn_id=connection_id) logging.info("got hook") pgconn = dbhook.get_connection(conn_id=connection_id) logging.info("got connection") # DATABASE = database_dict # db_url = URL(**DATABASE) db_url = 'postgresql://{c.login}:{c.password}@{c.host}:{c.port}/{c.schema}'.format( c=pgconn) #logging.info ("url: "+db_url) engine = create_engine(db_url) meta = MetaData() meta.bind = engine return meta
def create_connection_object(connection_id: str): """ :param connection_id: The identifier in the Airflow Connections menu :return: """ try: pg_hook = PostgresHook(postgres_conn_id=connection_id) connection = pg_hook.get_connection(connection_id) logging.info("Connection: {0}".format(connection)) # Schema is actually database connection_uri = 'postgresql+psycopg2://{c.login}:{c.password}@{c.host}:{c.port}/{c.schema}'.format( c=connection) return connection_uri except Exception as e: logging.error("Failed to initialize database connection. {}".format(e)) logging.error(traceback.format_exc())
def get_connection(): hook = PostgresHook(conn_name_attr='tsdb') conn = hook.get_connection('tsdb') SQLALCHEMY_DATABASE_URI = 'postgresql://{db_user}:{db_password}@{db_host}:5432/{db_name}'.format( db_host='34.69.215.94', db_name='snpiao_data', db_password='******', db_user='******', ) # db_host = '34.69.215.94', # db_name = 'snpiao_data', # db_password = '******', # db_user = '******', # db_host = conn.host, # db_name = conn.schema, # db_password = conn.password, # db_user = conn.login, engine = create_engine(SQLALCHEMY_DATABASE_URI) return engine.connect()
def execute(self, context): """ Create a Redshift cluster usign parameters stored in file aws.cfg """ aws_connection = BaseHook.get_connection(self.aws_credentials) redshift = PostgresHook(self.conn_id) redshift_connection = redshift.get_connection(self.conn_id) config = configparser.ConfigParser() config.read_file(open('/usr/local/airflow/plugins/operators/aws.cfg')) KEY = aws_connection.login SECRET = aws_connection.password REGION = config.get('AWS', 'REGION') VPC_SECUTIRY_GROUPS = config.get('AWS', 'VPC_SECUTIRY_GROUPS') REDSHIFT_CLUSTER_TYPE = config.get("REDSHIFT", "CLUSTER_TYPE") REDSHIFT_NODE_TYPE = config.get("REDSHIFT", "NODE_TYPE") REDSHIFT_CLUSTER_IDENTIFIER = config.get("REDSHIFT", "CLUSTER_ID") REDSHIFT_DB = redshift_connection.login REDSHIFT_DB_USER = redshift_connection.login REDSHIFT_DB_PASSWORD = redshift_connection.password REDSHIFT_ROLE_NAME = config.get("REDSHIFT", "S3_ROLE") redshift = boto3.client('redshift', region_name=REGION, aws_access_key_id=KEY, aws_secret_access_key=SECRET) response = redshift.create_cluster( ClusterType=REDSHIFT_CLUSTER_TYPE, NodeType=REDSHIFT_NODE_TYPE, DBName=REDSHIFT_DB, ClusterIdentifier=REDSHIFT_CLUSTER_IDENTIFIER, MasterUsername=REDSHIFT_DB_USER, MasterUserPassword=REDSHIFT_DB_PASSWORD, IamRoles=[REDSHIFT_ROLE_NAME], VpcSecurityGroupIds=[VPC_SECUTIRY_GROUPS]) self.log.info(response)
dag = DAG( "replace_secmar_database", default_args=default_args, max_active_runs=1, concurrency=2, catchup=False, schedule_interval=None, ) dag.doc_md = __doc__ tables = SECMAR_TABLES + ["operations_stats", "moyens_snsm"] template = "sudo -u postgres pg_dump -c --no-owner {tables} {schema} > {output}" dump_command = template.format( schema=PostgresHook.get_connection("postgresql_local").schema, output=OUTPUT_PATH, tables=" ".join(["-t " + t for t in tables]), ) dump_local_database = BashOperator( task_id="dump_local_database", bash_command=dump_command, dag=dag ) template = "psql -U {user} -h {host} {schema} < {input}" target_connection = PostgresHook.get_connection("target_secmar") import_command = template.format( user=target_connection.login, host=target_connection.host, schema=target_connection.schema, input=OUTPUT_PATH,