def sql_task(execution_date, input_file, **kwargs): if os.path.exists(os.path.join(reports_folder, input_file)): pg_hook = PostgresHook("postgres") pg_cursor = pg_hook.get_cursor() try: sql_file = open(os.path.join(reports_folder, input_file)) sql = sql_file.read() finally: sql_file.close() pg_cursor.execute(sql) result = pg_cursor.fetchall() with TemporaryDirectory() as tempdir: with open(os.path.join(tempdir, input_file + "-report.csv"), "w") as out_file: wr = csv.writer(out_file, quoting=csv.QUOTE_ALL) wr.writerows(result) copy_to_report_path( os.path.join(tempdir, input_file + "-report.csv"), report_path(execution_date), ) else: return FileNotFoundError
def execute(self, context): # First get all index names, so it's known which indices to rename hook = PostgresHook( postgres_conn_id=self.postgres_conn_id, schema=self.database ) # Start a list to hold rename information table_drops = [self.table_name] # Find the cross-tables for n-m relations, we assume they have # a name that start with f"{table_name}_" with hook.get_cursor() as cursor: cursor.execute( """ SELECT tablename FROM pg_tables WHERE schemaname = 'public' AND tablename like %s """, (f"{self.table_name}_%",), ) cross_tables = [row["tablename"] for row in cursor.fetchall()] # Define the SQL to execute by the super class. # This supports executing multiple statements in a single transaction: init_operation = "DROP TABLE IF EXISTS" if self.drop_table else "TRUNCATE TABLE" self.sql = [ f"{init_operation} {table_name} CASCADE" for table_name in table_drops + cross_tables ] return super().execute(context)
def execute(self, context=None): """look up the geometry where no geometry is present""" # get location data without geometry rows = self.get_non_geometry_records() # get BAG verblijfsobject ID from typeahead for record in rows: get_typeadhead_result = self.prepare_typeahead_call(record) record_key = None bag_url = None for key, value in get_typeadhead_result.items(): record_key = key bag_url = value # extract the BAG id from the url, which is the last # series of numbers before the last forward-slash try: get_uri = urlparse(bag_url) if not isinstance(get_uri, ParseResult): self.log.info(f"No BAG id found for {record}") continue else: bag_id = get_uri.path.rsplit("/")[-2] self.log.info(f"BAG id found for {record_key}: {bag_id}") except AttributeError: self.log.error( f"No BAG id found for {record_key} {bag_id} {bag_url}, empty result..." ) continue pg_hook = PostgresHook(postgres_conn_id=self.postgres_conn_id) with pg_hook.get_cursor() as cursor: # update record with found geometry cursor.execute( f""" WITH BAG_VBO_GEOM AS ( SELECT geometrie FROM public.bag_verblijfsobjecten WHERE 1=1 AND identificatie = %s ) UPDATE {self.source_table} SET {self.geometry_column} = BAG_VBO_GEOM.geometrie FROM BAG_VBO_GEOM WHERE 1=1 AND {self.source_key_column} = %s; COMMIT; """, ( bag_id, record_key, ), )
def execute(self, context): hook = PostgresHook(postgres_conn_id=self.postgres_conn_id, schema=self.database) # Start a list to hold copy information table_copies = [( self.source_table_name, self.target_table_name, )] # Find the cross-tables for n-m relations, we assume they have # a name that start with f"{source_table_name}_" with hook.get_cursor() as cursor: # the underscore must be escaped because of it's special meaning in a like # the exclamation mark was used as an escape chacater because # a backslash was not interpreted as an escape cursor.execute( """ SELECT tablename AS name FROM pg_tables WHERE schemaname = 'public' AND tablename like %s ESCAPE '!' """, (f"{self.source_table_name}!_%", ), ) cross_tables = cursor.fetchall() copies = [] for row in cross_tables: source_table_name = row["name"] target_table_name = source_table_name.replace("_new", "") copies.append((source_table_name, target_table_name)) # Define the SQL to execute by the super class. # This supports executing multiple statements in a single transaction self.sql = [] for source_table_name, target_table_name in table_copies + copies: lookup = dict( source_table_name=source_table_name, target_table_name=target_table_name, ) for sql in ( "CREATE TABLE IF NOT EXISTS {target_table_name} (LIKE {source_table_name} " "INCLUDING CONSTRAINTS INCLUDING INDEXES)", "TRUNCATE TABLE {target_table_name} CASCADE", "INSERT INTO {target_table_name} SELECT * FROM {source_table_name}", "DROP TABLE IF EXISTS {source_table_name} CASCADE" if self.ind_drop else None, ): self.sql.append(sql.format(**lookup)) return super().execute(context)
def _load_from_database(**context): params = context['params'] postgres_conn_id = params['postgres_conn_id'] pg_hook = PostgresHook(postgres_conn_id=postgres_conn_id) cur = pg_hook.get_cursor() table_name = 'repositories' constraint_col0 = 'processed' constraint_col1 = 'contains_logging' raw_query = f"""SELECT * from {table_name} WHERE {constraint_col0} = %s AND {constraint_col1} = %s LIMIT 20""" query = cur.mogrify(raw_query) repos = pg_hook.get_pandas_df(query, parameters=[True, True]) task_instance = context['task_instance'] task_instance.xcom_push('target_repositories', repos) return True
def execute(self, context): try: # Executing the Copy command redshift_hook = PostgresHook(self.redshift_conn_id) cur = redshift_hook.get_cursor() cur.execute(self.sql) int_row_count = cur.rowcount cur.execute("END TRANSACTION;") self.logger.info( ' {} Records have been Merged.'.format(int_row_count)) except Exception as e: self.logger.error('Failed to load data, {}'.format(e)) raise AirflowException('Failed to load data, {}'.format(e))
def get_non_geometry_records(self): """get location values from table (for record with no geometry)""" pg_hook = PostgresHook(postgres_conn_id=self.postgres_conn_id) with pg_hook.get_cursor() as cursor: cursor.execute(f""" SELECT {self.source_location_column} , {self.source_key_column} FROM {self.source_table} WHERE 1=1 AND {self.geometry_column} is NULL """) rows = cursor.fetchall() return rows
def index_jobs(**context): global algolia_conn_id pgsql = PostgresHook(postgres_conn_id="pgsql") cur = pgsql.get_cursor() algolia_conn = BaseHook.get_connection('algolia') client = SearchClient.create(algolia_conn.login, algolia_conn.password) index = client.init_index('jobs') jobs_sql_query = """ SELECT j.id AS objectID, j.provider_id AS provider_id, j.remote_id_on_provider AS remote_id_on_provider, j.remote_url AS remote_url, j.location AS location, j.currency_code AS currency_code, j.company_id AS company_id, j.company_name AS company_name, j.title AS title, j.description AS description, j.tags AS tags, j.salary AS salary, j.salary_max AS salary_max, j.salary_frequency AS salary_frequency, j.has_relocation_package AS has_relocation_package, j.expires_at AS expires_at, j.published_at AS published_at, c.id AS child_company_id, c.name AS child_company_name, c.remote_url AS child_company_remote_url, FROM job_vacancies j LEFT JOIN companies c ON (c.id = j.company_id) WHERE CAST(j.published_at AS DATE) = '{}'::DATE """.format(context['execution_date']) cur.execute(jobs_sql_query) rows = cur.fetchall() index.save_objects(rows)
def execute(self, context): postgres_hook = PostgresHook(postgres_conn_id=self._postgres_conn_id) s3_hook = S3Hook(aws_conn_id=self._s3_conn_id) with postgres_hook.get_cursor() as cursor: cursor.execute(self._query) results = cursor.fetchall() headers = [_[0] for _ in cursor.description] data_buffer = io.StringIO() csv_writer = csv.writer(data_buffer, quoting=csv.QUOTE_ALL, lineterminator=os.linesep) csv_writer.writerow(headers) csv_writer.writerows(results) data_buffer_binary = io.BytesIO(data_buffer.getvalue().encode()) s3_hook.load_file_obj( file_obj=data_buffer_binary, bucket_name=self._s3_bucket, key=self._s3_key, replace=True, )
def execute(self, context): # First get all index names, so it's known which indices to rename hook = PostgresHook(postgres_conn_id=self.postgres_conn_id, schema=self.database) with hook.get_cursor() as cursor: cursor.execute( "SELECT indexname FROM pg_indexes" " WHERE schemaname = 'public' AND indexname like %s" " ORDER BY indexname;", (f"%{self.old_table_name}%", ), ) indexes = list(cursor.fetchall()) index_renames = [( row["indexname"], re.sub( pattern=_get_complete_word_pattern(self.old_table_name), repl=self.new_table_name, string=row["indexname"], count=1, ), ) for row in indexes] backup_table = f"{self.new_table_name}_old" # Define the SQL to execute by the super class. # This supports executing multiple statements in a single transaction: self.sql = [ f"ALTER TABLE IF EXISTS {self.new_table_name} RENAME TO {backup_table}", f"ALTER TABLE {self.old_table_name} RENAME TO {self.new_table_name}", f"DROP TABLE IF EXISTS {backup_table}", ] + [ f"ALTER INDEX {old_index} RENAME TO {new_index}" for old_index, new_index in index_renames ] return super().execute(context)
def load_data(**kwargs): params = kwargs['params'] stock_index_name = params['stock_index_name'] from_s3 = params['from_s3'] ti = kwargs['ti'] filename = ti.xcom_pull(key='return_value', task_ids="clean_and_merge_industries") scraper = StockIndexScraper(stock_index_name, from_s3=from_s3, load_all=False) scraper.df = pd.read_csv(filename, index_col='Symbol') scraper.data = scraper.create_data() pghook = PostgresHook('postgres_db') cur = pghook.get_cursor() # delete old data delete_stmt = ("DELETE FROM visuals.index_component_stocks " "WHERE stock_index_name = %s") cur.execute(delete_stmt, (stock_index_name, )) # insert new data row_count = 0 for row in scraper.data_to_tuples(): insert_stmt = ("INSERT INTO visuals.index_component_stocks " "VALUES" "(%s,%s,%s,%s,%s,%s,%s)") cur.execute(insert_stmt, row) row_count += 1 pghook.conn.commit() return {'row_count': row_count}
def execute(self, context): # First get all index names, so it's known which indices to rename hook = PostgresHook(postgres_conn_id=self.postgres_conn_id, schema=self.database) # Start a list to hold rename information table_renames = [( self.old_table_name, self.new_table_name, f"{self.new_table_name}_old", )] # Find the cross-tables for n-m relations, we assume they have # a name that start with f"{old_table_name}_" with hook.get_cursor() as cursor: # the underscore must be escaped because of it's special meaning in a like # the exclamation mark was used as an escape chacater because a backslash was not interpreted as an escape cursor.execute( """ SELECT tablename AS name FROM pg_tables WHERE schemaname = 'public' AND tablename like %s ESCAPE '!' """, (f"{self.old_table_name}!_%", ), ) cross_tables = cursor.fetchall() cursor.execute( """ SELECT indexname AS name FROM pg_indexes WHERE schemaname = 'public' AND indexname LIKE %s ESCAPE '!' ORDER BY indexname """, (f"%{self.old_table_name}!_%", ), ) indexes = cursor.fetchall() renames = [] for row in cross_tables: old_table_name = row["name"] new_table_name = old_table_name.replace("_new", "") backup_table_name = f"{new_table_name}_old" renames.append((old_table_name, new_table_name, backup_table_name)) idx_renames = [( row["name"], row["name"].replace(self.old_table_name, self.new_table_name), ) for row in indexes] # Define the SQL to execute by the super class. # This supports executing multiple statements in a single transaction: self.sql = [] for sql in ( "ALTER TABLE IF EXISTS {new_table_name} RENAME TO {backup_table_name}", "ALTER TABLE IF EXISTS {old_table_name} RENAME TO {new_table_name}", "DROP TABLE IF EXISTS {backup_table_name}", ): for old_table_name, new_table_name, backup_table_name in ( table_renames + renames): lookup = dict( old_table_name=old_table_name, new_table_name=new_table_name, backup_table_name=backup_table_name, ) self.sql.append(sql.format(**lookup)) for old_name, new_name in idx_renames: self.sql.append( f"ALTER INDEX IF EXISTS {old_name} RENAME TO {new_name}") return super().execute(context)
def execute(self, context): ''' :param context: This is an Airflow standard; it contains metadata about the job and task. I use it here to generate descriptive filenames. ''' # Decide where we'll put the results. # The file is temporarily constructed within /tmp and copied # at the end to a unique path in Google Cloud Storage based # on the job and task names as well as the job date. # We need a temp file since we can't append directly # to GCS files. target_csv = task_output_file_path(self.output_basedir, context) temp_csv = task_temp_file(context) log.info('Temp output: {}'.format(temp_csv)) log.info('Final output: {}'.format(target_csv)) with open(temp_csv, 'w', newline='') as csvfile: csvwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) # We store the database credentials as a cluster configuration # rather than in code, similar to the Variables feature already # described in onethree_dag.py. In this case, you can view the # credentials 'pg_onethree_demo' in the Airflow interface by # clicking on Admin->Credentials and searching for that name. # # PostgresHook is a helper class that takes care of getting # those credentials and giving us the corresponding DB cursor. # # The other benefit of doing it this way is that connections # are reused across tasks, to avoid overloading the DB. pg_hook = PostgresHook('pg_onethree_demo', supports_autocommit=True) # Pull the list of drugbank IDs from the database for this batch. # # A comment on database structure: I'm using a surrogate key (id) # and treating drugbank_provided_id just as a property. # The table where targets are stored (drugbank_targets) has # a foreign key from drugbank_drug to this id column. Among other # benefits this makes indexes smaller and joins faster since an integer # is a compact data type compared to the character ID. cursor = pg_hook.get_cursor() cursor.execute( 'SELECT id, drugbank_provided_id FROM public.drugbank_drugs ' 'WHERE id % {} = {}'.format(self.num_batches, self.batch_num)) drugbank_id = cursor.fetchone() while drugbank_id: # Get the contents of the specific page url = 'https://www.drugbank.ca/drugs/' + drugbank_id[1] r = requests.get(url) # Parse the HTML of the page using BeautifulSoup with html5lib, which # works the same as a browser. soup = BeautifulSoup(r.text, 'html5lib') # Isolate our search to just the 'targets' div (i.e. what you'd see # visually when browsing to the TARGETS section of the page). # # This uses a CSS selector to get the div with both the # 'bond-list-container' and 'targets' CSS classes. "bond-list-container" # basically specifies "a section" and "targets" indicates which one. # # NOTE: HTML parsing is a bit brittle. I try to address that by harnessing # the logic in their design (for example they specifically identify the target # section with a class) and making as few other assumptions about the # structure as I can. However, an additional and far more robust # safeguard that I'd use in production would be size checks: Defining # the range of data quantities that we expect relative to past observations, # and alerting me when the real quantity is outside that range. # In my experience this catches 90% of issues. targets_div = soup.select('div.bond-list-container.targets') # If the page is a stub or otherwise lacks targets, we are unable # to pull any so continue. if len(targets_div) == 0: drugbank_id = cursor.fetchone() continue targets_div = targets_div[0] # Find the <dt> tags which contain the words 'Gene Name'. # Handle trivial changes to letter casing or surrounding whitespace. def ci_match_gn(string): return string and 'GENE NAME' in string.upper() gene_strings = targets_div.find_all('dt', string=ci_match_gn) for t in gene_strings: # Our target should be the next sibling of <dt>, i.e. <dd>. # If it isn't, they might have introduced a line break which # would become a sibling in between the two elements, so handle # that case. target = t.next_sibling if target.name != 'dd': target = target.next_sibling if target.name != 'dd': continue drug_target = target.string # Targets will be stored in a separate table, with a foreign # key back to the drug table's id column. csvwriter.writerow([drugbank_id[0], drug_target]) drugbank_id = cursor.fetchone() # Move the CSV to its final location. # # Since the same code is run locally during development and in # production, we use an Airflow variable 'environment' that we have # set to distinguish environments. The helper function uses it to # determine whether we are copying to a local filesystem location or a # Cloud Storage location, since we will need different commands. # # The 'environment' variable has many other uses, for example # deciding whether to page the engineering team for unrecoverable # errors (yes in 'production', usually no in 'dev'). env = Variable.get('environment') move_to_output_loc(env, temp_csv, target_csv) # Pass the name of the output CSV file to whatever # downstream task might need it. It's pulled similarly # using xcom_pull (shown later). # # XCOM is another Airflow feature, this time allowing message # passing (of small data like this) between tasks. context['task_instance'].xcom_push('csv_path', target_csv)