def fetch_partition_boundaries( self, conn: connection, table_name: TableName, partition_key: str, num_partitions: int) -> List[Tuple[int, int]]: """ Fetch ranges for the partition key that partitions the table nicely. """ stmt = """ SELECT MIN(pkey) AS lower_bound , MAX(pkey) AS upper_bound , COUNT(pkey) AS count FROM ( SELECT "{partition_key}" AS pkey , NTILE({num_partitions}) OVER (ORDER BY "{partition_key}") AS part FROM {table_name} ) t GROUP BY part ORDER BY part """ with Timer() as timer: rows = etl.db.query( conn, stmt.format(partition_key=partition_key, num_partitions=num_partitions, table_name=table_name)) row_count = sum(row["count"] for row in rows) self.logger.info( "Calculated %d partition boundaries for %d rows in '%s' using partition key '%s' (%s)", num_partitions, row_count, table_name.identifier, partition_key, timer) lower_bounds = (row["lower_bound"] for row in rows) upper_bounds = (row["upper_bound"] for row in rows) return [(low, high) for low, high in zip(lower_bounds, upper_bounds)]
def execute(cx, stmt, args=(), return_result=False): """ Execute query in 'stmt' over connection 'cx' (with parameters in 'args'). Be careful with query statements that have a '%' in them (say for LIKE) since this will interfere with psycopg2 interpreting parameters. Printing the query will not print AWS credentials IF the string used matches "CREDENTIALS '[^']*'" So be careful or you'll end up sending your credentials to the logfile. """ with cx.cursor() as cursor: executable_statement = mogrify(cursor, stmt, args) printable_stmt = remove_password(executable_statement.decode()) logger.debug("QUERY:\n%s\n;", printable_stmt) with Timer() as timer: cursor.execute(executable_statement) if cursor.rowcount is not None and cursor.rowcount > 0: logger.debug("QUERY STATUS: %s [rowcount=%d] (%s)", cursor.statusmessage, cursor.rowcount, timer) else: logger.debug("QUERY STATUS: %s (%s)", cursor.statusmessage, timer) if cx.notices and logger.isEnabledFor(logging.DEBUG): for msg in cx.notices: logger.debug("QUERY " + msg.rstrip("\n")) del cx.notices[:] if return_result: return cursor.fetchall()
def extract_source( self, source: DataWarehouseSchema, relations: List[RelationDescription]) -> List[RelationDescription]: """ For a given upstream source, iterate through given relations to extract the relations' data. """ self.logger.info("Extracting %d relation(s) from source '%s'", len(relations), source.name) failed = [] extract_retries = etl.config.get_config_int( "arthur_settings.extract_retries") with Timer() as timer: for i, relation in enumerate(relations): try: extract_func = partial(self.extract_table, source, relation) with etl.monitor.Monitor( relation.identifier, "extract", options=self.options_info(), source=self.source_info(source, relation), destination={ "bucket_name": relation.bucket_name, "object_key": relation.manifest_file_name }, index={ "current": i + 1, "final": len(relations), "name": source.name }, dry_run=self.dry_run, ): retry(extract_retries, extract_func, self.logger) except ETLRuntimeError: self.failed_sources.add(source.name) failed.append(relation) if not relation.is_required: self.logger.warning( "Extract failed for non-required relation '%s':", relation.identifier, exc_info=True) elif self.keep_going: self.logger.warning( "Ignoring failure of required relation '%s' and proceeding as requested:", relation.identifier, exc_info=True, ) else: self.logger.error( "Extract failed for required relation '%s'", relation.identifier) raise self.logger.info( "Finished extract from source '%s': %d succeeded, %d failed (%s)", source.name, len(relations) - len(failed), len(failed), timer, ) return failed
def validate_designs( relations: List[RelationDescription], keep_going=False, skip_sources=False, skip_dependencies=False, ) -> None: """ Make sure that all table design files pass the validation checks. See module documentation for list of checks. """ config = etl.config.get_dw_config() _error_occurred.clear() valid_descriptions = validate_semantics(relations, keep_going=keep_going) ordered_descriptions = validate_execution_order(valid_descriptions, keep_going=keep_going) validate_reload(config.schemas, valid_descriptions, keep_going=keep_going) if skip_sources: logger.info("Skipping validation of designs against upstream sources") else: with Timer() as timer: validate_upstream_sources(config.schemas, ordered_descriptions, keep_going=keep_going) logger.info("Validated designs against upstream sources (%s)", timer) if skip_dependencies: logger.info("Skipping validation of transforms against data warehouse") else: with Timer() as timer: validate_transforms(config.dsn_etl, ordered_descriptions, keep_going=keep_going) logger.info("Validated transforms against data warehouse (%s)", timer) if _error_occurred.is_set(): raise ETLDelayedExit( "At least one error occurred while validating with 'keep going' option" )
def tail_events(relations, start_time, update_interval=None, idle_time_out=None, step: Optional[str] = None) -> None: """ Tail the events table and show latest events coming in (which are not start events, just fail or finish). """ targets = [relation.identifier for relation in relations] query = EventsQuery(step) consumer_queue = queue.Queue() # type: ignore epoch_seconds = timegm(start_time.utctimetuple()) thread = BackgroundQueriesRunner(targets, query, consumer_queue, epoch_seconds, update_interval, idle_time_out, daemon=True) thread.start() events = [] n_printed = 0 done = False while not done: progress = Timer() while progress.elapsed < 10: try: event = consumer_queue.get(timeout=10) if event is None: done = True break event["timestamp"] = datetime.utcfromtimestamp( event["timestamp"]).isoformat() # timestamp to isoformat events.append(event) except queue.Empty: break # Keep printing tail of table that accumulates the events. if len(events) > n_printed: lines = etl.text.format_lines( [[event[header] for header in query.keys] for event in events], header_row=query.keys).split("\n") if n_printed: print("\n".join( lines[n_printed + 2:-1])) # skip header and final "(x rows)" line else: print("\n".join(lines[:-1])) # only skip the "(x rows)" line n_printed = len( lines) - 3 # header, separator, final = 3 extra rows if done: print(lines[-1])
def run(self): ddb = DynamoDBStorage.factory() table = ddb.get_table(create_if_not_exists=False) targets = self.targets start_time = self.start_time idle = Timer() while targets: logger.debug( "Waiting for events for %d target(s), start time = '%s'", len(targets), datetime.utcfromtimestamp(start_time).isoformat(), ) new_start_time = datetime.utcnow() - timedelta( seconds=1) # avoid rounding errors query_loop = Timer() retired = set() for target in targets: latest_event = self.query(table, target, start_time) if latest_event: self.queue.put(latest_event) retired.add(latest_event["target"]) targets = [t for t in targets if t not in retired] start_time = timegm(new_start_time.utctimetuple()) if self.update_interval is None or not targets: break if retired: idle = Timer() elif self.idle_time_out and idle.elapsed > self.idle_time_out: logger.info( "Idle time-out: Waited for %d seconds but no events arrived, " "%d target(s) remaining", self.idle_time_out, len(targets), ) break if query_loop.elapsed < self.update_interval: time.sleep(self.update_interval - query_loop.elapsed) logger.info("Found events for %d out of %d target(s)", len(self.targets) - len(targets), len(self.targets)) self.queue.put(None)
def sync_with_s3(relations: List[RelationDescription], bucket_name: str, prefix: str, dry_run: bool = False) -> None: """ Copy (validated) table design and SQL files from local directory to S3 bucket. """ logger.info("Validating %d table design(s) before upload", len(relations)) RelationDescription.load_in_parallel(relations) files = [] # typing: List[Tuple[str, str]] for relation in relations: relation_files = [relation.design_file_name] if relation.is_transformation: if relation.sql_file_name: relation_files.append(relation.sql_file_name) else: raise MissingQueryError("Missing matching SQL file for '%s'" % relation.design_file_name) for file_name in relation_files: local_filename = relation.norm_path(file_name) remote_filename = os.path.join(prefix, local_filename) files.append((local_filename, remote_filename)) uploader = etl.s3.S3Uploader(bucket_name, dry_run=dry_run) with Timer() as timer: futures = [] # typing: List[concurrent.futures.Future] # TODO With Python 3.6, we should pass in a thread_name_prefix with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor: for local_filename, remote_filename in files: futures.append(executor.submit(uploader.__call__, local_filename, remote_filename)) errors = 0 for future in concurrent.futures.as_completed(futures): exception = future.exception() if exception is not None: logger.error("Failed to upload file: %s", exception) errors += 1 if not dry_run: logger.info( "Uploaded %d of %d file(s) to 's3://%s/%s (%s)", len(files) - errors, len(files), bucket_name, prefix, timer ) if errors: raise ETLRuntimeError("There were {:d} error(s) during upload".format(errors))
def extract_source(self, source: DataWarehouseSchema, relations: List[RelationDescription]) -> List[str]: """ Iterate through given relations to extract data from (upstream) source schemas. This will return a list of tables that failed to extract or raise an exception if there was just one relation failing, it was required, and "keep going" was not active. """ self.logger.info("Extracting %d relation(s) from source '%s'", len(relations), source.name) failed_tables = [] with Timer() as timer: for i, relation in enumerate(relations): if not self.extract_table_with_retry(source, relation, i + 1, len(relations)): failed_tables.append(relation.identifier) self.logger.info( "Finished extract from source '%s': %d succeeded, %d failed (%s)", source.name, len(relations) - len(failed_tables), len(failed_tables), timer, ) return failed_tables