Esempio n. 1
0
 def fetch_partition_boundaries(
         self, conn: connection, table_name: TableName, partition_key: str,
         num_partitions: int) -> List[Tuple[int, int]]:
     """
     Fetch ranges for the partition key that partitions the table nicely.
     """
     stmt = """
         SELECT MIN(pkey) AS lower_bound
              , MAX(pkey) AS upper_bound
              , COUNT(pkey) AS count
           FROM (
                   SELECT "{partition_key}" AS pkey
                        , NTILE({num_partitions}) OVER (ORDER BY "{partition_key}") AS part
                     FROM {table_name}
                ) t
          GROUP BY part
          ORDER BY part
     """
     with Timer() as timer:
         rows = etl.db.query(
             conn,
             stmt.format(partition_key=partition_key,
                         num_partitions=num_partitions,
                         table_name=table_name))
     row_count = sum(row["count"] for row in rows)
     self.logger.info(
         "Calculated %d partition boundaries for %d rows in '%s' using partition key '%s' (%s)",
         num_partitions, row_count, table_name.identifier, partition_key,
         timer)
     lower_bounds = (row["lower_bound"] for row in rows)
     upper_bounds = (row["upper_bound"] for row in rows)
     return [(low, high) for low, high in zip(lower_bounds, upper_bounds)]
Esempio n. 2
0
def execute(cx, stmt, args=(), return_result=False):
    """
    Execute query in 'stmt' over connection 'cx' (with parameters in 'args').

    Be careful with query statements that have a '%' in them (say for LIKE)
    since this will interfere with psycopg2 interpreting parameters.

    Printing the query will not print AWS credentials IF the string used matches "CREDENTIALS '[^']*'"
    So be careful or you'll end up sending your credentials to the logfile.
    """
    with cx.cursor() as cursor:
        executable_statement = mogrify(cursor, stmt, args)
        printable_stmt = remove_password(executable_statement.decode())
        logger.debug("QUERY:\n%s\n;", printable_stmt)
        with Timer() as timer:
            cursor.execute(executable_statement)
        if cursor.rowcount is not None and cursor.rowcount > 0:
            logger.debug("QUERY STATUS: %s [rowcount=%d] (%s)",
                         cursor.statusmessage, cursor.rowcount, timer)
        else:
            logger.debug("QUERY STATUS: %s (%s)", cursor.statusmessage, timer)
        if cx.notices and logger.isEnabledFor(logging.DEBUG):
            for msg in cx.notices:
                logger.debug("QUERY " + msg.rstrip("\n"))
            del cx.notices[:]
        if return_result:
            return cursor.fetchall()
 def extract_source(
         self, source: DataWarehouseSchema,
         relations: List[RelationDescription]) -> List[RelationDescription]:
     """
     For a given upstream source, iterate through given relations to extract the relations' data.
     """
     self.logger.info("Extracting %d relation(s) from source '%s'",
                      len(relations), source.name)
     failed = []
     extract_retries = etl.config.get_config_int(
         "arthur_settings.extract_retries")
     with Timer() as timer:
         for i, relation in enumerate(relations):
             try:
                 extract_func = partial(self.extract_table, source,
                                        relation)
                 with etl.monitor.Monitor(
                         relation.identifier,
                         "extract",
                         options=self.options_info(),
                         source=self.source_info(source, relation),
                         destination={
                             "bucket_name": relation.bucket_name,
                             "object_key": relation.manifest_file_name
                         },
                         index={
                             "current": i + 1,
                             "final": len(relations),
                             "name": source.name
                         },
                         dry_run=self.dry_run,
                 ):
                     retry(extract_retries, extract_func, self.logger)
             except ETLRuntimeError:
                 self.failed_sources.add(source.name)
                 failed.append(relation)
                 if not relation.is_required:
                     self.logger.warning(
                         "Extract failed for non-required relation '%s':",
                         relation.identifier,
                         exc_info=True)
                 elif self.keep_going:
                     self.logger.warning(
                         "Ignoring failure of required relation '%s' and proceeding as requested:",
                         relation.identifier,
                         exc_info=True,
                     )
                 else:
                     self.logger.error(
                         "Extract failed for required relation '%s'",
                         relation.identifier)
                     raise
         self.logger.info(
             "Finished extract from source '%s': %d succeeded, %d failed (%s)",
             source.name,
             len(relations) - len(failed),
             len(failed),
             timer,
         )
     return failed
Esempio n. 4
0
def validate_designs(
    relations: List[RelationDescription],
    keep_going=False,
    skip_sources=False,
    skip_dependencies=False,
) -> None:
    """
    Make sure that all table design files pass the validation checks.

    See module documentation for list of checks.
    """
    config = etl.config.get_dw_config()
    _error_occurred.clear()

    valid_descriptions = validate_semantics(relations, keep_going=keep_going)
    ordered_descriptions = validate_execution_order(valid_descriptions,
                                                    keep_going=keep_going)

    validate_reload(config.schemas, valid_descriptions, keep_going=keep_going)

    if skip_sources:
        logger.info("Skipping validation of designs against upstream sources")
    else:
        with Timer() as timer:
            validate_upstream_sources(config.schemas,
                                      ordered_descriptions,
                                      keep_going=keep_going)
            logger.info("Validated designs against upstream sources (%s)",
                        timer)

    if skip_dependencies:
        logger.info("Skipping validation of transforms against data warehouse")
    else:
        with Timer() as timer:
            validate_transforms(config.dsn_etl,
                                ordered_descriptions,
                                keep_going=keep_going)
            logger.info("Validated transforms against data warehouse (%s)",
                        timer)

    if _error_occurred.is_set():
        raise ETLDelayedExit(
            "At least one error occurred while validating with 'keep going' option"
        )
Esempio n. 5
0
def tail_events(relations,
                start_time,
                update_interval=None,
                idle_time_out=None,
                step: Optional[str] = None) -> None:
    """
    Tail the events table and show latest events coming in (which are not start events, just fail or finish).
    """
    targets = [relation.identifier for relation in relations]
    query = EventsQuery(step)
    consumer_queue = queue.Queue()  # type: ignore
    epoch_seconds = timegm(start_time.utctimetuple())

    thread = BackgroundQueriesRunner(targets,
                                     query,
                                     consumer_queue,
                                     epoch_seconds,
                                     update_interval,
                                     idle_time_out,
                                     daemon=True)
    thread.start()

    events = []
    n_printed = 0
    done = False
    while not done:
        progress = Timer()
        while progress.elapsed < 10:
            try:
                event = consumer_queue.get(timeout=10)
                if event is None:
                    done = True
                    break
                event["timestamp"] = datetime.utcfromtimestamp(
                    event["timestamp"]).isoformat()  # timestamp to isoformat
                events.append(event)
            except queue.Empty:
                break
        # Keep printing tail of table that accumulates the events.
        if len(events) > n_printed:
            lines = etl.text.format_lines(
                [[event[header] for header in query.keys] for event in events],
                header_row=query.keys).split("\n")
            if n_printed:
                print("\n".join(
                    lines[n_printed +
                          2:-1]))  # skip header and final "(x rows)" line
            else:
                print("\n".join(lines[:-1]))  # only skip the "(x rows)" line
            n_printed = len(
                lines) - 3  # header, separator, final = 3 extra rows
            if done:
                print(lines[-1])
Esempio n. 6
0
 def run(self):
     ddb = DynamoDBStorage.factory()
     table = ddb.get_table(create_if_not_exists=False)
     targets = self.targets
     start_time = self.start_time
     idle = Timer()
     while targets:
         logger.debug(
             "Waiting for events for %d target(s), start time = '%s'",
             len(targets),
             datetime.utcfromtimestamp(start_time).isoformat(),
         )
         new_start_time = datetime.utcnow() - timedelta(
             seconds=1)  # avoid rounding errors
         query_loop = Timer()
         retired = set()
         for target in targets:
             latest_event = self.query(table, target, start_time)
             if latest_event:
                 self.queue.put(latest_event)
                 retired.add(latest_event["target"])
         targets = [t for t in targets if t not in retired]
         start_time = timegm(new_start_time.utctimetuple())
         if self.update_interval is None or not targets:
             break
         if retired:
             idle = Timer()
         elif self.idle_time_out and idle.elapsed > self.idle_time_out:
             logger.info(
                 "Idle time-out: Waited for %d seconds but no events arrived, "
                 "%d target(s) remaining",
                 self.idle_time_out,
                 len(targets),
             )
             break
         if query_loop.elapsed < self.update_interval:
             time.sleep(self.update_interval - query_loop.elapsed)
     logger.info("Found events for %d out of %d target(s)",
                 len(self.targets) - len(targets), len(self.targets))
     self.queue.put(None)
Esempio n. 7
0
def sync_with_s3(relations: List[RelationDescription], bucket_name: str, prefix: str, dry_run: bool = False) -> None:
    """
    Copy (validated) table design and SQL files from local directory to S3 bucket.
    """
    logger.info("Validating %d table design(s) before upload", len(relations))
    RelationDescription.load_in_parallel(relations)

    files = []  # typing: List[Tuple[str, str]]
    for relation in relations:
        relation_files = [relation.design_file_name]
        if relation.is_transformation:
            if relation.sql_file_name:
                relation_files.append(relation.sql_file_name)
            else:
                raise MissingQueryError("Missing matching SQL file for '%s'" % relation.design_file_name)
        for file_name in relation_files:
            local_filename = relation.norm_path(file_name)
            remote_filename = os.path.join(prefix, local_filename)
            files.append((local_filename, remote_filename))

    uploader = etl.s3.S3Uploader(bucket_name, dry_run=dry_run)
    with Timer() as timer:
        futures = []  # typing: List[concurrent.futures.Future]
        # TODO With Python 3.6, we should pass in a thread_name_prefix
        with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
            for local_filename, remote_filename in files:
                futures.append(executor.submit(uploader.__call__, local_filename, remote_filename))
        errors = 0
        for future in concurrent.futures.as_completed(futures):
            exception = future.exception()
            if exception is not None:
                logger.error("Failed to upload file: %s", exception)
                errors += 1
    if not dry_run:
        logger.info(
            "Uploaded %d of %d file(s) to 's3://%s/%s (%s)", len(files) - errors, len(files), bucket_name, prefix, timer
        )
    if errors:
        raise ETLRuntimeError("There were {:d} error(s) during upload".format(errors))
Esempio n. 8
0
    def extract_source(self, source: DataWarehouseSchema,
                       relations: List[RelationDescription]) -> List[str]:
        """
        Iterate through given relations to extract data from (upstream) source schemas.

        This will return a list of tables that failed to extract or raise an exception
        if there was just one relation failing, it was required, and "keep going" was not active.
        """
        self.logger.info("Extracting %d relation(s) from source '%s'",
                         len(relations), source.name)
        failed_tables = []
        with Timer() as timer:
            for i, relation in enumerate(relations):
                if not self.extract_table_with_retry(source, relation, i + 1,
                                                     len(relations)):
                    failed_tables.append(relation.identifier)
            self.logger.info(
                "Finished extract from source '%s': %d succeeded, %d failed (%s)",
                source.name,
                len(relations) - len(failed_tables),
                len(failed_tables),
                timer,
            )
        return failed_tables