Beispiel #1
0
def run(
    ctx: click.Context,
    config: str,
    dry_run: bool,
    preview: bool,
    strict_warnings: bool,
    preview_workunits: int,
) -> None:
    """Ingest metadata into DataHub."""

    logger.info("DataHub CLI version: %s", datahub_package.nice_version_name())

    config_file = pathlib.Path(config)
    pipeline_config = load_config_file(config_file)

    try:
        logger.debug(f"Using config: {pipeline_config}")
        pipeline = Pipeline.create(pipeline_config, dry_run, preview,
                                   preview_workunits)
    except ValidationError as e:
        click.echo(e, err=True)
        sys.exit(1)
    except Exception as e:
        # The pipeline_config may contain sensitive information, so we wrap the exception
        # in a SensitiveError to prevent detailed variable-level information from being logged.
        raise SensitiveError() from e

    logger.info("Starting metadata ingestion")
    pipeline.run()
    logger.info("Finished metadata ingestion")
    ret = pipeline.pretty_print_summary(warnings_as_failure=strict_warnings)
    pipeline.log_ingestion_stats()
    sys.exit(ret)
Beispiel #2
0
class SourceReport(Report):
    workunits_produced: int = 0
    workunit_ids: List[str] = field(default_factory=list)

    warnings: Dict[str, List[str]] = field(default_factory=dict)
    failures: Dict[str, List[str]] = field(default_factory=dict)
    cli_version: str = datahub.nice_version_name()
    cli_entry_location: str = datahub.__file__
    py_version: str = sys.version
    py_exec_path: str = sys.executable
    os_details: str = platform.platform()

    def report_workunit(self, wu: WorkUnit) -> None:
        self.workunits_produced += 1
        self.workunit_ids.append(wu.id)

    def report_warning(self, key: str, reason: str) -> None:
        if key not in self.warnings:
            self.warnings[key] = []
        self.warnings[key].append(reason)

    def report_failure(self, key: str, reason: str) -> None:
        if key not in self.failures:
            self.failures[key] = []
        self.failures[key].append(reason)
Beispiel #3
0
    def __init__(self):

        # init the client ID and config if it doesn't exist
        if not CONFIG_FILE.exists():
            self.client_id = str(uuid.uuid4())
            self.update_config()

        else:
            self.load_config()

        # send updated user-level properties
        self.mp = None
        if self.enabled:
            try:
                self.mp = Mixpanel(
                    MIXPANEL_TOKEN,
                    consumer=Consumer(request_timeout=int(TIMEOUT)))
                self.mp.people_set(
                    self.client_id,
                    {
                        "datahub_version": datahub_package.nice_version_name(),
                        "os": platform.system(),
                        "python_version": platform.python_version(),
                    },
                )
            except Exception as e:
                logger.debug(f"Error connecting to mixpanel: {e}")
Beispiel #4
0
    def ping(
        self,
        category: str,
        action: str,
        label: Optional[str] = None,
        value: Optional[int] = None,
    ) -> None:
        """
        Ping Google Analytics with a single event.

        Args:
            category (str): category for the event
            action (str): action taken
            label (Optional[str], optional): label for the event
            value (Optional[int], optional): value for the event
        """

        if not self.enabled:
            return

        req_url = "https://www.google-analytics.com/collect"

        params: Dict[str, Union[str, int]] = {
            "an": "datahub-cli",  # app name
            "av": datahub_package.nice_version_name(),  # app version
            "t": "event",  # event type
            "v": GA_VERSION,  # Google Analytics version
            "tid": GA_TID,  # tracking id
            "cid": self.client_id,  # client id
            "ec": category,  # event category
            "ea": action,  # event action
            # use custom dimensions to capture OS and Python version
            # see https://developers.google.com/analytics/devguides/collection/protocol/v1/parameters#cd_
            "cd1": platform.system(),  # OS
            "cd2": platform.python_version(),  # Python version
        }

        if label:
            params["el"] = label

        # this has to a non-negative int, otherwise the request will fail
        if value:
            params["ev"] = value

        try:
            requests.post(
                req_url,
                data=params,
                headers={
                    "user-agent":
                    f"datahub {datahub_package.nice_version_name()}"
                },
            )
        except Exception as e:

            logger.debug(f"Error reporting telemetry: {e}")
Beispiel #5
0
def run(
    ctx: click.Context,
    config: str,
    dry_run: bool,
    preview: bool,
    strict_warnings: bool,
    preview_workunits: int,
    suppress_error_logs: bool,
) -> None:
    """Ingest metadata into DataHub."""

    logger.info("DataHub CLI version: %s", datahub_package.nice_version_name())

    config_file = pathlib.Path(config)
    pipeline_config = load_config_file(config_file)

    try:
        logger.debug(f"Using config: {pipeline_config}")
        pipeline = Pipeline.create(pipeline_config, dry_run, preview, preview_workunits)
    except ValidationError as e:
        click.echo(e, err=True)
        sys.exit(1)
    except Exception as e:
        # The pipeline_config may contain sensitive information, so we wrap the exception
        # in a SensitiveError to prevent detailed variable-level information from being logged.
        raise SensitiveError() from e

    logger.info("Starting metadata ingestion")
    try:
        pipeline.run()
    except Exception as e:
        logger.info(
            f"Source ({pipeline.config.source.type}) report:\n{pipeline.source.get_report().as_string()}"
        )
        logger.info(
            f"Sink ({pipeline.config.sink.type}) report:\n{pipeline.sink.get_report().as_string()}"
        )
        # We dont want to log sensitive information in variables if the pipeline fails due to
        # an unexpected error. Disable printing sensitive info to logs if ingestion is running
        # with `--suppress-error-logs` flag.
        if suppress_error_logs:
            raise SensitiveError() from e
        else:
            raise e
    else:
        logger.info("Finished metadata pipeline")
        pipeline.log_ingestion_stats()
        ret = pipeline.pretty_print_summary(warnings_as_failure=strict_warnings)
        sys.exit(ret)
Beispiel #6
0
    def init_tracking(self) -> None:
        if not self.enabled or self.mp is None or self.tracking_init is True:
            return

        logger.debug("Sending init Telemetry")
        try:
            self.mp.people_set(
                self.client_id,
                {
                    "datahub_version": datahub_package.nice_version_name(),
                    "os": platform.system(),
                    "python_version": platform.python_version(),
                },
            )
        except Exception as e:
            logger.debug(f"Error reporting telemetry: {e}")
        self.init_track = True
def run(config: str) -> None:
    """Ingest metadata into DataHub."""
    logger.debug("DataHub CLI version: %s",
                 datahub_package.nice_version_name())

    config_file = pathlib.Path(config)
    pipeline_config = load_config_file(config_file)

    try:
        logger.debug(f"Using config: {pipeline_config}")
        pipeline = Pipeline.create(pipeline_config)
    except ValidationError as e:
        click.echo(e, err=True)
        sys.exit(1)

    logger.info("Starting metadata ingestion")
    pipeline.run()
    logger.info("Finished metadata ingestion")
    ret = pipeline.pretty_print_summary()
    sys.exit(ret)
Beispiel #8
0
def run(config: str, dry_run: bool, preview: bool,
        strict_warnings: bool) -> None:
    """Ingest metadata into DataHub."""

    logger.debug("DataHub CLI version: %s",
                 datahub_package.nice_version_name())

    config_file = pathlib.Path(config)
    pipeline_config = load_config_file(config_file)

    try:
        logger.debug(f"Using config: {pipeline_config}")
        pipeline = Pipeline.create(pipeline_config, dry_run, preview)
    except ValidationError as e:
        click.echo(e, err=True)
        sys.exit(1)

    logger.info("Starting metadata ingestion")
    pipeline.run()
    logger.info("Finished metadata ingestion")
    ret = pipeline.pretty_print_summary(warnings_as_failure=strict_warnings)
    pipeline.log_ingestion_stats()
    sys.exit(ret)
# Configure logger.
BASE_LOGGING_FORMAT = (
    "[%(asctime)s] %(levelname)-8s {%(name)s:%(lineno)d} - %(message)s")
logging.basicConfig(format=BASE_LOGGING_FORMAT)

MAX_CONTENT_WIDTH = 120


@click.group(context_settings=dict(
    # Avoid truncation of help text.
    # See https://github.com/pallets/click/issues/486.
    max_content_width=MAX_CONTENT_WIDTH, ))
@click.option("--debug/--no-debug", default=False)
@click.version_option(
    version=datahub_package.nice_version_name(),
    prog_name=datahub_package.__package_name__,
)
def datahub(debug: bool) -> None:
    if debug or os.getenv("DATAHUB_DEBUG", False):
        logging.getLogger().setLevel(logging.INFO)
        logging.getLogger("datahub").setLevel(logging.DEBUG)
    else:
        logging.getLogger().setLevel(logging.WARNING)
        logging.getLogger("datahub").setLevel(logging.INFO)
    # loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict]
    # print(loggers)


@datahub.command()
def version() -> None: