Python Dataset Beispiele, jx_bigquery.bigquery.Dataset Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: main.py Projekt: mozilla/cia-tasks

    def __init__(self, config):
        self.config = config = wrap(config)
        config.range.min = Date(config.range.min)
        config.range.max = Date(config.range.max)
        config.start = Date(config.start)
        config.interval = Duration(config.interval)
        config.branches = listwrap(config.branches)
        self.destination = bigquery.Dataset(
            config.destination).get_or_create_table(config.destination)

        # CALCULATE THE PREVIOUS RUN
        mozci_version = self.version("mozci")
        prev_done = self.get_state()
        if prev_done and prev_done.mozci_version == mozci_version:
            self.done = Data(
                mozci_version=mozci_version,
                min=Date(coalesce(prev_done.min, config.start, "today-2day")),
                max=Date(coalesce(prev_done.max, config.start, "today-2day")),
            )
        else:
            self.done = Data(
                mozci_version=mozci_version,
                min=Date(coalesce(config.start, "today-2day")),
                max=Date(coalesce(config.start, "today-2day")),
            )
            self.set_state()

Beispiel #2

0

Datei anzeigen

    def __init__(self, config):
        self.config = config = wrap(config)
        config.range.min = Date(config.range.min)
        config.range.max = Date(config.range.max)
        config.start = Date(config.start)
        config.interval = Duration(config.interval)
        config.branches = listwrap(config.branches)
        self.destination = bigquery.Dataset(config.destination).get_or_create_table(
            config.destination
        )

        # CALCULATE THE PREVIOUS RUN
        mozci_version = self.version("mozci")
        self.etl_config_table = jx_sqlite.Container(
            config.config_db
        ).get_or_create_facts("etl-range")
        done_result = wrap(self.etl_config_table.query()).data
        prev_done = done_result[0]
        if len(done_result) and prev_done.mozci_version == mozci_version:
            self.done = Data(
                mozci_version=mozci_version,
                min=Date(coalesce(prev_done.min, config.start, "today-2day")),
                max=Date(coalesce(prev_done.max, config.start, "today-2day")),
            )
        else:
            self.done = Data(
                mozci_version=mozci_version,
                min=Date(coalesce(config.start, "today-2day")),
                max=Date(coalesce(config.start, "today-2day")),
            )
            self.etl_config_table.add(self.done)

Beispiel #3

0

Datei anzeigen

def main():
    since = Date.today() - Duration(SCATTER_RANGE)

    if config.database.host not in listwrap(
            config.analysis.expected_database_host):
        Log.error("Expecting database to be one of {{expected}}",
                  expected=config.analysis.expected_database_host)
    if not config.analysis.interesting:
        Log.alert(
            "Expecting config file to have `analysis.interesting` with a json expression.  All series are included."
        )

    # SETUP DESTINATION
    deviant_summary = bigquery.Dataset(
        config.deviant_summary).get_or_create_table(
            read_only=True, kwargs=config.deviant_summary)

    if config.args.id:
        # EXIT EARLY AFTER WE GOT THE SPECIFIC IDS
        if len(config.args.id) < 4:
            step_detector.SHOW_CHARTS = True
        for signature_hash in config.args.id:
            process(
                signature_hash,
                since=since,
                source=config.database,
                deviant_summary=deviant_summary,
                show=True,
            )
        return

    # DOWNLOAD
    if config.args.download:
        # GET INTERESTING SERIES
        where_clause = BQLang[jx_expression(
            config.analysis.interesting)].to_bq(deviant_summary.schema)

        # GET ALL KNOWN SERIES
        docs = list(
            deviant_summary.sql_query(f"""
                SELECT * EXCEPT (_rank, values) 
                FROM (
                  SELECT 
                    *, 
                    row_number() over (partition by id order by last_updated desc) as _rank 
                  FROM  
                    {quote_column(deviant_summary.full_name)}
                  ) a 
                WHERE _rank=1 and {sql_iso(where_clause)}
                LIMIT {quote_value(DOWNLOAD_LIMIT)}
            """))
        if len(docs) == DOWNLOAD_LIMIT:
            Log.warning("Not all signatures downloaded")
        File(config.args.download).write(list2tab(docs, separator=","))

    # DEVIANT
    show_sorted(
        config=config,
        since=since,
        source=config.database,
        deviant_summary=deviant_summary,
        sort={
            "value": {
                "abs": "overall_dev_score"
            },
            "sort": "desc"
        },
        limit=config.args.deviant,
        show_old=False,
        show_distribution=True,
    )

    # MODAL
    show_sorted(
        config=config,
        since=since,
        source=config.database,
        deviant_summary=deviant_summary,
        sort="overall_dev_score",
        limit=config.args.modal,
        where={"eq": {
            "overall_dev_status": "MODAL"
        }},
        show_distribution=True,
    )

    # OUTLIERS
    show_sorted(
        config=config,
        since=since,
        source=config.database,
        deviant_summary=deviant_summary,
        sort={
            "value": "overall_dev_score",
            "sort": "desc"
        },
        limit=config.args.outliers,
        where={"eq": {
            "overall_dev_status": "OUTLIERS"
        }},
        show_distribution=True,
    )

    # SKEWED
    show_sorted(
        config=config,
        since=since,
        source=config.database,
        deviant_summary=deviant_summary,
        sort={
            "value": {
                "abs": "overall_dev_score"
            },
            "sort": "desc"
        },
        limit=config.args.skewed,
        where={"eq": {
            "overall_dev_status": "SKEWED"
        }},
        show_distribution=True,
    )

    # OK
    show_sorted(
        config=config,
        since=since,
        source=config.database,
        deviant_summary=deviant_summary,
        sort={
            "value": {
                "abs": "overall_dev_score"
            },
            "sort": "desc"
        },
        limit=config.args.ok,
        where={"eq": {
            "overall_dev_status": "OK"
        }},
        show_distribution=True,
    )

    # NOISE
    show_sorted(
        config=config,
        since=since,
        source=config.database,
        deviant_summary=deviant_summary,
        sort={
            "value": {
                "abs": "relative_noise"
            },
            "sort": "desc"
        },
        where={"gte": {
            "num_pushes": 30
        }},
        limit=config.args.noise,
    )

    # EXTRA
    show_sorted(
        config=config,
        since=since,
        source=config.database,
        deviant_summary=deviant_summary,
        sort={
            "value": {
                "abs": "max_extra_diff"
            },
            "sort": "desc"
        },
        where={"lte": {
            "num_new_segments": 7
        }},
        limit=config.args.extra,
    )

    # MISSING
    show_sorted(
        config=config,
        since=since,
        source=config.database,
        deviant_summary=deviant_summary,
        sort={
            "value": {
                "abs": "max_missing_diff"
            },
            "sort": "desc"
        },
        where={"lte": {
            "num_old_segments": 6
        }},
        limit=config.args.missing,
    )

    # PATHOLOGICAL
    show_sorted(
        config=config,
        since=since,
        source=config.database,
        deviant_summary=deviant_summary,
        sort={
            "value": "num_segments",
            "sort": "desc"
        },
        limit=config.args.pathological,
    )

Beispiel #4

0

Datei anzeigen

Datei: extract_jobs.py Projekt: jaskeerat789/treeherder

    def run(self, force=False, restart=False, merge=False):
        # SETUP LOGGING
        settings = startup.read_settings(filename=CONFIG_FILE)
        constants.set(settings.constants)
        Log.start(settings.debug)

        if not settings.extractor.app_name:
            Log.error("Expecting an extractor.app_name in config file")

        # SETUP DESTINATION
        destination = bigquery.Dataset(
            dataset=settings.extractor.app_name, kwargs=settings.destination
        ).get_or_create_table(settings.destination)

        try:
            if merge:
                with Timer("merge shards"):
                    destination.merge_shards()

            # RECOVER LAST SQL STATE
            redis = Redis()
            state = redis.get(settings.extractor.key)

            if restart or not state:
                state = (0, 0)
                redis.set(settings.extractor.key, value2json(state).encode("utf8"))
            else:
                state = json2value(state.decode("utf8"))

            last_modified, job_id = state

            # SCAN SCHEMA, GENERATE EXTRACTION SQL
            extractor = MySqlSnowflakeExtractor(settings.source)
            canonical_sql = extractor.get_sql(SQL("SELECT 0"))

            # ENSURE SCHEMA HAS NOT CHANGED SINCE LAST RUN
            old_sql = redis.get(settings.extractor.sql)
            if old_sql and old_sql.decode("utf8") != canonical_sql.sql:
                if force:
                    Log.warning("Schema has changed")
                else:
                    Log.error("Schema has changed")
            redis.set(settings.extractor.sql, canonical_sql.sql.encode("utf8"))

            # SETUP SOURCE
            source = MySQL(settings.source.database)

            while True:
                Log.note(
                    "Extracting jobs for last_modified={{last_modified|datetime|quote}}, job.id={{job_id}}",
                    last_modified=last_modified,
                    job_id=job_id,
                )

                # Example: job.id ==283890114
                # get_ids = ConcatSQL(
                #     (SQL_SELECT, sql_alias(quote_value(283890114), "id"))
                # )
                # get_ids = sql_query(
                #     {
                #         "from": "job",
                #         "select": ["id"],
                #         "where": {
                #             "or": [
                #                 {"gt": {"last_modified": parse(last_modified)}},
                #                 {
                #                     "and": [
                #                         {"eq": {"last_modified": parse(last_modified)}},
                #                         {"gt": {"id": job_id}},
                #                     ]
                #                 },
                #             ]
                #         },
                #         "sort": ["last_modified", "id"],
                #         "limit": settings.extractor.chunk_size,
                #     }
                # )

                get_ids = SQL(str(
                    (
                        Job.objects.filter(
                            Q(last_modified__gt=parse(last_modified).datetime)
                            | (
                                Q(last_modified=parse(last_modified).datetime)
                                & Q(id__gt=job_id)
                            )
                        )
                        .annotate()
                        .values("id")
                        .order_by("last_modified", "id")[
                            : settings.extractor.chunk_size
                        ]
                    ).query
                ))

                sql = extractor.get_sql(get_ids)

                # PULL FROM source, AND PUSH TO destination
                acc = []
                with source.transaction():
                    cursor = source.query(sql, stream=True, row_tuples=True)
                    extractor.construct_docs(cursor, acc.append, False)
                if not acc:
                    break
                destination.extend(acc)

                # RECORD THE STATE
                last_doc = acc[-1]
                last_modified, job_id = last_doc.last_modified, last_doc.id
                redis.set(
                    settings.extractor.key,
                    value2json((last_modified, job_id)).encode("utf8"),
                )

                if len(acc) < settings.extractor.chunk_size:
                    break

        except Exception as e:
            Log.warning("problem with extraction", cause=e)

        Log.note("done job extraction")

        try:
            with Timer("merge shards"):
                destination.merge_shards()
        except Exception as e:
            Log.warning("problem with merge", cause=e)

        Log.note("done job merge")

Beispiel #5

0

Datei anzeigen

Datei: extract_jobs.py Projekt: soumyalahiri/treeherder

    def extract(self, settings, force, restart, start, merge):
        if not settings.extractor.app_name:
            Log.error("Expecting an extractor.app_name in config file")

        # SETUP DESTINATION
        destination = bigquery.Dataset(
            dataset=settings.extractor.app_name,
            kwargs=settings.destination).get_or_create_table(
                settings.destination)

        try:
            if merge:
                with Timer("merge shards"):
                    destination.merge_shards()

            # RECOVER LAST SQL STATE
            redis = Redis.from_url(REDIS_URL)
            state = redis.get(settings.extractor.key)

            if start:
                state = start, 0
            elif restart or not state:
                state = (0, 0)
                redis.set(settings.extractor.key,
                          value2json(state).encode("utf8"))
            else:
                state = json2value(state.decode("utf8"))

            last_modified, job_id = state

            # SCAN SCHEMA, GENERATE EXTRACTION SQL
            extractor = MySqlSnowflakeExtractor(settings.source)
            canonical_sql = extractor.get_sql(SQL("SELECT 0"))

            # ENSURE SCHEMA HAS NOT CHANGED SINCE LAST RUN
            old_sql = redis.get(settings.extractor.sql)
            if old_sql and old_sql.decode("utf8") != canonical_sql.sql:
                if force:
                    Log.warning("Schema has changed")
                else:
                    Log.error("Schema has changed")
            redis.set(settings.extractor.sql, canonical_sql.sql.encode("utf8"))

            # SETUP SOURCE
            source = MySQL(settings.source.database)

            while True:
                Log.note(
                    "Extracting jobs for last_modified={{last_modified|datetime|quote}}, job.id={{job_id}}",
                    last_modified=last_modified,
                    job_id=job_id,
                )

                # Example: job.id ==283890114
                # get_ids = ConcatSQL(
                #     (SQL_SELECT, sql_alias(quote_value(283890114), "id"))
                # )
                get_ids = sql_query({
                    "from": "job",
                    "select": ["id"],
                    "where": {
                        "or": [
                            {
                                "gt": {
                                    "last_modified": Date(last_modified)
                                }
                            },
                            {
                                "and": [
                                    {
                                        "eq": {
                                            "last_modified":
                                            Date(last_modified)
                                        }
                                    },
                                    {
                                        "gt": {
                                            "id": job_id
                                        }
                                    },
                                ]
                            },
                        ]
                    },
                    "sort": ["last_modified", "id"],
                    "limit": settings.extractor.chunk_size,
                })
                sql = extractor.get_sql(get_ids)

                # PULL FROM source, AND PUSH TO destination
                acc = []
                with source.transaction():
                    cursor = source.query(sql, stream=True, row_tuples=True)
                    extractor.construct_docs(cursor, acc.append, False)
                if not acc:
                    break

                # SOME LIMITS PLACES ON STRING SIZE
                for fl in jx.drill(acc, "job_log.failure_line"):
                    fl.message = strings.limit(fl.message, 10000)
                for r in acc:
                    r.etl.timestamp = Date.now()
                destination.extend(acc)

                # RECORD THE STATE
                last_doc = acc[-1]
                last_modified, job_id = last_doc.last_modified, last_doc.id
                redis.set(
                    settings.extractor.key,
                    value2json((last_modified, job_id)).encode("utf8"),
                )

                if len(acc) < settings.extractor.chunk_size:
                    break

        except Exception as e:
            Log.warning("problem with extraction", cause=e)

        Log.note("done job extraction")

        try:
            with Timer("merge shards"):
                destination.merge_shards()
        except Exception as e:
            Log.warning("problem with merge", cause=e)

        Log.note("done job merge")

Beispiel #6

0

Datei anzeigen

    def extract(self, settings, force, restart, merge):
        if not settings.extractor.app_name:
            Log.error("Expecting an extractor.app_name in config file")

        # SETUP DESTINATION
        destination = bigquery.Dataset(
            dataset=settings.extractor.app_name,
            kwargs=settings.destination).get_or_create_table(
                settings.destination)

        try:
            if merge:
                with Timer("merge shards"):
                    destination.merge_shards()

            # RECOVER LAST SQL STATE
            redis = Redis.from_url(REDIS_URL)
            state = redis.get(settings.extractor.key)

            if restart or not state:
                state = (0, 0)
                redis.set(settings.extractor.key,
                          value2json(state).encode("utf8"))
            else:
                state = json2value(state.decode("utf8"))

            last_modified, alert_id = state
            last_modified = Date(last_modified)

            # SCAN SCHEMA, GENERATE EXTRACTION SQL
            extractor = MySqlSnowflakeExtractor(settings.source)
            canonical_sql = extractor.get_sql(SQL("SELECT 0"))

            # ENSURE SCHEMA HAS NOT CHANGED SINCE LAST RUN
            old_sql = redis.get(settings.extractor.sql)
            if old_sql and old_sql.decode("utf8") != canonical_sql.sql:
                if force:
                    Log.warning("Schema has changed")
                else:
                    Log.error("Schema has changed")
            redis.set(settings.extractor.sql, canonical_sql.sql.encode("utf8"))

            # SETUP SOURCE
            source = MySQL(settings.source.database)

            while True:
                Log.note(
                    "Extracting alerts for last_modified={{last_modified|datetime|quote}}, alert.id={{alert_id}}",
                    last_modified=last_modified,
                    alert_id=alert_id,
                )
                last_year = Date.today(
                ) - YEAR + DAY  # ONLY YOUNG RECORDS CAN GO INTO BIGQUERY

                get_ids = SQL(
                    "SELECT s.id " +
                    "\nFROM treeherder.performance_alert_summary s" +
                    "\nLEFT JOIN treeherder.performance_alert a ON s.id=a.summary_id"
                    + "\nWHERE s.created>" + quote_value(last_year).sql +
                    " AND (s.last_updated > " +
                    quote_value(last_modified).sql + "\nOR a.last_updated > " +
                    quote_value(last_modified).sql + ")" + "\nGROUP BY s.id" +
                    "\nORDER BY s.id" + "\nLIMIT " +
                    quote_value(settings.extractor.chunk_size).sql)
                sql = extractor.get_sql(get_ids)

                # PULL FROM source, AND PUSH TO destination
                acc = []
                with source.transaction():
                    cursor = source.query(sql, stream=True, row_tuples=True)
                    extractor.construct_docs(cursor, acc.append, False)
                if not acc:
                    break
                destination.extend(acc)

                # RECORD THE STATE
                last_doc = acc[-1]
                last_modified, alert_id = last_doc.created, last_doc.id
                redis.set(
                    settings.extractor.key,
                    value2json((last_modified, alert_id)).encode("utf8"),
                )

                if len(acc) < settings.extractor.chunk_size:
                    break

        except Exception as e:
            Log.warning("problem with extraction", cause=e)

        Log.note("done alert extraction")

        try:
            with Timer("merge shards"):
                destination.merge_shards()
        except Exception as e:
            Log.warning("problem with merge", cause=e)

        Log.note("done alert merge")
        Log.stop()

Beispiel #7

0

Datei anzeigen

Datei: extract_perf.py Projekt: soumyalahiri/treeherder

    def extract(self, settings, force, restart, merge):
        if not settings.extractor.app_name:
            Log.error("Expecting an extractor.app_name in config file")

        # SETUP DESTINATION
        destination = bigquery.Dataset(
            dataset=settings.extractor.app_name,
            kwargs=settings.destination).get_or_create_table(
                settings.destination)

        try:
            if merge:
                with Timer("merge shards"):
                    destination.merge_shards()

            # RECOVER LAST SQL STATE
            redis = Redis.from_url(REDIS_URL)
            state = redis.get(settings.extractor.key)

            if restart or not state:
                state = 916850000
                redis.set(settings.extractor.key,
                          value2json(state).encode("utf8"))
            else:
                state = json2value(state.decode("utf8"))

            perf_id = state

            # SCAN SCHEMA, GENERATE EXTRACTION SQL
            extractor = MySqlSnowflakeExtractor(settings.source)
            canonical_sql = extractor.get_sql(SQL("SELECT 0"))

            # ENSURE SCHEMA HAS NOT CHANGED SINCE LAST RUN
            old_sql = redis.get(settings.extractor.sql)
            if old_sql and old_sql.decode("utf8") != canonical_sql.sql:
                if force:
                    Log.warning("Schema has changed")
                else:
                    Log.error("Schema has changed")
            redis.set(settings.extractor.sql, canonical_sql.sql.encode("utf8"))

            # SETUP SOURCE
            source = MySQL(settings.source.database)

            while True:
                Log.note("Extracting perfs for perf.id={{perf_id}}",
                         perf_id=perf_id)

                # get_ids = sql_query(
                #     {
                #         "from": "performance_datum",
                #         "select": ["id"],
                #         "where": {"gt": {"id": perf_id}},
                #         "sort": ["id"],
                #         "limit": settings.extractor.chunk_size,
                #     }
                # )
                get_ids = SQL(
                    str((PerformanceDatum.objects.filter(
                        id__gt=perf_id).values("id").order_by("id")
                         [:settings.extractor.chunk_size]).query))

                sql = extractor.get_sql(get_ids)

                # PULL FROM source, AND PUSH TO destination
                acc = []
                with source.transaction():
                    cursor = source.query(sql, stream=True, row_tuples=True)
                    extractor.construct_docs(cursor, acc.append, False)
                if not acc:
                    break

                # TODO: Remove me July 2021
                # OLD PERF RECORDS HAVE NO CORRESPONDING JOB
                # ADD job.submit_time FOR PARTITIONING
                for a in acc:
                    if not a.job.submit_time:
                        a.job.submit_time = a.push_timestamp
                    a.etl.timestamp = Date.now()
                destination.extend(acc)

                # RECORD THE STATE
                last_doc = acc[-1]
                perf_id = last_doc.id
                redis.set(settings.extractor.key,
                          value2json(perf_id).encode("utf8"))

                if len(acc) < settings.extractor.chunk_size:
                    break

        except Exception as e:
            Log.warning("problem with extraction", cause=e)

        Log.note("done perf extraction")

        try:
            with Timer("merge shards"):
                destination.merge_shards()
        except Exception as e:
            Log.warning("problem with merge", cause=e)

        Log.note("done perf merge")

Beispiel #8

0

Datei anzeigen

    def run(self, force=False, restart=False, merge=False):
        # SETUP LOGGING
        settings = startup.read_settings(filename=CONFIG_FILE)
        constants.set(settings.constants)
        Log.start(settings.debug)

        if not settings.extractor.app_name:
            Log.error("Expecting an extractor.app_name in config file")

        # SETUP DESTINATION
        destination = bigquery.Dataset(
            dataset=settings.extractor.app_name, kwargs=settings.destination
        ).get_or_create_table(settings.destination)

        try:
            if merge:
                with Timer("merge shards"):
                    destination.merge_shards()

            # RECOVER LAST SQL STATE
            redis = Redis()
            state = redis.get(settings.extractor.key)

            if restart or not state:
                state = (0, 0)
                redis.set(settings.extractor.key, value2json(state).encode("utf8"))
            else:
                state = json2value(state.decode("utf8"))

            last_modified, alert_id = state
            last_modified = parse(last_modified)

            # SCAN SCHEMA, GENERATE EXTRACTION SQL
            extractor = MySqlSnowflakeExtractor(settings.source)
            canonical_sql = extractor.get_sql(SQL("SELECT 0"))

            # ENSURE SCHEMA HAS NOT CHANGED SINCE LAST RUN
            old_sql = redis.get(settings.extractor.sql)
            if old_sql and old_sql.decode("utf8") != canonical_sql.sql:
                if force:
                    Log.warning("Schema has changed")
                else:
                    Log.error("Schema has changed")
            redis.set(settings.extractor.sql, canonical_sql.sql.encode("utf8"))

            # SETUP SOURCE
            source = MySQL(settings.source.database)

            while True:
                Log.note(
                    "Extracting alerts for last_modified={{last_modified|datetime|quote}}, alert.id={{alert_id}}",
                    last_modified=last_modified,
                    alert_id=alert_id,
                )
                last_year = (
                    Date.today() - YEAR + DAY
                )  # ONLY YOUNG RECORDS CAN GO INTO BIGQUERY

                # SELECT
                #     s.od
                # FROM
                #     treeherder.performance_alert_summary s
                # LEFT JOIN
                #     treeherder.performance_alert a ON s.id=a.summary_id
                # WHERE
                #     s.created>{last_year} AND (s.last_updated>{last_modified} OR a.last_updated>{last_modified})
                # GROUP BY
                #     s.id
                # ORDER BY
                #     s.id
                # LIMIT
                #     {settings.extractor.chunk_size}
                get_ids = SQL(
                    str(
                        (
                            PerformanceAlertSummary.objects.filter(
                                Q(created__gt=last_year.datetime)
                                & (
                                    Q(last_updated__gt=last_modified.datetime)
                                    | Q(alerts__last_updated__gt=last_modified.datetime)
                                )
                            )
                            .annotate()
                            .values("id")
                            .order_by("id")[: settings.extractor.chunk_size]
                        ).query
                    )
                )

                sql = extractor.get_sql(get_ids)

                # PULL FROM source, AND PUSH TO destination
                acc = []
                with source.transaction():
                    cursor = source.query(sql, stream=True, row_tuples=True)
                    extractor.construct_docs(cursor, acc.append, False)
                if not acc:
                    break
                destination.extend(acc)

                # RECORD THE STATE
                last_doc = acc[-1]
                last_modified, alert_id = last_doc.created, last_doc.id
                redis.set(
                    settings.extractor.key,
                    value2json((last_modified, alert_id)).encode("utf8"),
                )

                if len(acc) < settings.extractor.chunk_size:
                    break

        except Exception as e:
            Log.warning("problem with extraction", cause=e)

        Log.note("done alert extraction")

        try:
            with Timer("merge shards"):
                destination.merge_shards()
        except Exception as e:
            Log.warning("problem with merge", cause=e)

        Log.note("done alert merge")
        Log.stop()