Esempio n. 1
0
def test_extract_alert(extract_alert_settings, test_perf_alert_summary,
                       test_perf_alert):
    """
    If you find this test failing, then copy the JSON in the test failure into the test_extract_alerts.json file,
    then you may use the diff to review the changes.
    """
    now = datetime.datetime.now()
    source = MySQL(extract_alert_settings.source.database)
    extractor = MySqlSnowflakeExtractor(extract_alert_settings.source)
    sql = extractor.get_sql(
        SQL("SELECT " + text(test_perf_alert_summary.id) + " as id"))

    acc = []
    with source.transaction():
        cursor = list(source.query(sql, stream=True, row_tuples=True))
        extractor.construct_docs(cursor, acc.append, False)

    doc = acc[0]
    # TEST ARE RUN WITH CURRENT TIMESTAMPS
    doc.created = now
    doc.last_updated = now
    for d in doc.details:
        d.created = now
        d.last_updated = now
        d.series_signature.last_updated = now

    assertAlmostEqual(
        acc, ALERT, places=3
    )  # TH MIXES LOCAL TIMEZONE WITH GMT: https://bugzilla.mozilla.org/show_bug.cgi?id=1612603
Esempio n. 2
0
    def __init__(self, kwargs=None):
        self.settings = kwargs
        excludes = listwrap(self.settings.exclude)
        self.settings.exclude = set(e for e in excludes
                                    if len(split_field(e)) == 1)
        self.settings.exclude_columns = set(p for e in excludes
                                            for p in [tuple(split_field(e))]
                                            if len(p) > 1)
        self.settings.exclude_path = list(
            map(split_field, listwrap(self.settings.exclude_path)))
        self.settings.show_foreign_keys = coalesce(
            self.settings.show_foreign_keys, True)
        self.name_relations = unwrap(coalesce(self.settings.name_relations,
                                              {}))

        self.all_nested_paths = None
        self.nested_path_to_join = None
        self.columns = None

        with Explanation("scan database", debug=DEBUG):
            self.db = MySQL(**kwargs.database)
            self.settings.database.schema = self.db.settings.schema
            with self.db.transaction():
                self._scan_database()
        if not self.settings.database.schema:
            Log.error("you must provide a `database.schema`")
Esempio n. 3
0
def get_all_signatures(db_config, sql):
    """
    RETURN ALL SIGNATURES FROM PERFHERDER DATABASE
    """
    db = MySQL(db_config)
    with db:
        return db.query(sql)
Esempio n. 4
0
def test_extract_job(complex_job, extract_job_settings):
    """
    If you find this test failing, then copy the JSON in the test failure into the test_extract_job.json file,
    then you may use the diff to review the changes.
    """
    with MySQL(extract_job_settings.source.database) as source:
        with MySqlSnowflakeExtractor(extract_job_settings.source) as extractor:
            sql = extractor.get_sql(
                SQL("SELECT " + text(complex_job.id) + " as id"))

            acc = []
            with source.transaction():
                cursor = list(source.query(sql, stream=True, row_tuples=True))
                extractor.construct_docs(cursor, acc.append, False)

    doc = first(acc)
    doc.guid = first(JOB).guid  # NEW EACH TIME

    job_guid = first(jx.drill(JOB, "job_log.failure_line.job_guid"))
    for fl in jx.drill(doc, "job_log.failure_line"):
        fl.job_guid = job_guid

    assertAlmostEqual(
        acc,
        JOB,
        places=
        4,  # TH MIXES LOCAL TIMEZONE WITH GMT: https://bugzilla.mozilla.org/show_bug.cgi?id=1612603
    )
Esempio n. 5
0
def test_make_failure_class(failure_class, extract_job_settings):
    # TEST I CAN MAKE AN OBJECT IN THE DATABASE
    source = MySQL(extract_job_settings.source.database)
    with source.transaction():
        result = source.query(SQL("SELECT * from failure_classification"))

    # verify the repository object is the one we expect
    assert result[0].name == "not classified"
Esempio n. 6
0
def test_make_repository(test_repository, extract_job_settings):
    # TEST EXISTING FIXTURE MAKES AN OBJECT IN THE DATABASE
    source = MySQL(extract_job_settings.source.database)
    with source.transaction():
        result = source.query(SQL("SELECT * from repository"))

    # verify the repository object is the one we expect
    assert result[0].id == test_repository.id
    assert result[0].tc_root_url == test_repository.tc_root_url
Esempio n. 7
0
    def run_compare(self, config, id_sql, expected):
        db = MySQL(**config.database)
        extractor = MySqlSnowflakeExtractor(kwargs=config)

        sql = extractor.get_sql(SQL(id_sql))

        result = []
        with db.transaction():
            cursor = db.query(sql, stream=True, row_tuples=True)
            cursor = list(cursor)
            extractor.construct_docs(cursor, result.append, False)

        self.assertEqual(result, expected, "expecting identical")
        self.assertEqual(expected, result, "expecting identical")
Esempio n. 8
0
    def __init__(self,
                 host,
                 port=3306,
                 username=None,
                 password=None,
                 debug=False,
                 schema=None,
                 preamble=None,
                 readonly=False,
                 kwargs=None):
        from jx_mysql.mysql import MySQL

        self.settings = kwargs
        self._db = MySQL(kwargs)
Esempio n. 9
0
 def setUpClass(cls):
     Log.start(settings.debug)
     with Timer("setup database"):
         try:
             with MySQL(schema=None, kwargs=settings.database) as db:
                 db.query("drop database testing")
         except Exception as e:
             if "Can't drop database " in e:
                 pass
             else:
                 Log.warning("problem removing db", cause=e)
         execute_file("tests/resources/database.sql",
                      schema=None,
                      kwargs=settings.database)
Esempio n. 10
0
def test_django_cannot_encode_datetime(extract_job_settings):
    """
    DJANGO DOES NOT ENCODE THE DATETIME PROPERLY
    """
    epoch = Date(Date.EPOCH).datetime
    get_ids = SQL(
        str((Job.objects.filter(
            Q(last_modified__gt=epoch)
            | (Q(last_modified=epoch)
               & Q(id__gt=0))).annotate().values("id").order_by(
                   "last_modified", "id")[:2000]).query))
    source = MySQL(extract_job_settings.source.database)

    with pytest.raises(Exception):
        with source.transaction():
            list(source.query(get_ids, stream=True, row_tuples=True))
Esempio n. 11
0
def test_django_cannot_encode_datetime_strings(extract_job_settings):
    """
    DJANGO/MYSQL DATETIME MATH WORKS WHEN STRINGS
    """
    epoch_string = Date.EPOCH.format()
    sql_query = SQL(
        str((Job.objects.filter(
            Q(last_modified__gt=epoch_string)
            | (Q(last_modified=epoch_string)
               & Q(id__gt=0))).annotate().values("id").order_by(
                   "last_modified", "id")[:2000]).query))
    source = MySQL(extract_job_settings.source.database)

    with pytest.raises(Exception):
        with source.transaction():
            list(source.query(sql_query, stream=True, row_tuples=True))
Esempio n. 12
0
def get_signature(db_config, signature_id):
    db = MySQL(db_config)
    with db:
        return first(
            db.query(f"""
                SELECT
                    t1.id , 
                    t1.signature_hash, 
                    t1.suite ,
                    t1.test ,
                    UNIX_TIMESTAMP(t1.last_updated) as last_updated,
                    t1.lower_is_better,
                    t1.has_subtests ,
                    t1.alert_threshold,
                    t1.fore_window ,
                    t1.max_back_window,
                    t1.min_back_window ,
                    t1.should_alert, 
                    t1.extra_options ,
                    t1.alert_change_type,
                    t1.measurement_unit,
                    t1.application ,
                    t1.suite_public_name,
                    t1.test_public_name ,
                    t1.tags,
                    t3.option_collection_hash as `option_collection.hash`,
                    t1.framework_id,
                    t4.name AS framework, 
                    t5.platform AS platform, 
                    t6.name AS `repository`
                FROM
                    performance_signature t1
                LEFT JOIN
                    performance_signature AS t2 ON t2.id = t1.parent_signature_id
                LEFT JOIN
                    option_collection AS t3 ON t3.id = t1.option_collection_id
                LEFT JOIN
                    performance_framework AS t4 ON t4.id = t1.framework_id
                LEFT JOIN
                    machine_platform AS t5 ON t5.id = t1.platform_id
                LEFT JOIN
                   repository AS t6 ON t6.id = t1.repository_id
                WHERE
                    t1.id in {quote_list(listwrap(signature_id))}
                ORDER BY 
                    t1.last_updated DESC
            """))
Esempio n. 13
0
def env_setup():
    # These values not directly accessed during testing, but the code requires that they be present.
    os.environ["NEW_RELIC_APP_NAME"] = "testing"
    os.environ["BIGQUERY_PRIVATE_KEY_ID"] = "1"
    os.environ["BIGQUERY_PRIVATE_KEY"] = "1"

    # THE DOCKER ENV IS DIFFERENT FROM THE DEV ENVIRONMENT
    attempt = [
        "mysql://[email protected]:3306/test_treeherder",
        "mysql://root@mysql:3306/test_treeherder",
    ]
    for a in attempt:
        try:
            MySQL(host=a)
            os.environ["DATABASE_URL"] = a
        except Exception:
            pass
Esempio n. 14
0
def test_extract_job(complex_job, extract_job_settings, now):
    source = MySQL(extract_job_settings.source.database)
    extractor = MySqlSnowflakeExtractor(extract_job_settings.source)
    sql = extractor.get_sql(SQL("SELECT " + text(complex_job.id) + " as id"))

    acc = []
    with source.transaction():
        cursor = list(source.query(sql, stream=True, row_tuples=True))
        extractor.construct_docs(cursor, acc.append, False)

    doc = acc[0]
    doc.guid = complex_job.guid
    doc.last_modified = complex_job.last_modified

    assertAlmostEqual(
        acc, JOB, places=3
    )  # TH MIXES LOCAL TIMEZONE WITH GMT: https://bugzilla.mozilla.org/show_bug.cgi?id=1612603
Esempio n. 15
0
def test_extract_alert(extract_alert_settings, test_perf_alert_summary,
                       test_perf_alert):
    """
    If you find this test failing, then copy the JSON in the test failure into the test_extract_alerts.json file,
    then you may use the diff to review the changes.
    """
    with MySQL(extract_alert_settings.source.database) as source:
        with MySqlSnowflakeExtractor(
                extract_alert_settings.source) as extractor:
            sql = extractor.get_sql(
                SQL("SELECT " + text(test_perf_alert_summary.id) + " as id"))

            acc = []
            with source.transaction():
                cursor = list(source.query(sql, stream=True, row_tuples=True))
                extractor.construct_docs(cursor, acc.append, False)

    assertAlmostEqual(
        acc, ALERT, places=3
    )  # TH MIXES LOCAL TIMEZONE WITH GMT: https://bugzilla.mozilla.org/show_bug.cgi?id=1612603
Esempio n. 16
0
def test_extract_job(complex_job, extract_job_settings, now):
    """
    If you find this test failing, then copy the JSON in the test failure into the test_extract_job.json file,
    then you may use the diff to review the changes.
    """
    source = MySQL(extract_job_settings.source.database)
    extractor = MySqlSnowflakeExtractor(extract_job_settings.source)
    sql = extractor.get_sql(SQL("SELECT " + text(complex_job.id) + " as id"))

    acc = []
    with source.transaction():
        cursor = list(source.query(sql, stream=True, row_tuples=True))
        extractor.construct_docs(cursor, acc.append, False)

    doc = acc[0]
    doc.guid = complex_job.guid

    assertAlmostEqual(
        acc,
        JOB,
        places=
        4,  # TH MIXES LOCAL TIMEZONE WITH GMT: https://bugzilla.mozilla.org/show_bug.cgi?id=1612603
    )
Esempio n. 17
0
    def extract(self, settings, force, restart, start, merge):
        if not settings.extractor.app_name:
            Log.error("Expecting an extractor.app_name in config file")

        # SETUP DESTINATION
        destination = bigquery.Dataset(
            dataset=settings.extractor.app_name,
            kwargs=settings.destination).get_or_create_table(
                settings.destination)

        try:
            if merge:
                with Timer("merge shards"):
                    destination.merge_shards()

            # RECOVER LAST SQL STATE
            redis = Redis.from_url(REDIS_URL)
            state = redis.get(settings.extractor.key)

            if start:
                state = start, 0
            elif restart or not state:
                state = (0, 0)
                redis.set(settings.extractor.key,
                          value2json(state).encode("utf8"))
            else:
                state = json2value(state.decode("utf8"))

            last_modified, job_id = state

            # SCAN SCHEMA, GENERATE EXTRACTION SQL
            extractor = MySqlSnowflakeExtractor(settings.source)
            canonical_sql = extractor.get_sql(SQL("SELECT 0"))

            # ENSURE SCHEMA HAS NOT CHANGED SINCE LAST RUN
            old_sql = redis.get(settings.extractor.sql)
            if old_sql and old_sql.decode("utf8") != canonical_sql.sql:
                if force:
                    Log.warning("Schema has changed")
                else:
                    Log.error("Schema has changed")
            redis.set(settings.extractor.sql, canonical_sql.sql.encode("utf8"))

            # SETUP SOURCE
            source = MySQL(settings.source.database)

            while True:
                Log.note(
                    "Extracting jobs for last_modified={{last_modified|datetime|quote}}, job.id={{job_id}}",
                    last_modified=last_modified,
                    job_id=job_id,
                )

                # Example: job.id ==283890114
                # get_ids = ConcatSQL(
                #     (SQL_SELECT, sql_alias(quote_value(283890114), "id"))
                # )
                get_ids = sql_query({
                    "from": "job",
                    "select": ["id"],
                    "where": {
                        "or": [
                            {
                                "gt": {
                                    "last_modified": Date(last_modified)
                                }
                            },
                            {
                                "and": [
                                    {
                                        "eq": {
                                            "last_modified":
                                            Date(last_modified)
                                        }
                                    },
                                    {
                                        "gt": {
                                            "id": job_id
                                        }
                                    },
                                ]
                            },
                        ]
                    },
                    "sort": ["last_modified", "id"],
                    "limit": settings.extractor.chunk_size,
                })
                sql = extractor.get_sql(get_ids)

                # PULL FROM source, AND PUSH TO destination
                acc = []
                with source.transaction():
                    cursor = source.query(sql, stream=True, row_tuples=True)
                    extractor.construct_docs(cursor, acc.append, False)
                if not acc:
                    break

                # SOME LIMITS PLACES ON STRING SIZE
                for fl in jx.drill(acc, "job_log.failure_line"):
                    fl.message = strings.limit(fl.message, 10000)
                for r in acc:
                    r.etl.timestamp = Date.now()
                destination.extend(acc)

                # RECORD THE STATE
                last_doc = acc[-1]
                last_modified, job_id = last_doc.last_modified, last_doc.id
                redis.set(
                    settings.extractor.key,
                    value2json((last_modified, job_id)).encode("utf8"),
                )

                if len(acc) < settings.extractor.chunk_size:
                    break

        except Exception as e:
            Log.warning("problem with extraction", cause=e)

        Log.note("done job extraction")

        try:
            with Timer("merge shards"):
                destination.merge_shards()
        except Exception as e:
            Log.warning("problem with merge", cause=e)

        Log.note("done job merge")
Esempio n. 18
0
    def run(self, force=False, restart=False, merge=False):
        # SETUP LOGGING
        settings = startup.read_settings(filename=CONFIG_FILE)
        constants.set(settings.constants)
        Log.start(settings.debug)

        if not settings.extractor.app_name:
            Log.error("Expecting an extractor.app_name in config file")

        # SETUP DESTINATION
        destination = bigquery.Dataset(
            dataset=settings.extractor.app_name, kwargs=settings.destination
        ).get_or_create_table(settings.destination)

        try:
            if merge:
                with Timer("merge shards"):
                    destination.merge_shards()

            # RECOVER LAST SQL STATE
            redis = Redis()
            state = redis.get(settings.extractor.key)

            if restart or not state:
                state = (0, 0)
                redis.set(settings.extractor.key, value2json(state).encode("utf8"))
            else:
                state = json2value(state.decode("utf8"))

            last_modified, alert_id = state
            last_modified = parse(last_modified)

            # SCAN SCHEMA, GENERATE EXTRACTION SQL
            extractor = MySqlSnowflakeExtractor(settings.source)
            canonical_sql = extractor.get_sql(SQL("SELECT 0"))

            # ENSURE SCHEMA HAS NOT CHANGED SINCE LAST RUN
            old_sql = redis.get(settings.extractor.sql)
            if old_sql and old_sql.decode("utf8") != canonical_sql.sql:
                if force:
                    Log.warning("Schema has changed")
                else:
                    Log.error("Schema has changed")
            redis.set(settings.extractor.sql, canonical_sql.sql.encode("utf8"))

            # SETUP SOURCE
            source = MySQL(settings.source.database)

            while True:
                Log.note(
                    "Extracting alerts for last_modified={{last_modified|datetime|quote}}, alert.id={{alert_id}}",
                    last_modified=last_modified,
                    alert_id=alert_id,
                )
                last_year = (
                    Date.today() - YEAR + DAY
                )  # ONLY YOUNG RECORDS CAN GO INTO BIGQUERY

                # SELECT
                #     s.od
                # FROM
                #     treeherder.performance_alert_summary s
                # LEFT JOIN
                #     treeherder.performance_alert a ON s.id=a.summary_id
                # WHERE
                #     s.created>{last_year} AND (s.last_updated>{last_modified} OR a.last_updated>{last_modified})
                # GROUP BY
                #     s.id
                # ORDER BY
                #     s.id
                # LIMIT
                #     {settings.extractor.chunk_size}
                get_ids = SQL(
                    str(
                        (
                            PerformanceAlertSummary.objects.filter(
                                Q(created__gt=last_year.datetime)
                                & (
                                    Q(last_updated__gt=last_modified.datetime)
                                    | Q(alerts__last_updated__gt=last_modified.datetime)
                                )
                            )
                            .annotate()
                            .values("id")
                            .order_by("id")[: settings.extractor.chunk_size]
                        ).query
                    )
                )

                sql = extractor.get_sql(get_ids)

                # PULL FROM source, AND PUSH TO destination
                acc = []
                with source.transaction():
                    cursor = source.query(sql, stream=True, row_tuples=True)
                    extractor.construct_docs(cursor, acc.append, False)
                if not acc:
                    break
                destination.extend(acc)

                # RECORD THE STATE
                last_doc = acc[-1]
                last_modified, alert_id = last_doc.created, last_doc.id
                redis.set(
                    settings.extractor.key,
                    value2json((last_modified, alert_id)).encode("utf8"),
                )

                if len(acc) < settings.extractor.chunk_size:
                    break

        except Exception as e:
            Log.warning("problem with extraction", cause=e)

        Log.note("done alert extraction")

        try:
            with Timer("merge shards"):
                destination.merge_shards()
        except Exception as e:
            Log.warning("problem with merge", cause=e)

        Log.note("done alert merge")
        Log.stop()
Esempio n. 19
0
def test_make_job(complex_job, extract_job_settings):
    source = MySQL(extract_job_settings.source.database)
    with source.transaction():
        result = source.query(SQL("SELECT count(1) as num from job_detail"))

    assert result[0].num == 4
Esempio n. 20
0
def get_dataum(db_config, signature_id, since, limit):
    db = MySQL(db_config)
    with db:
        return db.query(
            SQL(f"""
        SELECT
            d.id,
            d.value,
            t3.id AS `job.id`,
            t3.guid AS `job.guid`,
            p.revision AS `push.revision`,
            UNIX_TIMESTAMP(p.time) AS `push.time`,
            r.name AS `push.repository`,
            s.created AS `summary.created`,
            s.status AS `summary.status`,
            s.bug_number AS `summary.bug_number`,
            s.manually_created AS `summary.manually_created`,
            s.prev_push_id AS `summary.prev_push`,
            s.issue_tracker_id AS `summary.issue_tracker`,
            s.notes AS `summary.notes`,
            s.first_triaged AS `summary.first_triaged`,
            s.last_updated AS `summary.last_updated`,
            s.bug_updated AS `summary.bug_updated`,
            a.id AS `alert.id`,
            a.`is_regression` AS `alert.isregression`,
            a.`status` AS `alert.status`,
            a.`amount_pct` AS `alert.amount_pct`,
            a.`amount_abs` AS `alert.amount_abs`,
            a.`prev_value` AS `alert.prev_value`,
            a.`new_value` AS `alert.new_value`,
            a.`t_value` AS `alert.t_value`,
            a.`manually_created` AS `alert.manually_created`,
            a.`classifier_id` AS `alert.classifier_id`,
            a.`starred` AS `alert.starred`,
            a.`created` AS `alert.created`,
            a.`first_triaged` AS `alert.first_triaged`,
            a.`last_updated` AS `alert.last_updated`
        FROM
            performance_datum AS d
        JOIN 
            performance_signature sig on sig.id=d.signature_id
        LEFT JOIN
            job AS t3 ON t3.id = d.job_id
        LEFT JOIN
            push AS p ON p.id = d.push_id
        LEFT JOIN
            repository AS r ON r.id = p.repository_id
        LEFT JOIN
            performance_alert_summary s on s.repository_id = p.repository_id and s.push_id=p.id
        LEFT JOIN
            performance_alert a 
        ON 
            a.summary_id = s.id AND 
            a.series_signature_id = d.signature_id AND 
            a.manually_created=0
        WHERE
            p.time > {quote_value(since)} AND
            d.signature_id in {quote_list(listwrap(signature_id))}
        ORDER BY
            p.time DESC
        LIMIT
            {quote_value(limit + 1)}
        """))
Esempio n. 21
0
    def extract(self, settings, force, restart, merge):
        if not settings.extractor.app_name:
            Log.error("Expecting an extractor.app_name in config file")

        # SETUP DESTINATION
        destination = bigquery.Dataset(
            dataset=settings.extractor.app_name,
            kwargs=settings.destination).get_or_create_table(
                settings.destination)

        try:
            if merge:
                with Timer("merge shards"):
                    destination.merge_shards()

            # RECOVER LAST SQL STATE
            redis = Redis.from_url(REDIS_URL)
            state = redis.get(settings.extractor.key)

            if restart or not state:
                state = 916850000
                redis.set(settings.extractor.key,
                          value2json(state).encode("utf8"))
            else:
                state = json2value(state.decode("utf8"))

            perf_id = state

            # SCAN SCHEMA, GENERATE EXTRACTION SQL
            extractor = MySqlSnowflakeExtractor(settings.source)
            canonical_sql = extractor.get_sql(SQL("SELECT 0"))

            # ENSURE SCHEMA HAS NOT CHANGED SINCE LAST RUN
            old_sql = redis.get(settings.extractor.sql)
            if old_sql and old_sql.decode("utf8") != canonical_sql.sql:
                if force:
                    Log.warning("Schema has changed")
                else:
                    Log.error("Schema has changed")
            redis.set(settings.extractor.sql, canonical_sql.sql.encode("utf8"))

            # SETUP SOURCE
            source = MySQL(settings.source.database)

            while True:
                Log.note("Extracting perfs for perf.id={{perf_id}}",
                         perf_id=perf_id)

                # get_ids = sql_query(
                #     {
                #         "from": "performance_datum",
                #         "select": ["id"],
                #         "where": {"gt": {"id": perf_id}},
                #         "sort": ["id"],
                #         "limit": settings.extractor.chunk_size,
                #     }
                # )
                get_ids = SQL(
                    str((PerformanceDatum.objects.filter(
                        id__gt=perf_id).values("id").order_by("id")
                         [:settings.extractor.chunk_size]).query))

                sql = extractor.get_sql(get_ids)

                # PULL FROM source, AND PUSH TO destination
                acc = []
                with source.transaction():
                    cursor = source.query(sql, stream=True, row_tuples=True)
                    extractor.construct_docs(cursor, acc.append, False)
                if not acc:
                    break

                # TODO: Remove me July 2021
                # OLD PERF RECORDS HAVE NO CORRESPONDING JOB
                # ADD job.submit_time FOR PARTITIONING
                for a in acc:
                    if not a.job.submit_time:
                        a.job.submit_time = a.push_timestamp
                    a.etl.timestamp = Date.now()
                destination.extend(acc)

                # RECORD THE STATE
                last_doc = acc[-1]
                perf_id = last_doc.id
                redis.set(settings.extractor.key,
                          value2json(perf_id).encode("utf8"))

                if len(acc) < settings.extractor.chunk_size:
                    break

        except Exception as e:
            Log.warning("problem with extraction", cause=e)

        Log.note("done perf extraction")

        try:
            with Timer("merge shards"):
                destination.merge_shards()
        except Exception as e:
            Log.warning("problem with merge", cause=e)

        Log.note("done perf merge")
Esempio n. 22
0
    def run(self, force=False, restart=False, merge=False):
        # SETUP LOGGING
        settings = startup.read_settings(filename=CONFIG_FILE)
        constants.set(settings.constants)
        Log.start(settings.debug)

        if not settings.extractor.app_name:
            Log.error("Expecting an extractor.app_name in config file")

        # SETUP DESTINATION
        destination = bigquery.Dataset(
            dataset=settings.extractor.app_name, kwargs=settings.destination
        ).get_or_create_table(settings.destination)

        try:
            if merge:
                with Timer("merge shards"):
                    destination.merge_shards()

            # RECOVER LAST SQL STATE
            redis = Redis()
            state = redis.get(settings.extractor.key)

            if restart or not state:
                state = (0, 0)
                redis.set(settings.extractor.key, value2json(state).encode("utf8"))
            else:
                state = json2value(state.decode("utf8"))

            last_modified, job_id = state

            # SCAN SCHEMA, GENERATE EXTRACTION SQL
            extractor = MySqlSnowflakeExtractor(settings.source)
            canonical_sql = extractor.get_sql(SQL("SELECT 0"))

            # ENSURE SCHEMA HAS NOT CHANGED SINCE LAST RUN
            old_sql = redis.get(settings.extractor.sql)
            if old_sql and old_sql.decode("utf8") != canonical_sql.sql:
                if force:
                    Log.warning("Schema has changed")
                else:
                    Log.error("Schema has changed")
            redis.set(settings.extractor.sql, canonical_sql.sql.encode("utf8"))

            # SETUP SOURCE
            source = MySQL(settings.source.database)

            while True:
                Log.note(
                    "Extracting jobs for last_modified={{last_modified|datetime|quote}}, job.id={{job_id}}",
                    last_modified=last_modified,
                    job_id=job_id,
                )

                # Example: job.id ==283890114
                # get_ids = ConcatSQL(
                #     (SQL_SELECT, sql_alias(quote_value(283890114), "id"))
                # )
                # get_ids = sql_query(
                #     {
                #         "from": "job",
                #         "select": ["id"],
                #         "where": {
                #             "or": [
                #                 {"gt": {"last_modified": parse(last_modified)}},
                #                 {
                #                     "and": [
                #                         {"eq": {"last_modified": parse(last_modified)}},
                #                         {"gt": {"id": job_id}},
                #                     ]
                #                 },
                #             ]
                #         },
                #         "sort": ["last_modified", "id"],
                #         "limit": settings.extractor.chunk_size,
                #     }
                # )

                get_ids = SQL(str(
                    (
                        Job.objects.filter(
                            Q(last_modified__gt=parse(last_modified).datetime)
                            | (
                                Q(last_modified=parse(last_modified).datetime)
                                & Q(id__gt=job_id)
                            )
                        )
                        .annotate()
                        .values("id")
                        .order_by("last_modified", "id")[
                            : settings.extractor.chunk_size
                        ]
                    ).query
                ))

                sql = extractor.get_sql(get_ids)

                # PULL FROM source, AND PUSH TO destination
                acc = []
                with source.transaction():
                    cursor = source.query(sql, stream=True, row_tuples=True)
                    extractor.construct_docs(cursor, acc.append, False)
                if not acc:
                    break
                destination.extend(acc)

                # RECORD THE STATE
                last_doc = acc[-1]
                last_modified, job_id = last_doc.last_modified, last_doc.id
                redis.set(
                    settings.extractor.key,
                    value2json((last_modified, job_id)).encode("utf8"),
                )

                if len(acc) < settings.extractor.chunk_size:
                    break

        except Exception as e:
            Log.warning("problem with extraction", cause=e)

        Log.note("done job extraction")

        try:
            with Timer("merge shards"):
                destination.merge_shards()
        except Exception as e:
            Log.warning("problem with merge", cause=e)

        Log.note("done job merge")
Esempio n. 23
0
    def extract(self, settings, force, restart, merge):
        if not settings.extractor.app_name:
            Log.error("Expecting an extractor.app_name in config file")

        # SETUP DESTINATION
        destination = bigquery.Dataset(
            dataset=settings.extractor.app_name,
            kwargs=settings.destination).get_or_create_table(
                settings.destination)

        try:
            if merge:
                with Timer("merge shards"):
                    destination.merge_shards()

            # RECOVER LAST SQL STATE
            redis = Redis.from_url(REDIS_URL)
            state = redis.get(settings.extractor.key)

            if restart or not state:
                state = (0, 0)
                redis.set(settings.extractor.key,
                          value2json(state).encode("utf8"))
            else:
                state = json2value(state.decode("utf8"))

            last_modified, alert_id = state
            last_modified = Date(last_modified)

            # SCAN SCHEMA, GENERATE EXTRACTION SQL
            extractor = MySqlSnowflakeExtractor(settings.source)
            canonical_sql = extractor.get_sql(SQL("SELECT 0"))

            # ENSURE SCHEMA HAS NOT CHANGED SINCE LAST RUN
            old_sql = redis.get(settings.extractor.sql)
            if old_sql and old_sql.decode("utf8") != canonical_sql.sql:
                if force:
                    Log.warning("Schema has changed")
                else:
                    Log.error("Schema has changed")
            redis.set(settings.extractor.sql, canonical_sql.sql.encode("utf8"))

            # SETUP SOURCE
            source = MySQL(settings.source.database)

            while True:
                Log.note(
                    "Extracting alerts for last_modified={{last_modified|datetime|quote}}, alert.id={{alert_id}}",
                    last_modified=last_modified,
                    alert_id=alert_id,
                )
                last_year = Date.today(
                ) - YEAR + DAY  # ONLY YOUNG RECORDS CAN GO INTO BIGQUERY

                get_ids = SQL(
                    "SELECT s.id " +
                    "\nFROM treeherder.performance_alert_summary s" +
                    "\nLEFT JOIN treeherder.performance_alert a ON s.id=a.summary_id"
                    + "\nWHERE s.created>" + quote_value(last_year).sql +
                    " AND (s.last_updated > " +
                    quote_value(last_modified).sql + "\nOR a.last_updated > " +
                    quote_value(last_modified).sql + ")" + "\nGROUP BY s.id" +
                    "\nORDER BY s.id" + "\nLIMIT " +
                    quote_value(settings.extractor.chunk_size).sql)
                sql = extractor.get_sql(get_ids)

                # PULL FROM source, AND PUSH TO destination
                acc = []
                with source.transaction():
                    cursor = source.query(sql, stream=True, row_tuples=True)
                    extractor.construct_docs(cursor, acc.append, False)
                if not acc:
                    break
                destination.extend(acc)

                # RECORD THE STATE
                last_doc = acc[-1]
                last_modified, alert_id = last_doc.created, last_doc.id
                redis.set(
                    settings.extractor.key,
                    value2json((last_modified, alert_id)).encode("utf8"),
                )

                if len(acc) < settings.extractor.chunk_size:
                    break

        except Exception as e:
            Log.warning("problem with extraction", cause=e)

        Log.note("done alert extraction")

        try:
            with Timer("merge shards"):
                destination.merge_shards()
        except Exception as e:
            Log.warning("problem with merge", cause=e)

        Log.note("done alert merge")
        Log.stop()