def get_all_signatures(db_config, sql):
    """
    RETURN ALL SIGNATURES FROM PERFHERDER DATABASE
    """
    db = MySQL(db_config)
    with db:
        return db.query(sql)
Exemple #2
0
def test_extract_alert(extract_alert_settings, test_perf_alert_summary,
                       test_perf_alert):
    """
    If you find this test failing, then copy the JSON in the test failure into the test_extract_alerts.json file,
    then you may use the diff to review the changes.
    """
    now = datetime.datetime.now()
    source = MySQL(extract_alert_settings.source.database)
    extractor = MySqlSnowflakeExtractor(extract_alert_settings.source)
    sql = extractor.get_sql(
        SQL("SELECT " + text(test_perf_alert_summary.id) + " as id"))

    acc = []
    with source.transaction():
        cursor = list(source.query(sql, stream=True, row_tuples=True))
        extractor.construct_docs(cursor, acc.append, False)

    doc = acc[0]
    # TEST ARE RUN WITH CURRENT TIMESTAMPS
    doc.created = now
    doc.last_updated = now
    for d in doc.details:
        d.created = now
        d.last_updated = now
        d.series_signature.last_updated = now

    assertAlmostEqual(
        acc, ALERT, places=3
    )  # TH MIXES LOCAL TIMEZONE WITH GMT: https://bugzilla.mozilla.org/show_bug.cgi?id=1612603
Exemple #3
0
def test_make_failure_class(failure_class, extract_job_settings):
    # TEST I CAN MAKE AN OBJECT IN THE DATABASE
    source = MySQL(extract_job_settings.source.database)
    with source.transaction():
        result = source.query(SQL("SELECT * from failure_classification"))

    # verify the repository object is the one we expect
    assert result[0].name == "not classified"
Exemple #4
0
def test_make_repository(test_repository, extract_job_settings):
    # TEST EXISTING FIXTURE MAKES AN OBJECT IN THE DATABASE
    source = MySQL(extract_job_settings.source.database)
    with source.transaction():
        result = source.query(SQL("SELECT * from repository"))

    # verify the repository object is the one we expect
    assert result[0].id == test_repository.id
    assert result[0].tc_root_url == test_repository.tc_root_url
Exemple #5
0
    def run_compare(self, config, id_sql, expected):
        db = MySQL(**config.database)
        extractor = MySqlSnowflakeExtractor(kwargs=config)

        sql = extractor.get_sql(SQL(id_sql))

        result = []
        with db.transaction():
            cursor = db.query(sql, stream=True, row_tuples=True)
            cursor = list(cursor)
            extractor.construct_docs(cursor, result.append, False)

        self.assertEqual(result, expected, "expecting identical")
        self.assertEqual(expected, result, "expecting identical")
Exemple #6
0
def test_django_cannot_encode_datetime(extract_job_settings):
    """
    DJANGO DOES NOT ENCODE THE DATETIME PROPERLY
    """
    epoch = Date(Date.EPOCH).datetime
    get_ids = SQL(
        str((Job.objects.filter(
            Q(last_modified__gt=epoch)
            | (Q(last_modified=epoch)
               & Q(id__gt=0))).annotate().values("id").order_by(
                   "last_modified", "id")[:2000]).query))
    source = MySQL(extract_job_settings.source.database)

    with pytest.raises(Exception):
        with source.transaction():
            list(source.query(get_ids, stream=True, row_tuples=True))
Exemple #7
0
def test_django_cannot_encode_datetime_strings(extract_job_settings):
    """
    DJANGO/MYSQL DATETIME MATH WORKS WHEN STRINGS
    """
    epoch_string = Date.EPOCH.format()
    sql_query = SQL(
        str((Job.objects.filter(
            Q(last_modified__gt=epoch_string)
            | (Q(last_modified=epoch_string)
               & Q(id__gt=0))).annotate().values("id").order_by(
                   "last_modified", "id")[:2000]).query))
    source = MySQL(extract_job_settings.source.database)

    with pytest.raises(Exception):
        with source.transaction():
            list(source.query(sql_query, stream=True, row_tuples=True))
def get_signature(db_config, signature_id):
    db = MySQL(db_config)
    with db:
        return first(
            db.query(f"""
                SELECT
                    t1.id , 
                    t1.signature_hash, 
                    t1.suite ,
                    t1.test ,
                    UNIX_TIMESTAMP(t1.last_updated) as last_updated,
                    t1.lower_is_better,
                    t1.has_subtests ,
                    t1.alert_threshold,
                    t1.fore_window ,
                    t1.max_back_window,
                    t1.min_back_window ,
                    t1.should_alert, 
                    t1.extra_options ,
                    t1.alert_change_type,
                    t1.measurement_unit,
                    t1.application ,
                    t1.suite_public_name,
                    t1.test_public_name ,
                    t1.tags,
                    t3.option_collection_hash as `option_collection.hash`,
                    t1.framework_id,
                    t4.name AS framework, 
                    t5.platform AS platform, 
                    t6.name AS `repository`
                FROM
                    performance_signature t1
                LEFT JOIN
                    performance_signature AS t2 ON t2.id = t1.parent_signature_id
                LEFT JOIN
                    option_collection AS t3 ON t3.id = t1.option_collection_id
                LEFT JOIN
                    performance_framework AS t4 ON t4.id = t1.framework_id
                LEFT JOIN
                    machine_platform AS t5 ON t5.id = t1.platform_id
                LEFT JOIN
                   repository AS t6 ON t6.id = t1.repository_id
                WHERE
                    t1.id in {quote_list(listwrap(signature_id))}
                ORDER BY 
                    t1.last_updated DESC
            """))
Exemple #9
0
def test_extract_job(complex_job, extract_job_settings, now):
    source = MySQL(extract_job_settings.source.database)
    extractor = MySqlSnowflakeExtractor(extract_job_settings.source)
    sql = extractor.get_sql(SQL("SELECT " + text(complex_job.id) + " as id"))

    acc = []
    with source.transaction():
        cursor = list(source.query(sql, stream=True, row_tuples=True))
        extractor.construct_docs(cursor, acc.append, False)

    doc = acc[0]
    doc.guid = complex_job.guid
    doc.last_modified = complex_job.last_modified

    assertAlmostEqual(
        acc, JOB, places=3
    )  # TH MIXES LOCAL TIMEZONE WITH GMT: https://bugzilla.mozilla.org/show_bug.cgi?id=1612603
Exemple #10
0
def test_extract_job(complex_job, extract_job_settings, now):
    """
    If you find this test failing, then copy the JSON in the test failure into the test_extract_job.json file,
    then you may use the diff to review the changes.
    """
    source = MySQL(extract_job_settings.source.database)
    extractor = MySqlSnowflakeExtractor(extract_job_settings.source)
    sql = extractor.get_sql(SQL("SELECT " + text(complex_job.id) + " as id"))

    acc = []
    with source.transaction():
        cursor = list(source.query(sql, stream=True, row_tuples=True))
        extractor.construct_docs(cursor, acc.append, False)

    doc = acc[0]
    doc.guid = complex_job.guid

    assertAlmostEqual(
        acc,
        JOB,
        places=
        4,  # TH MIXES LOCAL TIMEZONE WITH GMT: https://bugzilla.mozilla.org/show_bug.cgi?id=1612603
    )
Exemple #11
0
def get_dataum(db_config, signature_id, since, limit):
    db = MySQL(db_config)
    with db:
        return db.query(
            SQL(f"""
        SELECT
            d.id,
            d.value,
            t3.id AS `job.id`,
            t3.guid AS `job.guid`,
            p.revision AS `push.revision`,
            UNIX_TIMESTAMP(p.time) AS `push.time`,
            r.name AS `push.repository`,
            s.created AS `summary.created`,
            s.status AS `summary.status`,
            s.bug_number AS `summary.bug_number`,
            s.manually_created AS `summary.manually_created`,
            s.prev_push_id AS `summary.prev_push`,
            s.issue_tracker_id AS `summary.issue_tracker`,
            s.notes AS `summary.notes`,
            s.first_triaged AS `summary.first_triaged`,
            s.last_updated AS `summary.last_updated`,
            s.bug_updated AS `summary.bug_updated`,
            a.id AS `alert.id`,
            a.`is_regression` AS `alert.isregression`,
            a.`status` AS `alert.status`,
            a.`amount_pct` AS `alert.amount_pct`,
            a.`amount_abs` AS `alert.amount_abs`,
            a.`prev_value` AS `alert.prev_value`,
            a.`new_value` AS `alert.new_value`,
            a.`t_value` AS `alert.t_value`,
            a.`manually_created` AS `alert.manually_created`,
            a.`classifier_id` AS `alert.classifier_id`,
            a.`starred` AS `alert.starred`,
            a.`created` AS `alert.created`,
            a.`first_triaged` AS `alert.first_triaged`,
            a.`last_updated` AS `alert.last_updated`
        FROM
            performance_datum AS d
        JOIN 
            performance_signature sig on sig.id=d.signature_id
        LEFT JOIN
            job AS t3 ON t3.id = d.job_id
        LEFT JOIN
            push AS p ON p.id = d.push_id
        LEFT JOIN
            repository AS r ON r.id = p.repository_id
        LEFT JOIN
            performance_alert_summary s on s.repository_id = p.repository_id and s.push_id=p.id
        LEFT JOIN
            performance_alert a 
        ON 
            a.summary_id = s.id AND 
            a.series_signature_id = d.signature_id AND 
            a.manually_created=0
        WHERE
            p.time > {quote_value(since)} AND
            d.signature_id in {quote_list(listwrap(signature_id))}
        ORDER BY
            p.time DESC
        LIMIT
            {quote_value(limit + 1)}
        """))
Exemple #12
0
def test_make_job(complex_job, extract_job_settings):
    source = MySQL(extract_job_settings.source.database)
    with source.transaction():
        result = source.query(SQL("SELECT count(1) as num from job_detail"))

    assert result[0].num == 4
    def extract(self, settings, force, restart, merge):
        if not settings.extractor.app_name:
            Log.error("Expecting an extractor.app_name in config file")

        # SETUP DESTINATION
        destination = bigquery.Dataset(
            dataset=settings.extractor.app_name,
            kwargs=settings.destination).get_or_create_table(
                settings.destination)

        try:
            if merge:
                with Timer("merge shards"):
                    destination.merge_shards()

            # RECOVER LAST SQL STATE
            redis = Redis.from_url(REDIS_URL)
            state = redis.get(settings.extractor.key)

            if restart or not state:
                state = 916850000
                redis.set(settings.extractor.key,
                          value2json(state).encode("utf8"))
            else:
                state = json2value(state.decode("utf8"))

            perf_id = state

            # SCAN SCHEMA, GENERATE EXTRACTION SQL
            extractor = MySqlSnowflakeExtractor(settings.source)
            canonical_sql = extractor.get_sql(SQL("SELECT 0"))

            # ENSURE SCHEMA HAS NOT CHANGED SINCE LAST RUN
            old_sql = redis.get(settings.extractor.sql)
            if old_sql and old_sql.decode("utf8") != canonical_sql.sql:
                if force:
                    Log.warning("Schema has changed")
                else:
                    Log.error("Schema has changed")
            redis.set(settings.extractor.sql, canonical_sql.sql.encode("utf8"))

            # SETUP SOURCE
            source = MySQL(settings.source.database)

            while True:
                Log.note("Extracting perfs for perf.id={{perf_id}}",
                         perf_id=perf_id)

                # get_ids = sql_query(
                #     {
                #         "from": "performance_datum",
                #         "select": ["id"],
                #         "where": {"gt": {"id": perf_id}},
                #         "sort": ["id"],
                #         "limit": settings.extractor.chunk_size,
                #     }
                # )
                get_ids = SQL(
                    str((PerformanceDatum.objects.filter(
                        id__gt=perf_id).values("id").order_by("id")
                         [:settings.extractor.chunk_size]).query))

                sql = extractor.get_sql(get_ids)

                # PULL FROM source, AND PUSH TO destination
                acc = []
                with source.transaction():
                    cursor = source.query(sql, stream=True, row_tuples=True)
                    extractor.construct_docs(cursor, acc.append, False)
                if not acc:
                    break

                # TODO: Remove me July 2021
                # OLD PERF RECORDS HAVE NO CORRESPONDING JOB
                # ADD job.submit_time FOR PARTITIONING
                for a in acc:
                    if not a.job.submit_time:
                        a.job.submit_time = a.push_timestamp
                    a.etl.timestamp = Date.now()
                destination.extend(acc)

                # RECORD THE STATE
                last_doc = acc[-1]
                perf_id = last_doc.id
                redis.set(settings.extractor.key,
                          value2json(perf_id).encode("utf8"))

                if len(acc) < settings.extractor.chunk_size:
                    break

        except Exception as e:
            Log.warning("problem with extraction", cause=e)

        Log.note("done perf extraction")

        try:
            with Timer("merge shards"):
                destination.merge_shards()
        except Exception as e:
            Log.warning("problem with merge", cause=e)

        Log.note("done perf merge")
Exemple #14
0
    def extract(self, settings, force, restart, merge):
        if not settings.extractor.app_name:
            Log.error("Expecting an extractor.app_name in config file")

        # SETUP DESTINATION
        destination = bigquery.Dataset(
            dataset=settings.extractor.app_name,
            kwargs=settings.destination).get_or_create_table(
                settings.destination)

        try:
            if merge:
                with Timer("merge shards"):
                    destination.merge_shards()

            # RECOVER LAST SQL STATE
            redis = Redis.from_url(REDIS_URL)
            state = redis.get(settings.extractor.key)

            if restart or not state:
                state = (0, 0)
                redis.set(settings.extractor.key,
                          value2json(state).encode("utf8"))
            else:
                state = json2value(state.decode("utf8"))

            last_modified, alert_id = state
            last_modified = Date(last_modified)

            # SCAN SCHEMA, GENERATE EXTRACTION SQL
            extractor = MySqlSnowflakeExtractor(settings.source)
            canonical_sql = extractor.get_sql(SQL("SELECT 0"))

            # ENSURE SCHEMA HAS NOT CHANGED SINCE LAST RUN
            old_sql = redis.get(settings.extractor.sql)
            if old_sql and old_sql.decode("utf8") != canonical_sql.sql:
                if force:
                    Log.warning("Schema has changed")
                else:
                    Log.error("Schema has changed")
            redis.set(settings.extractor.sql, canonical_sql.sql.encode("utf8"))

            # SETUP SOURCE
            source = MySQL(settings.source.database)

            while True:
                Log.note(
                    "Extracting alerts for last_modified={{last_modified|datetime|quote}}, alert.id={{alert_id}}",
                    last_modified=last_modified,
                    alert_id=alert_id,
                )
                last_year = Date.today(
                ) - YEAR + DAY  # ONLY YOUNG RECORDS CAN GO INTO BIGQUERY

                get_ids = SQL(
                    "SELECT s.id " +
                    "\nFROM treeherder.performance_alert_summary s" +
                    "\nLEFT JOIN treeherder.performance_alert a ON s.id=a.summary_id"
                    + "\nWHERE s.created>" + quote_value(last_year).sql +
                    " AND (s.last_updated > " +
                    quote_value(last_modified).sql + "\nOR a.last_updated > " +
                    quote_value(last_modified).sql + ")" + "\nGROUP BY s.id" +
                    "\nORDER BY s.id" + "\nLIMIT " +
                    quote_value(settings.extractor.chunk_size).sql)
                sql = extractor.get_sql(get_ids)

                # PULL FROM source, AND PUSH TO destination
                acc = []
                with source.transaction():
                    cursor = source.query(sql, stream=True, row_tuples=True)
                    extractor.construct_docs(cursor, acc.append, False)
                if not acc:
                    break
                destination.extend(acc)

                # RECORD THE STATE
                last_doc = acc[-1]
                last_modified, alert_id = last_doc.created, last_doc.id
                redis.set(
                    settings.extractor.key,
                    value2json((last_modified, alert_id)).encode("utf8"),
                )

                if len(acc) < settings.extractor.chunk_size:
                    break

        except Exception as e:
            Log.warning("problem with extraction", cause=e)

        Log.note("done alert extraction")

        try:
            with Timer("merge shards"):
                destination.merge_shards()
        except Exception as e:
            Log.warning("problem with merge", cause=e)

        Log.note("done alert merge")
        Log.stop()
    def extract(self, settings, force, restart, start, merge):
        if not settings.extractor.app_name:
            Log.error("Expecting an extractor.app_name in config file")

        # SETUP DESTINATION
        destination = bigquery.Dataset(
            dataset=settings.extractor.app_name,
            kwargs=settings.destination).get_or_create_table(
                settings.destination)

        try:
            if merge:
                with Timer("merge shards"):
                    destination.merge_shards()

            # RECOVER LAST SQL STATE
            redis = Redis.from_url(REDIS_URL)
            state = redis.get(settings.extractor.key)

            if start:
                state = start, 0
            elif restart or not state:
                state = (0, 0)
                redis.set(settings.extractor.key,
                          value2json(state).encode("utf8"))
            else:
                state = json2value(state.decode("utf8"))

            last_modified, job_id = state

            # SCAN SCHEMA, GENERATE EXTRACTION SQL
            extractor = MySqlSnowflakeExtractor(settings.source)
            canonical_sql = extractor.get_sql(SQL("SELECT 0"))

            # ENSURE SCHEMA HAS NOT CHANGED SINCE LAST RUN
            old_sql = redis.get(settings.extractor.sql)
            if old_sql and old_sql.decode("utf8") != canonical_sql.sql:
                if force:
                    Log.warning("Schema has changed")
                else:
                    Log.error("Schema has changed")
            redis.set(settings.extractor.sql, canonical_sql.sql.encode("utf8"))

            # SETUP SOURCE
            source = MySQL(settings.source.database)

            while True:
                Log.note(
                    "Extracting jobs for last_modified={{last_modified|datetime|quote}}, job.id={{job_id}}",
                    last_modified=last_modified,
                    job_id=job_id,
                )

                # Example: job.id ==283890114
                # get_ids = ConcatSQL(
                #     (SQL_SELECT, sql_alias(quote_value(283890114), "id"))
                # )
                get_ids = sql_query({
                    "from": "job",
                    "select": ["id"],
                    "where": {
                        "or": [
                            {
                                "gt": {
                                    "last_modified": Date(last_modified)
                                }
                            },
                            {
                                "and": [
                                    {
                                        "eq": {
                                            "last_modified":
                                            Date(last_modified)
                                        }
                                    },
                                    {
                                        "gt": {
                                            "id": job_id
                                        }
                                    },
                                ]
                            },
                        ]
                    },
                    "sort": ["last_modified", "id"],
                    "limit": settings.extractor.chunk_size,
                })
                sql = extractor.get_sql(get_ids)

                # PULL FROM source, AND PUSH TO destination
                acc = []
                with source.transaction():
                    cursor = source.query(sql, stream=True, row_tuples=True)
                    extractor.construct_docs(cursor, acc.append, False)
                if not acc:
                    break

                # SOME LIMITS PLACES ON STRING SIZE
                for fl in jx.drill(acc, "job_log.failure_line"):
                    fl.message = strings.limit(fl.message, 10000)
                for r in acc:
                    r.etl.timestamp = Date.now()
                destination.extend(acc)

                # RECORD THE STATE
                last_doc = acc[-1]
                last_modified, job_id = last_doc.last_modified, last_doc.id
                redis.set(
                    settings.extractor.key,
                    value2json((last_modified, job_id)).encode("utf8"),
                )

                if len(acc) < settings.extractor.chunk_size:
                    break

        except Exception as e:
            Log.warning("problem with extraction", cause=e)

        Log.note("done job extraction")

        try:
            with Timer("merge shards"):
                destination.merge_shards()
        except Exception as e:
            Log.warning("problem with merge", cause=e)

        Log.note("done job merge")
    def run(self, force=False, restart=False, merge=False):
        # SETUP LOGGING
        settings = startup.read_settings(filename=CONFIG_FILE)
        constants.set(settings.constants)
        Log.start(settings.debug)

        if not settings.extractor.app_name:
            Log.error("Expecting an extractor.app_name in config file")

        # SETUP DESTINATION
        destination = bigquery.Dataset(
            dataset=settings.extractor.app_name, kwargs=settings.destination
        ).get_or_create_table(settings.destination)

        try:
            if merge:
                with Timer("merge shards"):
                    destination.merge_shards()

            # RECOVER LAST SQL STATE
            redis = Redis()
            state = redis.get(settings.extractor.key)

            if restart or not state:
                state = (0, 0)
                redis.set(settings.extractor.key, value2json(state).encode("utf8"))
            else:
                state = json2value(state.decode("utf8"))

            last_modified, job_id = state

            # SCAN SCHEMA, GENERATE EXTRACTION SQL
            extractor = MySqlSnowflakeExtractor(settings.source)
            canonical_sql = extractor.get_sql(SQL("SELECT 0"))

            # ENSURE SCHEMA HAS NOT CHANGED SINCE LAST RUN
            old_sql = redis.get(settings.extractor.sql)
            if old_sql and old_sql.decode("utf8") != canonical_sql.sql:
                if force:
                    Log.warning("Schema has changed")
                else:
                    Log.error("Schema has changed")
            redis.set(settings.extractor.sql, canonical_sql.sql.encode("utf8"))

            # SETUP SOURCE
            source = MySQL(settings.source.database)

            while True:
                Log.note(
                    "Extracting jobs for last_modified={{last_modified|datetime|quote}}, job.id={{job_id}}",
                    last_modified=last_modified,
                    job_id=job_id,
                )

                # Example: job.id ==283890114
                # get_ids = ConcatSQL(
                #     (SQL_SELECT, sql_alias(quote_value(283890114), "id"))
                # )
                # get_ids = sql_query(
                #     {
                #         "from": "job",
                #         "select": ["id"],
                #         "where": {
                #             "or": [
                #                 {"gt": {"last_modified": parse(last_modified)}},
                #                 {
                #                     "and": [
                #                         {"eq": {"last_modified": parse(last_modified)}},
                #                         {"gt": {"id": job_id}},
                #                     ]
                #                 },
                #             ]
                #         },
                #         "sort": ["last_modified", "id"],
                #         "limit": settings.extractor.chunk_size,
                #     }
                # )

                get_ids = SQL(str(
                    (
                        Job.objects.filter(
                            Q(last_modified__gt=parse(last_modified).datetime)
                            | (
                                Q(last_modified=parse(last_modified).datetime)
                                & Q(id__gt=job_id)
                            )
                        )
                        .annotate()
                        .values("id")
                        .order_by("last_modified", "id")[
                            : settings.extractor.chunk_size
                        ]
                    ).query
                ))

                sql = extractor.get_sql(get_ids)

                # PULL FROM source, AND PUSH TO destination
                acc = []
                with source.transaction():
                    cursor = source.query(sql, stream=True, row_tuples=True)
                    extractor.construct_docs(cursor, acc.append, False)
                if not acc:
                    break
                destination.extend(acc)

                # RECORD THE STATE
                last_doc = acc[-1]
                last_modified, job_id = last_doc.last_modified, last_doc.id
                redis.set(
                    settings.extractor.key,
                    value2json((last_modified, job_id)).encode("utf8"),
                )

                if len(acc) < settings.extractor.chunk_size:
                    break

        except Exception as e:
            Log.warning("problem with extraction", cause=e)

        Log.note("done job extraction")

        try:
            with Timer("merge shards"):
                destination.merge_shards()
        except Exception as e:
            Log.warning("problem with merge", cause=e)

        Log.note("done job merge")
Exemple #17
0
class MySqlSnowflakeExtractor(object):
    @override
    def __init__(self, kwargs=None):
        self.settings = kwargs
        excludes = listwrap(self.settings.exclude)
        self.settings.exclude = set(e for e in excludes
                                    if len(split_field(e)) == 1)
        self.settings.exclude_columns = set(p for e in excludes
                                            for p in [tuple(split_field(e))]
                                            if len(p) > 1)
        self.settings.exclude_path = list(
            map(split_field, listwrap(self.settings.exclude_path)))
        self.settings.show_foreign_keys = coalesce(
            self.settings.show_foreign_keys, True)
        self.name_relations = unwrap(coalesce(self.settings.name_relations,
                                              {}))

        self.all_nested_paths = None
        self.nested_path_to_join = None
        self.columns = None

        with Explanation("scan database", debug=DEBUG):
            self.db = MySQL(**kwargs.database)
            self.settings.database.schema = self.db.settings.schema
            with self.db.transaction():
                self._scan_database()
        if not self.settings.database.schema:
            Log.error("you must provide a `database.schema`")

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.close()

    def close(self):
        self.db.close()

    def get_sql(self, get_ids):
        sql = self._compose_sql(get_ids)

        # ORDERING
        sort = []
        ordering = []
        for ci, c in enumerate(self.columns):
            if c.sort:
                sort.append(quote_column(c.column_alias) + SQL_IS_NOT_NULL)
                sort.append(quote_column(c.column_alias))
                ordering.append(ci)

        union_all_sql = SQL_UNION_ALL.join(sql)
        union_all_sql = ConcatSQL(
            SQL_SELECT,
            SQL_STAR,
            SQL_FROM,
            sql_alias(sql_iso(union_all_sql), "a"),
            SQL_ORDERBY,
            sql_list(sort),
        )
        if DEBUG:
            Log.note("{{sql}}", sql=union_all_sql)

        return union_all_sql

    def path_not_allowed(self, path):
        return path != "." and any(
            path_in_path(e, p) for p in [split_field(path)]
            for e in self.settings.exclude_path)

    def _scan_database(self):
        # GET ALL RELATIONS
        raw_relations = self.db.query(
            """
            SELECT
                table_schema,
                table_name,
                referenced_table_schema,
                referenced_table_name,
                referenced_column_name,
                constraint_name,
                column_name,
                ordinal_position
            FROM
                information_schema.key_column_usage
            WHERE
                referenced_column_name IS NOT NULL
            """,
            param=self.settings.database,
        )

        if not raw_relations:
            Log.error("No relations in the database")

        for r in self.settings.add_relations:
            try:
                lhs, rhs = map(strings.trim, r.split("->"))
                lhs = lhs.split(".")
                if len(lhs) == 2:
                    lhs = [self.settings.database.schema] + lhs
                rhs = rhs.split(".")
                if len(rhs) == 2:
                    rhs = [self.settings.database.schema] + rhs
                to_add = Data(
                    ordinal_position=1,  # CAN ONLY HANDLE 1-COLUMN RELATIONS
                    table_schema=lhs[0],
                    table_name=lhs[1],
                    column_name=lhs[2],
                    referenced_table_schema=rhs[0],
                    referenced_table_name=rhs[1],
                    referenced_column_name=rhs[2],
                )

                # CHECK IF EXISTING
                if jx.filter(raw_relations, {"eq": to_add}):
                    Log.note("Relation {{relation}} already exists",
                             relation=r)
                    continue

                to_add.constraint_name = Random.hex(20)
                raw_relations.append(to_add)
            except Exception as e:
                Log.error("Could not parse {{line|quote}}", line=r, cause=e)

        relations = jx.select(
            raw_relations,
            [
                {
                    "name": "constraint.name",
                    "value": "constraint_name"
                },
                {
                    "name": "table.schema",
                    "value": "table_schema"
                },
                {
                    "name": "table.name",
                    "value": "table_name"
                },
                {
                    "name": "column.name",
                    "value": "column_name"
                },
                {
                    "name": "referenced.table.schema",
                    "value": "referenced_table_schema"
                },
                {
                    "name": "referenced.table.name",
                    "value": "referenced_table_name"
                },
                {
                    "name": "referenced.column.name",
                    "value": "referenced_column_name"
                },
                {
                    "name": "ordinal_position",
                    "value": "ordinal_position"
                },
            ],
        )

        # GET ALL TABLES
        raw_tables = self.db.query("""
            SELECT
                t.table_schema,
                t.table_name,
                c.constraint_name,
                c.constraint_type,
                k.column_name,
                k.ordinal_position
            FROM
                information_schema.tables t
            LEFT JOIN
                information_schema.table_constraints c on c.table_name=t.table_name AND c.table_schema=t.table_schema and (constraint_type='UNIQUE' or constraint_type='PRIMARY KEY')
            LEFT JOIN
                information_schema.key_column_usage k on k.constraint_name=c.constraint_name AND k.table_name=t.table_name and k.table_schema=t.table_schema
            ORDER BY
                t.table_schema,
                t.table_name,
                c.constraint_name,
                k.ordinal_position,
                k.column_name
            """)

        # ORGANIZE, AND PICK ONE UNIQUE CONSTRAINT FOR LINKING
        tables = UniqueIndex(keys=["name", "schema"])
        for t, c in jx.groupby(raw_tables, ["table_name", "table_schema"]):
            c = wrap(list(c))
            best_index = Null
            is_referenced = False
            is_primary = False
            for g, w in jx.groupby(c, "constraint_name"):
                if not g.constraint_name:
                    continue
                w = list(w)
                ref = False
                for r in relations:
                    if (r.table.name == t.table_name
                            and r.table.schema == t.table_schema
                            and r.constraint.name == g.constraint_name):
                        ref = True
                is_prime = w[0].constraint_type == "PRIMARY"

                reasons_this_one_is_better = [
                    best_index == None,  # WE DO NOT HAVE A CANDIDATE YET
                    is_prime
                    and not is_primary,  # PRIMARY KEYS ARE GOOD TO HAVE
                    is_primary == is_prime and ref and
                    not is_referenced,  # REFERENCED UNIQUE TUPLES ARE GOOD TOO
                    is_primary == is_prime and ref == is_referenced and len(w)
                    < len(best_index),  # THE SHORTER THE TUPLE, THE BETTER
                ]
                if any(reasons_this_one_is_better):
                    is_primary = is_prime
                    is_referenced = ref
                    best_index = w

            tables.add({
                "name": t.table_name,
                "schema": t.table_schema,
                "id": [b.column_name for b in best_index],
            })

        fact_table = tables[self.settings.fact_table,
                            self.settings.database.schema]
        ids_table = {
            "alias": "t0",
            "name": "__ids__",
            "schema": fact_table.schema,
            "id": fact_table.id,
        }
        relations.extend(
            wrap({
                "constraint": {
                    "name": "__link_ids_to_fact_table__"
                },
                "table": ids_table,
                "column": {
                    "name": c
                },
                "referenced": {
                    "table": fact_table,
                    "column": {
                        "name": c
                    }
                },
                "ordinal_position": i,
            }) for i, c in enumerate(fact_table.id))
        tables.add(ids_table)

        # GET ALL COLUMNS
        raw_columns = self.db.query("""
            SELECT
                column_name,
                table_schema,
                table_name,
                ordinal_position,
                data_type
            FROM
                information_schema.columns
            """)

        reference_only_tables = [
            r.split(".")[0] for r in self.settings.reference_only
            if len(r.split(".")) == 2
        ]
        reference_all_tables = [
            r.split(".")[0] for r in self.settings.reference_only
            if len(r.split(".")) == 1
        ]
        foreign_column_table_schema_triples = {(r.column.name, r.table.name,
                                                r.table.schema)
                                               for r in relations}
        referenced_column_table_schema_triples = {(
            r.referenced.column.name,
            r.referenced.table.name,
            r.referenced.table.schema,
        )
                                                  for r in relations}
        related_column_table_schema_triples = (
            foreign_column_table_schema_triples
            | referenced_column_table_schema_triples)

        columns = UniqueIndex(["column.name", "table.name", "table.schema"])
        for c in raw_columns:
            if c.table_name in reference_only_tables:
                if c.table_name + "." + c.column_name in self.settings.reference_only:
                    include = True
                    reference = True
                    foreign = False
                elif c.column_name in tables[(c.table_name,
                                              c.table_schema)].id:
                    include = self.settings.show_foreign_keys
                    reference = False
                    foreign = False
                else:
                    include = False
                    reference = False
                    foreign = False
            elif c.table_name in reference_all_tables:
                # TABLES USED FOR REFERENCE, NO NESTED DOCUMENTS EXPECTED
                if c.column_name in tables[(c.table_name, c.table_schema)].id:
                    include = self.settings.show_foreign_keys
                    reference = True
                    foreign = False
                elif (
                        c.column_name,
                        c.table_name,
                        c.table_schema,
                ) in foreign_column_table_schema_triples:
                    include = False
                    reference = False
                    foreign = True
                else:
                    include = True
                    reference = False
                    foreign = False
            elif c.column_name in tables[(c.table_name, c.table_schema)].id:
                include = self.settings.show_foreign_keys
                reference = False
                foreign = False
            elif (
                    c.column_name,
                    c.table_name,
                    c.table_schema,
            ) in foreign_column_table_schema_triples:
                include = False
                reference = False
                foreign = True
            elif (
                    c.column_name,
                    c.table_name,
                    c.table_schema,
            ) in referenced_column_table_schema_triples:
                include = self.settings.show_foreign_keys
                reference = False
                foreign = False
            else:
                include = True
                reference = False
                foreign = False

            rel = {
                "column": {
                    "name": c.column_name,
                    "type": c.data_type
                },
                "table": {
                    "name": c.table_name,
                    "schema": c.table_schema
                },
                "ordinal_position": c.ordinal_position,
                "is_id": c.column_name
                in tables[(c.table_name, c.table_schema)].id,
                "include": include,  # TRUE IF THIS COLUMN IS OUTPUTTED
                "reference":
                reference,  # TRUE IF THIS COLUMN REPRESENTS THE ROW
                "foreign":
                foreign,  # TRUE IF THIS COLUMN POINTS TO ANOTHER ROW
            }
            columns.add(rel)

        # ITERATE OVER ALL PATHS
        todo = FlatList()
        output_columns = FlatList()
        nested_path_to_join = {}
        all_nested_paths = [["."]]

        def follow_paths(position, path, nested_path, done_relations,
                         no_nested_docs):
            if position.name in self.settings.exclude:
                return

            if self.path_not_allowed(path):
                return
            if DEBUG:
                Log.note("Trace {{path}}", path=path)
            if position.name != "__ids__":
                # USED TO CONFIRM WE CAN ACCESS THE TABLE (WILL THROW ERROR WHEN IF IT FAILS)
                self.db.query(
                    ConcatSQL(
                        SQL_SELECT,
                        SQL_STAR,
                        SQL_FROM,
                        quote_column(position.schema, position.name),
                        SQL_LIMIT,
                        SQL_ONE,
                    ))

            if position.name in reference_all_tables:
                no_nested_docs = True
            if position.name in reference_only_tables:
                return
            curr_join_list = copy(nested_path_to_join[nested_path[0]])

            ###############################################################################
            # INNER OBJECTS
            ###############################################################################
            referenced_tables = list(
                sort_using_key(
                    jx.groupby(
                        jx.filter(
                            relations,
                            {
                                "eq": {
                                    "table.name": position.name,
                                    "table.schema": position.schema,
                                }
                            },
                        ),
                        "constraint.name",
                    ),
                    key=lambda p: first(p[1]).column.name,
                ))
            for g, constraint_columns in referenced_tables:
                g = unwrap(g)
                constraint_columns = deepcopy(constraint_columns)
                if g["constraint.name"] in done_relations:
                    continue
                if any(cc for cc in constraint_columns
                       if cc.referenced.table.name in self.settings.exclude):
                    continue

                done_relations.add(g["constraint.name"])

                many_to_one_joins = nested_path_to_join[nested_path[0]]
                index = len(many_to_one_joins)

                alias = "t" + text(index)
                for c in constraint_columns:
                    c.referenced.table.alias = alias
                    c.table = position
                many_to_one_joins.append({
                    "join_columns": constraint_columns,
                    "path": path,
                    "nested_path": nested_path,
                })

                # HANDLE THE COMMON *id SUFFIX
                name = []
                for cname, tname in zip(
                        constraint_columns.column.name,
                        constraint_columns.referenced.table.name,
                ):
                    if cname.startswith(tname):
                        name.append(tname)
                    elif cname.endswith("_id"):
                        name.append(cname[:-3])
                    else:
                        name.append(cname)

                relation_string = many_to_one_string(constraint_columns[0])
                step = "/".join(name)
                if len(constraint_columns) == 1:
                    step = self.name_relations.get(relation_string, step)

                referenced_column_path = concat_field(path, step)
                if self.path_not_allowed(referenced_column_path):
                    continue

                if referenced_column_path in reference_only_tables:
                    continue

                col_pointer_name = relative_field(referenced_column_path,
                                                  nested_path[0])
                for col in columns:
                    if (col.table.name
                            == constraint_columns[0].referenced.table.name
                            and col.table.schema
                            == constraint_columns[0].referenced.table.schema):
                        col_full_name = concat_field(
                            col_pointer_name, literal_field(col.column.name))

                        if (col.is_id and len(nested_path) == 1
                                and col.table.name == fact_table.name
                                and col.table.schema == fact_table.schema):
                            # ALWAYS SHOW THE ID OF THE FACT
                            c_index = len(output_columns)
                            output_columns.append({
                                "table_alias":
                                alias,
                                "column_alias":
                                "c" + text(c_index),
                                "column":
                                col,
                                "sort":
                                True,
                                "path":
                                referenced_column_path,
                                "nested_path":
                                nested_path,
                                "put":
                                col_full_name,
                            })
                        elif col.column.name == constraint_columns[
                                0].column.name:
                            c_index = len(output_columns)
                            output_columns.append({
                                "table_alias":
                                alias,
                                "column_alias":
                                "c" + text(c_index),
                                "column":
                                col,
                                "sort":
                                False,
                                "path":
                                referenced_column_path,
                                "nested_path":
                                nested_path,
                                "put":
                                col_full_name
                                if self.settings.show_foreign_keys else None,
                            })
                        elif col.is_id:
                            c_index = len(output_columns)
                            output_columns.append({
                                "table_alias":
                                alias,
                                "column_alias":
                                "c" + text(c_index),
                                "column":
                                col,
                                "sort":
                                False,
                                "path":
                                referenced_column_path,
                                "nested_path":
                                nested_path,
                                "put":
                                col_full_name
                                if self.settings.show_foreign_keys else None,
                            })
                        elif col.reference:
                            c_index = len(output_columns)
                            output_columns.append({
                                "table_alias":
                                alias,
                                "column_alias":
                                "c" + text(c_index),
                                "column":
                                col,
                                "sort":
                                False,
                                "path":
                                referenced_column_path,
                                "nested_path":
                                nested_path,
                                "put":
                                col_pointer_name
                                if not self.settings.show_foreign_keys else
                                col_full_name,  # REFERENCE FIELDS CAN REPLACE THE WHOLE OBJECT BEING REFERENCED
                            })
                        elif col.include:
                            c_index = len(output_columns)
                            output_columns.append({
                                "table_alias":
                                alias,
                                "column_alias":
                                "c" + text(c_index),
                                "column":
                                col,
                                "sort":
                                False,
                                "path":
                                referenced_column_path,
                                "nested_path":
                                nested_path,
                                "put":
                                col_full_name,
                            })

                if position.name in reference_only_tables:
                    continue

                todo.append(
                    Data(
                        position=copy(constraint_columns[0].referenced.table),
                        path=referenced_column_path,
                        nested_path=nested_path,
                        done_relations=copy(done_relations),
                        no_nested_docs=no_nested_docs,
                    ))
            ###############################################################################
            # NESTED OBJECTS
            ###############################################################################
            if not no_nested_docs:
                nesting_tables = list(
                    sort_using_key(
                        jx.groupby(
                            jx.filter(
                                relations,
                                {
                                    "eq": {
                                        "referenced.table.name": position.name,
                                        "referenced.table.schema":
                                        position.schema,
                                    }
                                },
                            ),
                            "constraint.name",
                        ),
                        key=lambda p: [(r.table.name, r.column.name)
                                       for r in [first(p[1])]][0],
                    ))

                for g, constraint_columns in nesting_tables:
                    g = unwrap(g)
                    constraint_columns = deepcopy(constraint_columns)
                    if g["constraint.name"] in done_relations:
                        continue
                    done_relations.add(g["constraint.name"])

                    many_table = set(constraint_columns.table.name)
                    if not (many_table - self.settings.exclude):
                        continue

                    relation_string = one_to_many_string(constraint_columns[0])
                    step = "/".join(many_table)
                    if len(constraint_columns) == 1:
                        step = self.name_relations.get(relation_string, step)

                    referenced_column_path = concat_field(path, step)
                    if self.path_not_allowed(referenced_column_path):
                        continue

                    new_nested_path = [referenced_column_path] + nested_path
                    all_nested_paths.append(new_nested_path)

                    if referenced_column_path in nested_path_to_join:
                        Log.error(
                            "{{path}} already exists, try adding entry to name_relations",
                            path=referenced_column_path,
                        )
                    one_to_many_joins = nested_path_to_join[
                        referenced_column_path] = copy(curr_join_list)
                    index = len(one_to_many_joins)
                    alias = "t" + text(index)
                    for c in constraint_columns:
                        c.table.alias = alias
                        c.referenced.table = position
                    one_to_many_joins.append(
                        set_default(
                            {},
                            g,
                            {
                                "children": True,
                                "join_columns": constraint_columns,
                                "path": path,
                                "nested_path": nested_path,
                            },
                        ))
                    for col in columns:
                        if (col.table.name == constraint_columns[0].table.name
                                and col.table.schema
                                == constraint_columns[0].table.schema):
                            col_full_name = join_field(
                                split_field(referenced_column_path)
                                [len(split_field(new_nested_path[0])):] +
                                [literal_field(col.column.name)])

                            if col.column.name == constraint_columns[
                                    0].column.name:
                                c_index = len(output_columns)
                                output_columns.append({
                                    "table_alias":
                                    alias,
                                    "column_alias":
                                    "c" + text(c_index),
                                    "column":
                                    col,
                                    "sort":
                                    col.is_id,
                                    "path":
                                    referenced_column_path,
                                    "nested_path":
                                    new_nested_path,
                                    "put":
                                    col_full_name if
                                    self.settings.show_foreign_keys else None,
                                })
                            elif col.is_id:
                                c_index = len(output_columns)
                                output_columns.append({
                                    "table_alias":
                                    alias,
                                    "column_alias":
                                    "c" + text(c_index),
                                    "column":
                                    col,
                                    "sort":
                                    col.is_id,
                                    "path":
                                    referenced_column_path,
                                    "nested_path":
                                    new_nested_path,
                                    "put":
                                    col_full_name if
                                    self.settings.show_foreign_keys else None,
                                })
                            else:
                                c_index = len(output_columns)
                                output_columns.append({
                                    "table_alias":
                                    alias,
                                    "column_alias":
                                    "c" + text(c_index),
                                    "column":
                                    col,
                                    "sort":
                                    col.is_id,
                                    "path":
                                    referenced_column_path,
                                    "nested_path":
                                    new_nested_path,
                                    "put":
                                    col_full_name if col.include else None,
                                })

                    todo.append(
                        Data(
                            position=constraint_columns[0].table,
                            path=referenced_column_path,
                            nested_path=new_nested_path,
                            done_relations=copy(done_relations),
                            no_nested_docs=no_nested_docs,
                        ))

        path = "."
        nested_path = [path]
        nested_path_to_join["."] = [{
            "path":
            path,
            "join_columns": [{
                "referenced": {
                    "table": ids_table
                }
            }],
            "nested_path":
            nested_path,
        }]

        todo.append(
            Data(
                position=ids_table,
                path=path,
                nested_path=nested_path,
                done_relations=set(),
                no_nested_docs=False,
            ))

        while todo:
            item = todo.pop(0)
            follow_paths(**item)

        self.all_nested_paths = all_nested_paths
        self.nested_path_to_join = nested_path_to_join
        self.columns = output_columns

    def _compose_sql(self, get_ids):
        """
        :param get_ids: SQL to get the ids, and used to select the documents returned
        :return:
        """
        if not isinstance(get_ids, SQL):
            Log.error("Expecting SQL to get some primary ids")

        sql = []
        for nested_path in self.all_nested_paths:
            # MAKE THE REQUIRED JOINS
            sql_joins = []

            for i, curr_join in enumerate(
                    self.nested_path_to_join[nested_path[0]]):
                curr_join = wrap(curr_join)
                rel = curr_join.join_columns[0]
                if i == 0:
                    sql_joins.append(
                        ConcatSQL(
                            SQL_FROM,
                            sql_alias(sql_iso(get_ids),
                                      rel.referenced.table.alias),
                        ))
                elif curr_join.children:
                    full_name = quote_column(rel.table.schema, rel.table.name)
                    sql_joins.append(
                        ConcatSQL(
                            SQL_INNER_JOIN,
                            sql_alias(full_name, rel.table.alias),
                            SQL_ON,
                            SQL_AND.join(
                                ConcatSQL(
                                    quote_column(rel.table.alias,
                                                 const_col.column.name),
                                    SQL_EQ,
                                    quote_column(
                                        rel.referenced.table.alias,
                                        const_col.referenced.column.name,
                                    ),
                                ) for const_col in curr_join.join_columns),
                        ))
                else:
                    full_name = quote_column(rel.referenced.table.schema,
                                             rel.referenced.table.name)
                    sql_joins.append(
                        ConcatSQL(
                            SQL_LEFT_JOIN,
                            sql_alias(full_name, rel.referenced.table.alias),
                            SQL_ON,
                            SQL_AND.join(
                                ConcatSQL(
                                    quote_column(
                                        rel.referenced.table.alias,
                                        const_col.referenced.column.name,
                                    ),
                                    SQL_EQ,
                                    quote_column(rel.table.alias,
                                                 const_col.column.name),
                                ) for const_col in curr_join.join_columns),
                        ))

            # ONLY SELECT WHAT WE NEED, NULL THE REST
            selects = []
            not_null_column_seen = False
            for c in self.columns:
                if (
                        c.column.table.name,
                        c.column.column.name,
                ) in self.settings.exclude_columns:
                    selects.append(sql_alias(SQL_NULL, c.column_alias))
                elif c.nested_path[0] == nested_path[0]:
                    s = sql_alias(
                        quote_column(c.table_alias, c.column.column.name),
                        c.column_alias,
                    )
                    selects.append(s)
                    not_null_column_seen = True
                elif startswith_field(nested_path[0], c.path):
                    # PARENT ID REFERENCES
                    if c.column.is_id:
                        s = sql_alias(
                            quote_column(c.table_alias, c.column.column.name),
                            c.column_alias,
                        )
                        selects.append(s)
                        not_null_column_seen = True
                    else:
                        selects.append(sql_alias(SQL_NULL, c.column_alias))
                else:
                    selects.append(sql_alias(SQL_NULL, c.column_alias))

            if not_null_column_seen:
                sql.append(SQL_SELECT + sql_list(selects) +
                           SQL("").join(sql_joins))

        return sql

    def construct_docs(self, cursor, append, please_stop):
        """
        :param cursor: ITERATOR OF RECORD TUPLES
        :param append: METHOD TO CALL WITH CONSTRUCTED DOCUMENT
        :return: (count, first, next, next_key)
        number of documents added
        the first document in the batch
        the first document of the next batch
        """
        null_values = set(self.settings.null_values) | {None}

        doc_count = 0

        columns = tuple(wrap(c) for c in self.columns)
        with Timer("Downloading from MySQL", verbose=DEBUG):
            curr_doc = Null
            row_count = 0
            if DEBUG:
                cursor = list(cursor)
                Log.note("{{data|json|limit(1000)}}", data=cursor)
            for row in cursor:
                row_count += 1
                if please_stop:
                    Log.error("Got `please_stop` signal")

                nested_path = []
                next_object = Data()

                for c, value in zip(columns, row):
                    # columns ARE IN ORDER, FROM FACT ['.'] TO EVER-DEEPER-NESTED
                    if value in null_values:
                        # EVERY COLUMN THAT'S NOT NEEDED IS None
                        continue
                    if len(nested_path) < len(c.nested_path):
                        # EACH COLUMN IS DEEPER THAN THE NEXT
                        # THESE WILL BE THE id COLUMNS, WHICH ARE ALWAYS INCLUDED AND BEFORE ALL OTHER VALUES
                        nested_path = unwrap(c.nested_path)
                        next_object = Data()
                    next_object[c.put] = value

                if len(nested_path) == 1:
                    # TOP LEVEL DOCUMENT, EMIT THE curr_doc AND ADVANCE
                    if curr_doc:
                        append(curr_doc["id"])
                        doc_count += 1
                    curr_doc = next_object
                    continue

                # LET'S PLACE next_object AT THE CORRECT NESTED LEVEL
                children = [curr_doc]
                for parent_path, path in jx.pairs(reversed(nested_path)):
                    relative_path = relative_field(path, parent_path)
                    try:
                        parent = children[-1]
                        children = unwrap(parent[relative_path])
                        if not children:
                            children = parent[relative_path] = []
                    except Exception as e:
                        Log.error(
                            "Document construction error: path={{path}}\nsteps={{steps}}\ndoc={{curr_doc}}\nnext={{next_object}}",
                            path=path,
                            steps=nested_path,
                            curr_doc=curr_doc,
                            next_object=next_object,
                            cause=e)

                children.append(next_object)

            # DEAL WITH LAST RECORD
            if curr_doc:
                append(curr_doc["id"])
                doc_count += 1

        Log.note(
            "{{doc_count}} documents ({{row_count}} db records)",
            doc_count=doc_count,
            row_count=row_count,
        )
Exemple #18
0
    def run(self, force=False, restart=False, merge=False):
        # SETUP LOGGING
        settings = startup.read_settings(filename=CONFIG_FILE)
        constants.set(settings.constants)
        Log.start(settings.debug)

        if not settings.extractor.app_name:
            Log.error("Expecting an extractor.app_name in config file")

        # SETUP DESTINATION
        destination = bigquery.Dataset(
            dataset=settings.extractor.app_name, kwargs=settings.destination
        ).get_or_create_table(settings.destination)

        try:
            if merge:
                with Timer("merge shards"):
                    destination.merge_shards()

            # RECOVER LAST SQL STATE
            redis = Redis()
            state = redis.get(settings.extractor.key)

            if restart or not state:
                state = (0, 0)
                redis.set(settings.extractor.key, value2json(state).encode("utf8"))
            else:
                state = json2value(state.decode("utf8"))

            last_modified, alert_id = state
            last_modified = parse(last_modified)

            # SCAN SCHEMA, GENERATE EXTRACTION SQL
            extractor = MySqlSnowflakeExtractor(settings.source)
            canonical_sql = extractor.get_sql(SQL("SELECT 0"))

            # ENSURE SCHEMA HAS NOT CHANGED SINCE LAST RUN
            old_sql = redis.get(settings.extractor.sql)
            if old_sql and old_sql.decode("utf8") != canonical_sql.sql:
                if force:
                    Log.warning("Schema has changed")
                else:
                    Log.error("Schema has changed")
            redis.set(settings.extractor.sql, canonical_sql.sql.encode("utf8"))

            # SETUP SOURCE
            source = MySQL(settings.source.database)

            while True:
                Log.note(
                    "Extracting alerts for last_modified={{last_modified|datetime|quote}}, alert.id={{alert_id}}",
                    last_modified=last_modified,
                    alert_id=alert_id,
                )
                last_year = (
                    Date.today() - YEAR + DAY
                )  # ONLY YOUNG RECORDS CAN GO INTO BIGQUERY

                # SELECT
                #     s.od
                # FROM
                #     treeherder.performance_alert_summary s
                # LEFT JOIN
                #     treeherder.performance_alert a ON s.id=a.summary_id
                # WHERE
                #     s.created>{last_year} AND (s.last_updated>{last_modified} OR a.last_updated>{last_modified})
                # GROUP BY
                #     s.id
                # ORDER BY
                #     s.id
                # LIMIT
                #     {settings.extractor.chunk_size}
                get_ids = SQL(
                    str(
                        (
                            PerformanceAlertSummary.objects.filter(
                                Q(created__gt=last_year.datetime)
                                & (
                                    Q(last_updated__gt=last_modified.datetime)
                                    | Q(alerts__last_updated__gt=last_modified.datetime)
                                )
                            )
                            .annotate()
                            .values("id")
                            .order_by("id")[: settings.extractor.chunk_size]
                        ).query
                    )
                )

                sql = extractor.get_sql(get_ids)

                # PULL FROM source, AND PUSH TO destination
                acc = []
                with source.transaction():
                    cursor = source.query(sql, stream=True, row_tuples=True)
                    extractor.construct_docs(cursor, acc.append, False)
                if not acc:
                    break
                destination.extend(acc)

                # RECORD THE STATE
                last_doc = acc[-1]
                last_modified, alert_id = last_doc.created, last_doc.id
                redis.set(
                    settings.extractor.key,
                    value2json((last_modified, alert_id)).encode("utf8"),
                )

                if len(acc) < settings.extractor.chunk_size:
                    break

        except Exception as e:
            Log.warning("problem with extraction", cause=e)

        Log.note("done alert extraction")

        try:
            with Timer("merge shards"):
                destination.merge_shards()
        except Exception as e:
            Log.warning("problem with merge", cause=e)

        Log.note("done alert merge")
        Log.stop()