コード例 #1
0
ファイル: bigquery.py プロジェクト: mozilla/cia-tasks
def _extract_primary_shard_name(view_sql):
    # TODO: USE REAL PARSER
    e = view_sql.lower().find("from ")
    return ApiName(view_sql[e + 5:].strip().split(".")[-1].strip("`"))
コード例 #2
0
ファイル: bigquery.py プロジェクト: mozilla/cia-tasks
    def merge_shards(self):
        shards = []
        tables = list(self.container.client.list_tables(
            self.container.dataset))
        current_view = Null  # VIEW THAT POINTS TO PRIMARY SHARD
        primary_shard_name = None  # PRIMARY SHARD
        api_name = escape_name(self.short_name)

        for table_item in tables:
            table = table_item.reference
            table_api_name = ApiName(table.table_id)
            if text(table_api_name).startswith(text(api_name)):
                if table_api_name == api_name:
                    if table_item.table_type != "VIEW":
                        Log.error("expecting {{table}} to be a view",
                                  table=api_name)
                    current_view = self.container.client.get_table(table)
                    view_sql = current_view.view_query
                    primary_shard_name = _extract_primary_shard_name(view_sql)
                elif SUFFIX_PATTERN.match(
                        text(table_api_name)[len(text(api_name)):]):
                    try:
                        known_table = self.container.client.get_table(table)
                        shards.append(known_table)
                    except Exception as e:
                        Log.warning("could not merge table {{table}}",
                                    table=table,
                                    cause=e)

        if not current_view:
            Log.error("expecting {{table}} to be a view pointing to a table",
                      table=api_name)

        shard_flakes = [
            Snowflake.parse(
                big_query_schema=shard.schema,
                es_index=text(self.container.full_name +
                              ApiName(shard.table_id)),
                top_level_fields=self.top_level_fields,
                partition=self.partition,
            ) for shard in shards
        ]
        total_flake = snowflakes.merge(
            shard_flakes,
            es_index=text(self.full_name),
            top_level_fields=self.top_level_fields,
            partition=self.partition,
        )

        for i, s in enumerate(shards):
            if ApiName(s.table_id) == primary_shard_name:
                if total_flake == shard_flakes[i]:
                    # USE THE CURRENT PRIMARY SHARD AS A DESTINATION
                    del shards[i]
                    del shard_flakes[i]
                    break
        else:
            name = self.short_name + "_" + "".join(Random.sample(ALLOWED, 20))
            primary_shard_name = escape_name(name)
            self.container.create_table(
                table=name,
                schema=total_flake.schema,
                sharded=False,
                read_only=False,
                kwargs=self.config,
            )

        primary_full_name = self.container.full_name + primary_shard_name

        selects = []
        for flake, table in zip(shard_flakes, shards):
            q = ConcatSQL(
                SQL_SELECT,
                JoinSQL(ConcatSQL(SQL_COMMA, SQL_CR),
                        gen_select(total_flake, flake)),
                SQL_FROM,
                quote_column(ApiName(table.dataset_id, table.table_id)),
            )
            selects.append(q)

        Log.note("inserting into table {{table}}",
                 table=text(primary_shard_name))
        matched = []
        unmatched = []
        for sel, shard, flake in zip(selects, shards, shard_flakes):
            if flake == total_flake:
                matched.append((sel, shard, flake))
            else:
                unmatched.append((sel, shard, flake))

        # EVERYTHING THAT IS IDENTICAL TO PRIMARY CAN BE MERGED WITH SIMPLE UNION ALL
        if matched:
            for g, merge_chunk in jx.chunk(matched, MAX_MERGE):
                command = ConcatSQL(
                    SQL_INSERT,
                    quote_column(primary_full_name),
                    JoinSQL(
                        SQL_UNION_ALL,
                        (sql_query({
                            "from":
                            self.container.full_name + ApiName(shard.table_id)
                        }) for _, shard, _ in merge_chunk),
                    ),
                )
                DEBUG and Log.note("{{sql}}", sql=text(command))
                job = self.container.query_and_wait(command)
                Log.note("job {{id}} state = {{state}}",
                         id=job.job_id,
                         state=job.state)

                if job.errors:
                    Log.error(
                        "\n{{sql}}\nDid not fill table:\n{{reason|json|indent}}",
                        sql=command.sql,
                        reason=job.errors,
                    )
                for _, shard, _ in merge_chunk:
                    self.container.client.delete_table(shard)

        # ALL OTHER SCHEMAS MISMATCH
        for s, shard, _ in unmatched:
            try:
                command = ConcatSQL(SQL_INSERT,
                                    quote_column(primary_full_name), s)
                DEBUG and Log.note("{{sql}}", sql=text(command))
                job = self.container.query_and_wait(command)
                Log.note(
                    "from {{shard}}, job {{id}}, state {{state}}",
                    id=job.job_id,
                    shard=shard.table_id,
                    state=job.state,
                )

                if job.errors:
                    if all(" does not have a schema." in m
                           for m in wrap(job.errors).message):
                        pass  # NOTHING TO DO
                    else:
                        Log.error(
                            "\n{{sql}}\nDid not fill table:\n{{reason|json|indent}}",
                            sql=command.sql,
                            reason=job.errors,
                        )

                self.container.client.delete_table(shard)
            except Exception as e:
                Log.warning("failure to merge {{shard}}", shard=shard, cause=e)

        # REMOVE OLD VIEW
        view_full_name = self.container.full_name + api_name
        if current_view:
            self.container.client.delete_table(current_view)

        # CREATE NEW VIEW
        self.container.create_view(view_full_name, primary_full_name)
コード例 #3
0
ファイル: bigquery.py プロジェクト: mozilla/cia-tasks
    def __init__(
        self,
        table,
        typed,
        read_only,
        sharded,
        container,
        id=Null,
        partition=Null,
        cluster=Null,
        top_level_fields=Null,
        kwargs=None,
    ):
        self.short_name = table
        self.typed = typed
        self.read_only = read_only
        self.cluster = cluster
        self.id = id
        self.top_level_fields = top_level_fields
        self.config = Data(  # USED TO REPLICATE THIS
            typed=typed,
            read_only=read_only,
            sharded=sharded,
            id=id,
            partition=partition,
            cluster=cluster,
            top_level_fields=top_level_fields,
        )

        esc_name = escape_name(table)
        self.full_name = container.full_name + esc_name
        self.alias_view = alias_view = container.client.get_table(
            text(self.full_name))
        self.partition = partition
        self.container = container

        if not sharded:
            if not read_only and alias_view.table_type == "VIEW":
                Log.error("Expecting a table, not a view")
            self.shard = alias_view
            self._flake = Snowflake.parse(
                alias_view.schema,
                text(self.full_name),
                self.top_level_fields,
                partition,
            )
        else:
            if alias_view.table_type != "VIEW":
                Log.error("Sharded tables require a view")
            current_view = container.client.get_table(text(self.full_name))
            view_sql = current_view.view_query
            shard_name = _extract_primary_shard_name(view_sql)
            try:
                self.shard = container.client.get_table(
                    text(container.full_name + shard_name))
                self._flake = Snowflake.parse(
                    alias_view.schema,
                    text(self.full_name),
                    self.top_level_fields,
                    partition,
                )
            except Exception as e:
                Log.warning("view {{name}} is invalid",
                            name=shard_name,
                            cause=e)
                self._flake = Snowflake.parse(
                    alias_view.schema,
                    text(self.full_name),
                    self.top_level_fields,
                    partition,
                )
                # REMOVE STALE VIEW
                container.client.delete_table(current_view)

                # MAKE NEW VIEW POINTING TO NEW SHARD
                self._create_new_shard()
                container.create_view(
                    self.full_name,
                    self.container.full_name + ApiName(self.shard.table_id),
                )

        self.last_extend = Date.now() - EXTEND_LIMIT
コード例 #4
0
def create_dataset(project_id, dataset, client):
    full_name = ApiName(project_id) + escape_name(dataset)

    _dataset = bigquery.Dataset(text(full_name))
    _dataset.location = "US"
    return client.create_dataset(_dataset)
コード例 #5
0
    def _gen_select(source_path, source_tops, source_flake, total_path,
                    total_tops, total_flake):
        if total_flake == source_flake and not total_tops:
            return [
                quote_column(source_path + escape_name(k))
                for k in jx.sort(total_flake.keys())
            ]

        if NESTED_TYPE in total_flake:
            # PROMOTE EVERYTHING TO REPEATED
            v = source_flake.get(NESTED_TYPE)
            t = total_flake.get(NESTED_TYPE)

            if not v:
                # CONVERT INNER OBJECT TO ARRAY OF ONE STRUCT
                inner = [
                    ConcatSQL(
                        SQL_SELECT_AS_STRUCT,
                        JoinSQL(
                            ConcatSQL(SQL_COMMA, SQL_CR),
                            _gen_select(
                                source_path,
                                Null,
                                source_flake,
                                total_path + REPEATED,
                                Null,
                                t,
                            ),
                        ),
                    )
                ]
            else:
                row_name = "row" + text(len(source_path.values))
                ord_name = "ordering" + text(len(source_path.values))
                inner = [
                    ConcatSQL(
                        SQL_SELECT_AS_STRUCT,
                        JoinSQL(
                            ConcatSQL(SQL_COMMA, SQL_CR),
                            _gen_select(ApiName(row_name), Null, v,
                                        ApiName(row_name), Null, t),
                        ),
                        SQL_FROM,
                        sql_call("UNNEST",
                                 quote_column(source_path + REPEATED)),
                        SQL_AS,
                        SQL(row_name),
                        SQL(" WITH OFFSET AS "),
                        SQL(ord_name),
                        SQL_ORDERBY,
                        SQL(ord_name),
                    )
                ]

            return [sql_alias(sql_call("ARRAY", *inner), REPEATED)]

        selection = []
        for k, t in jx.sort(total_flake.items(), 0):
            k_total_tops = total_tops if is_text(total_tops) else total_tops[k]
            k_tops = source_tops if is_text(source_tops) else source_tops[k]
            v = source_flake.get(k)
            if is_text(k_total_tops):
                # DO NOT INCLUDE TOP_LEVEL_FIELDS
                pass
            elif t == v and not k_total_tops and not k_tops:
                selection.append(
                    ConcatSQL(
                        quote_column(source_path + escape_name(k)),
                        SQL_AS,
                        quote_column(escape_name(k)),
                    ))
            elif is_data(t):
                if not v:
                    selects = _gen_select(
                        source_path + escape_name(k),
                        source_tops,
                        {},
                        total_path + escape_name(k),
                        k_total_tops,
                        t,
                    )
                elif is_data(v):
                    selects = _gen_select(
                        source_path + escape_name(k),
                        source_tops,
                        v,
                        total_path + escape_name(k),
                        k_total_tops,
                        t,
                    )
                else:
                    raise Log.error(
                        "Datatype mismatch on {{field}}: Can not merge {{type}} into {{main}}",
                        field=join_field(source_path + escape_name(k)),
                        type=v,
                        main=t,
                    )
                if selects:
                    inner = [
                        ConcatSQL(
                            SQL_SELECT_AS_STRUCT,
                            JoinSQL(ConcatSQL(SQL_COMMA, SQL_CR), selects),
                        )
                    ]
                    selection.append(
                        sql_alias(sql_call("", *inner), escape_name(k)))
            elif is_text(t):
                if is_text(k_tops):
                    # THE SOURCE HAS THIS PROPERTY AS A TOP_LEVEL_FIELD
                    selection.append(
                        ConcatSQL(SQL(k_tops), SQL_AS,
                                  quote_column(escape_name(k))))
                elif v == t:
                    selection.append(
                        ConcatSQL(
                            quote_column(total_path + escape_name(k)),
                            SQL_AS,
                            quote_column(escape_name(k)),
                        ))
                else:
                    if v:
                        Log.note(
                            "Datatype mismatch on {{field}}: Can not merge {{type}} into {{main}}",
                            field=join_field(source_path + escape_name(k)),
                            type=v,
                            main=t,
                        )
                    selection.append(
                        ConcatSQL(
                            sql_call(
                                "CAST",
                                ConcatSQL(SQL_NULL, SQL_AS,
                                          SQL(json_type_to_bq_type[t])),
                            ),
                            SQL_AS,
                            quote_column(escape_name(k)),
                        ))
            else:
                Log.error("not expected")
        return selection
コード例 #6
0
def gen_select(total_flake, flake):
    def _gen_select(jx_path, es_path, total_tops, total_flake, source_tops,
                    source_flake):
        if total_flake == source_flake and total_tops == source_tops:
            if not jx_path:  # TOP LEVEL FIELDS
                return [
                    quote_column(escape_name(k)) for k in total_flake.keys()
                    if not is_text(total_tops[k])
                ]
            else:
                Log.error("should not happen")

        if NESTED_TYPE in total_flake:
            k = NESTED_TYPE
            # PROMOTE EVERYTHING TO REPEATED
            v = source_flake.get(k)
            t = total_flake.get(k)

            if not v:
                # CONVERT INNER OBJECT TO ARRAY OF ONE STRUCT
                inner = [
                    ConcatSQL(
                        SQL_SELECT_AS_STRUCT,
                        JoinSQL(
                            ConcatSQL(SQL_COMMA, SQL_CR),
                            _gen_select(jx_path, es_path + REPEATED, Null, t,
                                        Null, source_flake),
                        ),
                    )
                ]
            else:
                row_name = "row" + text(len(jx_path))
                ord_name = "ordering" + text(len(jx_path))
                inner = [
                    ConcatSQL(
                        SQL_SELECT_AS_STRUCT,
                        JoinSQL(
                            ConcatSQL(SQL_COMMA, SQL_CR),
                            _gen_select([row_name], ApiName(row_name), Null, t,
                                        Null, v),
                        ),
                        SQL_FROM,
                        sql_call("UNNEST",
                                 quote_column(es_path + escape_name(k))),
                        SQL_AS,
                        SQL(row_name),
                        SQL(" WITH OFFSET AS "),
                        SQL(ord_name),
                        SQL_ORDERBY,
                        SQL(ord_name),
                    )
                ]

            return [sql_alias(sql_call("ARRAY", *inner), escape_name(k))]

        selection = []
        for k, t in jx.sort(total_flake.items(), 0):
            k_total_tops = total_tops if is_text(total_tops) else total_tops[k]
            k_tops = source_tops if is_text(source_tops) else source_tops[k]
            v = source_flake.get(k)
            if is_text(k_total_tops):
                # DO NOT INCLUDE TOP_LEVEL_FIELDS
                pass
            elif t == v and k_total_tops == k_tops:
                selection.append(
                    ConcatSQL(
                        quote_column(es_path + escape_name(k)),
                        SQL_AS,
                        quote_column(escape_name(k)),
                    ))
            elif is_data(t):
                if not v:
                    selects = _gen_select(
                        jx_path + [k],
                        es_path + escape_name(k),
                        k_total_tops,
                        t,
                        source_tops,
                        {},
                    )
                elif is_data(v):
                    selects = _gen_select(
                        jx_path + [k],
                        es_path + escape_name(k),
                        k_total_tops,
                        t,
                        source_tops,
                        v,
                    )
                else:
                    raise Log.error(
                        "Datatype mismatch on {{field}}: Can not merge {{type}} into {{main}}",
                        field=join_field(jx_path + [k]),
                        type=v,
                        main=t,
                    )
                inner = [
                    ConcatSQL(
                        SQL_SELECT_AS_STRUCT,
                        JoinSQL(ConcatSQL(SQL_COMMA, SQL_CR), selects),
                    )
                ]
                selection.append(
                    sql_alias(sql_call("", *inner), escape_name(k)))
            elif is_text(t):
                if is_text(k_tops):
                    # THE SOURCE HAS THIS PROPERTY AS A TOP_LEVEL_FIELD
                    selection.append(
                        ConcatSQL(SQL(k_tops), SQL_AS,
                                  quote_column(escape_name(k))))
                elif v == t:
                    selection.append(
                        ConcatSQL(
                            quote_column(es_path + escape_name(k)),
                            SQL_AS,
                            quote_column(escape_name(k)),
                        ))
                else:
                    if v:
                        Log.note(
                            "Datatype mismatch on {{field}}: Can not merge {{type}} into {{main}}",
                            field=join_field(jx_path + [k]),
                            type=v,
                            main=t,
                        )
                    selection.append(
                        ConcatSQL(
                            sql_call(
                                "CAST",
                                ConcatSQL(SQL_NULL, SQL_AS,
                                          SQL(json_type_to_bq_type[t])),
                            ),
                            SQL_AS,
                            quote_column(escape_name(k)),
                        ))
            else:
                Log.error("not expected")
        return selection

    output = _gen_select(
        [],
        ApiName(),
        total_flake.top_level_fields,
        total_flake.schema,
        flake.top_level_fields,
        flake.schema,
    )
    tops = []

    for path, name in total_flake.top_level_fields.leaves():
        source = flake.top_level_fields[path]
        if source:
            # ALREADY TOP LEVEL FIELD
            source = SQL(source)
        else:
            # PULL OUT TOP LEVEL FIELD
            column = first(flake.leaves(path))
            source = SQL(column.es_column)

        tops.append(ConcatSQL(source, SQL_AS, quote_column(ApiName(name))))

    return tops + output