Exemple #1
0
    def condense(self):
        """
        :return:
        """
        # MAKE NEW SHARD
        partition = JoinSQL(
            SQL_COMMA,
            [
                quote_column(c.es_field) for f in listwrap(self.id.field)
                for c in self.flake.leaves(f)
            ],
        )
        order_by = JoinSQL(
            SQL_COMMA,
            [
                ConcatSQL(quote_column(c.es_field), SQL_DESC)
                for f in listwrap(self.id.version)
                for c in self.flake.leaves(f)
            ],
        )
        # WRAP WITH etl.timestamp BEST SELECTION

        self.container.query_and_wait(
            ConcatSQL(
                SQL(  # SOME KEYWORDS: ROWNUM RANK
                    "SELECT * EXCEPT (_rank) FROM (SELECT *, ROW_NUMBER() OVER (PARTITION BY "
                ),
                partition,
                SQL_ORDERBY,
                order_by,
                SQL(") AS _rank FROM "),
                quote_column(self.full_name),
                SQL(") a WHERE _rank=1"),
            ))
Exemple #2
0
    def to_bq(self, schema, not_null=False, boolean=False, many=True):
        var_name = self.var
        if var_name == GUID:
            return wrap(
                [{"name": ".", "sql": {"s": quote_column(ApiName(GUID))}, "nested_path": ROOT_PATH}]
            )
        cols = schema.leaves(var_name)
        if not cols:
            # DOES NOT EXIST
            return wrap(
                [{"name": ".", "sql": {"0": SQL_NULL}, "nested_path": ROOT_PATH}]
            )
        acc = {}
        if boolean:
            for col in cols:
                cname = relative_field(col.name, var_name)
                nested_path = col.nested_path[0]
                if col.type == OBJECT:
                    value = SQL_TRUE
                elif col.type == BOOLEAN:
                    value = quote_column(col.es_column)
                else:
                    value = quote_column(col.es_column) + SQL_IS_NOT_NULL
                tempa = acc.setdefault(nested_path, {})
                tempb = tempa.setdefault(get_property_name(cname), {})
                tempb["b"] = value
        else:
            for col in cols:
                cname = relative_field(col.name, var_name)
                if col.jx_type == OBJECT:
                    prefix = self.var + "."
                    for cn, cs in schema.items():
                        if cn.startswith(prefix):
                            for child_col in cs:
                                tempa = acc.setdefault(child_col.nested_path[0], {})
                                tempb = tempa.setdefault(get_property_name(cname), {})
                                tempb[json_type_to_bq_type[col.type]] = quote_column(
                                    child_col.es_column
                                )
                else:
                    nested_path = col.nested_path[0]
                    tempa = acc.setdefault(nested_path, {})
                    tempb = tempa.setdefault(get_property_name(cname), {})
                    tempb[json_type_to_bq_type[col.jx_type]] = quote_column(
                        col.es_column
                    )

        return wrap(
            [
                {"name": cname, "sql": types, "nested_path": nested_path}
                for nested_path, pairs in acc.items()
                for cname, types in pairs.items()
            ]
        )
Exemple #3
0
 def create_view(self, view_api_name, shard_api_name):
     job = self.query_and_wait(
         ConcatSQL(
             SQL("CREATE VIEW\n"),
             quote_column(view_api_name),
             SQL_AS,
             sql_query({"from": shard_api_name}),
         ))
Exemple #4
0
 def to_bq(self, schema, not_null=False, boolean=False, many=True):
     var_name = self.var
     if var_name == GUID:
         return BQLScript(data_type=STRING,
                          expr=quote_column(escape_name(GUID)),
                          frum=self,
                          miss=FALSE,
                          many=False,
                          schema=schema)
     cols = schema.leaves(var_name)
     if not cols:
         # DOES NOT EXIST
         return BQLScript(data_type=OBJECT,
                          expr=SQL_NULL,
                          frum=self,
                          miss=TRUE,
                          many=False,
                          schema=schema)
     elif len(cols) == 1:
         col = first(cols)
         return BQLScript(data_type=col.jx_type,
                          expr=quote_column(
                              ApiName(*split_field(col.es_column))),
                          frum=self,
                          miss=MissingOp(self),
                          many=False,
                          schema=schema)
     else:
         coalesce = []
         for col in cols:
             rel_path = untype_path(relative_field(col.name, var_name))
             if rel_path == '.':
                 coalesce.append(Variable(col.name))
             else:
                 Log.error("structure not supported")
         return CoalesceOp(coalesce).to_bq(schema)
Exemple #5
0
 def create_view(self, view_api_name, shard_api_name):
     sql = ConcatSQL(
         SQL("CREATE VIEW\n"),
         quote_column(view_api_name),
         SQL_AS,
         sql_query({"from": shard_api_name}),
     )
     job = self.query_and_wait(sql)
     if job.errors:
         Log.error(
             "Can not create view\n{{sql}}\n{{errors|json|indent}}",
             sql=sql,
             errors=job.errors,
         )
     pass
Exemple #6
0
    def _gen_select(source_path, source_tops, source_flake, total_path,
                    total_tops, total_flake):
        if total_flake == source_flake and not total_tops:
            return [
                quote_column(source_path + escape_name(k))
                for k in jx.sort(total_flake.keys())
            ]

        if NESTED_TYPE in total_flake:
            # PROMOTE EVERYTHING TO REPEATED
            v = source_flake.get(NESTED_TYPE)
            t = total_flake.get(NESTED_TYPE)

            if not v:
                # CONVERT INNER OBJECT TO ARRAY OF ONE STRUCT
                inner = [
                    ConcatSQL(
                        SQL_SELECT_AS_STRUCT,
                        JoinSQL(
                            ConcatSQL(SQL_COMMA, SQL_CR),
                            _gen_select(
                                source_path,
                                Null,
                                source_flake,
                                total_path + REPEATED,
                                Null,
                                t,
                            ),
                        ),
                    )
                ]
            else:
                row_name = "row" + text(len(source_path.values))
                ord_name = "ordering" + text(len(source_path.values))
                inner = [
                    ConcatSQL(
                        SQL_SELECT_AS_STRUCT,
                        JoinSQL(
                            ConcatSQL(SQL_COMMA, SQL_CR),
                            _gen_select(ApiName(row_name), Null, v,
                                        ApiName(row_name), Null, t),
                        ),
                        SQL_FROM,
                        sql_call("UNNEST",
                                 quote_column(source_path + REPEATED)),
                        SQL_AS,
                        SQL(row_name),
                        SQL(" WITH OFFSET AS "),
                        SQL(ord_name),
                        SQL_ORDERBY,
                        SQL(ord_name),
                    )
                ]

            return [sql_alias(sql_call("ARRAY", *inner), REPEATED)]

        selection = []
        for k, t in jx.sort(total_flake.items(), 0):
            k_total_tops = total_tops if is_text(total_tops) else total_tops[k]
            k_tops = source_tops if is_text(source_tops) else source_tops[k]
            v = source_flake.get(k)
            if is_text(k_total_tops):
                # DO NOT INCLUDE TOP_LEVEL_FIELDS
                pass
            elif t == v and not k_total_tops and not k_tops:
                selection.append(
                    ConcatSQL(
                        quote_column(source_path + escape_name(k)),
                        SQL_AS,
                        quote_column(escape_name(k)),
                    ))
            elif is_data(t):
                if not v:
                    selects = _gen_select(
                        source_path + escape_name(k),
                        source_tops,
                        {},
                        total_path + escape_name(k),
                        k_total_tops,
                        t,
                    )
                elif is_data(v):
                    selects = _gen_select(
                        source_path + escape_name(k),
                        source_tops,
                        v,
                        total_path + escape_name(k),
                        k_total_tops,
                        t,
                    )
                else:
                    raise Log.error(
                        "Datatype mismatch on {{field}}: Can not merge {{type}} into {{main}}",
                        field=join_field(source_path + escape_name(k)),
                        type=v,
                        main=t,
                    )
                if selects:
                    inner = [
                        ConcatSQL(
                            SQL_SELECT_AS_STRUCT,
                            JoinSQL(ConcatSQL(SQL_COMMA, SQL_CR), selects),
                        )
                    ]
                    selection.append(
                        sql_alias(sql_call("", *inner), escape_name(k)))
            elif is_text(t):
                if is_text(k_tops):
                    # THE SOURCE HAS THIS PROPERTY AS A TOP_LEVEL_FIELD
                    selection.append(
                        ConcatSQL(SQL(k_tops), SQL_AS,
                                  quote_column(escape_name(k))))
                elif v == t:
                    selection.append(
                        ConcatSQL(
                            quote_column(total_path + escape_name(k)),
                            SQL_AS,
                            quote_column(escape_name(k)),
                        ))
                else:
                    if v:
                        Log.note(
                            "Datatype mismatch on {{field}}: Can not merge {{type}} into {{main}}",
                            field=join_field(source_path + escape_name(k)),
                            type=v,
                            main=t,
                        )
                    selection.append(
                        ConcatSQL(
                            sql_call(
                                "CAST",
                                ConcatSQL(SQL_NULL, SQL_AS,
                                          SQL(json_type_to_bq_type[t])),
                            ),
                            SQL_AS,
                            quote_column(escape_name(k)),
                        ))
            else:
                Log.error("not expected")
        return selection
Exemple #7
0
    def merge_shards(self):
        shards = []
        tables = list(self.container.client.list_tables(
            self.container.dataset))
        current_view = Null  # VIEW THAT POINTS TO PRIMARY SHARD
        primary_shard_name = None  # PRIMARY SHARD
        api_name = escape_name(self.short_name)

        for table_item in tables:
            table = table_item.reference
            table_api_name = ApiName(table.table_id)
            if text(table_api_name).startswith(text(api_name)):
                if table_api_name == api_name:
                    if table_item.table_type != "VIEW":
                        Log.error("expecting {{table}} to be a view",
                                  table=api_name)
                    current_view = self.container.client.get_table(table)
                    view_sql = current_view.view_query
                    primary_shard_name = _extract_primary_shard_name(view_sql)
                elif SUFFIX_PATTERN.match(
                        text(table_api_name)[len(text(api_name)):]):
                    try:
                        known_table = self.container.client.get_table(table)
                        shards.append(known_table)
                    except Exception as e:
                        Log.warning("could not merge table {{table}}",
                                    table=table,
                                    cause=e)

        if not current_view:
            Log.error("expecting {{table}} to be a view pointing to a table",
                      table=api_name)

        shard_flakes = [
            Snowflake.parse(
                big_query_schema=shard.schema,
                es_index=text(self.container.full_name +
                              ApiName(shard.table_id)),
                top_level_fields=self.top_level_fields,
                partition=self.partition,
            ) for shard in shards
        ]
        total_flake = snowflakes.merge(
            shard_flakes,
            es_index=text(self.full_name),
            top_level_fields=self.top_level_fields,
            partition=self.partition,
        )

        for i, s in enumerate(shards):
            if ApiName(s.table_id) == primary_shard_name:
                if total_flake == shard_flakes[i]:
                    # USE THE CURRENT PRIMARY SHARD AS A DESTINATION
                    del shards[i]
                    del shard_flakes[i]
                    break
        else:
            name = self.short_name + "_" + "".join(Random.sample(ALLOWED, 20))
            primary_shard_name = escape_name(name)
            self.container.create_table(
                table=name,
                schema=total_flake.schema,
                sharded=False,
                read_only=False,
                kwargs=self.config,
            )

        primary_full_name = self.container.full_name + primary_shard_name

        selects = []
        for flake, table in zip(shard_flakes, shards):
            q = ConcatSQL(
                SQL_SELECT,
                JoinSQL(ConcatSQL(SQL_COMMA, SQL_CR),
                        gen_select(total_flake, flake)),
                SQL_FROM,
                quote_column(ApiName(table.dataset_id, table.table_id)),
            )
            selects.append(q)

        Log.note("inserting into table {{table}}",
                 table=text(primary_shard_name))
        matched = []
        unmatched = []
        for sel, shard, flake in zip(selects, shards, shard_flakes):
            if flake == total_flake:
                matched.append((sel, shard, flake))
            else:
                unmatched.append((sel, shard, flake))

        # EVERYTHING THAT IS IDENTICAL TO PRIMARY CAN BE MERGED WITH SIMPLE UNION ALL
        if matched:
            for g, merge_chunk in jx.chunk(matched, MAX_MERGE):
                command = ConcatSQL(
                    SQL_INSERT,
                    quote_column(primary_full_name),
                    JoinSQL(
                        SQL_UNION_ALL,
                        (sql_query({
                            "from":
                            self.container.full_name + ApiName(shard.table_id)
                        }) for _, shard, _ in merge_chunk),
                    ),
                )
                DEBUG and Log.note("{{sql}}", sql=text(command))
                job = self.container.query_and_wait(command)
                Log.note("job {{id}} state = {{state}}",
                         id=job.job_id,
                         state=job.state)

                if job.errors:
                    Log.error(
                        "\n{{sql}}\nDid not fill table:\n{{reason|json|indent}}",
                        sql=command.sql,
                        reason=job.errors,
                    )
                for _, shard, _ in merge_chunk:
                    self.container.client.delete_table(shard)

        # ALL OTHER SCHEMAS MISMATCH
        for s, shard, _ in unmatched:
            try:
                command = ConcatSQL(SQL_INSERT,
                                    quote_column(primary_full_name), s)
                DEBUG and Log.note("{{sql}}", sql=text(command))
                job = self.container.query_and_wait(command)
                Log.note(
                    "from {{shard}}, job {{id}}, state {{state}}",
                    id=job.job_id,
                    shard=shard.table_id,
                    state=job.state,
                )

                if job.errors:
                    if all(" does not have a schema." in m
                           for m in wrap(job.errors).message):
                        pass  # NOTHING TO DO
                    else:
                        Log.error(
                            "\n{{sql}}\nDid not fill table:\n{{reason|json|indent}}",
                            sql=command.sql,
                            reason=job.errors,
                        )

                self.container.client.delete_table(shard)
            except Exception as e:
                Log.warning("failure to merge {{shard}}", shard=shard, cause=e)

        # REMOVE OLD VIEW
        view_full_name = self.container.full_name + api_name
        if current_view:
            self.container.client.delete_table(current_view)

        # CREATE NEW VIEW
        self.container.create_view(view_full_name, primary_full_name)
Exemple #8
0
def gen_select(total_flake, flake):
    def _gen_select(jx_path, es_path, total_tops, total_flake, source_tops,
                    source_flake):
        if total_flake == source_flake and total_tops == source_tops:
            if not jx_path:  # TOP LEVEL FIELDS
                return [
                    quote_column(escape_name(k)) for k in total_flake.keys()
                    if not is_text(total_tops[k])
                ]
            else:
                Log.error("should not happen")

        if NESTED_TYPE in total_flake:
            k = NESTED_TYPE
            # PROMOTE EVERYTHING TO REPEATED
            v = source_flake.get(k)
            t = total_flake.get(k)

            if not v:
                # CONVERT INNER OBJECT TO ARRAY OF ONE STRUCT
                inner = [
                    ConcatSQL(
                        SQL_SELECT_AS_STRUCT,
                        JoinSQL(
                            ConcatSQL(SQL_COMMA, SQL_CR),
                            _gen_select(jx_path, es_path + REPEATED, Null, t,
                                        Null, source_flake),
                        ),
                    )
                ]
            else:
                row_name = "row" + text(len(jx_path))
                ord_name = "ordering" + text(len(jx_path))
                inner = [
                    ConcatSQL(
                        SQL_SELECT_AS_STRUCT,
                        JoinSQL(
                            ConcatSQL(SQL_COMMA, SQL_CR),
                            _gen_select([row_name], ApiName(row_name), Null, t,
                                        Null, v),
                        ),
                        SQL_FROM,
                        sql_call("UNNEST",
                                 quote_column(es_path + escape_name(k))),
                        SQL_AS,
                        SQL(row_name),
                        SQL(" WITH OFFSET AS "),
                        SQL(ord_name),
                        SQL_ORDERBY,
                        SQL(ord_name),
                    )
                ]

            return [sql_alias(sql_call("ARRAY", *inner), escape_name(k))]

        selection = []
        for k, t in jx.sort(total_flake.items(), 0):
            k_total_tops = total_tops if is_text(total_tops) else total_tops[k]
            k_tops = source_tops if is_text(source_tops) else source_tops[k]
            v = source_flake.get(k)
            if is_text(k_total_tops):
                # DO NOT INCLUDE TOP_LEVEL_FIELDS
                pass
            elif t == v and k_total_tops == k_tops:
                selection.append(
                    ConcatSQL(
                        quote_column(es_path + escape_name(k)),
                        SQL_AS,
                        quote_column(escape_name(k)),
                    ))
            elif is_data(t):
                if not v:
                    selects = _gen_select(
                        jx_path + [k],
                        es_path + escape_name(k),
                        k_total_tops,
                        t,
                        source_tops,
                        {},
                    )
                elif is_data(v):
                    selects = _gen_select(
                        jx_path + [k],
                        es_path + escape_name(k),
                        k_total_tops,
                        t,
                        source_tops,
                        v,
                    )
                else:
                    raise Log.error(
                        "Datatype mismatch on {{field}}: Can not merge {{type}} into {{main}}",
                        field=join_field(jx_path + [k]),
                        type=v,
                        main=t,
                    )
                inner = [
                    ConcatSQL(
                        SQL_SELECT_AS_STRUCT,
                        JoinSQL(ConcatSQL(SQL_COMMA, SQL_CR), selects),
                    )
                ]
                selection.append(
                    sql_alias(sql_call("", *inner), escape_name(k)))
            elif is_text(t):
                if is_text(k_tops):
                    # THE SOURCE HAS THIS PROPERTY AS A TOP_LEVEL_FIELD
                    selection.append(
                        ConcatSQL(SQL(k_tops), SQL_AS,
                                  quote_column(escape_name(k))))
                elif v == t:
                    selection.append(
                        ConcatSQL(
                            quote_column(es_path + escape_name(k)),
                            SQL_AS,
                            quote_column(escape_name(k)),
                        ))
                else:
                    if v:
                        Log.note(
                            "Datatype mismatch on {{field}}: Can not merge {{type}} into {{main}}",
                            field=join_field(jx_path + [k]),
                            type=v,
                            main=t,
                        )
                    selection.append(
                        ConcatSQL(
                            sql_call(
                                "CAST",
                                ConcatSQL(SQL_NULL, SQL_AS,
                                          SQL(json_type_to_bq_type[t])),
                            ),
                            SQL_AS,
                            quote_column(escape_name(k)),
                        ))
            else:
                Log.error("not expected")
        return selection

    output = _gen_select(
        [],
        ApiName(),
        total_flake.top_level_fields,
        total_flake.schema,
        flake.top_level_fields,
        flake.schema,
    )
    tops = []

    for path, name in total_flake.top_level_fields.leaves():
        source = flake.top_level_fields[path]
        if source:
            # ALREADY TOP LEVEL FIELD
            source = SQL(source)
        else:
            # PULL OUT TOP LEVEL FIELD
            column = first(flake.leaves(path))
            source = SQL(column.es_column)

        tops.append(ConcatSQL(source, SQL_AS, quote_column(ApiName(name))))

    return tops + output