Ejemplo n.º 1
0
    def __init__(
        self,
        host,
        index,
        port=9200,
        type="log",
        queue_size=1000,
        batch_size=100,
        kwargs=None,
    ):
        """
        settings ARE FOR THE ELASTICSEARCH INDEX
        """
        kwargs.timeout = Duration(coalesce(kwargs.timeout, "30second")).seconds
        kwargs.retry.times = coalesce(kwargs.retry.times, 3)
        kwargs.retry.sleep = Duration(coalesce(kwargs.retry.sleep, MINUTE)).seconds
        kwargs.host = Random.sample(listwrap(host), 1)[0]

        schema = json2value(value2json(SCHEMA), leaves=True)
        schema.mappings[type].properties["~N~"].type = "nested"
        self.es = Cluster(kwargs).get_or_create_index(
            schema=schema,
            limit_replicas=True,
            typed=True,
            kwargs=kwargs,
        )
        self.batch_size = batch_size
        self.es.add_alias(coalesce(kwargs.alias, kwargs.index))
        self.queue = Queue("debug logs to es", max=queue_size, silent=True)

        self.worker = Thread.run("add debug logs to es", self._insert_loop)
Ejemplo n.º 2
0
 def _create_new_shard(self):
     primary_shard = self.container.create_table(
         table=self.short_name + "_" + "".join(Random.sample(ALLOWED, 20)),
         sharded=False,
         schema=self._flake.schema,
         kwargs=self.config,
     )
     self.shard = primary_shard.shard
Ejemplo n.º 3
0
    def __init__(
        self,
        host,
        index,
        port=9200,
        type="log",
        queue_size=1000,
        batch_size=100,
        kwargs=None,
    ):
        """
        settings ARE FOR THE ELASTICSEARCH INDEX
        """
        kwargs.timeout = Duration(coalesce(kwargs.timeout, "30second")).seconds
        kwargs.retry.times = coalesce(kwargs.retry.times, 3)
        kwargs.retry.sleep = Duration(coalesce(kwargs.retry.sleep,
                                               MINUTE)).seconds
        kwargs.host = Random.sample(listwrap(host), 1)[0]

        rollover_interval = coalesce(kwargs.rollover.interval,
                                     kwargs.rollover.max, "year")
        rollover_max = coalesce(kwargs.rollover.max, kwargs.rollover.interval,
                                "year")

        schema = set_default(kwargs.schema, {
            "mappings": {
                kwargs.type: {
                    "properties": {
                        "~N~": {
                            "type": "nested"
                        }
                    }
                }
            }
        }, json2value(value2json(SCHEMA), leaves=True))

        self.es = RolloverIndex(
            rollover_field={"get": [{
                "first": "."
            }, {
                "literal": "timestamp"
            }]},
            rollover_interval=rollover_interval,
            rollover_max=rollover_max,
            schema=schema,
            limit_replicas=True,
            typed=True,
            read_only=False,
            kwargs=kwargs,
        )
        self.batch_size = batch_size
        self.queue = Queue("debug logs to es", max=queue_size, silent=True)

        self.worker = Thread.run("add debug logs to es", self._insert_loop)
Ejemplo n.º 4
0
    def test_id_vs_id(self):

        ops = [Op() for _ in range(200)]
        lang1 = {id(o): o for o in ops}

        sample = Random.sample(ops, 1000 * 1000)
        with Timer("using id()"):
            result1 = [lang1[id(o)] for o in sample]

        lang2 = [None] * (max(o.id for o in ops) + 1)
        for o in ops:
            lang2[o.id] = o
        # lang2 = tuple(lang2)

        with Timer("using o.id"):
            result2 = [lang2[o.id] for o in sample]
Ejemplo n.º 5
0
    def merge_shards(self):
        shards = []
        tables = list(self.container.client.list_tables(
            self.container.dataset))
        current_view = Null  # VIEW THAT POINTS TO PRIMARY SHARD
        primary_shard_name = None  # PRIMARY SHARD
        api_name = escape_name(self.short_name)

        for table_item in tables:
            table = table_item.reference
            table_api_name = ApiName(table.table_id)
            if text(table_api_name).startswith(text(api_name)):
                if table_api_name == api_name:
                    if table_item.table_type != "VIEW":
                        Log.error("expecting {{table}} to be a view",
                                  table=api_name)
                    current_view = self.container.client.get_table(table)
                    view_sql = current_view.view_query
                    primary_shard_name = _extract_primary_shard_name(view_sql)
                elif SUFFIX_PATTERN.match(
                        text(table_api_name)[len(text(api_name)):]):
                    try:
                        known_table = self.container.client.get_table(table)
                        shards.append(known_table)
                    except Exception as e:
                        Log.warning("could not merge table {{table}}",
                                    table=table,
                                    cause=e)

        if not current_view:
            Log.error("expecting {{table}} to be a view pointing to a table",
                      table=api_name)

        shard_flakes = [
            Snowflake.parse(
                big_query_schema=shard.schema,
                es_index=text(self.container.full_name +
                              ApiName(shard.table_id)),
                top_level_fields=self.top_level_fields,
                partition=self.partition,
            ) for shard in shards
        ]
        total_flake = snowflakes.merge(
            shard_flakes,
            es_index=text(self.full_name),
            top_level_fields=self.top_level_fields,
            partition=self.partition,
        )

        for i, s in enumerate(shards):
            if ApiName(s.table_id) == primary_shard_name:
                if total_flake == shard_flakes[i]:
                    # USE THE CURRENT PRIMARY SHARD AS A DESTINATION
                    del shards[i]
                    del shard_flakes[i]
                    break
        else:
            name = self.short_name + "_" + "".join(Random.sample(ALLOWED, 20))
            primary_shard_name = escape_name(name)
            self.container.create_table(
                table=name,
                schema=total_flake.schema,
                sharded=False,
                read_only=False,
                kwargs=self.config,
            )

        primary_full_name = self.container.full_name + primary_shard_name

        selects = []
        for flake, table in zip(shard_flakes, shards):
            q = ConcatSQL(
                SQL_SELECT,
                JoinSQL(ConcatSQL(SQL_COMMA, SQL_CR),
                        gen_select(total_flake, flake)),
                SQL_FROM,
                quote_column(ApiName(table.dataset_id, table.table_id)),
            )
            selects.append(q)

        DEBUG and Log.note("inserting into table {{table}}",
                           table=text(primary_shard_name))
        matched = []
        unmatched = []
        for sel, shard, flake in zip(selects, shards, shard_flakes):
            if flake == total_flake:
                matched.append((sel, shard, flake))
            else:
                unmatched.append((sel, shard, flake))

        # EVERYTHING THAT IS IDENTICAL TO PRIMARY CAN BE MERGED WITH SIMPLE UNION ALL
        if matched:
            for g, merge_chunk in jx.chunk(matched, MAX_MERGE):
                command = ConcatSQL(
                    SQL_INSERT,
                    quote_column(primary_full_name),
                    JoinSQL(
                        SQL_UNION_ALL,
                        (sql_query(
                            {
                                "from":
                                text(self.container.full_name +
                                     ApiName(shard.table_id))
                            },
                            schema,
                        ) for _, shard, schema in merge_chunk),
                    ),
                )
                DEBUG and Log.note("{{sql}}", sql=text(command))
                job = self.container.query_and_wait(command)
                DEBUG and Log.note("job {{id}} state = {{state}}",
                                   id=job.job_id,
                                   state=job.state)

                if job.errors:
                    Log.error(
                        "\n{{sql}}\nDid not fill table:\n{{reason|json|indent}}",
                        sql=command.sql,
                        reason=job.errors,
                    )
                for _, shard, _ in merge_chunk:
                    self.container.client.delete_table(shard)

        # ALL OTHER SCHEMAS MISMATCH
        for s, shard, _ in unmatched:
            try:
                command = ConcatSQL(SQL_INSERT,
                                    quote_column(primary_full_name), s)
                DEBUG and Log.note("{{sql}}", sql=text(command))
                job = self.container.query_and_wait(command)
                DEBUG and Log.note(
                    "from {{shard}}, job {{id}}, state {{state}}",
                    id=job.job_id,
                    shard=shard.table_id,
                    state=job.state,
                )

                if job.errors:
                    if all(" does not have a schema." in m
                           for m in wrap(job.errors).message):
                        pass  # NOTHING TO DO
                    else:
                        Log.error(
                            "\n{{sql}}\nDid not fill table:\n{{reason|json|indent}}",
                            sql=command.sql,
                            reason=job.errors,
                        )

                self.container.client.delete_table(shard)
            except Exception as e:
                Log.warning("failure to merge {{shard}}", shard=shard, cause=e)

        # REMOVE OLD VIEW
        view_full_name = self.container.full_name + api_name
        if current_view:
            self.container.client.delete_table(current_view)

        # CREATE NEW VIEW
        self.container.create_view(view_full_name, primary_full_name)
Ejemplo n.º 6
0
    def create_table(
        self,
        table,
        schema=None,
        typed=True,
        read_only=True,  # TO PREVENT ACCIDENTAL WRITING
        sharded=False,
        partition=Null,  # PARTITION RULES
        cluster=None,  # TUPLE OF FIELDS TO SORT DATA
        top_level_fields=Null,
        kwargs=None,
    ):
        if kwargs.lookup != None or kwargs.flake != None:
            Log.error("expecting schema, not lookup")
        full_name = self.full_name + escape_name(table)
        if not schema:
            # WE MUST HAVE SOMETHING
            if typed:
                schema = copy(DEFAULT_TYPED_SCHEMA)
            else:
                schema = copy(DEFAULT_SCHEMA)

        flake = Snowflake(text(full_name),
                          top_level_fields,
                          partition,
                          schema=schema)

        if read_only:
            Log.error("Can not create a table for read-only use")

        if sharded:
            shard_name = escape_name(table + "_" +
                                     "".join(Random.sample(ALLOWED, 20)))
            shard_api_name = self.full_name + shard_name
            _shard = bigquery.Table(text(shard_api_name),
                                    schema=flake.to_bq_schema())
            _shard.time_partitioning = unwrap(
                flake._partition.bq_time_partitioning)
            _shard.clustering_fields = [
                c.es_column for f in listwrap(cluster)
                for c in [first(flake.leaves(f))] if c
            ] or None
            self.shard = self.client.create_table(_shard)
            self.create_view(full_name, shard_api_name)
        else:
            _table = bigquery.Table(text(full_name),
                                    schema=flake.to_bq_schema())
            _table.time_partitioning = unwrap(
                flake._partition.bq_time_partitioning)
            _table.clustering_fields = [
                l.es_column for f in listwrap(cluster) for l in flake.leaves(f)
            ] or None
            self.client.create_table(_table)
            DEBUG and Log.note("created table {{table}}",
                               table=_table.table_id)

        return Table(
            table=table,
            typed=typed,
            read_only=read_only,
            sharded=sharded,
            partition=partition,
            top_level_fields=top_level_fields,
            kwargs=kwargs,
            container=self,
        )