Python groupby Examples

Programming Language: Python

Namespace/Package Name: jx_python.jx

Method/Function: groupby

Examples at hotexamples.com: 25

Python groupby - 25 examples found. These are the top rated real world Python examples of jx_python.jx.groupby extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: test_normal.py Project: kimberlythegeek/measure-noise

    def test_normal(self):
        num = 1000

        results = []
        for i in range(0, num):
            samples = numpy.random.normal(size=20)
            desc, score = deviance(samples)
            results.append(desc)

        self.assertEqual(list(jx.groupby(["ok", "not ok"])), [("not ok", ["not ok"]), ("ok", ["ok"])], "expecting version >=2.53")

        # normals ar OK
        for desc, n in jx.groupby(results):
            Log.note("{{desc}}: {{count}}", desc=desc, count=len(n))
            if desc == "OK":
                self.assertLessEqual(num * 0.99, len(n))

Example #2

Show file

File: mysql.py Project: vpathak2019/jx-sqlite

    def _execute_backlog(self):
        if not self.backlog: return

        (backlog, self.backlog) = (self.backlog, [])
        if self.db.__module__.startswith("pymysql"):
            # BUG IN PYMYSQL: CAN NOT HANDLE MULTIPLE STATEMENTS
            # https://github.com/PyMySQL/PyMySQL/issues/157
            for b in backlog:
                sql = self.preamble + b
                try:
                    if self.debug:
                        Log.note("Execute SQL:\n{{sql|indent}}", sql=sql)
                    self.cursor.execute(b)
                except Exception as e:
                    Log.error("Can not execute sql:\n{{sql}}",
                              sql=sql,
                              cause=e)

            self.cursor.close()
            self.cursor = self.db.cursor()
        else:
            for i, g in jx.groupby(backlog, size=MAX_BATCH_SIZE):
                sql = self.preamble + ";\n".join(g)
                try:
                    if self.debug:
                        Log.note("Execute block of SQL:\n{{sql|indent}}",
                                 sql=sql)
                    self.cursor.execute(sql)
                    self.cursor.close()
                    self.cursor = self.db.cursor()
                except Exception as e:
                    Log.error("Problem executing SQL:\n{{sql|indent}}",
                              sql=sql,
                              cause=e,
                              stack_depth=1)

Example #3

Show file

File: mysql.py Project: rv404674/TUID

    def _execute_backlog(self):
        if not self.backlog: return

        backlog, self.backlog = self.backlog, []
        if self.db.__module__.startswith("pymysql"):
            # BUG IN PYMYSQL: CAN NOT HANDLE MULTIPLE STATEMENTS
            # https://github.com/PyMySQL/PyMySQL/issues/157
            for b in backlog:
                sql = self.preamble + b
                try:
                    self.debug and Log.note("Execute SQL:\n{{sql|indent}}", sql=sql)
                    self.cursor.execute(b)
                except Exception as e:
                    Log.error("Can not execute sql:\n{{sql}}", sql=sql, cause=e)

            self.cursor.close()
            self.cursor = self.db.cursor()
        else:
            for i, g in jx.groupby(backlog, size=MAX_BATCH_SIZE):
                sql = self.preamble + ";\n".join(g)
                try:
                    self.debug and Log.note("Execute block of SQL:\n{{sql|indent}}", sql=sql)
                    self.cursor.execute(sql)
                    self.cursor.close()
                    self.cursor = self.db.cursor()
                except Exception as e:
                    Log.error("Problem executing SQL:\n{{sql|indent}}", sql=sql, cause=e, stack_depth=1)

Example #4

Show file

File: clogger.py Project: yoyogias2011/TUID

    def add_cset_entries(self,
                         ordered_rev_list,
                         timestamp=False,
                         number_forward=True):
        '''
        Adds a list of revisions to the table. Assumes ordered_rev_list is an ordered
        based on how changesets are found in the changelog. Going forwards or backwards is dealt
        with by flipping the list
        :param ordered_cset_list: Order given from changeset log searching.
        :param timestamp: If false, records are kept indefinitely
                          but if holes exist: (delete, None, delete, None)
                          those delete's with None's around them
                          will not be deleted.
        :param numbered: If True, this function will number the revision list
                         by going forward from max(revNum), else it'll go backwards
                         from revNum, then add X to all revnums and self.next_revnum
                         where X is the length of ordered_rev_list
        :return:
        '''
        with self.conn.transaction() as t:
            current_min = t.get_one("SELECT min(revnum) FROM csetlog")[0]
            current_max = t.get_one("SELECT max(revnum) FROM csetlog")[0]
            if not current_min or not current_max:
                current_min = 0
                current_max = 0

            direction = -1
            start = current_min - 1
            if number_forward:
                direction = 1
                start = current_max + 1
                ordered_rev_list = ordered_rev_list[::-1]

            insert_list = [(start + direction * count, rev,
                            int(time.time()) if timestamp else -1)
                           for count, rev in enumerate(ordered_rev_list)]

            # In case of overlapping requests
            fmt_insert_list = []
            for cset_entry in insert_list:
                tmp = self._get_one_revision(t, cset_entry)
                if not tmp:
                    fmt_insert_list.append(cset_entry)

            for _, tmp_insert_list in jx.groupby(fmt_insert_list,
                                                 size=SQL_CSET_BATCH_SIZE):
                t.execute(
                    "INSERT INTO csetLog (revnum, revision, timestamp)" +
                    " VALUES " + sql_list(
                        quote_set((revnum, revision, timestamp))
                        for revnum, revision, timestamp in tmp_insert_list))

            # Move the revision numbers forward if needed
            self.recompute_table_revnums()

        # Start a maintenance run if needed
        if self.check_for_maintenance():
            Log.note("Scheduling maintenance run on clogger.")
            self.maintenance_signal.go()

Example #5

Show file

def insert_into_db_chunked(transaction, data, cmd, sql_chunk_size=500):
    # For the `cmd` object, we expect something like (don't forget the whitespace at the end):
    #   "INSERT INTO temporal (tuid, file, revision, line) VALUES "
    #
    # `data` must be a list of tuples.
    for _, inserts_list in jx.groupby(data, size=sql_chunk_size):
        transaction.execute(cmd + sql_list(
            quote_set(entry) for entry in inserts_list))

Example #6

Show file

File: test_qb.py Project: klahnakoski/pyLibrary

 def test_groupby_values(self):
     self.assertAlmostEqual(
         list(jx.groupby(["ok", "not ok"])),
         [
             ("not ok", ("not ok", )),
             ("ok", ("ok", ))
         ],
         "expecting to group by whole value"
     )

Example #7

Show file

File: util.py Project: rv404674/TUID

def insert_into_db_chunked(transaction, data, cmd, sql_chunk_size=500):
    # For the `cmd` object, we expect something like (don't forget the whitespace at the end):
    #   "INSERT INTO temporal (tuid, file, revision, line) VALUES "
    #
    # `data` must be a list of tuples.
    for _, inserts_list in jx.groupby(data, size=sql_chunk_size):
        transaction.execute(
            cmd +
            sql_list(quote_set(entry) for entry in inserts_list)
        )

Example #8

Show file

File: mysql.py Project: klahnakoski/pyLibrary

    def _execute_backlog(self):
        if not self.backlog: return

        backlog, self.backlog = self.backlog, []
        for i, g in jx.groupby(backlog, size=MAX_BATCH_SIZE):
            sql = self.preamble + ";\n".join(g)
            try:
                self.debug and Log.note("Execute block of SQL:\n{{sql|indent}}", sql=sql)
                self.cursor.execute(sql)
                self.cursor.close()
                self.cursor = self.db.cursor()
            except Exception as e:
                Log.error("Problem executing SQL:\n{{sql|indent}}", sql=sql, cause=e, stack_depth=1)

Example #9

Show file

    def _insert_loop(self, please_stop=None):
        bad_count = 0
        while not please_stop:
            try:
                messages = wrap(self.queue.pop_all())
                if not messages:
                    Till(seconds=PAUSE_AFTER_GOOD_INSERT).wait()
                    continue

                for g, mm in jx.groupby(messages, size=self.batch_size):
                    scrubbed = []
                    for i, message in enumerate(mm):
                        if message is THREAD_STOP:
                            please_stop.go()
                            continue
                        try:
                            messages = flatten_causal_chain(message.value)
                            scrubbed.append(
                                {
                                    "value": [
                                        _deep_json_to_string(m, depth=3)
                                        for m in messages
                                    ]
                                }
                            )
                        except Exception as e:
                            Log.warning("Problem adding to scrubbed list", cause=e)

                    self.es.extend(scrubbed)
                    bad_count = 0
            except Exception as f:
                Log.warning("Problem inserting logs into ES", cause=f)
                bad_count += 1
                if bad_count > MAX_BAD_COUNT:
                    Log.warning(
                        "Given up trying to write debug logs to ES index {{index}}",
                        index=self.es.settings.index,
                    )
                Till(seconds=PAUSE_AFTER_BAD_INSERT).wait()

        self.es.flush()

        # CONTINUE TO DRAIN THIS QUEUE
        while not please_stop:
            try:
                Till(seconds=PAUSE_AFTER_GOOD_INSERT).wait()
                self.queue.pop_all()
            except Exception as e:
                Log.warning("Should not happen", cause=e)

Example #10

Show file

File: mysql.py Project: mars-f/ActiveData

    def _execute_backlog(self):
        if not self.backlog: return

        backlog, self.backlog = self.backlog, []
        for i, g in jx.groupby(backlog, size=MAX_BATCH_SIZE):
            sql = self.preamble + ";\n".join(g)
            try:
                self.debug and Log.note(
                    "Execute block of SQL:\n{{sql|indent}}", sql=sql)
                self.cursor.execute(sql)
                self.cursor.close()
                self.cursor = self.db.cursor()
            except Exception as e:
                Log.error("Problem executing SQL:\n{{sql|indent}}",
                          sql=sql,
                          cause=e,
                          stack_depth=1)

Example #11

Show file

    def _insert_loop(self, please_stop=None):
        bad_count = 0
        while not please_stop:
            try:
                Till(seconds=1).wait()
                messages = wrap(self.queue.pop_all())
                if not messages:
                    continue

                for g, mm in jx.groupby(messages, size=self.batch_size):
                    scrubbed = []
                    try:
                        for i, message in enumerate(mm):
                            if message is THREAD_STOP:
                                please_stop.go()
                                return
                            scrubbed.append(
                                _deep_json_to_string(message, depth=3))
                    finally:
                        self.es.extend(scrubbed)
                    bad_count = 0
            except Exception as e:
                Log.warning("Problem inserting logs into ES", cause=e)
                bad_count += 1
                if bad_count > MAX_BAD_COUNT:
                    Log.warning(
                        "Given up trying to write debug logs to ES index {{index}}",
                        index=self.es.settings.index)
                Till(seconds=30).wait()

        # CONTINUE TO DRAIN THIS QUEUE
        while not please_stop:
            try:
                Till(seconds=1).wait()
                self.queue.pop_all()
            except Exception as e:
                Log.warning("Should not happen", cause=e)

Example #12

Show file

File: analysis_etl.py Project: mozilla/measure-noise

def process(
    sig_id,
    since,
    source,
    destination,
):
    """
    :param sig_id: The performance hash
    :param since: Only data after this date
    :param show:
    :param show_limit:
    :param show_old:
    :param show_distribution:
    :return:
    """
    if not isinstance(sig_id, int):
        Log.error("expecting id")

    # GET SIGNATURE DETAILS
    sig = get_signature(source, sig_id)

    # GET SIGNATURE DETAILS
    pushes = get_dataum(source, sig_id, since, LIMIT)

    pushes = jx.sort(
        [{
            "value": median(rows.value),
            "runs": rows,
            "push": {
                "time": unwrap(t)["push.time"]
            },
        } for t, rows in jx.groupby(pushes, "push.time")
         if t["push\\.time"] > since],
        "push.time",
    )

    values = list(pushes.value)
    title = "-".join(
        map(
            str,
            [
                sig.framework,
                sig.suite,
                sig.test,
                sig.platform,
                sig.repository,
            ],
        ))
    Log.note("With {{title}}", title=title)

    if len(values) > LIMIT:
        Log.alert(
            "Too many values for {{title}} ({at least {num}}), choosing last {{limit}}",
            title=title,
            num=len(values),
            limit=LIMIT,
        )
        values = values[-LIMIT:]

    with Timer("find segments"):
        new_segments, new_diffs = find_segments(values, sig.alert_change_type,
                                                sig.alert_threshold)

    if len(new_segments) == 1:
        overall_dev_status = None
        overall_dev_score = None
        last_mean = None
        last_std = None
        last_dev_status = None
        last_dev_score = None
        relative_noise = None
    else:
        # NOISE OF LAST SEGMENT
        s, e = new_segments[-2], new_segments[-1]
        last_segment = np.array(values[s:e])
        trimmed_segment = last_segment
        last_mean = np.mean(trimmed_segment)
        last_std = np.std(trimmed_segment)
        last_dev_status, last_dev_score = deviance(trimmed_segment)
        relative_noise = last_std / last_mean

        # FOR EACH SEGMENT, NORMALIZE MEAN AND VARIANCE
        normalized = []
        for s, e in jx.pairs(new_segments):
            data = np.array(values[s:e])
            norm = (data + last_mean - np.mean(data)) * last_std / np.std(data)
            normalized.extend(norm)

        overall_dev_status, overall_dev_score = deviance(normalized)
        Log.note(
            "\n\tdeviance = {{deviance}}\n\tnoise={{std}}\n\tpushes={{pushes}}\n\tsegments={{num_segments}}",
            title=title,
            deviance=(overall_dev_status, overall_dev_score),
            std=relative_noise,
            pushes=len(values),
            num_segments=len(new_segments) - 1,
        )

    destination.add(
        Data(
            id=sig_id,
            title=title,
            num_pushes=len(values),
            num_segments=len(new_segments) - 1,
            relative_noise=relative_noise,
            overall_dev_status=overall_dev_status,
            overall_dev_score=overall_dev_score,
            last_mean=last_mean,
            last_std=last_std,
            last_dev_status=last_dev_status,
            last_dev_score=last_dev_score,
            last_updated=Date.now(),
            values=values,
        )
        | scrub(sig))

Example #13

Show file

File: meta.py Project: rv404674/TUID

    def _update_cardinality(self, column):
        """
        QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN
        """
        if column.es_index in self.index_does_not_exist:
            return

        if column.jx_type in STRUCT:
            Log.error("not supported")
        try:
            if column.es_index == "meta.columns":
                partitions = jx.sort([g[column.es_column] for g, _ in jx.groupby(self.meta.columns, column.es_column) if g[column.es_column] != None])
                self.meta.columns.update({
                    "set": {
                        "partitions": partitions,
                        "count": len(self.meta.columns),
                        "cardinality": len(partitions),
                        "multi": 1,
                        "last_updated": Date.now()
                    },
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                return
            if column.es_index == "meta.tables":
                partitions = jx.sort([g[column.es_column] for g, _ in jx.groupby(self.meta.tables, column.es_column) if g[column.es_column] != None])
                self.meta.columns.update({
                    "set": {
                        "partitions": partitions,
                        "count": len(self.meta.tables),
                        "cardinality": len(partitions),
                        "multi": 1,
                        "last_updated": Date.now()
                    },
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                return

            es_index = column.es_index.split(".")[0]

            is_text = [cc for cc in self.meta.columns if cc.es_column == column.es_column and cc.es_type == "text"]
            if is_text:
                # text IS A MULTIVALUE STRING THAT CAN ONLY BE FILTERED
                result = self.es_cluster.post("/" + es_index + "/_search", data={
                    "aggs": {
                        "count": {"filter": {"match_all": {}}}
                    },
                    "size": 0
                })
                count = result.hits.total
                cardinality = max(1001, count)
                multi = 1001
            elif column.es_column == "_id":
                result = self.es_cluster.post("/" + es_index + "/_search", data={
                    "query": {"match_all": {}},
                    "size": 0
                })
                count = cardinality = result.hits.total
                multi = 1
            elif column.es_type == BOOLEAN:
                result = self.es_cluster.post("/" + es_index + "/_search", data={
                    "aggs": {
                        "count": _counting_query(column)
                    },
                    "size": 0
                })
                count = result.hits.total
                cardinality = 2
                multi = 1
            else:
                result = self.es_cluster.post("/" + es_index + "/_search", data={
                    "aggs": {
                        "count": _counting_query(column),
                        "multi": {"max": {"script": "doc[" + quote(column.es_column) + "].values.size()"}}
                    },
                    "size": 0
                })
                agg_results = result.aggregations
                count = result.hits.total
                cardinality = coalesce(agg_results.count.value, agg_results.count._nested.value, agg_results.count.doc_count)
                multi = int(coalesce(agg_results.multi.value, 1))
                if cardinality == None:
                   Log.error("logic error")

            query = Data(size=0)

            if column.es_column == "_id":
                self.meta.columns.update({
                    "set": {
                        "count": cardinality,
                        "cardinality": cardinality,
                        "multi": 1,
                        "last_updated": Date.now()
                    },
                    "clear": ["partitions"],
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                return
            elif cardinality > 1000 or (count >= 30 and cardinality == count) or (count >= 1000 and cardinality / count > 0.99):
                DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality)
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "multi": multi,
                        "last_updated": Date.now()
                    },
                    "clear": ["partitions"],
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                return
            elif column.es_type in elasticsearch.ES_NUMERIC_TYPES and cardinality > 30:
                DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality)
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "multi": multi,
                        "last_updated": Date.now()
                    },
                    "clear": ["partitions"],
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                return
            elif len(column.nested_path) != 1:
                query.aggs["_"] = {
                    "nested": {"path": column.nested_path[0]},
                    "aggs": {"_nested": {"terms": {"field": column.es_column}}}
                }
            elif cardinality == 0:
                query.aggs["_"] = {"terms": {"field": column.es_column}}
            else:
                query.aggs["_"] = {"terms": {"field": column.es_column, "size": cardinality}}

            result = self.es_cluster.post("/" + es_index + "/_search", data=query)

            aggs = result.aggregations._
            if aggs._nested:
                parts = jx.sort(aggs._nested.buckets.key)
            else:
                parts = jx.sort(aggs.buckets.key)

            self.meta.columns.update({
                "set": {
                    "count": count,
                    "cardinality": cardinality,
                    "multi": multi,
                    "partitions": parts,
                    "last_updated": Date.now()
                },
                "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
            })
        except Exception as e:
            # CAN NOT IMPORT: THE TEST MODULES SETS UP LOGGING
            # from tests.test_jx import TEST_TABLE
            e = Except.wrap(e)
            TEST_TABLE = "testdata"
            is_missing_index = any(w in e for w in ["IndexMissingException", "index_not_found_exception"])
            is_test_table = column.es_index.startswith((TEST_TABLE_PREFIX, TEST_TABLE))
            if is_missing_index and is_test_table:
                # WE EXPECT TEST TABLES TO DISAPPEAR
                self.meta.columns.update({
                    "clear": ".",
                    "where": {"eq": {"es_index": column.es_index}}
                })
                self.index_does_not_exist.add(column.es_index)
            else:
                self.meta.columns.update({
                    "set": {
                        "last_updated": Date.now()
                    },
                    "clear": [
                        "count",
                        "cardinality",
                        "multi",
                        "partitions",
                    ],
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                Log.warning("Could not get {{col.es_index}}.{{col.es_column}} info", col=column, cause=e)

Example #14

Show file

    def _update_cardinality(self, column):
        """
        QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN
        """
        now = Date.now()
        if column.es_index in self.index_does_not_exist:
            return

        if column.jx_type in STRUCT:
            Log.error("not supported")
        try:
            if column.es_index == "meta.columns":
                partitions = jx.sort([
                    g[column.es_column]
                    for g, _ in jx.groupby(self.meta.columns, column.es_column)
                    if g[column.es_column] != None
                ])
                self.meta.columns.update({
                    "set": {
                        "partitions": partitions,
                        "count": len(self.meta.columns),
                        "cardinality": len(partitions),
                        "multi": 1,
                        "last_updated": now
                    },
                    "where": {
                        "eq": {
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
                return
            if column.es_index == "meta.tables":
                partitions = jx.sort([
                    g[column.es_column]
                    for g, _ in jx.groupby(self.meta.tables, column.es_column)
                    if g[column.es_column] != None
                ])
                self.meta.columns.update({
                    "set": {
                        "partitions": partitions,
                        "count": len(self.meta.tables),
                        "cardinality": len(partitions),
                        "multi": 1,
                        "last_updated": now
                    },
                    "where": {
                        "eq": {
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
                return

            es_index = column.es_index.split(".")[0]

            is_text = [
                cc for cc in self.meta.columns
                if cc.es_column == column.es_column and cc.es_type == "text"
            ]
            if is_text:
                # text IS A MULTIVALUE STRING THAT CAN ONLY BE FILTERED
                result = self.es_cluster.post("/" + es_index + "/_search",
                                              data={
                                                  "aggs": {
                                                      "count": {
                                                          "filter": {
                                                              "match_all": {}
                                                          }
                                                      }
                                                  },
                                                  "size": 0
                                              })
                count = result.hits.total
                cardinality = max(1001, count)
                multi = 1001
            elif column.es_column == "_id":
                result = self.es_cluster.post("/" + es_index + "/_search",
                                              data={
                                                  "query": {
                                                      "match_all": {}
                                                  },
                                                  "size": 0
                                              })
                count = cardinality = result.hits.total
                multi = 1
            elif column.es_type == BOOLEAN:
                result = self.es_cluster.post("/" + es_index + "/_search",
                                              data={
                                                  "aggs": {
                                                      "count":
                                                      _counting_query(column)
                                                  },
                                                  "size": 0
                                              })
                count = result.hits.total
                cardinality = 2

                DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts",
                                   table=column.es_index,
                                   field=column.es_column,
                                   num=cardinality)
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "partitions": [False, True],
                        "multi": 1,
                        "last_updated": now
                    },
                    "clear": ["partitions"],
                    "where": {
                        "eq": {
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
                return
            else:
                es_query = {
                    "aggs": {
                        "count": _counting_query(column),
                        "_filter": {
                            "aggs": {
                                "multi": {
                                    "max": {
                                        "script":
                                        "doc[" + quote(column.es_column) +
                                        "].values.size()"
                                    }
                                }
                            },
                            "filter": {
                                "bool": {
                                    "should": [{
                                        "range": {
                                            "etl.timestamp.~n~": {
                                                "gte": (Date.today() - WEEK)
                                            }
                                        }
                                    }, {
                                        "bool": {
                                            "must_not": {
                                                "exists": {
                                                    "field":
                                                    "etl.timestamp.~n~"
                                                }
                                            }
                                        }
                                    }]
                                }
                            }
                        }
                    },
                    "size": 0
                }

                result = self.es_cluster.post("/" + es_index + "/_search",
                                              data=es_query)
                agg_results = result.aggregations
                count = result.hits.total
                cardinality = coalesce(agg_results.count.value,
                                       agg_results.count._nested.value,
                                       agg_results.count.doc_count)
                multi = int(coalesce(agg_results._filter.multi.value, 1))
                if cardinality == None:
                    Log.error("logic error")

            query = Data(size=0)

            if column.es_column == "_id":
                self.meta.columns.update({
                    "set": {
                        "count": cardinality,
                        "cardinality": cardinality,
                        "multi": 1,
                        "last_updated": now
                    },
                    "clear": ["partitions"],
                    "where": {
                        "eq": {
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
                return
            elif cardinality > 1000 or (count >= 30 and cardinality == count
                                        ) or (count >= 1000
                                              and cardinality / count > 0.99):
                DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts",
                                   table=column.es_index,
                                   field=column.es_column,
                                   num=cardinality)
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "multi": multi,
                        "last_updated": now
                    },
                    "clear": ["partitions"],
                    "where": {
                        "eq": {
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
                return
            elif column.es_type in elasticsearch.ES_NUMERIC_TYPES and cardinality > 30:
                DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts",
                                   table=column.es_index,
                                   field=column.es_column,
                                   num=cardinality)
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "multi": multi,
                        "last_updated": now
                    },
                    "clear": ["partitions"],
                    "where": {
                        "eq": {
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
                return
            elif len(column.nested_path) != 1:
                query.aggs["_"] = {
                    "nested": {
                        "path": column.nested_path[0]
                    },
                    "aggs": {
                        "_nested": {
                            "terms": {
                                "field": column.es_column
                            }
                        }
                    }
                }
            elif cardinality == 0:  # WHEN DOES THIS HAPPEN?
                query.aggs["_"] = {"terms": {"field": column.es_column}}
            else:
                query.aggs["_"] = {
                    "terms": {
                        "field": column.es_column,
                        "size": cardinality
                    }
                }

            result = self.es_cluster.post("/" + es_index + "/_search",
                                          data=query)

            aggs = result.aggregations._
            if aggs._nested:
                parts = jx.sort(aggs._nested.buckets.key)
            else:
                parts = jx.sort(aggs.buckets.key)

            DEBUG and Log.note(
                "update metadata for {{column.es_index}}.{{column.es_column}} (id={{id}}) at {{time}}",
                id=id(column),
                column=column,
                time=now)
            self.meta.columns.update({
                "set": {
                    "count": count,
                    "cardinality": cardinality,
                    "multi": multi,
                    "partitions": parts,
                    "last_updated": now
                },
                "where": {
                    "eq": {
                        "es_index": column.es_index,
                        "es_column": column.es_column
                    }
                }
            })
        except Exception as e:
            # CAN NOT IMPORT: THE TEST MODULES SETS UP LOGGING
            # from tests.test_jx import TEST_TABLE
            e = Except.wrap(e)
            TEST_TABLE = "testdata"
            is_missing_index = any(
                w in e for w in
                ["IndexMissingException", "index_not_found_exception"])
            is_test_table = column.es_index.startswith(
                (TEST_TABLE_PREFIX, TEST_TABLE))
            if is_missing_index:
                # WE EXPECT TEST TABLES TO DISAPPEAR
                Log.warning("Missing index {{col.es_index}}",
                            col=column,
                            cause=e)
                self.meta.columns.update({
                    "clear": ".",
                    "where": {
                        "eq": {
                            "es_index": column.es_index
                        }
                    }
                })
                self.index_does_not_exist.add(column.es_index)
            elif "No field found for" in e:
                self.meta.columns.update({
                    "clear": ".",
                    "where": {
                        "eq": {
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
                Log.warning(
                    "Could not get column {{col.es_index}}.{{col.es_column}} info",
                    col=column,
                    cause=e)
            else:
                self.meta.columns.update({
                    "set": {
                        "last_updated": now
                    },
                    "clear": [
                        "count",
                        "cardinality",
                        "multi",
                        "partitions",
                    ],
                    "where": {
                        "eq": {
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
                Log.warning(
                    "Could not get {{col.es_index}}.{{col.es_column}} info",
                    col=column,
                    cause=e)

Example #15

Show file

 def groupby(self, keys):
     return jx.groupby(self.__iter__(), keys)

Example #16

Show file

def process(
    about_deviant,
    since,
    source,
    deviant_summary,
    show=False,
    show_limit=MAX_POINTS,
    show_old=False,
    show_distribution=None,
):
    """
    :param signature_hash: The performance hash
    :param since: Only data after this date
    :param show:
    :param show_limit:
    :param show_old:
    :param show_distribution:
    :return:
    """
    sig_id = about_deviant.id
    if not isinstance(sig_id, int):
        Log.error("expecting id")

    # GET SIGNATURE DETAILS
    sig = get_signature(db_config=source, signature_id=sig_id)

    # GET SIGNATURE DETAILS
    data = get_dataum(source, sig.id, since=since, limit=show_limit)

    min_date = since.unix
    pushes = jx.sort(
        [{
            "value": median(rows.value),
            "runs": rows,
            "push": {
                "time": unwrap(t)["push.time"]
            },
        } for t, rows in jx.groupby(data, "push.time")
         if t["push\\.time"] > min_date],
        "push.time",
    )

    values = list(pushes.value)
    title = "-".join(
        map(
            str,
            [
                sig.id,
                sig.framework,
                sig.suite,
                sig.test,
                sig.repository,
                sig.platform,
                about_deviant.overall_dev_status,
            ],
        ))
    # EG https://treeherder.mozilla.org/perf.html#/graphs?highlightAlerts=1&series=mozilla-central,fee739b45f7960e4a520d8e0bd781dd9d0a3bec4,1,10&timerange=31536000
    url = "https://treeherder.mozilla.org/perf.html#/graphs?" + value2url_param(
        {
            "highlightAlerts":
            1,
            "series": [
                sig.repository, sig.id, 1,
                coalesce(sig.framework_id, sig.framework)
            ],
            "timerange":
            Duration(TREEHERDER_RANGE).seconds
        })

    Log.note("With {{title}}: {{url}}", title=title, url=url)

    with Timer("find segments"):
        new_segments, new_diffs = find_segments(values, sig.alert_change_type,
                                                sig.alert_threshold)

    # USE PERFHERDER ALERTS TO IDENTIFY OLD SEGMENTS
    old_segments = tuple(
        sorted(
            set([
                i for i, p in enumerate(pushes) if any(r.alert.id
                                                       for r in p.runs)
            ] + [0, len(pushes)])))
    old_medians = [0.0] + [
        np.median(values[s:e])
        for s, e in zip(old_segments[:-1], old_segments[1:])
    ]
    old_diffs = np.array(
        [b / a - 1 for a, b in zip(old_medians[:-1], old_medians[1:])] + [0])

    if len(new_segments) == 1:
        overall_dev_status = None
        overall_dev_score = None
        last_mean = None
        last_std = None
        last_dev_status = None
        last_dev_score = None
        relative_noise = None
        Log.note("not ")
    else:
        # NOISE OF LAST SEGMENT
        s, e = new_segments[-2], new_segments[-1]
        last_segment = np.array(values[s:e])
        ignore = IGNORE_TOP
        trimmed_segment = last_segment[np.argsort(last_segment)
                                       [ignore:-ignore]]
        last_mean = np.mean(trimmed_segment)
        last_std = np.std(trimmed_segment)
        last_dev_status, last_dev_score = deviance(trimmed_segment)
        relative_noise = last_std / last_mean

        # FOR EACH SEGMENT, NORMALIZE MEAN AND VARIANCE
        normalized = []
        for s, e in jx.pairs(new_segments):
            data = np.array(values[s:e])
            norm = (data + last_mean - np.mean(data)) * last_std / np.std(data)
            normalized.extend(norm)

        overall_dev_status, overall_dev_score = deviance(normalized)
        Log.note(
            "\n\tdeviance = {{deviance}}\n\tnoise={{std}}\n\tpushes={{pushes}}\n\tsegments={{num_segments}}",
            title=title,
            deviance=(overall_dev_status, overall_dev_score),
            std=relative_noise,
            pushes=len(values),
            num_segments=len(new_segments) - 1,
        )

        if show_distribution:
            histogram(trimmed_segment,
                      title=last_dev_status + "=" + text(last_dev_score))

    max_extra_diff = None
    max_missing_diff = None
    _is_diff = is_diff(new_segments, old_segments)
    if _is_diff:
        # FOR MISSING POINTS, CALC BIGGEST DIFF
        max_extra_diff = mo_math.MAX(
            abs(d) for s, d in zip(new_segments, new_diffs)
            if all(not (s - TOLERANCE <= o <= s + TOLERANCE)
                   for o in old_segments))
        max_missing_diff = mo_math.MAX(
            abs(d) for s, d in zip(old_segments, old_diffs)
            if all(not (s - TOLERANCE <= n <= s + TOLERANCE)
                   for n in new_segments))

        Log.alert(
            "Disagree max_extra_diff={{max_extra_diff|round(places=3)}}, max_missing_diff={{max_missing_diff|round(places=3)}}",
            max_extra_diff=max_extra_diff,
            max_missing_diff=max_missing_diff,
        )
        Log.note("old={{old}}, new={{new}}",
                 old=old_segments,
                 new=new_segments)
    else:
        Log.note("Agree")

    if show and len(pushes):
        show_old and assign_colors(values, old_segments, title="OLD " + title)
        assign_colors(values, new_segments, title="NEW " + title)
        if url:
            webbrowser.open(url)

    if isinstance(deviant_summary, bigquery.Table):
        Log.note("BigQuery summary not updated")
        return

    deviant_summary.upsert(
        where={"eq": {
            "id": sig.id
        }},
        doc=Data(
            id=sig_id,
            title=title,
            num_pushes=len(values),
            num_segments=len(new_segments) - 1,
            relative_noise=relative_noise,
            overall_dev_status=overall_dev_status,
            overall_dev_score=overall_dev_score,
            last_mean=last_mean,
            last_std=last_std,
            last_dev_status=last_dev_status,
            last_dev_score=last_dev_score,
            last_updated=Date.now(),
            is_diff=_is_diff,
            max_extra_diff=max_extra_diff,
            max_missing_diff=max_missing_diff,
            num_new_segments=len(new_segments),
            num_old_segments=len(old_segments),
        ),
    )

Example #17

Show file

        def follow_paths(position, path, nested_path, done_relations,
                         no_nested_docs):
            if position.name in self.settings.exclude:
                return

            if self.path_not_allowed(path):
                return
            if DEBUG:
                Log.note("Trace {{path}}", path=path)
            if position.name != "__ids__":
                # USED TO CONFIRM WE CAN ACCESS THE TABLE (WILL THROW ERROR WHEN IF IT FAILS)
                self.db.query(
                    ConcatSQL(
                        SQL_SELECT,
                        SQL_STAR,
                        SQL_FROM,
                        quote_column(position.schema, position.name),
                        SQL_LIMIT,
                        SQL_ONE,
                    ))

            if position.name in reference_all_tables:
                no_nested_docs = True
            if position.name in reference_only_tables:
                return
            curr_join_list = copy(nested_path_to_join[nested_path[0]])

            ###############################################################################
            # INNER OBJECTS
            ###############################################################################
            referenced_tables = list(
                sort_using_key(
                    jx.groupby(
                        jx.filter(
                            relations,
                            {
                                "eq": {
                                    "table.name": position.name,
                                    "table.schema": position.schema,
                                }
                            },
                        ),
                        "constraint.name",
                    ),
                    key=lambda p: first(p[1]).column.name,
                ))
            for g, constraint_columns in referenced_tables:
                g = unwrap(g)
                constraint_columns = deepcopy(constraint_columns)
                if g["constraint.name"] in done_relations:
                    continue
                if any(cc for cc in constraint_columns
                       if cc.referenced.table.name in self.settings.exclude):
                    continue

                done_relations.add(g["constraint.name"])

                many_to_one_joins = nested_path_to_join[nested_path[0]]
                index = len(many_to_one_joins)

                alias = "t" + text(index)
                for c in constraint_columns:
                    c.referenced.table.alias = alias
                    c.table = position
                many_to_one_joins.append({
                    "join_columns": constraint_columns,
                    "path": path,
                    "nested_path": nested_path,
                })

                # HANDLE THE COMMON *id SUFFIX
                name = []
                for cname, tname in zip(
                        constraint_columns.column.name,
                        constraint_columns.referenced.table.name,
                ):
                    if cname.startswith(tname):
                        name.append(tname)
                    elif cname.endswith("_id"):
                        name.append(cname[:-3])
                    else:
                        name.append(cname)

                relation_string = many_to_one_string(constraint_columns[0])
                step = "/".join(name)
                if len(constraint_columns) == 1:
                    step = self.name_relations.get(relation_string, step)

                referenced_column_path = concat_field(path, step)
                if self.path_not_allowed(referenced_column_path):
                    continue

                if referenced_column_path in reference_only_tables:
                    continue

                col_pointer_name = relative_field(referenced_column_path,
                                                  nested_path[0])
                for col in columns:
                    if (col.table.name
                            == constraint_columns[0].referenced.table.name
                            and col.table.schema
                            == constraint_columns[0].referenced.table.schema):
                        col_full_name = concat_field(
                            col_pointer_name, literal_field(col.column.name))

                        if (col.is_id and len(nested_path) == 1
                                and col.table.name == fact_table.name
                                and col.table.schema == fact_table.schema):
                            # ALWAYS SHOW THE ID OF THE FACT
                            c_index = len(output_columns)
                            output_columns.append({
                                "table_alias":
                                alias,
                                "column_alias":
                                "c" + text(c_index),
                                "column":
                                col,
                                "sort":
                                True,
                                "path":
                                referenced_column_path,
                                "nested_path":
                                nested_path,
                                "put":
                                col_full_name,
                            })
                        elif col.column.name == constraint_columns[
                                0].column.name:
                            c_index = len(output_columns)
                            output_columns.append({
                                "table_alias":
                                alias,
                                "column_alias":
                                "c" + text(c_index),
                                "column":
                                col,
                                "sort":
                                False,
                                "path":
                                referenced_column_path,
                                "nested_path":
                                nested_path,
                                "put":
                                col_full_name
                                if self.settings.show_foreign_keys else None,
                            })
                        elif col.is_id:
                            c_index = len(output_columns)
                            output_columns.append({
                                "table_alias":
                                alias,
                                "column_alias":
                                "c" + text(c_index),
                                "column":
                                col,
                                "sort":
                                False,
                                "path":
                                referenced_column_path,
                                "nested_path":
                                nested_path,
                                "put":
                                col_full_name
                                if self.settings.show_foreign_keys else None,
                            })
                        elif col.reference:
                            c_index = len(output_columns)
                            output_columns.append({
                                "table_alias":
                                alias,
                                "column_alias":
                                "c" + text(c_index),
                                "column":
                                col,
                                "sort":
                                False,
                                "path":
                                referenced_column_path,
                                "nested_path":
                                nested_path,
                                "put":
                                col_pointer_name
                                if not self.settings.show_foreign_keys else
                                col_full_name,  # REFERENCE FIELDS CAN REPLACE THE WHOLE OBJECT BEING REFERENCED
                            })
                        elif col.include:
                            c_index = len(output_columns)
                            output_columns.append({
                                "table_alias":
                                alias,
                                "column_alias":
                                "c" + text(c_index),
                                "column":
                                col,
                                "sort":
                                False,
                                "path":
                                referenced_column_path,
                                "nested_path":
                                nested_path,
                                "put":
                                col_full_name,
                            })

                if position.name in reference_only_tables:
                    continue

                todo.append(
                    Data(
                        position=copy(constraint_columns[0].referenced.table),
                        path=referenced_column_path,
                        nested_path=nested_path,
                        done_relations=copy(done_relations),
                        no_nested_docs=no_nested_docs,
                    ))
            ###############################################################################
            # NESTED OBJECTS
            ###############################################################################
            if not no_nested_docs:
                nesting_tables = list(
                    sort_using_key(
                        jx.groupby(
                            jx.filter(
                                relations,
                                {
                                    "eq": {
                                        "referenced.table.name": position.name,
                                        "referenced.table.schema":
                                        position.schema,
                                    }
                                },
                            ),
                            "constraint.name",
                        ),
                        key=lambda p: [(r.table.name, r.column.name)
                                       for r in [first(p[1])]][0],
                    ))

                for g, constraint_columns in nesting_tables:
                    g = unwrap(g)
                    constraint_columns = deepcopy(constraint_columns)
                    if g["constraint.name"] in done_relations:
                        continue
                    done_relations.add(g["constraint.name"])

                    many_table = set(constraint_columns.table.name)
                    if not (many_table - self.settings.exclude):
                        continue

                    relation_string = one_to_many_string(constraint_columns[0])
                    step = "/".join(many_table)
                    if len(constraint_columns) == 1:
                        step = self.name_relations.get(relation_string, step)

                    referenced_column_path = concat_field(path, step)
                    if self.path_not_allowed(referenced_column_path):
                        continue

                    new_nested_path = [referenced_column_path] + nested_path
                    all_nested_paths.append(new_nested_path)

                    if referenced_column_path in nested_path_to_join:
                        Log.error(
                            "{{path}} already exists, try adding entry to name_relations",
                            path=referenced_column_path,
                        )
                    one_to_many_joins = nested_path_to_join[
                        referenced_column_path] = copy(curr_join_list)
                    index = len(one_to_many_joins)
                    alias = "t" + text(index)
                    for c in constraint_columns:
                        c.table.alias = alias
                        c.referenced.table = position
                    one_to_many_joins.append(
                        set_default(
                            {},
                            g,
                            {
                                "children": True,
                                "join_columns": constraint_columns,
                                "path": path,
                                "nested_path": nested_path,
                            },
                        ))
                    for col in columns:
                        if (col.table.name == constraint_columns[0].table.name
                                and col.table.schema
                                == constraint_columns[0].table.schema):
                            col_full_name = join_field(
                                split_field(referenced_column_path)
                                [len(split_field(new_nested_path[0])):] +
                                [literal_field(col.column.name)])

                            if col.column.name == constraint_columns[
                                    0].column.name:
                                c_index = len(output_columns)
                                output_columns.append({
                                    "table_alias":
                                    alias,
                                    "column_alias":
                                    "c" + text(c_index),
                                    "column":
                                    col,
                                    "sort":
                                    col.is_id,
                                    "path":
                                    referenced_column_path,
                                    "nested_path":
                                    new_nested_path,
                                    "put":
                                    col_full_name if
                                    self.settings.show_foreign_keys else None,
                                })
                            elif col.is_id:
                                c_index = len(output_columns)
                                output_columns.append({
                                    "table_alias":
                                    alias,
                                    "column_alias":
                                    "c" + text(c_index),
                                    "column":
                                    col,
                                    "sort":
                                    col.is_id,
                                    "path":
                                    referenced_column_path,
                                    "nested_path":
                                    new_nested_path,
                                    "put":
                                    col_full_name if
                                    self.settings.show_foreign_keys else None,
                                })
                            else:
                                c_index = len(output_columns)
                                output_columns.append({
                                    "table_alias":
                                    alias,
                                    "column_alias":
                                    "c" + text(c_index),
                                    "column":
                                    col,
                                    "sort":
                                    col.is_id,
                                    "path":
                                    referenced_column_path,
                                    "nested_path":
                                    new_nested_path,
                                    "put":
                                    col_full_name if col.include else None,
                                })

                    todo.append(
                        Data(
                            position=constraint_columns[0].table,
                            path=referenced_column_path,
                            nested_path=new_nested_path,
                            done_relations=copy(done_relations),
                            no_nested_docs=no_nested_docs,
                        ))

Example #18

Show file

    def _scan_database(self):
        # GET ALL RELATIONS
        raw_relations = self.db.query(
            """
            SELECT
                table_schema,
                table_name,
                referenced_table_schema,
                referenced_table_name,
                referenced_column_name,
                constraint_name,
                column_name,
                ordinal_position
            FROM
                information_schema.key_column_usage
            WHERE
                referenced_column_name IS NOT NULL
            """,
            param=self.settings.database,
        )

        if not raw_relations:
            Log.error("No relations in the database")

        for r in self.settings.add_relations:
            try:
                lhs, rhs = map(strings.trim, r.split("->"))
                lhs = lhs.split(".")
                if len(lhs) == 2:
                    lhs = [self.settings.database.schema] + lhs
                rhs = rhs.split(".")
                if len(rhs) == 2:
                    rhs = [self.settings.database.schema] + rhs
                to_add = Data(
                    ordinal_position=1,  # CAN ONLY HANDLE 1-COLUMN RELATIONS
                    table_schema=lhs[0],
                    table_name=lhs[1],
                    column_name=lhs[2],
                    referenced_table_schema=rhs[0],
                    referenced_table_name=rhs[1],
                    referenced_column_name=rhs[2],
                )

                # CHECK IF EXISTING
                if jx.filter(raw_relations, {"eq": to_add}):
                    Log.note("Relation {{relation}} already exists",
                             relation=r)
                    continue

                to_add.constraint_name = Random.hex(20)
                raw_relations.append(to_add)
            except Exception as e:
                Log.error("Could not parse {{line|quote}}", line=r, cause=e)

        relations = jx.select(
            raw_relations,
            [
                {
                    "name": "constraint.name",
                    "value": "constraint_name"
                },
                {
                    "name": "table.schema",
                    "value": "table_schema"
                },
                {
                    "name": "table.name",
                    "value": "table_name"
                },
                {
                    "name": "column.name",
                    "value": "column_name"
                },
                {
                    "name": "referenced.table.schema",
                    "value": "referenced_table_schema"
                },
                {
                    "name": "referenced.table.name",
                    "value": "referenced_table_name"
                },
                {
                    "name": "referenced.column.name",
                    "value": "referenced_column_name"
                },
                {
                    "name": "ordinal_position",
                    "value": "ordinal_position"
                },
            ],
        )

        # GET ALL TABLES
        raw_tables = self.db.query("""
            SELECT
                t.table_schema,
                t.table_name,
                c.constraint_name,
                c.constraint_type,
                k.column_name,
                k.ordinal_position
            FROM
                information_schema.tables t
            LEFT JOIN
                information_schema.table_constraints c on c.table_name=t.table_name AND c.table_schema=t.table_schema and (constraint_type='UNIQUE' or constraint_type='PRIMARY KEY')
            LEFT JOIN
                information_schema.key_column_usage k on k.constraint_name=c.constraint_name AND k.table_name=t.table_name and k.table_schema=t.table_schema
            ORDER BY
                t.table_schema,
                t.table_name,
                c.constraint_name,
                k.ordinal_position,
                k.column_name
            """)

        # ORGANIZE, AND PICK ONE UNIQUE CONSTRAINT FOR LINKING
        tables = UniqueIndex(keys=["name", "schema"])
        for t, c in jx.groupby(raw_tables, ["table_name", "table_schema"]):
            c = wrap(list(c))
            best_index = Null
            is_referenced = False
            is_primary = False
            for g, w in jx.groupby(c, "constraint_name"):
                if not g.constraint_name:
                    continue
                w = list(w)
                ref = False
                for r in relations:
                    if (r.table.name == t.table_name
                            and r.table.schema == t.table_schema
                            and r.constraint.name == g.constraint_name):
                        ref = True
                is_prime = w[0].constraint_type == "PRIMARY"

                reasons_this_one_is_better = [
                    best_index == None,  # WE DO NOT HAVE A CANDIDATE YET
                    is_prime
                    and not is_primary,  # PRIMARY KEYS ARE GOOD TO HAVE
                    is_primary == is_prime and ref and
                    not is_referenced,  # REFERENCED UNIQUE TUPLES ARE GOOD TOO
                    is_primary == is_prime and ref == is_referenced and len(w)
                    < len(best_index),  # THE SHORTER THE TUPLE, THE BETTER
                ]
                if any(reasons_this_one_is_better):
                    is_primary = is_prime
                    is_referenced = ref
                    best_index = w

            tables.add({
                "name": t.table_name,
                "schema": t.table_schema,
                "id": [b.column_name for b in best_index],
            })

        fact_table = tables[self.settings.fact_table,
                            self.settings.database.schema]
        ids_table = {
            "alias": "t0",
            "name": "__ids__",
            "schema": fact_table.schema,
            "id": fact_table.id,
        }
        relations.extend(
            wrap({
                "constraint": {
                    "name": "__link_ids_to_fact_table__"
                },
                "table": ids_table,
                "column": {
                    "name": c
                },
                "referenced": {
                    "table": fact_table,
                    "column": {
                        "name": c
                    }
                },
                "ordinal_position": i,
            }) for i, c in enumerate(fact_table.id))
        tables.add(ids_table)

        # GET ALL COLUMNS
        raw_columns = self.db.query("""
            SELECT
                column_name,
                table_schema,
                table_name,
                ordinal_position,
                data_type
            FROM
                information_schema.columns
            """)

        reference_only_tables = [
            r.split(".")[0] for r in self.settings.reference_only
            if len(r.split(".")) == 2
        ]
        reference_all_tables = [
            r.split(".")[0] for r in self.settings.reference_only
            if len(r.split(".")) == 1
        ]
        foreign_column_table_schema_triples = {(r.column.name, r.table.name,
                                                r.table.schema)
                                               for r in relations}
        referenced_column_table_schema_triples = {(
            r.referenced.column.name,
            r.referenced.table.name,
            r.referenced.table.schema,
        )
                                                  for r in relations}
        related_column_table_schema_triples = (
            foreign_column_table_schema_triples
            | referenced_column_table_schema_triples)

        columns = UniqueIndex(["column.name", "table.name", "table.schema"])
        for c in raw_columns:
            if c.table_name in reference_only_tables:
                if c.table_name + "." + c.column_name in self.settings.reference_only:
                    include = True
                    reference = True
                    foreign = False
                elif c.column_name in tables[(c.table_name,
                                              c.table_schema)].id:
                    include = self.settings.show_foreign_keys
                    reference = False
                    foreign = False
                else:
                    include = False
                    reference = False
                    foreign = False
            elif c.table_name in reference_all_tables:
                # TABLES USED FOR REFERENCE, NO NESTED DOCUMENTS EXPECTED
                if c.column_name in tables[(c.table_name, c.table_schema)].id:
                    include = self.settings.show_foreign_keys
                    reference = True
                    foreign = False
                elif (
                        c.column_name,
                        c.table_name,
                        c.table_schema,
                ) in foreign_column_table_schema_triples:
                    include = False
                    reference = False
                    foreign = True
                else:
                    include = True
                    reference = False
                    foreign = False
            elif c.column_name in tables[(c.table_name, c.table_schema)].id:
                include = self.settings.show_foreign_keys
                reference = False
                foreign = False
            elif (
                    c.column_name,
                    c.table_name,
                    c.table_schema,
            ) in foreign_column_table_schema_triples:
                include = False
                reference = False
                foreign = True
            elif (
                    c.column_name,
                    c.table_name,
                    c.table_schema,
            ) in referenced_column_table_schema_triples:
                include = self.settings.show_foreign_keys
                reference = False
                foreign = False
            else:
                include = True
                reference = False
                foreign = False

            rel = {
                "column": {
                    "name": c.column_name,
                    "type": c.data_type
                },
                "table": {
                    "name": c.table_name,
                    "schema": c.table_schema
                },
                "ordinal_position": c.ordinal_position,
                "is_id": c.column_name
                in tables[(c.table_name, c.table_schema)].id,
                "include": include,  # TRUE IF THIS COLUMN IS OUTPUTTED
                "reference":
                reference,  # TRUE IF THIS COLUMN REPRESENTS THE ROW
                "foreign":
                foreign,  # TRUE IF THIS COLUMN POINTS TO ANOTHER ROW
            }
            columns.add(rel)

        # ITERATE OVER ALL PATHS
        todo = FlatList()
        output_columns = FlatList()
        nested_path_to_join = {}
        all_nested_paths = [["."]]

        def follow_paths(position, path, nested_path, done_relations,
                         no_nested_docs):
            if position.name in self.settings.exclude:
                return

            if self.path_not_allowed(path):
                return
            if DEBUG:
                Log.note("Trace {{path}}", path=path)
            if position.name != "__ids__":
                # USED TO CONFIRM WE CAN ACCESS THE TABLE (WILL THROW ERROR WHEN IF IT FAILS)
                self.db.query(
                    ConcatSQL(
                        SQL_SELECT,
                        SQL_STAR,
                        SQL_FROM,
                        quote_column(position.schema, position.name),
                        SQL_LIMIT,
                        SQL_ONE,
                    ))

            if position.name in reference_all_tables:
                no_nested_docs = True
            if position.name in reference_only_tables:
                return
            curr_join_list = copy(nested_path_to_join[nested_path[0]])

            ###############################################################################
            # INNER OBJECTS
            ###############################################################################
            referenced_tables = list(
                sort_using_key(
                    jx.groupby(
                        jx.filter(
                            relations,
                            {
                                "eq": {
                                    "table.name": position.name,
                                    "table.schema": position.schema,
                                }
                            },
                        ),
                        "constraint.name",
                    ),
                    key=lambda p: first(p[1]).column.name,
                ))
            for g, constraint_columns in referenced_tables:
                g = unwrap(g)
                constraint_columns = deepcopy(constraint_columns)
                if g["constraint.name"] in done_relations:
                    continue
                if any(cc for cc in constraint_columns
                       if cc.referenced.table.name in self.settings.exclude):
                    continue

                done_relations.add(g["constraint.name"])

                many_to_one_joins = nested_path_to_join[nested_path[0]]
                index = len(many_to_one_joins)

                alias = "t" + text(index)
                for c in constraint_columns:
                    c.referenced.table.alias = alias
                    c.table = position
                many_to_one_joins.append({
                    "join_columns": constraint_columns,
                    "path": path,
                    "nested_path": nested_path,
                })

                # HANDLE THE COMMON *id SUFFIX
                name = []
                for cname, tname in zip(
                        constraint_columns.column.name,
                        constraint_columns.referenced.table.name,
                ):
                    if cname.startswith(tname):
                        name.append(tname)
                    elif cname.endswith("_id"):
                        name.append(cname[:-3])
                    else:
                        name.append(cname)

                relation_string = many_to_one_string(constraint_columns[0])
                step = "/".join(name)
                if len(constraint_columns) == 1:
                    step = self.name_relations.get(relation_string, step)

                referenced_column_path = concat_field(path, step)
                if self.path_not_allowed(referenced_column_path):
                    continue

                if referenced_column_path in reference_only_tables:
                    continue

                col_pointer_name = relative_field(referenced_column_path,
                                                  nested_path[0])
                for col in columns:
                    if (col.table.name
                            == constraint_columns[0].referenced.table.name
                            and col.table.schema
                            == constraint_columns[0].referenced.table.schema):
                        col_full_name = concat_field(
                            col_pointer_name, literal_field(col.column.name))

                        if (col.is_id and len(nested_path) == 1
                                and col.table.name == fact_table.name
                                and col.table.schema == fact_table.schema):
                            # ALWAYS SHOW THE ID OF THE FACT
                            c_index = len(output_columns)
                            output_columns.append({
                                "table_alias":
                                alias,
                                "column_alias":
                                "c" + text(c_index),
                                "column":
                                col,
                                "sort":
                                True,
                                "path":
                                referenced_column_path,
                                "nested_path":
                                nested_path,
                                "put":
                                col_full_name,
                            })
                        elif col.column.name == constraint_columns[
                                0].column.name:
                            c_index = len(output_columns)
                            output_columns.append({
                                "table_alias":
                                alias,
                                "column_alias":
                                "c" + text(c_index),
                                "column":
                                col,
                                "sort":
                                False,
                                "path":
                                referenced_column_path,
                                "nested_path":
                                nested_path,
                                "put":
                                col_full_name
                                if self.settings.show_foreign_keys else None,
                            })
                        elif col.is_id:
                            c_index = len(output_columns)
                            output_columns.append({
                                "table_alias":
                                alias,
                                "column_alias":
                                "c" + text(c_index),
                                "column":
                                col,
                                "sort":
                                False,
                                "path":
                                referenced_column_path,
                                "nested_path":
                                nested_path,
                                "put":
                                col_full_name
                                if self.settings.show_foreign_keys else None,
                            })
                        elif col.reference:
                            c_index = len(output_columns)
                            output_columns.append({
                                "table_alias":
                                alias,
                                "column_alias":
                                "c" + text(c_index),
                                "column":
                                col,
                                "sort":
                                False,
                                "path":
                                referenced_column_path,
                                "nested_path":
                                nested_path,
                                "put":
                                col_pointer_name
                                if not self.settings.show_foreign_keys else
                                col_full_name,  # REFERENCE FIELDS CAN REPLACE THE WHOLE OBJECT BEING REFERENCED
                            })
                        elif col.include:
                            c_index = len(output_columns)
                            output_columns.append({
                                "table_alias":
                                alias,
                                "column_alias":
                                "c" + text(c_index),
                                "column":
                                col,
                                "sort":
                                False,
                                "path":
                                referenced_column_path,
                                "nested_path":
                                nested_path,
                                "put":
                                col_full_name,
                            })

                if position.name in reference_only_tables:
                    continue

                todo.append(
                    Data(
                        position=copy(constraint_columns[0].referenced.table),
                        path=referenced_column_path,
                        nested_path=nested_path,
                        done_relations=copy(done_relations),
                        no_nested_docs=no_nested_docs,
                    ))
            ###############################################################################
            # NESTED OBJECTS
            ###############################################################################
            if not no_nested_docs:
                nesting_tables = list(
                    sort_using_key(
                        jx.groupby(
                            jx.filter(
                                relations,
                                {
                                    "eq": {
                                        "referenced.table.name": position.name,
                                        "referenced.table.schema":
                                        position.schema,
                                    }
                                },
                            ),
                            "constraint.name",
                        ),
                        key=lambda p: [(r.table.name, r.column.name)
                                       for r in [first(p[1])]][0],
                    ))

                for g, constraint_columns in nesting_tables:
                    g = unwrap(g)
                    constraint_columns = deepcopy(constraint_columns)
                    if g["constraint.name"] in done_relations:
                        continue
                    done_relations.add(g["constraint.name"])

                    many_table = set(constraint_columns.table.name)
                    if not (many_table - self.settings.exclude):
                        continue

                    relation_string = one_to_many_string(constraint_columns[0])
                    step = "/".join(many_table)
                    if len(constraint_columns) == 1:
                        step = self.name_relations.get(relation_string, step)

                    referenced_column_path = concat_field(path, step)
                    if self.path_not_allowed(referenced_column_path):
                        continue

                    new_nested_path = [referenced_column_path] + nested_path
                    all_nested_paths.append(new_nested_path)

                    if referenced_column_path in nested_path_to_join:
                        Log.error(
                            "{{path}} already exists, try adding entry to name_relations",
                            path=referenced_column_path,
                        )
                    one_to_many_joins = nested_path_to_join[
                        referenced_column_path] = copy(curr_join_list)
                    index = len(one_to_many_joins)
                    alias = "t" + text(index)
                    for c in constraint_columns:
                        c.table.alias = alias
                        c.referenced.table = position
                    one_to_many_joins.append(
                        set_default(
                            {},
                            g,
                            {
                                "children": True,
                                "join_columns": constraint_columns,
                                "path": path,
                                "nested_path": nested_path,
                            },
                        ))
                    for col in columns:
                        if (col.table.name == constraint_columns[0].table.name
                                and col.table.schema
                                == constraint_columns[0].table.schema):
                            col_full_name = join_field(
                                split_field(referenced_column_path)
                                [len(split_field(new_nested_path[0])):] +
                                [literal_field(col.column.name)])

                            if col.column.name == constraint_columns[
                                    0].column.name:
                                c_index = len(output_columns)
                                output_columns.append({
                                    "table_alias":
                                    alias,
                                    "column_alias":
                                    "c" + text(c_index),
                                    "column":
                                    col,
                                    "sort":
                                    col.is_id,
                                    "path":
                                    referenced_column_path,
                                    "nested_path":
                                    new_nested_path,
                                    "put":
                                    col_full_name if
                                    self.settings.show_foreign_keys else None,
                                })
                            elif col.is_id:
                                c_index = len(output_columns)
                                output_columns.append({
                                    "table_alias":
                                    alias,
                                    "column_alias":
                                    "c" + text(c_index),
                                    "column":
                                    col,
                                    "sort":
                                    col.is_id,
                                    "path":
                                    referenced_column_path,
                                    "nested_path":
                                    new_nested_path,
                                    "put":
                                    col_full_name if
                                    self.settings.show_foreign_keys else None,
                                })
                            else:
                                c_index = len(output_columns)
                                output_columns.append({
                                    "table_alias":
                                    alias,
                                    "column_alias":
                                    "c" + text(c_index),
                                    "column":
                                    col,
                                    "sort":
                                    col.is_id,
                                    "path":
                                    referenced_column_path,
                                    "nested_path":
                                    new_nested_path,
                                    "put":
                                    col_full_name if col.include else None,
                                })

                    todo.append(
                        Data(
                            position=constraint_columns[0].table,
                            path=referenced_column_path,
                            nested_path=new_nested_path,
                            done_relations=copy(done_relations),
                            no_nested_docs=no_nested_docs,
                        ))

        path = "."
        nested_path = [path]
        nested_path_to_join["."] = [{
            "path":
            path,
            "join_columns": [{
                "referenced": {
                    "table": ids_table
                }
            }],
            "nested_path":
            nested_path,
        }]

        todo.append(
            Data(
                position=ids_table,
                path=path,
                nested_path=nested_path,
                done_relations=set(),
                no_nested_docs=False,
            ))

        while todo:
            item = todo.pop(0)
            follow_paths(**item)

        self.all_nested_paths = all_nested_paths
        self.nested_path_to_join = nested_path_to_join
        self.columns = output_columns

Example #19

Show file

File: meta.py Project: klahnakoski/pyLibrary

 def groupby(self, keys):
     with self.locker:
         self._update_meta()
         return jx.groupby(self.__iter__(), keys)

Example #20

Show file

def process(sig_id,
            show=False,
            show_limit=MAX_POINTS,
            show_old=True,
            show_distribution=None):
    if not mo_math.is_integer(sig_id):
        Log.error("expecting integer id")
    sig = first(get_signature(config.database, sig_id))
    data = get_dataum(config.database, sig_id)

    min_date = (Date.today() - 3 * MONTH).unix
    pushes = jx.sort(
        [{
            "value": median(rows.value),
            "runs": rows,
            "push": {
                "time": unwrap(t)["push.time"]
            },
        } for t, rows in jx.groupby(data, "push.time")
         if t["push\\.time"] > min_date],
        "push.time",
    )

    values = pushes.value
    title = "-".join(
        map(
            text,
            [
                sig.id,
                sig.framework,
                sig.suite,
                sig.test,
                sig.platform,
                sig.repository.name,
            ],
        ))
    Log.note("With {{title}}", title=title)

    with Timer("find segments"):
        new_segments, new_diffs = find_segments(values, sig.alert_change_type,
                                                sig.alert_threshold)

    # USE PERFHERDER ALERTS TO IDENTIFY OLD SEGMENTS
    old_segments = tuple(
        sorted(
            set([
                i for i, p in enumerate(pushes) if any(r.alert.id
                                                       for r in p.runs)
            ] + [0, len(pushes)])))
    old_medians = [0] + [
        np.median(values[s:e])
        for s, e in zip(old_segments[:-1], old_segments[1:])
    ]
    old_diffs = np.array(
        [b / a - 1 for a, b in zip(old_medians[:-1], old_medians[1:])] + [0])

    if len(new_segments) == 1:
        dev_status = None
        dev_score = None
        relative_noise = None
    else:
        # MEASURE DEVIANCE (USE THE LAST SEGMENT)
        s, e = new_segments[-2], new_segments[-1]
        last_segment = np.array(values[s:e])
        trimmed_segment = last_segment[np.argsort(last_segment)
                                       [IGNORE_TOP:-IGNORE_TOP]]
        dev_status, dev_score = deviance(trimmed_segment)
        relative_noise = np.std(trimmed_segment) / np.mean(trimmed_segment)
        Log.note(
            "\n\tdeviance = {{deviance}}\n\tnoise={{std}}",
            title=title,
            deviance=(dev_status, dev_score),
            std=relative_noise,
        )

        if show_distribution:
            histogram(last_segment, title=dev_status + "=" + text(dev_score))

    max_extra_diff = None
    max_missing_diff = None
    _is_diff = is_diff(new_segments, old_segments)
    if _is_diff:
        # FOR MISSING POINTS, CALC BIGGEST DIFF
        max_extra_diff = mo_math.MAX(
            abs(d) for s, d in zip(new_segments, new_diffs)
            if all(not (s - TOLLERANCE <= o <= s + TOLLERANCE)
                   for o in old_segments))
        max_missing_diff = mo_math.MAX(
            abs(d) for s, d in zip(old_segments, old_diffs)
            if all(not (s - TOLLERANCE <= n <= s + TOLLERANCE)
                   for n in new_segments))

        Log.alert(
            "Disagree max_extra_diff={{max_extra_diff|round(places=3)}}, max_missing_diff={{max_missing_diff|round(places=3)}}",
            max_extra_diff=max_extra_diff,
            max_missing_diff=max_missing_diff,
        )
        Log.note("old={{old}}, new={{new}}",
                 old=old_segments,
                 new=new_segments)
        if show and len(pushes):
            show_old and assign_colors(
                values, old_segments, title="OLD " + title)
            assign_colors(values, new_segments, title="NEW " + title)
    else:
        Log.note("Agree")
        if show and len(pushes):
            show_old and assign_colors(
                values, old_segments, title="OLD " + title)
            assign_colors(values, new_segments, title="NEW " + title)

    summary_table.upsert(
        where={"eq": {
            "id": sig.id
        }},
        doc=Data(
            id=sig.id,
            title=title,
            num_pushes=len(pushes),
            is_diff=_is_diff,
            max_extra_diff=max_extra_diff,
            max_missing_diff=max_missing_diff,
            num_new_segments=len(new_segments),
            num_old_segments=len(old_segments),
            relative_noise=relative_noise,
            dev_status=dev_status,
            dev_score=dev_score,
            last_updated=Date.now(),
        ),
    )

Example #21

Show file

File: snowflake_schema.py Project: nknick99/MySQL-to-S3

        def follow_paths(position, path, nested_path, done_relations,
                         no_nested_docs):
            if position.name in self.settings.exclude:
                return
            if DEBUG:
                Log.note("Trace {{path}}", path=path)
            if position.name != "__ids__":
                # USED TO CONFIRM WE CAN ACCESS THE TABLE (WILL THROW ERROR WHEN IF IT FAILS)
                self.db.query("SELECT * FROM " +
                              quote_column(position.name, position.schema) +
                              " LIMIT 1")

            if position.name in reference_all_tables:
                no_nested_docs = True
            if position.name in reference_only_tables:
                return

            curr_join_list = copy(nested_path_to_join[nested_path[0]])

            # INNER OBJECTS
            referenced_tables = list(
                jx.groupby(
                    jx.filter(
                        relations, {
                            "eq": {
                                "table.name": position.name,
                                "table.schema": position.schema
                            }
                        }), "constraint.name"))
            for g, constraint_columns in referenced_tables:
                g = unwrap(g)
                constraint_columns = deepcopy(constraint_columns)
                if g["constraint.name"] in done_relations:
                    continue
                if any(cc for cc in constraint_columns
                       if cc.referenced.table.name in self.settings.exclude):
                    continue

                done_relations.add(g["constraint.name"])

                many_to_one_joins = nested_path_to_join[nested_path[0]]
                index = len(many_to_one_joins)

                alias = "t" + text_type(index)
                for c in constraint_columns:
                    c.referenced.table.alias = alias
                    c.table = position
                many_to_one_joins.append({
                    "join_columns": constraint_columns,
                    "path": path,
                    "nested_path": nested_path
                })

                # referenced_table_path = join_field(split_field(path) + ["/".join(constraint_columns.referenced.table.name)])
                # HANDLE THE COMMON *id SUFFIX
                name = []
                for a, b in zip(constraint_columns.column.name,
                                constraint_columns.referenced.table.name):
                    if a.startswith(b):
                        name.append(b)
                    elif a.endswith("_id"):
                        name.append(a[:-3])
                    else:
                        name.append(a)
                referenced_column_path = join_field(
                    split_field(path) + ["/".join(name)])
                col_pointer_name = relative_field(referenced_column_path,
                                                  nested_path[0])
                # insert into nested1 VALUES (100, 10, 'aaa', -1);
                # id.about.time.nested1 .ref=10
                # id.about.time.nested1 .ref.name
                for col in columns:
                    if col.table.name == constraint_columns[
                            0].referenced.table.name and col.table.schema == constraint_columns[
                                0].referenced.table.schema:
                        col_full_name = concat_field(
                            col_pointer_name, literal_field(col.column.name))

                        if col.is_id and col.table.name == fact_table.name and col.table.schema == fact_table.schema:
                            # ALWAYS SHOW THE ID OF THE FACT
                            c_index = len(output_columns)
                            output_columns.append({
                                "table_alias":
                                alias,
                                "column_alias":
                                "c" + text_type(c_index),
                                "column":
                                col,
                                "sort":
                                True,
                                "path":
                                referenced_column_path,
                                "nested_path":
                                nested_path,
                                "put":
                                col_full_name
                            })
                        elif col.column.name == constraint_columns[
                                0].column.name:
                            c_index = len(output_columns)
                            output_columns.append({
                                "table_alias":
                                alias,
                                "column_alias":
                                "c" + text_type(c_index),
                                "column":
                                col,
                                "sort":
                                False,
                                "path":
                                referenced_column_path,
                                "nested_path":
                                nested_path,
                                "put":
                                col_full_name
                                if self.settings.show_foreign_keys else None
                            })
                        elif col.is_id:
                            c_index = len(output_columns)
                            output_columns.append({
                                "table_alias":
                                alias,
                                "column_alias":
                                "c" + text_type(c_index),
                                "column":
                                col,
                                "sort":
                                False,
                                "path":
                                referenced_column_path,
                                "nested_path":
                                nested_path,
                                "put":
                                col_full_name
                                if self.settings.show_foreign_keys else None
                            })
                        elif col.reference:
                            c_index = len(output_columns)
                            output_columns.append({
                                "table_alias":
                                alias,
                                "column_alias":
                                "c" + text_type(c_index),
                                "column":
                                col,
                                "sort":
                                False,
                                "path":
                                referenced_column_path,
                                "nested_path":
                                nested_path,
                                "put":
                                col_pointer_name
                                if not self.settings.show_foreign_keys else
                                col_full_name  # REFERENCE FIELDS CAN REPLACE THE WHOLE OBJECT BEING REFERENCED
                            })
                        elif col.include:
                            c_index = len(output_columns)
                            output_columns.append({
                                "table_alias":
                                alias,
                                "column_alias":
                                "c" + text_type(c_index),
                                "column":
                                col,
                                "sort":
                                False,
                                "path":
                                referenced_column_path,
                                "nested_path":
                                nested_path,
                                "put":
                                col_full_name
                            })

                if position.name in reference_only_tables:
                    continue

                todo.append(
                    Data(position=copy(constraint_columns[0].referenced.table),
                         path=referenced_column_path,
                         nested_path=nested_path,
                         done_relations=copy(done_relations),
                         no_nested_docs=no_nested_docs))

            # NESTED OBJECTS
            if not no_nested_docs:
                for g, constraint_columns in jx.groupby(
                        jx.filter(
                            relations, {
                                "eq": {
                                    "referenced.table.name": position.name,
                                    "referenced.table.schema": position.schema
                                }
                            }), "constraint.name"):
                    g = unwrap(g)
                    constraint_columns = deepcopy(constraint_columns)
                    if g["constraint.name"] in done_relations:
                        continue
                    done_relations.add(g["constraint.name"])

                    many_table = set(constraint_columns.table.name)
                    if not (many_table - self.settings.exclude):
                        continue

                    referenced_column_path = join_field(
                        split_field(path) + ["/".join(many_table)])
                    new_nested_path = [referenced_column_path] + nested_path
                    all_nested_paths.append(new_nested_path)

                    # if new_path not in self.settings.include:
                    #     Log.note("Exclude nested path {{path}}", path=new_path)
                    #     continue
                    one_to_many_joins = nested_path_to_join[
                        referenced_column_path] = copy(curr_join_list)
                    index = len(one_to_many_joins)
                    alias = "t" + text_type(index)
                    for c in constraint_columns:
                        c.table.alias = alias
                        c.referenced.table = position
                    one_to_many_joins.append(
                        set_default({}, g, {
                            "children": True,
                            "join_columns": constraint_columns,
                            "path": path,
                            "nested_path": nested_path
                        }))
                    # insert into nested1 VALUES (100, 10, 'aaa', -1); # id.about.time.nested1 .ref=10# id.about.time.nested1 .ref.name
                    for col in columns:
                        if col.table.name == constraint_columns[
                                0].table.name and col.table.schema == constraint_columns[
                                    0].table.schema:
                            col_full_name = join_field(
                                split_field(referenced_column_path)
                                [len(split_field(new_nested_path[0])):] +
                                [literal_field(col.column.name)])

                            if col.column.name == constraint_columns[
                                    0].column.name:
                                c_index = len(output_columns)
                                output_columns.append({
                                    "table_alias":
                                    alias,
                                    "column_alias":
                                    "c" + text_type(c_index),
                                    "column":
                                    col,
                                    "sort":
                                    col.is_id,
                                    "path":
                                    referenced_column_path,
                                    "nested_path":
                                    new_nested_path,
                                    "put":
                                    col_full_name if
                                    self.settings.show_foreign_keys else None
                                })
                            elif col.is_id:
                                c_index = len(output_columns)
                                output_columns.append({
                                    "table_alias":
                                    alias,
                                    "column_alias":
                                    "c" + text_type(c_index),
                                    "column":
                                    col,
                                    "sort":
                                    col.is_id,
                                    "path":
                                    referenced_column_path,
                                    "nested_path":
                                    new_nested_path,
                                    "put":
                                    col_full_name if
                                    self.settings.show_foreign_keys else None
                                })
                            else:
                                c_index = len(output_columns)
                                output_columns.append({
                                    "table_alias":
                                    alias,
                                    "column_alias":
                                    "c" + text_type(c_index),
                                    "column":
                                    col,
                                    "sort":
                                    col.is_id,
                                    "path":
                                    referenced_column_path,
                                    "nested_path":
                                    new_nested_path,
                                    "put":
                                    col_full_name if col.include else None
                                })

                    todo.append(
                        Data(position=constraint_columns[0].table,
                             path=referenced_column_path,
                             nested_path=new_nested_path,
                             done_relations=copy(done_relations),
                             no_nested_docs=no_nested_docs))

Example #22

Show file

File: meta_columns.py Project: klahnakoski/auth0-api

 def groupby(self, keys):
     with self.locker:
         self._update_meta()
         return jx.groupby(self.__iter__(), keys)

Example #23

Show file

File: snowflake_schema.py Project: nknick99/MySQL-to-S3

    def _scan_database(self):
        # GET ALL RELATIONS
        raw_relations = self.db.query("""
            SELECT
                table_schema,
                table_name,
                referenced_table_schema,
                referenced_table_name,
                referenced_column_name,
                constraint_name,
                column_name,
                ordinal_position
            FROM
                information_schema.key_column_usage
            WHERE
                referenced_column_name IS NOT NULL
        """,
                                      param=self.settings.database)

        if not raw_relations:
            Log.error("No relations in the database")

        for r in self.settings.add_relations:
            try:
                a, b = map(strings.trim, r.split("->"))
                a = a.split(".")
                b = b.split(".")
                raw_relations.append(
                    Data(table_schema=a[0],
                         table_name=a[1],
                         referenced_table_schema=b[0],
                         referenced_table_name=b[1],
                         referenced_column_name=b[2],
                         constraint_name=Random.hex(20),
                         column_name=a[2],
                         ordinal_position=1))
            except Exception as e:
                Log.error("Could not parse {{line|quote}}", line=r, cause=e)

        relations = jx.select(raw_relations,
                              [{
                                  "name": "constraint.name",
                                  "value": "constraint_name"
                              }, {
                                  "name": "table.schema",
                                  "value": "table_schema"
                              }, {
                                  "name": "table.name",
                                  "value": "table_name"
                              }, {
                                  "name": "column.name",
                                  "value": "column_name"
                              }, {
                                  "name": "referenced.table.schema",
                                  "value": "referenced_table_schema"
                              }, {
                                  "name": "referenced.table.name",
                                  "value": "referenced_table_name"
                              }, {
                                  "name": "referenced.column.name",
                                  "value": "referenced_column_name"
                              }, {
                                  "name": "ordinal_position",
                                  "value": "ordinal_position"
                              }])

        # GET ALL TABLES
        raw_tables = self.db.query("""
            SELECT
                t.table_schema,
                t.table_name,
                c.constraint_name,
                c.constraint_type,
                k.column_name,
                k.ordinal_position
            FROM
                information_schema.tables t
            LEFT JOIN
                information_schema.table_constraints c on c.table_name=t.table_name AND c.table_schema=t.table_schema and (constraint_type='UNIQUE' or constraint_type='PRIMARY KEY')
            LEFT JOIN
                information_schema.key_column_usage k on k.constraint_name=c.constraint_name AND k.table_name=t.table_name and k.table_schema=t.table_schema
            ORDER BY
                t.table_schema,
                t.table_name,
                c.constraint_name,
                k.ordinal_position,
                k.column_name
        """,
                                   param=self.settings.database)

        # ORGANIZE, AND PICK ONE UNIQUE CONSTRAINT FOR LINKING
        tables = UniqueIndex(keys=["name", "schema"])
        for t, c in jx.groupby(raw_tables, ["table_name", "table_schema"]):
            c = wrap(list(c))
            best_index = Null
            is_referenced = False
            is_primary = False
            for g, w in jx.groupby(c, "constraint_name"):
                if not g.constraint_name:
                    continue
                w = list(w)
                ref = False
                for r in relations:
                    if r.table.name == t.table_name and r.table.schema == t.table_schema and r.constraint.name == g.constraint_name:
                        ref = True
                is_prime = w[0].constraint_type == "PRIMARY"

                reasons_this_one_is_better = [
                    best_index == None,  # WE DO NOT HAVE A CANDIDATE YET
                    is_prime
                    and not is_primary,  # PRIMARY KEYS ARE GOOD TO HAVE
                    is_primary == is_prime and ref and
                    not is_referenced,  # REFERENCED UNIQUE TUPLES ARE GOOD TOO
                    is_primary == is_prime and ref == is_referenced and len(w)
                    < len(best_index)  # THE SHORTER THE TUPLE, THE BETTER
                ]
                if any(reasons_this_one_is_better):
                    is_primary = is_prime
                    is_referenced = ref
                    best_index = w

            tables.add({
                "name": t.table_name,
                "schema": t.table_schema,
                "id": [b.column_name for b in best_index]
            })

        fact_table = tables[self.settings.fact_table,
                            self.settings.database.schema]
        ids_table = {
            "alias": "t0",
            "name": "__ids__",
            "schema": fact_table.schema,
            "id": fact_table.id
        }
        relations.extend(
            wrap({
                "constraint": {
                    "name": "__link_ids_to_fact_table__"
                },
                "table": ids_table,
                "column": {
                    "name": c
                },
                "referenced": {
                    "table": fact_table,
                    "column": {
                        "name": c
                    }
                },
                "ordinal_position": i
            }) for i, c in enumerate(fact_table.id))
        tables.add(ids_table)

        # GET ALL COLUMNS
        raw_columns = self.db.query("""
            SELECT
                column_name,
                table_schema,
                table_name,
                ordinal_position,
                data_type
            FROM
                information_schema.columns
        """,
                                    param=self.settings.database)

        reference_only_tables = [
            r.split(".")[0] for r in self.settings.reference_only
            if len(r.split(".")) == 2
        ]
        reference_all_tables = [
            r.split(".")[0] for r in self.settings.reference_only
            if len(r.split(".")) == 1
        ]
        foreign_column_table_schema_triples = {(r.column.name, r.table.name,
                                                r.table.schema)
                                               for r in relations}
        referenced_column_table_schema_triples = {
            (r.referenced.column.name, r.referenced.table.name,
             r.referenced.table.schema)
            for r in relations
        }
        related_column_table_schema_triples = foreign_column_table_schema_triples | referenced_column_table_schema_triples

        columns = UniqueIndex(["column.name", "table.name", "table.schema"])
        for c in raw_columns:
            if c.table_name in reference_only_tables:
                if c.table_name + "." + c.column_name in self.settings.reference_only:
                    include = True
                    reference = True
                    foreign = False
                elif c.column_name in tables[(c.table_name,
                                              c.table_schema)].id:
                    include = self.settings.show_foreign_keys
                    reference = False
                    foreign = False
                else:
                    include = False
                    reference = False
                    foreign = False
            elif c.table_name in reference_all_tables:
                # TABLES USED FOR REFERENCE, NO NESTED DOCUMENTS EXPECTED
                if c.column_name in tables[(c.table_name, c.table_schema)].id:
                    include = self.settings.show_foreign_keys
                    reference = True
                    foreign = False
                elif (c.column_name, c.table_name,
                      c.table_schema) in foreign_column_table_schema_triples:
                    include = False
                    reference = False
                    foreign = True
                else:
                    include = True
                    reference = False
                    foreign = False
            elif c.column_name in tables[(c.table_name, c.table_schema)].id:
                include = self.settings.show_foreign_keys
                reference = False
                foreign = False
            elif (c.column_name, c.table_name,
                  c.table_schema) in foreign_column_table_schema_triples:
                include = False
                reference = False
                foreign = True
            elif (c.column_name, c.table_name,
                  c.table_schema) in referenced_column_table_schema_triples:
                include = self.settings.show_foreign_keys
                reference = False
                foreign = False
            else:
                include = True
                reference = False
                foreign = False

            rel = {
                "column": {
                    "name": c.column_name,
                    "type": c.data_type
                },
                "table": {
                    "name": c.table_name,
                    "schema": c.table_schema
                },
                "ordinal_position": c.ordinal_position,
                "is_id": c.column_name
                in tables[(c.table_name, c.table_schema)].id,
                "include": include,  # TRUE IF THIS COLUMN IS OUTPUTTED
                "reference":
                reference,  # TRUE IF THIS COLUMN REPRESENTS THE ROW
                "foreign": foreign  # TRUE IF THIS COLUMN POINTS TO ANOTHER ROW
            }
            columns.add(rel)

        # ITERATE OVER ALL PATHS
        todo = FlatList()
        output_columns = FlatList()
        nested_path_to_join = {}
        all_nested_paths = [["."]]

        def follow_paths(position, path, nested_path, done_relations,
                         no_nested_docs):
            if position.name in self.settings.exclude:
                return
            if DEBUG:
                Log.note("Trace {{path}}", path=path)
            if position.name != "__ids__":
                # USED TO CONFIRM WE CAN ACCESS THE TABLE (WILL THROW ERROR WHEN IF IT FAILS)
                self.db.query("SELECT * FROM " +
                              quote_column(position.name, position.schema) +
                              " LIMIT 1")

            if position.name in reference_all_tables:
                no_nested_docs = True
            if position.name in reference_only_tables:
                return

            curr_join_list = copy(nested_path_to_join[nested_path[0]])

            # INNER OBJECTS
            referenced_tables = list(
                jx.groupby(
                    jx.filter(
                        relations, {
                            "eq": {
                                "table.name": position.name,
                                "table.schema": position.schema
                            }
                        }), "constraint.name"))
            for g, constraint_columns in referenced_tables:
                g = unwrap(g)
                constraint_columns = deepcopy(constraint_columns)
                if g["constraint.name"] in done_relations:
                    continue
                if any(cc for cc in constraint_columns
                       if cc.referenced.table.name in self.settings.exclude):
                    continue

                done_relations.add(g["constraint.name"])

                many_to_one_joins = nested_path_to_join[nested_path[0]]
                index = len(many_to_one_joins)

                alias = "t" + text_type(index)
                for c in constraint_columns:
                    c.referenced.table.alias = alias
                    c.table = position
                many_to_one_joins.append({
                    "join_columns": constraint_columns,
                    "path": path,
                    "nested_path": nested_path
                })

                # referenced_table_path = join_field(split_field(path) + ["/".join(constraint_columns.referenced.table.name)])
                # HANDLE THE COMMON *id SUFFIX
                name = []
                for a, b in zip(constraint_columns.column.name,
                                constraint_columns.referenced.table.name):
                    if a.startswith(b):
                        name.append(b)
                    elif a.endswith("_id"):
                        name.append(a[:-3])
                    else:
                        name.append(a)
                referenced_column_path = join_field(
                    split_field(path) + ["/".join(name)])
                col_pointer_name = relative_field(referenced_column_path,
                                                  nested_path[0])
                # insert into nested1 VALUES (100, 10, 'aaa', -1);
                # id.about.time.nested1 .ref=10
                # id.about.time.nested1 .ref.name
                for col in columns:
                    if col.table.name == constraint_columns[
                            0].referenced.table.name and col.table.schema == constraint_columns[
                                0].referenced.table.schema:
                        col_full_name = concat_field(
                            col_pointer_name, literal_field(col.column.name))

                        if col.is_id and col.table.name == fact_table.name and col.table.schema == fact_table.schema:
                            # ALWAYS SHOW THE ID OF THE FACT
                            c_index = len(output_columns)
                            output_columns.append({
                                "table_alias":
                                alias,
                                "column_alias":
                                "c" + text_type(c_index),
                                "column":
                                col,
                                "sort":
                                True,
                                "path":
                                referenced_column_path,
                                "nested_path":
                                nested_path,
                                "put":
                                col_full_name
                            })
                        elif col.column.name == constraint_columns[
                                0].column.name:
                            c_index = len(output_columns)
                            output_columns.append({
                                "table_alias":
                                alias,
                                "column_alias":
                                "c" + text_type(c_index),
                                "column":
                                col,
                                "sort":
                                False,
                                "path":
                                referenced_column_path,
                                "nested_path":
                                nested_path,
                                "put":
                                col_full_name
                                if self.settings.show_foreign_keys else None
                            })
                        elif col.is_id:
                            c_index = len(output_columns)
                            output_columns.append({
                                "table_alias":
                                alias,
                                "column_alias":
                                "c" + text_type(c_index),
                                "column":
                                col,
                                "sort":
                                False,
                                "path":
                                referenced_column_path,
                                "nested_path":
                                nested_path,
                                "put":
                                col_full_name
                                if self.settings.show_foreign_keys else None
                            })
                        elif col.reference:
                            c_index = len(output_columns)
                            output_columns.append({
                                "table_alias":
                                alias,
                                "column_alias":
                                "c" + text_type(c_index),
                                "column":
                                col,
                                "sort":
                                False,
                                "path":
                                referenced_column_path,
                                "nested_path":
                                nested_path,
                                "put":
                                col_pointer_name
                                if not self.settings.show_foreign_keys else
                                col_full_name  # REFERENCE FIELDS CAN REPLACE THE WHOLE OBJECT BEING REFERENCED
                            })
                        elif col.include:
                            c_index = len(output_columns)
                            output_columns.append({
                                "table_alias":
                                alias,
                                "column_alias":
                                "c" + text_type(c_index),
                                "column":
                                col,
                                "sort":
                                False,
                                "path":
                                referenced_column_path,
                                "nested_path":
                                nested_path,
                                "put":
                                col_full_name
                            })

                if position.name in reference_only_tables:
                    continue

                todo.append(
                    Data(position=copy(constraint_columns[0].referenced.table),
                         path=referenced_column_path,
                         nested_path=nested_path,
                         done_relations=copy(done_relations),
                         no_nested_docs=no_nested_docs))

            # NESTED OBJECTS
            if not no_nested_docs:
                for g, constraint_columns in jx.groupby(
                        jx.filter(
                            relations, {
                                "eq": {
                                    "referenced.table.name": position.name,
                                    "referenced.table.schema": position.schema
                                }
                            }), "constraint.name"):
                    g = unwrap(g)
                    constraint_columns = deepcopy(constraint_columns)
                    if g["constraint.name"] in done_relations:
                        continue
                    done_relations.add(g["constraint.name"])

                    many_table = set(constraint_columns.table.name)
                    if not (many_table - self.settings.exclude):
                        continue

                    referenced_column_path = join_field(
                        split_field(path) + ["/".join(many_table)])
                    new_nested_path = [referenced_column_path] + nested_path
                    all_nested_paths.append(new_nested_path)

                    # if new_path not in self.settings.include:
                    #     Log.note("Exclude nested path {{path}}", path=new_path)
                    #     continue
                    one_to_many_joins = nested_path_to_join[
                        referenced_column_path] = copy(curr_join_list)
                    index = len(one_to_many_joins)
                    alias = "t" + text_type(index)
                    for c in constraint_columns:
                        c.table.alias = alias
                        c.referenced.table = position
                    one_to_many_joins.append(
                        set_default({}, g, {
                            "children": True,
                            "join_columns": constraint_columns,
                            "path": path,
                            "nested_path": nested_path
                        }))
                    # insert into nested1 VALUES (100, 10, 'aaa', -1); # id.about.time.nested1 .ref=10# id.about.time.nested1 .ref.name
                    for col in columns:
                        if col.table.name == constraint_columns[
                                0].table.name and col.table.schema == constraint_columns[
                                    0].table.schema:
                            col_full_name = join_field(
                                split_field(referenced_column_path)
                                [len(split_field(new_nested_path[0])):] +
                                [literal_field(col.column.name)])

                            if col.column.name == constraint_columns[
                                    0].column.name:
                                c_index = len(output_columns)
                                output_columns.append({
                                    "table_alias":
                                    alias,
                                    "column_alias":
                                    "c" + text_type(c_index),
                                    "column":
                                    col,
                                    "sort":
                                    col.is_id,
                                    "path":
                                    referenced_column_path,
                                    "nested_path":
                                    new_nested_path,
                                    "put":
                                    col_full_name if
                                    self.settings.show_foreign_keys else None
                                })
                            elif col.is_id:
                                c_index = len(output_columns)
                                output_columns.append({
                                    "table_alias":
                                    alias,
                                    "column_alias":
                                    "c" + text_type(c_index),
                                    "column":
                                    col,
                                    "sort":
                                    col.is_id,
                                    "path":
                                    referenced_column_path,
                                    "nested_path":
                                    new_nested_path,
                                    "put":
                                    col_full_name if
                                    self.settings.show_foreign_keys else None
                                })
                            else:
                                c_index = len(output_columns)
                                output_columns.append({
                                    "table_alias":
                                    alias,
                                    "column_alias":
                                    "c" + text_type(c_index),
                                    "column":
                                    col,
                                    "sort":
                                    col.is_id,
                                    "path":
                                    referenced_column_path,
                                    "nested_path":
                                    new_nested_path,
                                    "put":
                                    col_full_name if col.include else None
                                })

                    todo.append(
                        Data(position=constraint_columns[0].table,
                             path=referenced_column_path,
                             nested_path=new_nested_path,
                             done_relations=copy(done_relations),
                             no_nested_docs=no_nested_docs))

        path = "."
        nested_path = [path]
        nested_path_to_join["."] = [{
            "path":
            path,
            "join_columns": [{
                "referenced": {
                    "table": ids_table
                }
            }],
            "nested_path":
            nested_path
        }]

        todo.append(
            Data(position=ids_table,
                 path=path,
                 nested_path=nested_path,
                 done_relations=set(),
                 no_nested_docs=False))

        while todo:
            item = todo.pop(0)
            follow_paths(**item)

        self.all_nested_paths = all_nested_paths
        self.nested_path_to_join = nested_path_to_join
        self.columns = output_columns

Example #24

Show file

File: meta.py Project: nknick99/MySQL-to-S3

    def _update_cardinality(self, column):
        """
        QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN
        """
        if column.es_index in self.index_does_not_exist:
            return

        if column.type in STRUCT:
            Log.error("not supported")
        try:
            if column.es_index == "meta.columns":
                with self.meta.columns.locker:
                    partitions = jx.sort([
                        g[column.es_column] for g, _ in jx.groupby(
                            self.meta.columns, column.es_column)
                        if g[column.es_column] != None
                    ])
                    self.meta.columns.update({
                        "set": {
                            "partitions": partitions,
                            "count": len(self.meta.columns),
                            "cardinality": len(partitions),
                            "multi": 1,
                            "last_updated": Date.now()
                        },
                        "where": {
                            "eq": {
                                "es_index": column.es_index,
                                "es_column": column.es_column
                            }
                        }
                    })
                return
            if column.es_index == "meta.tables":
                with self.meta.columns.locker:
                    partitions = jx.sort([
                        g[column.es_column] for g, _ in jx.groupby(
                            self.meta.tables, column.es_column)
                        if g[column.es_column] != None
                    ])
                    self.meta.columns.update({
                        "set": {
                            "partitions": partitions,
                            "count": len(self.meta.tables),
                            "cardinality": len(partitions),
                            "multi": 1,
                            "last_updated": Date.now()
                        },
                        "where": {
                            "eq": {
                                "es_index": column.es_index,
                                "es_column": column.es_column
                            }
                        }
                    })
                return

            es_index = column.es_index.split(".")[0]

            is_text = [
                cc for cc in self.abs_columns
                if cc.es_column == column.es_column and cc.type == "text"
            ]
            if is_text:
                # text IS A MULTIVALUE STRING THAT CAN ONLY BE FILTERED
                result = self.default_es.post("/" + es_index + "/_search",
                                              data={
                                                  "aggs": {
                                                      "count": {
                                                          "filter": {
                                                              "match_all": {}
                                                          }
                                                      }
                                                  },
                                                  "size": 0
                                              })
                count = result.hits.total
                cardinality = 1001
                multi = 1001
            elif column.es_column == "_id":
                result = self.default_es.post("/" + es_index + "/_search",
                                              data={
                                                  "query": {
                                                      "match_all": {}
                                                  },
                                                  "size": 0
                                              })
                count = cardinality = result.hits.total
                multi = 1
            else:
                result = self.default_es.post(
                    "/" + es_index + "/_search",
                    data={
                        "aggs": {
                            "count": _counting_query(column),
                            "multi": {
                                "max": {
                                    "script":
                                    "doc[" + quote(column.es_column) +
                                    "].values.size()"
                                }
                            }
                        },
                        "size": 0
                    })
                r = result.aggregations.count
                count = result.hits.total
                cardinality = coalesce(r.value, r._nested.value, r.doc_count)
                multi = coalesce(r.multi.value, 1)
                if cardinality == None:
                    Log.error("logic error")

            query = Data(size=0)

            if column.es_column == "_id":
                with self.meta.columns.locker:
                    self.meta.columns.update({
                        "set": {
                            "count": cardinality,
                            "cardinality": cardinality,
                            "multi": 1,
                            "last_updated": Date.now()
                        },
                        "clear": ["partitions"],
                        "where": {
                            "eq": {
                                "es_index": column.es_index,
                                "es_column": column.es_column
                            }
                        }
                    })
                return
            elif cardinality > 1000 or (count >= 30 and cardinality == count
                                        ) or (count >= 1000
                                              and cardinality / count > 0.99):
                if DEBUG:
                    Log.note("{{table}}.{{field}} has {{num}} parts",
                             table=column.es_index,
                             field=column.es_column,
                             num=cardinality)
                with self.meta.columns.locker:
                    self.meta.columns.update({
                        "set": {
                            "count": count,
                            "cardinality": cardinality,
                            "multi": multi,
                            "last_updated": Date.now()
                        },
                        "clear": ["partitions"],
                        "where": {
                            "eq": {
                                "es_index": column.es_index,
                                "es_column": column.es_column
                            }
                        }
                    })
                return
            elif column.type in elasticsearch.ES_NUMERIC_TYPES and cardinality > 30:
                if DEBUG:
                    Log.note("{{field}} has {{num}} parts",
                             field=column.es_index,
                             num=cardinality)
                with self.meta.columns.locker:
                    self.meta.columns.update({
                        "set": {
                            "count": count,
                            "cardinality": cardinality,
                            "multi": multi,
                            "last_updated": Date.now()
                        },
                        "clear": ["partitions"],
                        "where": {
                            "eq": {
                                "es_index": column.es_index,
                                "es_column": column.es_column
                            }
                        }
                    })
                return
            elif len(column.nested_path) != 1:
                query.aggs["_"] = {
                    "nested": {
                        "path": column.nested_path[0]
                    },
                    "aggs": {
                        "_nested": {
                            "terms": {
                                "field": column.es_column
                            }
                        }
                    }
                }
            elif cardinality == 0:
                query.aggs["_"] = {"terms": {"field": column.es_column}}
            else:
                query.aggs["_"] = {
                    "terms": {
                        "field": column.es_column,
                        "size": cardinality
                    }
                }

            result = self.default_es.post("/" + es_index + "/_search",
                                          data=query)

            aggs = result.aggregations._
            if aggs._nested:
                parts = jx.sort(aggs._nested.buckets.key)
            else:
                parts = jx.sort(aggs.buckets.key)

            if DEBUG:
                Log.note("{{field}} has {{parts}}",
                         field=column.names["."],
                         parts=parts)
            with self.meta.columns.locker:
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "multi": multi,
                        "partitions": parts,
                        "last_updated": Date.now()
                    },
                    "where": {
                        "eq": {
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
        except Exception as e:
            # CAN NOT IMPORT: THE TEST MODULES SETS UP LOGGING
            # from tests.test_jx import TEST_TABLE
            TEST_TABLE = "testdata"
            is_missing_index = any(
                w in e for w in
                ["IndexMissingException", "index_not_found_exception"])
            is_test_table = any(
                column.es_index.startswith(t)
                for t in [TEST_TABLE_PREFIX, TEST_TABLE])
            if is_missing_index and is_test_table:
                # WE EXPECT TEST TABLES TO DISAPPEAR
                with self.meta.columns.locker:
                    self.meta.columns.update({
                        "clear": ".",
                        "where": {
                            "eq": {
                                "es_index": column.es_index
                            }
                        }
                    })
                self.index_does_not_exist.add(column.es_index)
            else:
                self.meta.columns.update({
                    "set": {
                        "last_updated": Date.now()
                    },
                    "clear": [
                        "count",
                        "cardinality",
                        "multi",
                        "partitions",
                    ],
                    "where": {
                        "eq": {
                            "names.\\.": ".",
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
                Log.warning(
                    "Could not get {{col.es_index}}.{{col.es_column}} info",
                    col=column,
                    cause=e)

Example #25

Show file

File: clogger.py Project: rv404674/TUID

    def add_cset_entries(self, ordered_rev_list, timestamp=False, number_forward=True):
        '''
        Adds a list of revisions to the table. Assumes ordered_rev_list is an ordered
        based on how changesets are found in the changelog. Going forwards or backwards is dealt
        with by flipping the list
        :param ordered_cset_list: Order given from changeset log searching.
        :param timestamp: If false, records are kept indefinitely
                          but if holes exist: (delete, None, delete, None)
                          those delete's with None's around them
                          will not be deleted.
        :param numbered: If True, this function will number the revision list
                         by going forward from max(revNum), else it'll go backwards
                         from revNum, then add X to all revnums and self.next_revnum
                         where X is the length of ordered_rev_list
        :return:
        '''
        with self.conn.transaction() as t:
            current_min = t.get_one("SELECT min(revnum) FROM csetlog")[0]
            current_max = t.get_one("SELECT max(revnum) FROM csetlog")[0]
            if not current_min or not current_max:
                current_min = 0
                current_max = 0

            direction = -1
            start = current_min - 1
            if number_forward:
                direction = 1
                start = current_max + 1
                ordered_rev_list = ordered_rev_list[::-1]

            insert_list = [
                (
                    start + direction * count,
                    rev,
                    int(time.time()) if timestamp else -1
                )
                for count, rev in enumerate(ordered_rev_list)
            ]

            # In case of overlapping requests
            fmt_insert_list = []
            for cset_entry in insert_list:
                tmp = self._get_one_revision(t, cset_entry)
                if not tmp:
                    fmt_insert_list.append(cset_entry)

            for _, tmp_insert_list in jx.groupby(fmt_insert_list, size=SQL_CSET_BATCH_SIZE):
                t.execute(
                    "INSERT INTO csetLog (revnum, revision, timestamp)" +
                    " VALUES " +
                    sql_list(
                        quote_set((revnum, revision, timestamp))
                        for revnum, revision, timestamp in tmp_insert_list
                    )
                )

            # Move the revision numbers forward if needed
            self.recompute_table_revnums()

        # Start a maintenance run if needed
        if self.check_for_maintenance():
            Log.note("Scheduling maintenance run on clogger.")
            self.maintenance_signal.go()