Ejemplo n.º 1
0
    def test_date_literal(self):
        expr = {"date": {"literal": "today-month"}}

        from jx_python.expression_compiler import compile_expression
        result = compile_expression(Python[jx_expression(expr).partial_eval()].to_python())(None)
        expected = (Date.today()-MONTH).unix
        self.assertEqual(result, expected)
Ejemplo n.º 2
0
def update_local_database(config, deviant_summary, candidates, since):
    if isinstance(deviant_summary, bigquery.Table):
        Log.note("Only the ETL process should fill the bigquery table")
        return

    # GET EVERYTHING WE HAVE SO FAR
    exists = deviant_summary.query({
        "select": ["signature_hash", "last_updated"],
        "where": {
            "and": [
                {
                    "in": {
                        "signature_hash": candidates.signature_hash
                    }
                },
                {
                    "exists": "num_pushes"
                },
            ]
        },
        "sort": "last_updated",
        "limit": 100000,
        "format": "list",
    }).data
    # CHOOSE MISSING, THEN OLDEST, UP TO "RECENT"
    missing = list(set(candidates.signature_hash) - set(exists.signature_hash))

    too_old = Date.today() - parse(LOCAL_RETENTION)
    needs_update = missing + [
        e.signature_hash for e in exists if e.last_updated < too_old.unix
    ]
    Log.alert("{{num}} series are candidates for local update",
              num=len(needs_update))

    limited_update = Queue("sigs")
    limited_update.extend(
        left(needs_update, coalesce(config.display.download_limit, 100)))
    Log.alert("Updating local database with {{num}} series",
              num=len(limited_update))

    with Timer("Updating local database"):

        def loop(please_stop):
            while not please_stop:
                signature_hash = limited_update.pop_one()
                if not signature_hash:
                    return
                process(
                    signature_hash,
                    since,
                    source=config.database,
                    deviant_summary=deviant_summary,
                )

        threads = [Thread.run(text(i), loop) for i in range(3)]
        for t in threads:
            t.join()

    Log.note("Local database is up to date")
Ejemplo n.º 3
0
 def last_deploy(self):
     setup_file = File.new_instance(self.directory, 'setup.py')
     if not setup_file.exists:
         Log.note("Not a pypi project: {{dir}}", dir=self.directory)
         return Date.today()
     setup = setup_file.read()
     version = json2value(strings.between(setup, "version=",
                                          ",")).split(".")[-1]
     date = unicode2Date(version, format="%y%j")
     Log.note("PyPi last deployed {{date|datetime}}",
              date=date,
              dir=self.directory)
     return date
Ejemplo n.º 4
0
def update_local_database():
    # GET EVERYTHING WE HAVE SO FAR
    exists = summary_table.query({
        "select": ["id", "last_updated"],
        "where": {
            "and": [{
                "in": {
                    "id": candidates.id
                }
            }, {
                "exists": "num_pushes"
            }]
        },
        "sort": "last_updated",
        "limit": 100000,
        "format": "list",
    }).data
    # CHOOSE MISSING, THEN OLDEST, UP TO "RECENT"
    missing = list(set(candidates.id) - set(exists.id))

    too_old = Date.today() - parse(LOCAL_RETENTION)
    needs_update = missing + [
        e for e in exists if e.last_updated < too_old.unix
    ]
    Log.alert("{{num}} series are candidates for local update",
              num=len(needs_update))

    limited_update = Queue("sigs")
    limited_update.extend(
        left(needs_update, coalesce(config.analysis.download_limit, 100)))
    Log.alert("Updating local database with {{num}} series",
              num=len(limited_update))

    with Timer("Updating local database"):

        def loop(please_stop):
            while not please_stop:
                sig_id = limited_update.pop_one()
                if not sig_id:
                    return
                process(sig_id)

        threads = [Thread.run(text(i), loop) for i in range(3)]
        for t in threads:
            t.join()

    Log.note("Local database is up to date")
Ejemplo n.º 5
0
def main():
    since = Date.today() - Duration(SCATTER_RANGE)

    if config.database.host not in listwrap(
            config.analysis.expected_database_host):
        Log.error("Expecting database to be one of {{expected}}",
                  expected=config.analysis.expected_database_host)
    if not config.analysis.interesting:
        Log.alert(
            "Expecting config file to have `analysis.interesting` with a json expression.  All series are included."
        )

    # SETUP DESTINATION
    deviant_summary = bigquery.Dataset(
        config.deviant_summary).get_or_create_table(
            read_only=True, kwargs=config.deviant_summary)

    if config.args.id:
        # EXIT EARLY AFTER WE GOT THE SPECIFIC IDS
        if len(config.args.id) < 4:
            step_detector.SHOW_CHARTS = True
        for signature_hash in config.args.id:
            process(
                signature_hash,
                since=since,
                source=config.database,
                deviant_summary=deviant_summary,
                show=True,
            )
        return

    # DOWNLOAD
    if config.args.download:
        # GET INTERESTING SERIES
        where_clause = BQLang[jx_expression(
            config.analysis.interesting)].to_bq(deviant_summary.schema)

        # GET ALL KNOWN SERIES
        docs = list(
            deviant_summary.sql_query(f"""
                SELECT * EXCEPT (_rank, values) 
                FROM (
                  SELECT 
                    *, 
                    row_number() over (partition by id order by last_updated desc) as _rank 
                  FROM  
                    {quote_column(deviant_summary.full_name)}
                  ) a 
                WHERE _rank=1 and {sql_iso(where_clause)}
                LIMIT {quote_value(DOWNLOAD_LIMIT)}
            """))
        if len(docs) == DOWNLOAD_LIMIT:
            Log.warning("Not all signatures downloaded")
        File(config.args.download).write(list2tab(docs, separator=","))

    # DEVIANT
    show_sorted(
        config=config,
        since=since,
        source=config.database,
        deviant_summary=deviant_summary,
        sort={
            "value": {
                "abs": "overall_dev_score"
            },
            "sort": "desc"
        },
        limit=config.args.deviant,
        show_old=False,
        show_distribution=True,
    )

    # MODAL
    show_sorted(
        config=config,
        since=since,
        source=config.database,
        deviant_summary=deviant_summary,
        sort="overall_dev_score",
        limit=config.args.modal,
        where={"eq": {
            "overall_dev_status": "MODAL"
        }},
        show_distribution=True,
    )

    # OUTLIERS
    show_sorted(
        config=config,
        since=since,
        source=config.database,
        deviant_summary=deviant_summary,
        sort={
            "value": "overall_dev_score",
            "sort": "desc"
        },
        limit=config.args.outliers,
        where={"eq": {
            "overall_dev_status": "OUTLIERS"
        }},
        show_distribution=True,
    )

    # SKEWED
    show_sorted(
        config=config,
        since=since,
        source=config.database,
        deviant_summary=deviant_summary,
        sort={
            "value": {
                "abs": "overall_dev_score"
            },
            "sort": "desc"
        },
        limit=config.args.skewed,
        where={"eq": {
            "overall_dev_status": "SKEWED"
        }},
        show_distribution=True,
    )

    # OK
    show_sorted(
        config=config,
        since=since,
        source=config.database,
        deviant_summary=deviant_summary,
        sort={
            "value": {
                "abs": "overall_dev_score"
            },
            "sort": "desc"
        },
        limit=config.args.ok,
        where={"eq": {
            "overall_dev_status": "OK"
        }},
        show_distribution=True,
    )

    # NOISE
    show_sorted(
        config=config,
        since=since,
        source=config.database,
        deviant_summary=deviant_summary,
        sort={
            "value": {
                "abs": "relative_noise"
            },
            "sort": "desc"
        },
        where={"gte": {
            "num_pushes": 30
        }},
        limit=config.args.noise,
    )

    # EXTRA
    show_sorted(
        config=config,
        since=since,
        source=config.database,
        deviant_summary=deviant_summary,
        sort={
            "value": {
                "abs": "max_extra_diff"
            },
            "sort": "desc"
        },
        where={"lte": {
            "num_new_segments": 7
        }},
        limit=config.args.extra,
    )

    # MISSING
    show_sorted(
        config=config,
        since=since,
        source=config.database,
        deviant_summary=deviant_summary,
        sort={
            "value": {
                "abs": "max_missing_diff"
            },
            "sort": "desc"
        },
        where={"lte": {
            "num_old_segments": 6
        }},
        limit=config.args.missing,
    )

    # PATHOLOGICAL
    show_sorted(
        config=config,
        since=since,
        source=config.database,
        deviant_summary=deviant_summary,
        sort={
            "value": "num_segments",
            "sort": "desc"
        },
        limit=config.args.pathological,
    )
Ejemplo n.º 6
0
def process(sig_id,
            show=False,
            show_limit=MAX_POINTS,
            show_old=True,
            show_distribution=None):
    if not mo_math.is_integer(sig_id):
        Log.error("expecting integer id")
    sig = first(get_signature(config.database, sig_id))
    data = get_dataum(config.database, sig_id)

    min_date = (Date.today() - 3 * MONTH).unix
    pushes = jx.sort(
        [{
            "value": median(rows.value),
            "runs": rows,
            "push": {
                "time": unwrap(t)["push.time"]
            },
        } for t, rows in jx.groupby(data, "push.time")
         if t["push\\.time"] > min_date],
        "push.time",
    )

    values = pushes.value
    title = "-".join(
        map(
            text,
            [
                sig.id,
                sig.framework,
                sig.suite,
                sig.test,
                sig.platform,
                sig.repository.name,
            ],
        ))
    Log.note("With {{title}}", title=title)

    with Timer("find segments"):
        new_segments, new_diffs = find_segments(values, sig.alert_change_type,
                                                sig.alert_threshold)

    # USE PERFHERDER ALERTS TO IDENTIFY OLD SEGMENTS
    old_segments = tuple(
        sorted(
            set([
                i for i, p in enumerate(pushes) if any(r.alert.id
                                                       for r in p.runs)
            ] + [0, len(pushes)])))
    old_medians = [0] + [
        np.median(values[s:e])
        for s, e in zip(old_segments[:-1], old_segments[1:])
    ]
    old_diffs = np.array(
        [b / a - 1 for a, b in zip(old_medians[:-1], old_medians[1:])] + [0])

    if len(new_segments) == 1:
        dev_status = None
        dev_score = None
        relative_noise = None
    else:
        # MEASURE DEVIANCE (USE THE LAST SEGMENT)
        s, e = new_segments[-2], new_segments[-1]
        last_segment = np.array(values[s:e])
        trimmed_segment = last_segment[np.argsort(last_segment)
                                       [IGNORE_TOP:-IGNORE_TOP]]
        dev_status, dev_score = deviance(trimmed_segment)
        relative_noise = np.std(trimmed_segment) / np.mean(trimmed_segment)
        Log.note(
            "\n\tdeviance = {{deviance}}\n\tnoise={{std}}",
            title=title,
            deviance=(dev_status, dev_score),
            std=relative_noise,
        )

        if show_distribution:
            histogram(last_segment, title=dev_status + "=" + text(dev_score))

    max_extra_diff = None
    max_missing_diff = None
    _is_diff = is_diff(new_segments, old_segments)
    if _is_diff:
        # FOR MISSING POINTS, CALC BIGGEST DIFF
        max_extra_diff = mo_math.MAX(
            abs(d) for s, d in zip(new_segments, new_diffs)
            if all(not (s - TOLLERANCE <= o <= s + TOLLERANCE)
                   for o in old_segments))
        max_missing_diff = mo_math.MAX(
            abs(d) for s, d in zip(old_segments, old_diffs)
            if all(not (s - TOLLERANCE <= n <= s + TOLLERANCE)
                   for n in new_segments))

        Log.alert(
            "Disagree max_extra_diff={{max_extra_diff|round(places=3)}}, max_missing_diff={{max_missing_diff|round(places=3)}}",
            max_extra_diff=max_extra_diff,
            max_missing_diff=max_missing_diff,
        )
        Log.note("old={{old}}, new={{new}}",
                 old=old_segments,
                 new=new_segments)
        if show and len(pushes):
            show_old and assign_colors(
                values, old_segments, title="OLD " + title)
            assign_colors(values, new_segments, title="NEW " + title)
    else:
        Log.note("Agree")
        if show and len(pushes):
            show_old and assign_colors(
                values, old_segments, title="OLD " + title)
            assign_colors(values, new_segments, title="NEW " + title)

    summary_table.upsert(
        where={"eq": {
            "id": sig.id
        }},
        doc=Data(
            id=sig.id,
            title=title,
            num_pushes=len(pushes),
            is_diff=_is_diff,
            max_extra_diff=max_extra_diff,
            max_missing_diff=max_missing_diff,
            num_new_segments=len(new_segments),
            num_old_segments=len(old_segments),
            relative_noise=relative_noise,
            dev_status=dev_status,
            dev_score=dev_score,
            last_updated=Date.now(),
        ),
    )
Ejemplo n.º 7
0
    def _update_cardinality(self, column):
        """
        QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN
        """
        now = Date.now()
        if column.es_index in self.index_does_not_exist:
            return

        if column.jx_type in STRUCT:
            Log.error("not supported")
        try:
            if column.es_index == "meta.columns":
                partitions = jx.sort([
                    g[column.es_column]
                    for g, _ in jx.groupby(self.meta.columns, column.es_column)
                    if g[column.es_column] != None
                ])
                self.meta.columns.update({
                    "set": {
                        "partitions": partitions,
                        "count": len(self.meta.columns),
                        "cardinality": len(partitions),
                        "multi": 1,
                        "last_updated": now
                    },
                    "where": {
                        "eq": {
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
                return
            if column.es_index == "meta.tables":
                partitions = jx.sort([
                    g[column.es_column]
                    for g, _ in jx.groupby(self.meta.tables, column.es_column)
                    if g[column.es_column] != None
                ])
                self.meta.columns.update({
                    "set": {
                        "partitions": partitions,
                        "count": len(self.meta.tables),
                        "cardinality": len(partitions),
                        "multi": 1,
                        "last_updated": now
                    },
                    "where": {
                        "eq": {
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
                return

            es_index = column.es_index.split(".")[0]

            is_text = [
                cc for cc in self.meta.columns
                if cc.es_column == column.es_column and cc.es_type == "text"
            ]
            if is_text:
                # text IS A MULTIVALUE STRING THAT CAN ONLY BE FILTERED
                result = self.es_cluster.post("/" + es_index + "/_search",
                                              data={
                                                  "aggs": {
                                                      "count": {
                                                          "filter": {
                                                              "match_all": {}
                                                          }
                                                      }
                                                  },
                                                  "size": 0
                                              })
                count = result.hits.total
                cardinality = max(1001, count)
                multi = 1001
            elif column.es_column == "_id":
                result = self.es_cluster.post("/" + es_index + "/_search",
                                              data={
                                                  "query": {
                                                      "match_all": {}
                                                  },
                                                  "size": 0
                                              })
                count = cardinality = result.hits.total
                multi = 1
            elif column.es_type == BOOLEAN:
                result = self.es_cluster.post("/" + es_index + "/_search",
                                              data={
                                                  "aggs": {
                                                      "count":
                                                      _counting_query(column)
                                                  },
                                                  "size": 0
                                              })
                count = result.hits.total
                cardinality = 2

                DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts",
                                   table=column.es_index,
                                   field=column.es_column,
                                   num=cardinality)
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "partitions": [False, True],
                        "multi": 1,
                        "last_updated": now
                    },
                    "clear": ["partitions"],
                    "where": {
                        "eq": {
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
                return
            else:
                es_query = {
                    "aggs": {
                        "count": _counting_query(column),
                        "_filter": {
                            "aggs": {
                                "multi": {
                                    "max": {
                                        "script":
                                        "doc[" + quote(column.es_column) +
                                        "].values.size()"
                                    }
                                }
                            },
                            "filter": {
                                "bool": {
                                    "should": [{
                                        "range": {
                                            "etl.timestamp.~n~": {
                                                "gte": (Date.today() - WEEK)
                                            }
                                        }
                                    }, {
                                        "bool": {
                                            "must_not": {
                                                "exists": {
                                                    "field":
                                                    "etl.timestamp.~n~"
                                                }
                                            }
                                        }
                                    }]
                                }
                            }
                        }
                    },
                    "size": 0
                }

                result = self.es_cluster.post("/" + es_index + "/_search",
                                              data=es_query)
                agg_results = result.aggregations
                count = result.hits.total
                cardinality = coalesce(agg_results.count.value,
                                       agg_results.count._nested.value,
                                       agg_results.count.doc_count)
                multi = int(coalesce(agg_results._filter.multi.value, 1))
                if cardinality == None:
                    Log.error("logic error")

            query = Data(size=0)

            if column.es_column == "_id":
                self.meta.columns.update({
                    "set": {
                        "count": cardinality,
                        "cardinality": cardinality,
                        "multi": 1,
                        "last_updated": now
                    },
                    "clear": ["partitions"],
                    "where": {
                        "eq": {
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
                return
            elif cardinality > 1000 or (count >= 30 and cardinality == count
                                        ) or (count >= 1000
                                              and cardinality / count > 0.99):
                DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts",
                                   table=column.es_index,
                                   field=column.es_column,
                                   num=cardinality)
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "multi": multi,
                        "last_updated": now
                    },
                    "clear": ["partitions"],
                    "where": {
                        "eq": {
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
                return
            elif column.es_type in elasticsearch.ES_NUMERIC_TYPES and cardinality > 30:
                DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts",
                                   table=column.es_index,
                                   field=column.es_column,
                                   num=cardinality)
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "multi": multi,
                        "last_updated": now
                    },
                    "clear": ["partitions"],
                    "where": {
                        "eq": {
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
                return
            elif len(column.nested_path) != 1:
                query.aggs["_"] = {
                    "nested": {
                        "path": column.nested_path[0]
                    },
                    "aggs": {
                        "_nested": {
                            "terms": {
                                "field": column.es_column
                            }
                        }
                    }
                }
            elif cardinality == 0:  # WHEN DOES THIS HAPPEN?
                query.aggs["_"] = {"terms": {"field": column.es_column}}
            else:
                query.aggs["_"] = {
                    "terms": {
                        "field": column.es_column,
                        "size": cardinality
                    }
                }

            result = self.es_cluster.post("/" + es_index + "/_search",
                                          data=query)

            aggs = result.aggregations._
            if aggs._nested:
                parts = jx.sort(aggs._nested.buckets.key)
            else:
                parts = jx.sort(aggs.buckets.key)

            DEBUG and Log.note(
                "update metadata for {{column.es_index}}.{{column.es_column}} (id={{id}}) at {{time}}",
                id=id(column),
                column=column,
                time=now)
            self.meta.columns.update({
                "set": {
                    "count": count,
                    "cardinality": cardinality,
                    "multi": multi,
                    "partitions": parts,
                    "last_updated": now
                },
                "where": {
                    "eq": {
                        "es_index": column.es_index,
                        "es_column": column.es_column
                    }
                }
            })
        except Exception as e:
            # CAN NOT IMPORT: THE TEST MODULES SETS UP LOGGING
            # from tests.test_jx import TEST_TABLE
            e = Except.wrap(e)
            TEST_TABLE = "testdata"
            is_missing_index = any(
                w in e for w in
                ["IndexMissingException", "index_not_found_exception"])
            is_test_table = column.es_index.startswith(
                (TEST_TABLE_PREFIX, TEST_TABLE))
            if is_missing_index:
                # WE EXPECT TEST TABLES TO DISAPPEAR
                Log.warning("Missing index {{col.es_index}}",
                            col=column,
                            cause=e)
                self.meta.columns.update({
                    "clear": ".",
                    "where": {
                        "eq": {
                            "es_index": column.es_index
                        }
                    }
                })
                self.index_does_not_exist.add(column.es_index)
            elif "No field found for" in e:
                self.meta.columns.update({
                    "clear": ".",
                    "where": {
                        "eq": {
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
                Log.warning(
                    "Could not get column {{col.es_index}}.{{col.es_column}} info",
                    col=column,
                    cause=e)
            else:
                self.meta.columns.update({
                    "set": {
                        "last_updated": now
                    },
                    "clear": [
                        "count",
                        "cardinality",
                        "multi",
                        "partitions",
                    ],
                    "where": {
                        "eq": {
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
                Log.warning(
                    "Could not get {{col.es_index}}.{{col.es_column}} info",
                    col=column,
                    cause=e)
    def _get_spot_prices_from_aws(self):
        with Timer("Read no capacity file"):
            try:
                # FILE IS LIST OF {instance_type, last_failure} OBJECTS
                content = self.no_capacity_file.read()
                self.no_capacity = dict(
                    (r.instance_type, r.last_failure)
                    for r in convert.json2value(
                        content, flexible=False, leaves=False))
            except Exception as e:
                self.no_capacity = {}

        with Timer("Read pricing file"):
            try:
                content = File(self.settings.price_file).read()
                cache = convert.json2value(content,
                                           flexible=False,
                                           leaves=False)
            except Exception as e:
                cache = FlatList()

        cache = ListContainer(name=None, data=cache)
        most_recents = jx.run({
            "from": cache,
            "edges": ["instance_type", "availability_zone"],
            "select": {
                "value": "timestamp",
                "aggregate": "max"
            }
        })

        zones = self._get_valid_availability_zones()
        prices = set(cache)
        with Timer("Get pricing from AWS"):
            for instance_type in self.settings.utility.keys():
                for zone in zones:
                    if cache:
                        most_recent = most_recents[{
                            "instance_type": instance_type,
                            "availability_zone": zone
                        }].timestamp
                        start_at = MAX(
                            [Date(most_recent),
                             Date.today() - WEEK])
                    else:
                        start_at = Date.today() - WEEK

                    if DEBUG_PRICING:
                        Log.note(
                            "get pricing for {{instance_type}} starting at {{start_at}}",
                            instance_type=instance_type,
                            start_at=start_at)

                    next_token = None
                    while True:
                        resultset = self.ec2_conn.get_spot_price_history(
                            product_description=coalesce(
                                self.settings.product,
                                "Linux/UNIX (Amazon VPC)"),
                            instance_type=instance_type,
                            availability_zone=zone,
                            start_time=start_at.format(ISO8601),
                            next_token=next_token)
                        next_token = resultset.next_token

                        for p in resultset:
                            prices.add(
                                wrap({
                                    "availability_zone": p.availability_zone,
                                    "instance_type": p.instance_type,
                                    "price": p.price,
                                    "product_description":
                                    p.product_description,
                                    "region": p.region.name,
                                    "timestamp": Date(p.timestamp).unix
                                }))

                        if not next_token:
                            break

        with Timer("Save prices to file"):
            new_prices = jx.filter(
                prices, {"gte": {
                    "timestamp": {
                        "date": "today-2day"
                    }
                }})

            def stream():  # IT'S A LOT OF PRICES, STREAM THEM TO FILE
                prefix = "[\n"
                for p in new_prices:
                    yield prefix
                    yield convert.value2json(p)
                    prefix = ",\n"
                yield "]"

            File(self.settings.price_file).write(stream())

        return ListContainer(name="prices", data=prices)
Ejemplo n.º 9
0
def main():
    try:
        config = startup.read_settings()
        constants.set(config.constants)
        Log.start(config.debug)

        # SHUNT PYTHON LOGGING TO MAIN LOGGING
        capture_logging()
        # SHUNT ADR LOGGING TO MAIN LOGGING
        # https://loguru.readthedocs.io/en/stable/api/logger.html#loguru._logger.Logger.add
        capture_loguru()

        if config.taskcluster:
            inject_secrets(config)

        @extend(Configuration)
        def update(self, config):
            """
            Update the configuration object with new parameters
            :param config: dict of configuration
            """
            for k, v in config.items():
                if v != None:
                    self._config[k] = v

            self._config["sources"] = sorted(
                map(os.path.expanduser, set(self._config["sources"])))

            # Use the NullStore by default. This allows us to control whether
            # caching is enabled or not at runtime.
            self._config["cache"].setdefault("stores",
                                             {"null": {
                                                 "driver": "null"
                                             }})
            object.__setattr__(self, "cache", CustomCacheManager(self._config))
            for _, store in self._config["cache"]["stores"].items():
                if store.path and not store.path.endswith("/"):
                    # REQUIRED, OTHERWISE FileStore._create_cache_directory() WILL LOOK AT PARENT DIRECTORY
                    store.path = store.path + "/"

        if SHOW_S3_CACHE_HIT:
            s3_get = S3Store._get

            @extend(S3Store)
            def _get(self, key):
                with Timer("get {{key}} from S3", {"key": key},
                           verbose=False) as timer:
                    output = s3_get(self, key)
                    if output is not None:
                        timer.verbose = True
                    return output

        # UPDATE ADR CONFIGURATION
        with Repeat("waiting for ADR", every="10second"):
            adr.config.update(config.adr)
            # DUMMY TO TRIGGER CACHE
            make_push_objects(from_date=Date.today().format(),
                              to_date=Date.now().format(),
                              branch="autoland")

        outatime = Till(seconds=Duration(MAX_RUNTIME).total_seconds())
        outatime.then(lambda: Log.alert("Out of time, exit early"))
        Schedulers(config).process(outatime)
    except Exception as e:
        Log.warning("Problem with etl! Shutting down.", cause=e)
    finally:
        Log.stop()
Ejemplo n.º 10
0
    def _update_cardinality(self, column):
        """
        QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN
        """
        now = Date.now()
        if column.es_index in self.index_does_not_exist:
            return

        if column.jx_type in STRUCT:
            Log.error("not supported")
        try:
            if column.es_index == "meta.columns":
                partitions = jx.sort([g[column.es_column] for g, _ in jx.groupby(self.meta.columns, column.es_column) if g[column.es_column] != None])
                self.meta.columns.update({
                    "set": {
                        "partitions": partitions,
                        "count": len(self.meta.columns),
                        "cardinality": len(partitions),
                        "multi": 1,
                        "last_updated": now
                    },
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                return
            if column.es_index == "meta.tables":
                partitions = jx.sort([g[column.es_column] for g, _ in jx.groupby(self.meta.tables, column.es_column) if g[column.es_column] != None])
                self.meta.columns.update({
                    "set": {
                        "partitions": partitions,
                        "count": len(self.meta.tables),
                        "cardinality": len(partitions),
                        "multi": 1,
                        "last_updated": now
                    },
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                return

            es_index = column.es_index.split(".")[0]

            is_text = [cc for cc in self.meta.columns if cc.es_column == column.es_column and cc.es_type == "text"]
            if is_text:
                # text IS A MULTIVALUE STRING THAT CAN ONLY BE FILTERED
                result = self.es_cluster.post("/" + es_index + "/_search", data={
                    "aggs": {
                        "count": {"filter": {"match_all": {}}}
                    },
                    "size": 0
                })
                count = result.hits.total
                cardinality = max(1001, count)
                multi = 1001
            elif column.es_column == "_id":
                result = self.es_cluster.post("/" + es_index + "/_search", data={
                    "query": {"match_all": {}},
                    "size": 0
                })
                count = cardinality = result.hits.total
                multi = 1
            elif column.es_type == BOOLEAN:
                result = self.es_cluster.post("/" + es_index + "/_search", data={
                    "aggs": {
                        "count": _counting_query(column)
                    },
                    "size": 0
                })
                count = result.hits.total
                cardinality = 2

                DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality)
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "partitions": [False, True],
                        "multi": 1,
                        "last_updated": now
                    },
                    "clear": ["partitions"],
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                return
            else:
                es_query = {
                    "aggs": {
                        "count": _counting_query(column),
                        "_filter": {
                            "aggs": {"multi": {"max": {"script": "doc[" + quote(column.es_column) + "].values.size()"}}},
                            "filter": {"bool": {"should": [
                                {"range": {"etl.timestamp.~n~": {"gte": (Date.today() - WEEK)}}},
                                {"bool": {"must_not": {"exists": {"field": "etl.timestamp.~n~"}}}}
                            ]}}
                        }
                    },
                    "size": 0
                }

                result = self.es_cluster.post("/" + es_index + "/_search", data=es_query)
                agg_results = result.aggregations
                count = result.hits.total
                cardinality = coalesce(agg_results.count.value, agg_results.count._nested.value, agg_results.count.doc_count)
                multi = int(coalesce(agg_results._filter.multi.value, 1))
                if cardinality == None:
                    Log.error("logic error")

            query = Data(size=0)

            if column.es_column == "_id":
                self.meta.columns.update({
                    "set": {
                        "count": cardinality,
                        "cardinality": cardinality,
                        "multi": 1,
                        "last_updated": now
                    },
                    "clear": ["partitions"],
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                return
            elif cardinality > 1000 or (count >= 30 and cardinality == count) or (count >= 1000 and cardinality / count > 0.99):
                DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality)
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "multi": multi,
                        "last_updated": now
                    },
                    "clear": ["partitions"],
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                return
            elif column.es_type in elasticsearch.ES_NUMERIC_TYPES and cardinality > 30:
                DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality)
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "multi": multi,
                        "last_updated": now
                    },
                    "clear": ["partitions"],
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                return
            elif len(column.nested_path) != 1:
                query.aggs["_"] = {
                    "nested": {"path": column.nested_path[0]},
                    "aggs": {"_nested": {"terms": {"field": column.es_column}}}
                }
            elif cardinality == 0:  # WHEN DOES THIS HAPPEN?
                query.aggs["_"] = {"terms": {"field": column.es_column}}
            else:
                query.aggs["_"] = {"terms": {"field": column.es_column, "size": cardinality}}

            result = self.es_cluster.post("/" + es_index + "/_search", data=query)

            aggs = result.aggregations._
            if aggs._nested:
                parts = jx.sort(aggs._nested.buckets.key)
            else:
                parts = jx.sort(aggs.buckets.key)

            DEBUG and Log.note("update metadata for {{column.es_index}}.{{column.es_column}} (id={{id}}) at {{time}}", id=id(column), column=column, time=now)
            self.meta.columns.update({
                "set": {
                    "count": count,
                    "cardinality": cardinality,
                    "multi": multi,
                    "partitions": parts,
                    "last_updated": now
                },
                "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
            })
        except Exception as e:
            # CAN NOT IMPORT: THE TEST MODULES SETS UP LOGGING
            # from tests.test_jx import TEST_TABLE
            e = Except.wrap(e)
            TEST_TABLE = "testdata"
            is_missing_index = any(w in e for w in ["IndexMissingException", "index_not_found_exception"])
            is_test_table = column.es_index.startswith((TEST_TABLE_PREFIX, TEST_TABLE))
            if is_missing_index:
                # WE EXPECT TEST TABLES TO DISAPPEAR
                Log.warning("Missing index {{col.es_index}}", col=column, cause=e)
                self.meta.columns.update({
                    "clear": ".",
                    "where": {"eq": {"es_index": column.es_index}}
                })
                self.index_does_not_exist.add(column.es_index)
            elif "No field found for" in e:
                self.meta.columns.update({
                    "clear": ".",
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                Log.warning("Could not get column {{col.es_index}}.{{col.es_column}} info", col=column, cause=e)
            else:
                self.meta.columns.update({
                    "set": {
                        "last_updated": now
                    },
                    "clear": [
                        "count",
                        "cardinality",
                        "multi",
                        "partitions",
                    ],
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                Log.warning("Could not get {{col.es_index}}.{{col.es_column}} info", col=column, cause=e)
Ejemplo n.º 11
0
    def _get_spot_prices_from_aws(self):
        with Timer("Read pricing file"):
            try:
                content = File(self.settings.price_file).read()
                cache = convert.json2value(content, flexible=False, leaves=False)
            except Exception as e:
                cache = FlatList()

        most_recents = jx.run({
            "from": cache,
            "edges": ["instance_type", "availability_zone"],
            "select": {"value": "timestamp", "aggregate": "max"}
        })

        zones = self._get_valid_availability_zones()
        prices = set(cache)
        with Timer("Get pricing from AWS"):
            for instance_type in self.settings.utility.keys():
                for zone in zones:
                    if cache:
                        most_recent = most_recents[{
                            "instance_type": instance_type,
                            "availability_zone": zone
                        }].timestamp
                        start_at = MAX([Date(most_recent), Date.today() - WEEK])
                    else:
                        start_at = Date.today() - WEEK

                    if DEBUG_PRICING:
                        Log.note("get pricing for {{instance_type}} starting at {{start_at}}",
                            instance_type=instance_type,
                            start_at=start_at
                        )

                    next_token = None
                    while True:
                        resultset = self.ec2_conn.get_spot_price_history(
                            product_description=coalesce(self.settings.product, "Linux/UNIX (Amazon VPC)"),
                            instance_type=instance_type,
                            availability_zone=zone,
                            start_time=start_at.format(ISO8601),
                            next_token=next_token
                        )
                        next_token = resultset.next_token

                        for p in resultset:
                            prices.add(wrap({
                                "availability_zone": p.availability_zone,
                                "instance_type": p.instance_type,
                                "price": p.price,
                                "product_description": p.product_description,
                                "region": p.region.name,
                                "timestamp": Date(p.timestamp).unix
                            }))

                        if not next_token:
                            break

        with Timer("Save prices to file"):
            new_prices = jx.filter(prices, {"gte": {"timestamp": {"date": "today-2day"}}})
            def stream():  # IT'S A LOT OF PRICES, STREAM THEM TO FILE
                prefix = "[\n"
                for p in new_prices:
                    yield prefix
                    yield convert.value2json(p)
                    prefix = ",\n"
                yield "]"
            File(self.settings.price_file).write(stream())

        return prices
Ejemplo n.º 12
0
    def pypi(self):
        if Date.today() <= self.last_deploy():
            Log.note("Can not upload to pypi")
            return False

        lib_name = self.directory.name
        source_readme = File.new_instance(self.directory, 'README.md').abspath
        dest_readme = File.new_instance(self.directory, 'README.txt').abspath
        pypandoc.convert(source_readme, to=b'rst', outputfile=dest_readme)
        setup_file = File.new_instance(self.directory, 'setup.py')
        req_file = File.new_instance(self.directory, 'requirements.txt')

        if not setup_file.exists:
            Log.warning("Not a PyPi project!  No setup.py file.")

        setup = setup_file.read()
        # UPDATE THE VERSION NUMBER
        curr = (datetime.datetime.utcnow() +
                datetime.timedelta(days=1)).strftime("%y%j")
        setup = re.sub(r'(version\s*=\s*\"\d*\.\d*\.)\d*(\")',
                       r'\g<1>%s\2' % curr, setup)

        # UPDATE THE REQUIREMENTS
        if not req_file.exists:
            Log.error("Expecting a requirements.txt file")
        req = req_file.read()
        setup_req = re.findall(r'install_requires\s*=\s*\[.*\]\s*,', setup)
        setup.replace(
            setup_req[0], 'install_requires=' + value2json(
                d for d in sorted(map(strings.trim, req.split("\n"))) if d))
        setup_file.write(setup)

        File.new_instance(self.directory, "build").delete()
        File.new_instance(self.directory, "dist").delete()
        File.new_instance(self.directory,
                          lib_name.replace("-", "_") + ".egg-info").delete()

        process, stdout, stderr = self.local(
            "pypi",
            ["C:/Python27/python.exe", "setup.py", "bdist_egg", "upload"],
            raise_on_error=False)
        if "Upload failed (400): File already exists." in stderr:
            Log.warning("Not uploaded")
        elif process.returncode == 0:
            pass
        else:
            Log.error("not expected")
        process, stdout, stderr = self.local(
            "pypi", ["C:/Python27/python.exe", "setup.py", "sdist", "upload"],
            raise_on_error=False)
        if "Upload failed (400): File already exists." in stderr:
            Log.warning("Not uploaded")
        elif process.returncode == 0:
            pass
        else:
            Log.error("not expected")

        File.new_instance(self.directory, "README.txt").delete()
        File.new_instance(self.directory, "build").delete()
        File.new_instance(self.directory, "dist").delete()
        File.new_instance(self.directory,
                          lib_name.replace("-", "_") + ".egg-info").delete()
        return True