def test_date_literal(self): expr = {"date": {"literal": "today-month"}} from jx_python.expression_compiler import compile_expression result = compile_expression(Python[jx_expression(expr).partial_eval()].to_python())(None) expected = (Date.today()-MONTH).unix self.assertEqual(result, expected)
def update_local_database(config, deviant_summary, candidates, since): if isinstance(deviant_summary, bigquery.Table): Log.note("Only the ETL process should fill the bigquery table") return # GET EVERYTHING WE HAVE SO FAR exists = deviant_summary.query({ "select": ["signature_hash", "last_updated"], "where": { "and": [ { "in": { "signature_hash": candidates.signature_hash } }, { "exists": "num_pushes" }, ] }, "sort": "last_updated", "limit": 100000, "format": "list", }).data # CHOOSE MISSING, THEN OLDEST, UP TO "RECENT" missing = list(set(candidates.signature_hash) - set(exists.signature_hash)) too_old = Date.today() - parse(LOCAL_RETENTION) needs_update = missing + [ e.signature_hash for e in exists if e.last_updated < too_old.unix ] Log.alert("{{num}} series are candidates for local update", num=len(needs_update)) limited_update = Queue("sigs") limited_update.extend( left(needs_update, coalesce(config.display.download_limit, 100))) Log.alert("Updating local database with {{num}} series", num=len(limited_update)) with Timer("Updating local database"): def loop(please_stop): while not please_stop: signature_hash = limited_update.pop_one() if not signature_hash: return process( signature_hash, since, source=config.database, deviant_summary=deviant_summary, ) threads = [Thread.run(text(i), loop) for i in range(3)] for t in threads: t.join() Log.note("Local database is up to date")
def last_deploy(self): setup_file = File.new_instance(self.directory, 'setup.py') if not setup_file.exists: Log.note("Not a pypi project: {{dir}}", dir=self.directory) return Date.today() setup = setup_file.read() version = json2value(strings.between(setup, "version=", ",")).split(".")[-1] date = unicode2Date(version, format="%y%j") Log.note("PyPi last deployed {{date|datetime}}", date=date, dir=self.directory) return date
def update_local_database(): # GET EVERYTHING WE HAVE SO FAR exists = summary_table.query({ "select": ["id", "last_updated"], "where": { "and": [{ "in": { "id": candidates.id } }, { "exists": "num_pushes" }] }, "sort": "last_updated", "limit": 100000, "format": "list", }).data # CHOOSE MISSING, THEN OLDEST, UP TO "RECENT" missing = list(set(candidates.id) - set(exists.id)) too_old = Date.today() - parse(LOCAL_RETENTION) needs_update = missing + [ e for e in exists if e.last_updated < too_old.unix ] Log.alert("{{num}} series are candidates for local update", num=len(needs_update)) limited_update = Queue("sigs") limited_update.extend( left(needs_update, coalesce(config.analysis.download_limit, 100))) Log.alert("Updating local database with {{num}} series", num=len(limited_update)) with Timer("Updating local database"): def loop(please_stop): while not please_stop: sig_id = limited_update.pop_one() if not sig_id: return process(sig_id) threads = [Thread.run(text(i), loop) for i in range(3)] for t in threads: t.join() Log.note("Local database is up to date")
def main(): since = Date.today() - Duration(SCATTER_RANGE) if config.database.host not in listwrap( config.analysis.expected_database_host): Log.error("Expecting database to be one of {{expected}}", expected=config.analysis.expected_database_host) if not config.analysis.interesting: Log.alert( "Expecting config file to have `analysis.interesting` with a json expression. All series are included." ) # SETUP DESTINATION deviant_summary = bigquery.Dataset( config.deviant_summary).get_or_create_table( read_only=True, kwargs=config.deviant_summary) if config.args.id: # EXIT EARLY AFTER WE GOT THE SPECIFIC IDS if len(config.args.id) < 4: step_detector.SHOW_CHARTS = True for signature_hash in config.args.id: process( signature_hash, since=since, source=config.database, deviant_summary=deviant_summary, show=True, ) return # DOWNLOAD if config.args.download: # GET INTERESTING SERIES where_clause = BQLang[jx_expression( config.analysis.interesting)].to_bq(deviant_summary.schema) # GET ALL KNOWN SERIES docs = list( deviant_summary.sql_query(f""" SELECT * EXCEPT (_rank, values) FROM ( SELECT *, row_number() over (partition by id order by last_updated desc) as _rank FROM {quote_column(deviant_summary.full_name)} ) a WHERE _rank=1 and {sql_iso(where_clause)} LIMIT {quote_value(DOWNLOAD_LIMIT)} """)) if len(docs) == DOWNLOAD_LIMIT: Log.warning("Not all signatures downloaded") File(config.args.download).write(list2tab(docs, separator=",")) # DEVIANT show_sorted( config=config, since=since, source=config.database, deviant_summary=deviant_summary, sort={ "value": { "abs": "overall_dev_score" }, "sort": "desc" }, limit=config.args.deviant, show_old=False, show_distribution=True, ) # MODAL show_sorted( config=config, since=since, source=config.database, deviant_summary=deviant_summary, sort="overall_dev_score", limit=config.args.modal, where={"eq": { "overall_dev_status": "MODAL" }}, show_distribution=True, ) # OUTLIERS show_sorted( config=config, since=since, source=config.database, deviant_summary=deviant_summary, sort={ "value": "overall_dev_score", "sort": "desc" }, limit=config.args.outliers, where={"eq": { "overall_dev_status": "OUTLIERS" }}, show_distribution=True, ) # SKEWED show_sorted( config=config, since=since, source=config.database, deviant_summary=deviant_summary, sort={ "value": { "abs": "overall_dev_score" }, "sort": "desc" }, limit=config.args.skewed, where={"eq": { "overall_dev_status": "SKEWED" }}, show_distribution=True, ) # OK show_sorted( config=config, since=since, source=config.database, deviant_summary=deviant_summary, sort={ "value": { "abs": "overall_dev_score" }, "sort": "desc" }, limit=config.args.ok, where={"eq": { "overall_dev_status": "OK" }}, show_distribution=True, ) # NOISE show_sorted( config=config, since=since, source=config.database, deviant_summary=deviant_summary, sort={ "value": { "abs": "relative_noise" }, "sort": "desc" }, where={"gte": { "num_pushes": 30 }}, limit=config.args.noise, ) # EXTRA show_sorted( config=config, since=since, source=config.database, deviant_summary=deviant_summary, sort={ "value": { "abs": "max_extra_diff" }, "sort": "desc" }, where={"lte": { "num_new_segments": 7 }}, limit=config.args.extra, ) # MISSING show_sorted( config=config, since=since, source=config.database, deviant_summary=deviant_summary, sort={ "value": { "abs": "max_missing_diff" }, "sort": "desc" }, where={"lte": { "num_old_segments": 6 }}, limit=config.args.missing, ) # PATHOLOGICAL show_sorted( config=config, since=since, source=config.database, deviant_summary=deviant_summary, sort={ "value": "num_segments", "sort": "desc" }, limit=config.args.pathological, )
def process(sig_id, show=False, show_limit=MAX_POINTS, show_old=True, show_distribution=None): if not mo_math.is_integer(sig_id): Log.error("expecting integer id") sig = first(get_signature(config.database, sig_id)) data = get_dataum(config.database, sig_id) min_date = (Date.today() - 3 * MONTH).unix pushes = jx.sort( [{ "value": median(rows.value), "runs": rows, "push": { "time": unwrap(t)["push.time"] }, } for t, rows in jx.groupby(data, "push.time") if t["push\\.time"] > min_date], "push.time", ) values = pushes.value title = "-".join( map( text, [ sig.id, sig.framework, sig.suite, sig.test, sig.platform, sig.repository.name, ], )) Log.note("With {{title}}", title=title) with Timer("find segments"): new_segments, new_diffs = find_segments(values, sig.alert_change_type, sig.alert_threshold) # USE PERFHERDER ALERTS TO IDENTIFY OLD SEGMENTS old_segments = tuple( sorted( set([ i for i, p in enumerate(pushes) if any(r.alert.id for r in p.runs) ] + [0, len(pushes)]))) old_medians = [0] + [ np.median(values[s:e]) for s, e in zip(old_segments[:-1], old_segments[1:]) ] old_diffs = np.array( [b / a - 1 for a, b in zip(old_medians[:-1], old_medians[1:])] + [0]) if len(new_segments) == 1: dev_status = None dev_score = None relative_noise = None else: # MEASURE DEVIANCE (USE THE LAST SEGMENT) s, e = new_segments[-2], new_segments[-1] last_segment = np.array(values[s:e]) trimmed_segment = last_segment[np.argsort(last_segment) [IGNORE_TOP:-IGNORE_TOP]] dev_status, dev_score = deviance(trimmed_segment) relative_noise = np.std(trimmed_segment) / np.mean(trimmed_segment) Log.note( "\n\tdeviance = {{deviance}}\n\tnoise={{std}}", title=title, deviance=(dev_status, dev_score), std=relative_noise, ) if show_distribution: histogram(last_segment, title=dev_status + "=" + text(dev_score)) max_extra_diff = None max_missing_diff = None _is_diff = is_diff(new_segments, old_segments) if _is_diff: # FOR MISSING POINTS, CALC BIGGEST DIFF max_extra_diff = mo_math.MAX( abs(d) for s, d in zip(new_segments, new_diffs) if all(not (s - TOLLERANCE <= o <= s + TOLLERANCE) for o in old_segments)) max_missing_diff = mo_math.MAX( abs(d) for s, d in zip(old_segments, old_diffs) if all(not (s - TOLLERANCE <= n <= s + TOLLERANCE) for n in new_segments)) Log.alert( "Disagree max_extra_diff={{max_extra_diff|round(places=3)}}, max_missing_diff={{max_missing_diff|round(places=3)}}", max_extra_diff=max_extra_diff, max_missing_diff=max_missing_diff, ) Log.note("old={{old}}, new={{new}}", old=old_segments, new=new_segments) if show and len(pushes): show_old and assign_colors( values, old_segments, title="OLD " + title) assign_colors(values, new_segments, title="NEW " + title) else: Log.note("Agree") if show and len(pushes): show_old and assign_colors( values, old_segments, title="OLD " + title) assign_colors(values, new_segments, title="NEW " + title) summary_table.upsert( where={"eq": { "id": sig.id }}, doc=Data( id=sig.id, title=title, num_pushes=len(pushes), is_diff=_is_diff, max_extra_diff=max_extra_diff, max_missing_diff=max_missing_diff, num_new_segments=len(new_segments), num_old_segments=len(old_segments), relative_noise=relative_noise, dev_status=dev_status, dev_score=dev_score, last_updated=Date.now(), ), )
def _update_cardinality(self, column): """ QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN """ now = Date.now() if column.es_index in self.index_does_not_exist: return if column.jx_type in STRUCT: Log.error("not supported") try: if column.es_index == "meta.columns": partitions = jx.sort([ g[column.es_column] for g, _ in jx.groupby(self.meta.columns, column.es_column) if g[column.es_column] != None ]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.columns), "cardinality": len(partitions), "multi": 1, "last_updated": now }, "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return if column.es_index == "meta.tables": partitions = jx.sort([ g[column.es_column] for g, _ in jx.groupby(self.meta.tables, column.es_column) if g[column.es_column] != None ]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.tables), "cardinality": len(partitions), "multi": 1, "last_updated": now }, "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return es_index = column.es_index.split(".")[0] is_text = [ cc for cc in self.meta.columns if cc.es_column == column.es_column and cc.es_type == "text" ] if is_text: # text IS A MULTIVALUE STRING THAT CAN ONLY BE FILTERED result = self.es_cluster.post("/" + es_index + "/_search", data={ "aggs": { "count": { "filter": { "match_all": {} } } }, "size": 0 }) count = result.hits.total cardinality = max(1001, count) multi = 1001 elif column.es_column == "_id": result = self.es_cluster.post("/" + es_index + "/_search", data={ "query": { "match_all": {} }, "size": 0 }) count = cardinality = result.hits.total multi = 1 elif column.es_type == BOOLEAN: result = self.es_cluster.post("/" + es_index + "/_search", data={ "aggs": { "count": _counting_query(column) }, "size": 0 }) count = result.hits.total cardinality = 2 DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "partitions": [False, True], "multi": 1, "last_updated": now }, "clear": ["partitions"], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return else: es_query = { "aggs": { "count": _counting_query(column), "_filter": { "aggs": { "multi": { "max": { "script": "doc[" + quote(column.es_column) + "].values.size()" } } }, "filter": { "bool": { "should": [{ "range": { "etl.timestamp.~n~": { "gte": (Date.today() - WEEK) } } }, { "bool": { "must_not": { "exists": { "field": "etl.timestamp.~n~" } } } }] } } } }, "size": 0 } result = self.es_cluster.post("/" + es_index + "/_search", data=es_query) agg_results = result.aggregations count = result.hits.total cardinality = coalesce(agg_results.count.value, agg_results.count._nested.value, agg_results.count.doc_count) multi = int(coalesce(agg_results._filter.multi.value, 1)) if cardinality == None: Log.error("logic error") query = Data(size=0) if column.es_column == "_id": self.meta.columns.update({ "set": { "count": cardinality, "cardinality": cardinality, "multi": 1, "last_updated": now }, "clear": ["partitions"], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return elif cardinality > 1000 or (count >= 30 and cardinality == count ) or (count >= 1000 and cardinality / count > 0.99): DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": now }, "clear": ["partitions"], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return elif column.es_type in elasticsearch.ES_NUMERIC_TYPES and cardinality > 30: DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": now }, "clear": ["partitions"], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return elif len(column.nested_path) != 1: query.aggs["_"] = { "nested": { "path": column.nested_path[0] }, "aggs": { "_nested": { "terms": { "field": column.es_column } } } } elif cardinality == 0: # WHEN DOES THIS HAPPEN? query.aggs["_"] = {"terms": {"field": column.es_column}} else: query.aggs["_"] = { "terms": { "field": column.es_column, "size": cardinality } } result = self.es_cluster.post("/" + es_index + "/_search", data=query) aggs = result.aggregations._ if aggs._nested: parts = jx.sort(aggs._nested.buckets.key) else: parts = jx.sort(aggs.buckets.key) DEBUG and Log.note( "update metadata for {{column.es_index}}.{{column.es_column}} (id={{id}}) at {{time}}", id=id(column), column=column, time=now) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "partitions": parts, "last_updated": now }, "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) except Exception as e: # CAN NOT IMPORT: THE TEST MODULES SETS UP LOGGING # from tests.test_jx import TEST_TABLE e = Except.wrap(e) TEST_TABLE = "testdata" is_missing_index = any( w in e for w in ["IndexMissingException", "index_not_found_exception"]) is_test_table = column.es_index.startswith( (TEST_TABLE_PREFIX, TEST_TABLE)) if is_missing_index: # WE EXPECT TEST TABLES TO DISAPPEAR Log.warning("Missing index {{col.es_index}}", col=column, cause=e) self.meta.columns.update({ "clear": ".", "where": { "eq": { "es_index": column.es_index } } }) self.index_does_not_exist.add(column.es_index) elif "No field found for" in e: self.meta.columns.update({ "clear": ".", "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) Log.warning( "Could not get column {{col.es_index}}.{{col.es_column}} info", col=column, cause=e) else: self.meta.columns.update({ "set": { "last_updated": now }, "clear": [ "count", "cardinality", "multi", "partitions", ], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) Log.warning( "Could not get {{col.es_index}}.{{col.es_column}} info", col=column, cause=e)
def _get_spot_prices_from_aws(self): with Timer("Read no capacity file"): try: # FILE IS LIST OF {instance_type, last_failure} OBJECTS content = self.no_capacity_file.read() self.no_capacity = dict( (r.instance_type, r.last_failure) for r in convert.json2value( content, flexible=False, leaves=False)) except Exception as e: self.no_capacity = {} with Timer("Read pricing file"): try: content = File(self.settings.price_file).read() cache = convert.json2value(content, flexible=False, leaves=False) except Exception as e: cache = FlatList() cache = ListContainer(name=None, data=cache) most_recents = jx.run({ "from": cache, "edges": ["instance_type", "availability_zone"], "select": { "value": "timestamp", "aggregate": "max" } }) zones = self._get_valid_availability_zones() prices = set(cache) with Timer("Get pricing from AWS"): for instance_type in self.settings.utility.keys(): for zone in zones: if cache: most_recent = most_recents[{ "instance_type": instance_type, "availability_zone": zone }].timestamp start_at = MAX( [Date(most_recent), Date.today() - WEEK]) else: start_at = Date.today() - WEEK if DEBUG_PRICING: Log.note( "get pricing for {{instance_type}} starting at {{start_at}}", instance_type=instance_type, start_at=start_at) next_token = None while True: resultset = self.ec2_conn.get_spot_price_history( product_description=coalesce( self.settings.product, "Linux/UNIX (Amazon VPC)"), instance_type=instance_type, availability_zone=zone, start_time=start_at.format(ISO8601), next_token=next_token) next_token = resultset.next_token for p in resultset: prices.add( wrap({ "availability_zone": p.availability_zone, "instance_type": p.instance_type, "price": p.price, "product_description": p.product_description, "region": p.region.name, "timestamp": Date(p.timestamp).unix })) if not next_token: break with Timer("Save prices to file"): new_prices = jx.filter( prices, {"gte": { "timestamp": { "date": "today-2day" } }}) def stream(): # IT'S A LOT OF PRICES, STREAM THEM TO FILE prefix = "[\n" for p in new_prices: yield prefix yield convert.value2json(p) prefix = ",\n" yield "]" File(self.settings.price_file).write(stream()) return ListContainer(name="prices", data=prices)
def main(): try: config = startup.read_settings() constants.set(config.constants) Log.start(config.debug) # SHUNT PYTHON LOGGING TO MAIN LOGGING capture_logging() # SHUNT ADR LOGGING TO MAIN LOGGING # https://loguru.readthedocs.io/en/stable/api/logger.html#loguru._logger.Logger.add capture_loguru() if config.taskcluster: inject_secrets(config) @extend(Configuration) def update(self, config): """ Update the configuration object with new parameters :param config: dict of configuration """ for k, v in config.items(): if v != None: self._config[k] = v self._config["sources"] = sorted( map(os.path.expanduser, set(self._config["sources"]))) # Use the NullStore by default. This allows us to control whether # caching is enabled or not at runtime. self._config["cache"].setdefault("stores", {"null": { "driver": "null" }}) object.__setattr__(self, "cache", CustomCacheManager(self._config)) for _, store in self._config["cache"]["stores"].items(): if store.path and not store.path.endswith("/"): # REQUIRED, OTHERWISE FileStore._create_cache_directory() WILL LOOK AT PARENT DIRECTORY store.path = store.path + "/" if SHOW_S3_CACHE_HIT: s3_get = S3Store._get @extend(S3Store) def _get(self, key): with Timer("get {{key}} from S3", {"key": key}, verbose=False) as timer: output = s3_get(self, key) if output is not None: timer.verbose = True return output # UPDATE ADR CONFIGURATION with Repeat("waiting for ADR", every="10second"): adr.config.update(config.adr) # DUMMY TO TRIGGER CACHE make_push_objects(from_date=Date.today().format(), to_date=Date.now().format(), branch="autoland") outatime = Till(seconds=Duration(MAX_RUNTIME).total_seconds()) outatime.then(lambda: Log.alert("Out of time, exit early")) Schedulers(config).process(outatime) except Exception as e: Log.warning("Problem with etl! Shutting down.", cause=e) finally: Log.stop()
def _update_cardinality(self, column): """ QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN """ now = Date.now() if column.es_index in self.index_does_not_exist: return if column.jx_type in STRUCT: Log.error("not supported") try: if column.es_index == "meta.columns": partitions = jx.sort([g[column.es_column] for g, _ in jx.groupby(self.meta.columns, column.es_column) if g[column.es_column] != None]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.columns), "cardinality": len(partitions), "multi": 1, "last_updated": now }, "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return if column.es_index == "meta.tables": partitions = jx.sort([g[column.es_column] for g, _ in jx.groupby(self.meta.tables, column.es_column) if g[column.es_column] != None]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.tables), "cardinality": len(partitions), "multi": 1, "last_updated": now }, "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return es_index = column.es_index.split(".")[0] is_text = [cc for cc in self.meta.columns if cc.es_column == column.es_column and cc.es_type == "text"] if is_text: # text IS A MULTIVALUE STRING THAT CAN ONLY BE FILTERED result = self.es_cluster.post("/" + es_index + "/_search", data={ "aggs": { "count": {"filter": {"match_all": {}}} }, "size": 0 }) count = result.hits.total cardinality = max(1001, count) multi = 1001 elif column.es_column == "_id": result = self.es_cluster.post("/" + es_index + "/_search", data={ "query": {"match_all": {}}, "size": 0 }) count = cardinality = result.hits.total multi = 1 elif column.es_type == BOOLEAN: result = self.es_cluster.post("/" + es_index + "/_search", data={ "aggs": { "count": _counting_query(column) }, "size": 0 }) count = result.hits.total cardinality = 2 DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "partitions": [False, True], "multi": 1, "last_updated": now }, "clear": ["partitions"], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return else: es_query = { "aggs": { "count": _counting_query(column), "_filter": { "aggs": {"multi": {"max": {"script": "doc[" + quote(column.es_column) + "].values.size()"}}}, "filter": {"bool": {"should": [ {"range": {"etl.timestamp.~n~": {"gte": (Date.today() - WEEK)}}}, {"bool": {"must_not": {"exists": {"field": "etl.timestamp.~n~"}}}} ]}} } }, "size": 0 } result = self.es_cluster.post("/" + es_index + "/_search", data=es_query) agg_results = result.aggregations count = result.hits.total cardinality = coalesce(agg_results.count.value, agg_results.count._nested.value, agg_results.count.doc_count) multi = int(coalesce(agg_results._filter.multi.value, 1)) if cardinality == None: Log.error("logic error") query = Data(size=0) if column.es_column == "_id": self.meta.columns.update({ "set": { "count": cardinality, "cardinality": cardinality, "multi": 1, "last_updated": now }, "clear": ["partitions"], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return elif cardinality > 1000 or (count >= 30 and cardinality == count) or (count >= 1000 and cardinality / count > 0.99): DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": now }, "clear": ["partitions"], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return elif column.es_type in elasticsearch.ES_NUMERIC_TYPES and cardinality > 30: DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": now }, "clear": ["partitions"], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return elif len(column.nested_path) != 1: query.aggs["_"] = { "nested": {"path": column.nested_path[0]}, "aggs": {"_nested": {"terms": {"field": column.es_column}}} } elif cardinality == 0: # WHEN DOES THIS HAPPEN? query.aggs["_"] = {"terms": {"field": column.es_column}} else: query.aggs["_"] = {"terms": {"field": column.es_column, "size": cardinality}} result = self.es_cluster.post("/" + es_index + "/_search", data=query) aggs = result.aggregations._ if aggs._nested: parts = jx.sort(aggs._nested.buckets.key) else: parts = jx.sort(aggs.buckets.key) DEBUG and Log.note("update metadata for {{column.es_index}}.{{column.es_column}} (id={{id}}) at {{time}}", id=id(column), column=column, time=now) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "partitions": parts, "last_updated": now }, "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) except Exception as e: # CAN NOT IMPORT: THE TEST MODULES SETS UP LOGGING # from tests.test_jx import TEST_TABLE e = Except.wrap(e) TEST_TABLE = "testdata" is_missing_index = any(w in e for w in ["IndexMissingException", "index_not_found_exception"]) is_test_table = column.es_index.startswith((TEST_TABLE_PREFIX, TEST_TABLE)) if is_missing_index: # WE EXPECT TEST TABLES TO DISAPPEAR Log.warning("Missing index {{col.es_index}}", col=column, cause=e) self.meta.columns.update({ "clear": ".", "where": {"eq": {"es_index": column.es_index}} }) self.index_does_not_exist.add(column.es_index) elif "No field found for" in e: self.meta.columns.update({ "clear": ".", "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) Log.warning("Could not get column {{col.es_index}}.{{col.es_column}} info", col=column, cause=e) else: self.meta.columns.update({ "set": { "last_updated": now }, "clear": [ "count", "cardinality", "multi", "partitions", ], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) Log.warning("Could not get {{col.es_index}}.{{col.es_column}} info", col=column, cause=e)
def _get_spot_prices_from_aws(self): with Timer("Read pricing file"): try: content = File(self.settings.price_file).read() cache = convert.json2value(content, flexible=False, leaves=False) except Exception as e: cache = FlatList() most_recents = jx.run({ "from": cache, "edges": ["instance_type", "availability_zone"], "select": {"value": "timestamp", "aggregate": "max"} }) zones = self._get_valid_availability_zones() prices = set(cache) with Timer("Get pricing from AWS"): for instance_type in self.settings.utility.keys(): for zone in zones: if cache: most_recent = most_recents[{ "instance_type": instance_type, "availability_zone": zone }].timestamp start_at = MAX([Date(most_recent), Date.today() - WEEK]) else: start_at = Date.today() - WEEK if DEBUG_PRICING: Log.note("get pricing for {{instance_type}} starting at {{start_at}}", instance_type=instance_type, start_at=start_at ) next_token = None while True: resultset = self.ec2_conn.get_spot_price_history( product_description=coalesce(self.settings.product, "Linux/UNIX (Amazon VPC)"), instance_type=instance_type, availability_zone=zone, start_time=start_at.format(ISO8601), next_token=next_token ) next_token = resultset.next_token for p in resultset: prices.add(wrap({ "availability_zone": p.availability_zone, "instance_type": p.instance_type, "price": p.price, "product_description": p.product_description, "region": p.region.name, "timestamp": Date(p.timestamp).unix })) if not next_token: break with Timer("Save prices to file"): new_prices = jx.filter(prices, {"gte": {"timestamp": {"date": "today-2day"}}}) def stream(): # IT'S A LOT OF PRICES, STREAM THEM TO FILE prefix = "[\n" for p in new_prices: yield prefix yield convert.value2json(p) prefix = ",\n" yield "]" File(self.settings.price_file).write(stream()) return prices
def pypi(self): if Date.today() <= self.last_deploy(): Log.note("Can not upload to pypi") return False lib_name = self.directory.name source_readme = File.new_instance(self.directory, 'README.md').abspath dest_readme = File.new_instance(self.directory, 'README.txt').abspath pypandoc.convert(source_readme, to=b'rst', outputfile=dest_readme) setup_file = File.new_instance(self.directory, 'setup.py') req_file = File.new_instance(self.directory, 'requirements.txt') if not setup_file.exists: Log.warning("Not a PyPi project! No setup.py file.") setup = setup_file.read() # UPDATE THE VERSION NUMBER curr = (datetime.datetime.utcnow() + datetime.timedelta(days=1)).strftime("%y%j") setup = re.sub(r'(version\s*=\s*\"\d*\.\d*\.)\d*(\")', r'\g<1>%s\2' % curr, setup) # UPDATE THE REQUIREMENTS if not req_file.exists: Log.error("Expecting a requirements.txt file") req = req_file.read() setup_req = re.findall(r'install_requires\s*=\s*\[.*\]\s*,', setup) setup.replace( setup_req[0], 'install_requires=' + value2json( d for d in sorted(map(strings.trim, req.split("\n"))) if d)) setup_file.write(setup) File.new_instance(self.directory, "build").delete() File.new_instance(self.directory, "dist").delete() File.new_instance(self.directory, lib_name.replace("-", "_") + ".egg-info").delete() process, stdout, stderr = self.local( "pypi", ["C:/Python27/python.exe", "setup.py", "bdist_egg", "upload"], raise_on_error=False) if "Upload failed (400): File already exists." in stderr: Log.warning("Not uploaded") elif process.returncode == 0: pass else: Log.error("not expected") process, stdout, stderr = self.local( "pypi", ["C:/Python27/python.exe", "setup.py", "sdist", "upload"], raise_on_error=False) if "Upload failed (400): File already exists." in stderr: Log.warning("Not uploaded") elif process.returncode == 0: pass else: Log.error("not expected") File.new_instance(self.directory, "README.txt").delete() File.new_instance(self.directory, "build").delete() File.new_instance(self.directory, "dist").delete() File.new_instance(self.directory, lib_name.replace("-", "_") + ".egg-info").delete() return True