def __init__( self, host, index, port=9200, type="log", queue_size=1000, batch_size=100, kwargs=None, ): """ settings ARE FOR THE ELASTICSEARCH INDEX """ kwargs.timeout = Duration(coalesce(kwargs.timeout, "30second")).seconds kwargs.retry.times = coalesce(kwargs.retry.times, 3) kwargs.retry.sleep = Duration(coalesce(kwargs.retry.sleep, MINUTE)).seconds kwargs.host = Random.sample(listwrap(host), 1)[0] schema = json2value(value2json(SCHEMA), leaves=True) schema.mappings[type].properties["~N~"].type = "nested" self.es = Cluster(kwargs).get_or_create_index( schema=schema, limit_replicas=True, typed=True, kwargs=kwargs, ) self.batch_size = batch_size self.es.add_alias(coalesce(kwargs.alias, kwargs.index)) self.queue = Queue("debug logs to es", max=queue_size, silent=True) self.worker = Thread.run("add debug logs to es", self._insert_loop)
def _range_composer(edge, domain, es_query, to_float, schema): # USE RANGES _min = coalesce(domain.min, MIN(domain.partitions.min)) _max = coalesce(domain.max, MAX(domain.partitions.max)) if edge.allowNulls: missing_filter = set_default( { "filter": NotOp("not", AndOp("and", [ edge.value.exists(), InequalityOp("gte", [edge.value, Literal(None, to_float(_min))]), InequalityOp("lt", [edge.value, Literal(None, to_float(_max))]) ]).partial_eval()).to_esfilter(schema) }, es_query ) else: missing_filter = None if isinstance(edge.value, Variable): calc = {"field": schema.leaves(edge.value.var)[0].es_column} else: calc = {"script": edge.value.to_es_script(schema).script(schema)} return wrap({"aggs": { "_match": set_default( {"range": calc}, {"range": {"ranges": [{"from": to_float(p.min), "to": to_float(p.max)} for p in domain.partitions]}}, es_query ), "_missing": missing_filter }})
def __init__(self, instance_manager, disable_prices=False, kwargs=None): self.settings = kwargs self.instance_manager = instance_manager aws_args = dict( region_name=kwargs.aws.region, aws_access_key_id=unwrap(kwargs.aws.aws_access_key_id), aws_secret_access_key=unwrap(kwargs.aws.aws_secret_access_key) ) self.ec2_conn = boto.ec2.connect_to_region(**aws_args) self.vpc_conn = boto.vpc.connect_to_region(**aws_args) self.price_locker = Lock() self.prices = None self.price_lookup = None self.done_spot_requests = Signal() self.net_new_locker = Lock() self.net_new_spot_requests = UniqueIndex(("id",)) # SPOT REQUESTS FOR THIS SESSION self.watcher = None self.active = None self.settings.uptime.bid_percentile = coalesce(self.settings.uptime.bid_percentile, self.settings.bid_percentile) self.settings.uptime.history = coalesce(Date(self.settings.uptime.history), DAY) self.settings.uptime.duration = coalesce(Duration(self.settings.uptime.duration), Date("5minute")) self.settings.max_percent_per_type = coalesce(self.settings.max_percent_per_type, 1) if ENABLE_SIDE_EFFECTS and instance_manager and instance_manager.setup_required(): self._start_life_cycle_watcher() if not disable_prices: self.pricing()
def _open(self): """ DO NOT USE THIS UNLESS YOU close() FIRST""" try: self.db = connect( host=self.settings.host, port=self.settings.port, user=coalesce(self.settings.username, self.settings.user), passwd=coalesce(self.settings.password, self.settings.passwd), db=coalesce(self.settings.schema, self.settings.db), read_timeout=coalesce(self.settings.read_timeout, (EXECUTE_TIMEOUT / 1000) - 10 if EXECUTE_TIMEOUT else None, 5*60), charset=u"utf8", use_unicode=True, ssl=coalesce(self.settings.ssl, None), cursorclass=cursors.SSCursor ) except Exception as e: if self.settings.host.find("://") == -1: Log.error( u"Failure to connect to {{host}}:{{port}}", host=self.settings.host, port=self.settings.port, cause=e ) else: Log.error(u"Failure to connect. PROTOCOL PREFIX IS PROBABLY BAD", e) self.cursor = None self.partial_rollback = False self.transaction_level = 0 self.backlog = [] # accumulate the write commands so they are sent at once if self.readonly: self.begin()
def __init__(self, value, port=None, path=None, query=None, fragment=None): try: self.scheme = None self.host = None self.port = port self.path = path self.query = query self.fragment = fragment if value == None: return if value.startswith("file://") or value.startswith("//"): # urlparse DOES NOT WORK IN THESE CASES scheme, suffix = value.split("//", 2) self.scheme = scheme.rstrip(":") parse(self, suffix, 0, 1) self.query = wrap(url_param2value(self.query)) else: output = urlparse(value) self.scheme = output.scheme self.port = coalesce(port, output.port) self.host = output.netloc.split(":")[0] self.path = coalesce(path, output.path) self.query = coalesce(query, wrap(url_param2value(output.query))) self.fragment = coalesce(fragment, output.fragment) except Exception as e: Log.error(u"problem parsing {{value}} to URL", value=value, cause=e)
def Stats2ZeroMoment(stats): # MODIFIED FROM http://statsmodels.sourceforge.net/devel/_modules/statsmodels/stats/moment_helpers.html # ADDED count mc0, mc1, mc2, skew, kurt = stats.count, coalesce(stats.mean, 0), coalesce(stats.variance, 0), coalesce(stats.skew, 0), coalesce(stats.kurtosis, 0) mz0 = mc0 mz1 = mc1 * mc0 mz2 = (mc2 + mc1 * mc1) * mc0 mc3 = coalesce(skew, 0) * (mc2 ** 1.5) # 3rd central moment mz3 = (mc3 + 3 * mc1 * mc2 + mc1 ** 3) * mc0 # 3rd non-central moment mc4 = (coalesce(kurt, 0) + 3.0) * (mc2 ** 2.0) # 4th central moment mz4 = (mc4 + 4 * mc1 * mc3 + 6 * mc1 * mc1 * mc2 + mc1 ** 4) * mc0 m = ZeroMoment(mz0, mz1, mz2, mz3, mz4) if DEBUG: from mo_testing.fuzzytestcase import assertAlmostEqualValue globals()["DEBUG"] = False try: v = ZeroMoment2Stats(m) assertAlmostEqualValue(v.count, stats.count, places=10) assertAlmostEqualValue(v.mean, stats.mean, places=10) assertAlmostEqualValue(v.variance, stats.variance, places=10) assertAlmostEqualValue(v.skew, stats.skew, places=10) assertAlmostEqualValue(v.kurtosis, stats.kurtosis, places=10) except Exception as e: v = ZeroMoment2Stats(m) Log.error("programmer error") globals()["DEBUG"] = True return m
def __init__(self, rate=None, amortization_period=None, source=None, database=None, kwargs=None): self.amortization_period = coalesce(amortization_period, AMORTIZATION_PERIOD) self.rate = coalesce(rate, HG_REQUEST_PER_SECOND) self.cache_locker = Lock() self.cache = {} # MAP FROM url TO (ready, headers, response, timestamp) PAIR self.no_cache = {} # VERY SHORT TERM CACHE self.workers = [] self.todo = Queue(APP_NAME+" todo") self.requests = Queue(APP_NAME + " requests", max=int(self.rate * self.amortization_period.seconds)) self.url = URL(source.url) self.db = Sqlite(database) self.inbound_rate = RateLogger("Inbound") self.outbound_rate = RateLogger("hg.mo") if not self.db.query("SELECT name FROM sqlite_master WHERE type='table'").data: with self.db.transaction() as t: t.execute( "CREATE TABLE cache (" " path TEXT PRIMARY KEY, " " headers TEXT, " " response TEXT, " " timestamp REAL " ")" ) self.threads = [ Thread.run(APP_NAME+" worker" + text_type(i), self._worker) for i in range(CONCURRENCY) ] self.limiter = Thread.run(APP_NAME+" limiter", self._rate_limiter) self.cleaner = Thread.run(APP_NAME+" cleaner", self._cache_cleaner)
def __init__(self, **desc): desc = wrap(desc) self._set_slots_to_null(self.__class__) set_default(self, desc) self.name = coalesce(desc.name, desc.type) self.isFacet = coalesce(desc.isFacet, False) self.dimension = Null self.limit = desc.limit
def get_decoders_by_depth(query): """ RETURN A LIST OF DECODER ARRAYS, ONE ARRAY FOR EACH NESTED DEPTH """ schema = query.frum.schema output = FlatList() if query.edges: if query.sort and query.format != "cube": # REORDER EDGES/GROUPBY TO MATCH THE SORT query.edges = sort_edges(query, "edges") elif query.groupby: if query.sort and query.format != "cube": query.groupby = sort_edges(query, "groupby") for edge in wrap(coalesce(query.edges, query.groupby, [])): limit = coalesce(edge.domain.limit, query.limit, DEFAULT_LIMIT) if edge.value != None and not isinstance(edge.value, NullOp): edge = edge.copy() vars_ = edge.value.vars() for v in vars_: if not schema.leaves(v.var): Log.error("{{var}} does not exist in schema", var=v) elif edge.range: vars_ = edge.range.min.vars() | edge.range.max.vars() for v in vars_: if not schema[v.var]: Log.error("{{var}} does not exist in schema", var=v) elif edge.domain.dimension: vars_ = edge.domain.dimension.fields edge.domain.dimension = edge.domain.dimension.copy() edge.domain.dimension.fields = [schema[v].es_column for v in vars_] elif all(edge.domain.partitions.where): vars_ = set() for p in edge.domain.partitions: vars_ |= p.where.vars() try: vars_ |= edge.value.vars() depths = set(len(c.nested_path) - 1 for v in vars_ for c in schema.leaves(v.var)) if -1 in depths: Log.error( "Do not know of column {{column}}", column=unwraplist([v for v in vars_ if schema[v] == None]) ) if len(depths) > 1: Log.error("expression {{expr|quote}} spans tables, can not handle", expr=edge.value) max_depth = MAX(depths) while len(output) <= max_depth: output.append([]) except Exception as e: # USUALLY THE SCHEMA IS EMPTY, SO WE ASSUME THIS IS A SIMPLE QUERY max_depth = 0 output.append([]) output[max_depth].append(AggsDecoder(edge, query, limit)) return output
def single(col, r): min = coalesce(r["gte"], r[">="]) max = coalesce(r["lte"], r["<="]) if min and max: # SPECIAL CASE (BETWEEN) return db.quote_column(col) + SQL(" BETWEEN ") + db.quote_value(min) + SQL(" AND ") + db.quote_value(max) else: return " AND ".join( db.quote_column(col) + name2sign[sign] + db.quote_value(value) for sign, value in r.items() )
def single(col, r): min = coalesce(r["gte"], r[">="]) max = coalesce(r["lte"], r["<="]) if min != None and max != None: # SPECIAL CASE (BETWEEN) sql = quote_column(col) + SQL(" BETWEEN ") + quote_value(min) + SQL_AND + quote_value(max) else: sql = SQL_AND.join( quote_column(col) + name2sign[sign] + quote_value(value) for sign, value in r.items() ) return sql
def _get_from_elasticsearch(self, revision, locale=None, get_diff=False, get_moves=True): rev = revision.changeset.id if self.es.cluster.version.startswith("1.7."): query = { "query": {"filtered": { "query": {"match_all": {}}, "filter": {"and": [ {"term": {"changeset.id12": rev[0:12]}}, {"term": {"branch.name": revision.branch.name}}, {"term": {"branch.locale": coalesce(locale, revision.branch.locale, DEFAULT_LOCALE)}}, {"range": {"etl.timestamp": {"gt": MIN_ETL_AGE}}} ]} }}, "size": 20 } else: query = { "query": {"bool": {"must": [ {"term": {"changeset.id12": rev[0:12]}}, {"term": {"branch.name": revision.branch.name}}, {"term": {"branch.locale": coalesce(locale, revision.branch.locale, DEFAULT_LOCALE)}}, {"range": {"etl.timestamp": {"gt": MIN_ETL_AGE}}} ]}}, "size": 20 } for attempt in range(3): try: with self.es_locker: docs = self.es.search(query).hits.hits if len(docs) == 0: return None best = docs[0]._source if len(docs) > 1: for d in docs: if d._id.endswith(d._source.branch.locale): best = d._source Log.warning("expecting no more than one document") return best except Exception as e: e = Except.wrap(e) if "EsRejectedExecutionException[rejected execution (queue capacity" in e: (Till(seconds=Random.int(30))).wait() continue else: Log.warning("Bad ES call, waiting for {{num}} seconds", num=WAIT_AFTER_NODE_FAILURE, cause=e) Till(seconds=WAIT_AFTER_NODE_FAILURE).wait() continue Log.warning("ES did not deliver, fall back to HG") return None
def send_email(self, from_address=None, to_address=None, subject=None, text_data=None, html_data=None ): """Sends an email. from_addr is an email address; to_addrs is a list of email adresses. Addresses can be plain (e.g. "*****@*****.**") or with real names (e.g. "John Smith <*****@*****.**>"). text_data and html_data are both strings. You can specify one or both. If you specify both, the email will be sent as a MIME multipart alternative, i.e., the recipient will see the HTML content if his viewer supports it; otherwise he'll see the text content. """ settings = self.settings from_address = coalesce(from_address, settings["from"], settings.from_address) to_address = listwrap(coalesce(to_address, settings.to_address, settings.to_addrs)) if not from_address or not to_address: raise Exception("Both from_addr and to_addrs must be specified") if not text_data and not html_data: raise Exception("Must specify either text_data or html_data") if not html_data: msg = MIMEText(text_data) elif not text_data: msg = MIMEText(html_data, 'html') else: msg = MIMEMultipart('alternative') msg.attach(MIMEText(text_data, 'plain')) msg.attach(MIMEText(html_data, 'html')) msg['Subject'] = coalesce(subject, settings.subject) msg['From'] = from_address msg['To'] = ', '.join(to_address) if self.server: # CALL AS PART OF A SMTP SESSION self.server.sendmail(from_address, to_address, msg.as_string()) else: # CALL AS STAND-ALONE with self: self.server.sendmail(from_address, to_address, msg.as_string())
def main(): try: settings = startup.read_settings() Log.start(settings.debug) with SingleInstance(flavor_id=settings.args.filename): constants.set(settings.constants) settings.run_interval = Duration(settings.run_interval) for u in settings.utility: u.discount = coalesce(u.discount, 0) # MARKUP drives WITH EXPECTED device MAPPING num_ephemeral_volumes = ephemeral_storage[u.instance_type]["num"] for i, d in enumerate(d for d in u.drives if not d.device): letter = convert.ascii2char(98 + num_ephemeral_volumes + i) d.device = "/dev/xvd" + letter settings.utility = UniqueIndex(["instance_type"], data=settings.utility) instance_manager = new_instance(settings.instance) m = SpotManager(instance_manager, kwargs=settings) if ENABLE_SIDE_EFFECTS: m.update_spot_requests(instance_manager.required_utility()) if m.watcher: m.watcher.join() except Exception as e: Log.warning("Problem with spot manager", cause=e) finally: Log.stop() MAIN_THREAD.stop()
def compileDuration2Term(edge): if edge.esscript: Log.error("edge script not supported yet") # IS THERE A LIMIT ON THE DOMAIN? numPartitions = len(edge.domain.partitions) value = edge.value if is_variable_name(value): value = "doc[\"" + value + "\"].value" ref = coalesce(edge.domain.min, edge.domain.max, durations.ZERO) nullTest = compileNullTest(edge) ms = edge.domain.interval.milli if edge.domain.interval.month > 0: ms = durations.YEAR.milli / 12 * edge.domain.interval.month partition2int = "Math.floor((" + value + "-" + value2MVEL(ref) + ")/" + ms + ")" partition2int = "((" + nullTest + ") ? " + numPartitions + " : " + partition2int + ")" def int2Partition(value): if Math.round(value) == numPartitions: return edge.domain.NULL return edge.domain.getPartByKey(ref.add(edge.domain.interval.multiply(value))) return Data(toTerm={"head": "", "body": partition2int}, fromTerm=int2Partition)
def assertAlmostEqualValue(test, expected, digits=None, places=None, msg=None, delta=None): """ Snagged from unittest/case.py, then modified (Aug2014) """ if expected is NULL: if test == None: # pandas dataframes reject any comparision with an exception! return else: raise AssertionError(expand_template("{{test}} != {{expected}}", locals())) if expected == None: # None has no expectations return if test == expected: # shortcut return if not is_number(expected): # SOME SPECIAL CASES, EXPECTING EMPTY CONTAINERS IS THE SAME AS EXPECTING NULL if is_list(expected) and len(expected) == 0 and test == None: return if is_data(expected) and not expected.keys() and test == None: return if test != expected: raise AssertionError(expand_template("{{test}} != {{expected}}", locals())) return num_param = 0 if digits != None: num_param += 1 if places != None: num_param += 1 if delta != None: num_param += 1 if num_param>1: raise TypeError("specify only one of digits, places or delta") if digits is not None: with suppress_exception: diff = log10(abs(test-expected)) if diff < digits: return standardMsg = expand_template("{{test}} != {{expected}} within {{digits}} decimal places", locals()) elif delta is not None: if abs(test - expected) <= delta: return standardMsg = expand_template("{{test}} != {{expected}} within {{delta}} delta", locals()) else: if places is None: places = 15 with suppress_exception: diff = mo_math.log10(abs(test-expected)) if diff < mo_math.ceiling(mo_math.log10(abs(test)))-places: return standardMsg = expand_template("{{test|json}} != {{expected|json}} within {{places}} places", locals()) raise AssertionError(coalesce(msg, "") + ": (" + standardMsg + ")")
def fix(source_key, rownum, line, source, sample_only_filter, sample_size): """ :param rownum: :param line: :param source: :param sample_only_filter: :param sample_size: :return: (row, no_more_data) TUPLE WHERE row IS {"value":<data structure>} OR {"json":<text line>} """ value = json2value(line) if rownum == 0: if len(line) > MAX_RECORD_LENGTH: _shorten(source_key, value, source) value = _fix(value) if sample_only_filter and Random.int(int(1.0/coalesce(sample_size, 0.01))) != 0 and jx.filter([value], sample_only_filter): # INDEX etl.id==0, BUT NO MORE if value.etl.id != 0: Log.error("Expecting etl.id==0") row = {"value": value} return row, True elif len(line) > MAX_RECORD_LENGTH: _shorten(source_key, value, source) value = _fix(value) elif '"resource_usage":' in line: value = _fix(value) row = {"value": value} return row, False
def query(self, sql, param=None): """ RETURN LIST OF dicts """ self._execute_backlog() try: old_cursor = self.cursor if not old_cursor: # ALLOW NON-TRANSACTIONAL READS self.cursor = self.db.cursor() self.cursor.execute("SET TIME_ZONE='+00:00'") self.cursor.close() self.cursor = self.db.cursor() if param: sql = expand_template(sql, self.quote_param(param)) sql = self.preamble + outdent(sql) if self.debug: Log.note("Execute SQL:\n{{sql}}", sql=indent(sql)) self.cursor.execute(sql) columns = [utf8_to_unicode(d[0]) for d in coalesce(self.cursor.description, [])] fixed = [[utf8_to_unicode(c) for c in row] for row in self.cursor] result = convert.table2list(columns, fixed) if not old_cursor: # CLEANUP AFTER NON-TRANSACTIONAL READS self.cursor.close() self.cursor = None return result except Exception as e: if isinstance(e, InterfaceError) or e.message.find("InterfaceError") >= 0: Log.error("Did you close the db connection?", e) Log.error("Problem executing SQL:\n{{sql|indent}}", sql= sql, cause=e, stack_depth=1)
def convert(self, expr): """ EXPAND INSTANCES OF name TO value """ if expr is True or expr == None or expr is False: return expr elif Math.is_number(expr): return expr elif expr == ".": return "." elif is_variable_name(expr): return coalesce(self.dimensions[expr], expr) elif isinstance(expr, text_type): Log.error("{{name|quote}} is not a valid variable name", name=expr) elif isinstance(expr, Date): return expr elif isinstance(expr, QueryOp): return self._convert_query(expr) elif isinstance(expr, Mapping): if expr["from"]: return self._convert_query(expr) elif len(expr) >= 2: #ASSUME WE HAVE A NAMED STRUCTURE, NOT AN EXPRESSION return wrap({name: self.convert(value) for name, value in expr.leaves()}) else: # ASSUME SINGLE-CLAUSE EXPRESSION k, v = expr.items()[0] return converter_map.get(k, self._convert_bop)(self, k, v) elif isinstance(expr, (list, set, tuple)): return wrap([self.convert(value) for value in expr]) else: return expr
def __init__(self, edge, query, limit): AggsDecoder.__init__(self, edge, query, limit) edge.allowNulls = False self.fields = edge.domain.dimension.fields self.domain = self.edge.domain self.domain.limit = Math.min(coalesce(self.domain.limit, query.limit, 10), MAX_LIMIT) self.parts = list()
def query(self, sql, param=None, stream=False, row_tuples=False): """ RETURN LIST OF dicts """ if not self.cursor: # ALLOW NON-TRANSACTIONAL READS Log.error("must perform all queries inside a transaction") self._execute_backlog() try: if param: sql = expand_template(sql, quote_param(param)) sql = self.preamble + outdent(sql) self.debug and Log.note("Execute SQL:\n{{sql}}", sql=indent(sql)) self.cursor.execute(sql) if row_tuples: if stream: result = self.cursor else: result = wrap(list(self.cursor)) else: columns = [utf8_to_unicode(d[0]) for d in coalesce(self.cursor.description, [])] if stream: result = (wrap({c: utf8_to_unicode(v) for c, v in zip(columns, row)}) for row in self.cursor) else: result = wrap([{c: utf8_to_unicode(v) for c, v in zip(columns, row)} for row in self.cursor]) return result except Exception as e: e = Except.wrap(e) if "InterfaceError" in e: Log.error("Did you close the db connection?", e) Log.error("Problem executing SQL:\n{{sql|indent}}", sql=sql, cause=e, stack_depth=1)
def _convert_edge(self, edge): if is_text(edge): return Data( name=edge, value=edge, domain=self._convert_domain() ) else: edge = wrap(edge) if not edge.name and not is_text(edge.value): Log.error("You must name compound edges: {{edge}}", edge= edge) if edge.value.__class__ in (Data, dict, list, FlatList) and not edge.domain: # COMPLEX EDGE IS SHORT HAND domain =self._convert_domain() domain.dimension = Data(fields=edge.value) return Data( name=edge.name, allowNulls=False if edge.allowNulls is False else True, domain=domain ) domain = self._convert_domain(edge.domain) return Data( name=coalesce(edge.name, edge.value), value=edge.value, range=edge.range, allowNulls=False if edge.allowNulls is False else True, domain=domain )
def _expand(template, seq): """ seq IS TUPLE OF OBJECTS IN PATH ORDER INTO THE DATA TREE """ if is_text(template): return _simple_expand(template, seq) elif is_data(template): # EXPAND LISTS OF ITEMS USING THIS FORM # {"from":from, "template":template, "separator":separator} template = wrap(template) assert template["from"], "Expecting template to have 'from' attribute" assert template.template, "Expecting template to have 'template' attribute" data = seq[-1][template["from"]] output = [] for d in data: s = seq + (d,) output.append(_expand(template.template, s)) return coalesce(template.separator, "").join(output) elif is_list(template): return "".join(_expand(t, seq) for t in template) else: if not _Log: _late_import() _Log.error("can not handle")
def __init__(self, host, index, port=9200, type="log", max_size=1000, batch_size=100, kwargs=None): """ settings ARE FOR THE ELASTICSEARCH INDEX """ self.es = Cluster(kwargs).get_or_create_index( schema=mo_json.json2value(value2json(SCHEMA), leaves=True), limit_replicas=True, tjson=True, kwargs=kwargs ) self.batch_size = batch_size self.es.add_alias(coalesce(kwargs.alias, kwargs.index)) self.queue = Queue("debug logs to es", max=max_size, silent=True) self.es.settings.retry.times = coalesce(self.es.settings.retry.times, 3) self.es.settings.retry.sleep = Duration(coalesce(self.es.settings.retry.sleep, MINUTE)) Thread.run("add debug logs to es", self._insert_loop)
def _worker(start): output = SchemaTree() root = parquet_schema_list[off.set] output.element = root max = start + coalesce(root.num_children, 0) if off.set == 0: if root.name not in ['.', 'schema', 'spark_schema', 'm', 'hive_schema', 'root']: # some known root names Log.warning("first SchemaElement is given name {{name|quote}}, name is ignored", name=root.name) root.name = '.' root.repetition_type = REQUIRED while off.set < max: off.set += 1 child = _worker(off.set) parent = output path = relative_field(child.element.name, root.name) # path = split_field(relative_field(child.element.name, root.name)) # for i, p in enumerate(path[:-1]): # new_parent = parent.more[p] = SchemaTree() # new_parent.element = SchemaElement( # name=concat_field(root.name, join_field(path[:i+1])), # repetition_type=REQUIRED # ) # parent = new_parent # parent.more[path[-1]] = child parent.more[path] = child return output
def read_settings(filename=None, defs=None): """ :param filename: Force load a file :param defs: arguments you want to accept :param default_filename: A config file from an environment variable (a fallback config file, if no other provided) :return: """ # READ SETTINGS defs = listwrap(defs) defs.append({ "name": ["--config", "--settings", "--settings-file", "--settings_file"], "help": "path to JSON file with settings", "type": str, "dest": "filename", "default": None, "required": False }) args = argparse(defs) args.filename = coalesce(filename, args.filename, "./config.json") settings_file = File(args.filename) if not settings_file.exists: Log.error("Can not read configuration file {{filename}}", { "filename": settings_file.abspath }) settings = mo_json_config.get_file(settings_file) settings.args = args return settings
def __init__(self, host, index, alias=None, name=None, port=9200, kwargs=None): global _elasticsearch if hasattr(self, "settings"): return from pyLibrary.queries.containers.list_usingPythonList import ListContainer from pyLibrary.env import elasticsearch as _elasticsearch self.settings = kwargs self.default_name = coalesce(name, alias, index) self.default_es = _elasticsearch.Cluster(kwargs=kwargs) self.todo = Queue("refresh metadata", max=100000, unique=True) self.es_metadata = Null self.last_es_metadata = Date.now()-OLD_METADATA self.meta=Data() table_columns = metadata_tables() column_columns = metadata_columns() self.meta.tables = ListContainer("meta.tables", [], wrap({c.names["."]: c for c in table_columns})) self.meta.columns = ColumnList() self.meta.columns.insert(column_columns) self.meta.columns.insert(table_columns) # TODO: fix monitor so it does not bring down ES if ENABLE_META_SCAN: self.worker = Thread.run("refresh metadata", self.monitor) else: self.worker = Thread.run("refresh metadata", self.not_monitor) return
def _convert_edge(self, edge): if isinstance(edge, basestring): return Data( name=edge, value=edge, domain=self._convert_domain() ) else: edge = wrap(edge) if not edge.name and not isinstance(edge.value, basestring): Log.error("You must name compound edges: {{edge}}", edge= edge) if isinstance(edge.value, (Mapping, list)) and not edge.domain: # COMPLEX EDGE IS SHORT HAND domain =self._convert_domain() domain.dimension = Data(fields=edge.value) return Data( name=edge.name, allowNulls=False if edge.allowNulls is False else True, domain=domain ) domain = self._convert_domain(edge.domain) return Data( name=coalesce(edge.name, edge.value), value=edge.value, range=edge.range, allowNulls=False if edge.allowNulls is False else True, domain=domain )
def __init__(self, conn=None, tuid_service=None, start_workers=True, new_table=False, kwargs=None): try: self.config = kwargs self.conn = conn if conn else sql.Sql(self.config.database.name) self.hg_cache = HgMozillaOrg(kwargs=self.config.hg_cache, use_cache=True) if self.config.hg_cache else Null self.tuid_service = tuid_service if tuid_service else tuid.service.TUIDService( kwargs=self.config.tuid, conn=self.conn, clogger=self ) self.rev_locker = Lock() self.working_locker = Lock() if new_table: with self.conn.transaction() as t: t.execute("DROP TABLE IF EXISTS csetLog") self.init_db() self.next_revnum = coalesce(self.conn.get_one("SELECT max(revnum)+1 FROM csetLog")[0], 1) self.csets_todo_backwards = Queue(name="Clogger.csets_todo_backwards") self.deletions_todo = Queue(name="Clogger.deletions_todo") self.maintenance_signal = Signal(name="Clogger.maintenance_signal") if 'tuid' in self.config: self.config = self.config.tuid self.disable_backfilling = False self.disable_tipfilling = False self.disable_deletion = False self.disable_maintenance = False self.backfill_thread = None self.tipfill_thread = None self.deletion_thread = None self.maintenance_thread = None # Make sure we are filled before allowing queries numrevs = self.conn.get_one("SELECT count(revnum) FROM csetLog")[0] if numrevs < MINIMUM_PERMANENT_CSETS: Log.note("Filling in csets to hold {{minim}} csets.", minim=MINIMUM_PERMANENT_CSETS) oldest_rev = 'tip' with self.conn.transaction() as t: tmp = t.query("SELECT min(revnum), revision FROM csetLog").data[0][1] if tmp: oldest_rev = tmp self._fill_in_range( MINIMUM_PERMANENT_CSETS - numrevs, oldest_rev, timestamp=False ) Log.note( "Table is filled with atleast {{minim}} entries.", minim=MINIMUM_PERMANENT_CSETS ) if start_workers: self.start_workers() except Exception as e: Log.warning("Cannot setup clogger: {{cause}}", cause=str(e))
def get_index(self, row): domain = self.edge.domain part = row[self.start] if part == None: return len(domain.partitions) f = coalesce(part.get('from'), part.get('key')) t = coalesce(part.get('to'), part.get('key')) if f == None or t == None: return len(domain.partitions) else: for p in domain.partitions: if p.min <= f < p.max: return p.dataIndex sample = part.copy sample.buckets = None Log.error("Expecting to find {{part}}", part=sample)
def read_db(self): """ PULL SCHEMA FROM DATABASE, BUILD THE MODEL :return: None """ # FIND ALL TABLES result = self.db.query( "SELECT * FROM sqlite_master WHERE type='table' ORDER BY name") tables = wrap([{k: d[i] for i, k in enumerate(result.header)} for d in result.data]) tables_found = False for table in tables: if table.name.startswith("__"): continue tables_found = True nested_path = [ join_field(split_field(tab.name)[1:]) for tab in jx.reverse(tables) if startswith_field(table.name, tab.name) ] self.add_table_to_schema(nested_path) # LOAD THE COLUMNS command = "PRAGMA table_info(" + quote_table(table.name) + ")" details = self.db.query(command) for cid, name, dtype, notnull, dfft_value, pk in details.data: if name.startswith("__"): continue cname, ctype = untyped_column(name) column = Column(names={ np: relative_field(cname, np) for np in nested_path }, type=coalesce( ctype, { "TEXT": "string", "REAL": "number", "INTEGER": "integer" }.get(dtype)), nested_path=nested_path, es_column=name, es_index=table.name) self.add_column_to_schema(column) return tables_found
def new_instance(cls, log_type=None, settings=None): if settings["class"]: if settings["class"].startswith("logging.handlers."): from mo_logs.log_usingThread import StructuredLogger_usingThread from mo_logs.log_usingHandler import StructuredLogger_usingHandler return StructuredLogger_usingThread( StructuredLogger_usingHandler(settings)) else: with suppress_exception: from mo_logs.log_usingLogger import make_log_from_settings return make_log_from_settings(settings) # OH WELL :( if log_type == "logger": from mo_logs.log_usingLogger import StructuredLogger_usingLogger return StructuredLogger_usingLogger(settings) if log_type == "file" or settings.file: return StructuredLogger_usingFile(settings.file) if log_type == "file" or settings.filename: return StructuredLogger_usingFile(settings.filename) if log_type == "console": from mo_logs.log_usingThread import StructuredLogger_usingThread return StructuredLogger_usingThread( StructuredLogger_usingStream(STDOUT)) if log_type == "mozlog": from mo_logs.log_usingMozLog import StructuredLogger_usingMozLog return StructuredLogger_usingMozLog( STDOUT, coalesce(settings.app_name, settings.appname)) if log_type == "stream" or settings.stream: from mo_logs.log_usingThread import StructuredLogger_usingThread return StructuredLogger_usingThread( StructuredLogger_usingStream(settings.stream)) if log_type == "elasticsearch" or settings.stream: from mo_logs.log_usingElasticSearch import StructuredLogger_usingElasticSearch return StructuredLogger_usingElasticSearch(settings) if log_type == "email": from mo_logs.log_usingEmail import StructuredLogger_usingEmail return StructuredLogger_usingEmail(settings) if log_type == "ses": from mo_logs.log_usingSES import StructuredLogger_usingSES return StructuredLogger_usingSES(settings) if log_type.lower() in ["nothing", "none", "null"]: from mo_logs.log_usingNothing import StructuredLogger return StructuredLogger() Log.error("Log type of {{config|json}} is not recognized", config=settings)
def __init__(self, edge, query, limit): AggsDecoder.__init__(self, edge, query, limit) self.domain = edge.domain self.domain.limit = Math.min( coalesce(self.domain.limit, query.limit, 10), MAX_LIMIT) self.parts = list() self.key2index = {} self.computed_domain = False # WE ASSUME IF THE VARIABLES MATCH, THEN THE SORT TERM AND EDGE TERM MATCH, AND WE SORT BY TERM self.sorted = None edge_var = edge.value.vars() for s in query.sort: if not edge_var - s.value.vars(): self.sorted = {1: "asc", -1: "desc"}[s.sort]
def __init__(self, instance_manager, disable_prices=False, kwargs=None): self.settings = kwargs self.instance_manager = instance_manager aws_args = dict(region_name=kwargs.aws.region, aws_access_key_id=unwrap(kwargs.aws.aws_access_key_id), aws_secret_access_key=unwrap( kwargs.aws.aws_secret_access_key)) self.ec2_conn = boto.ec2.connect_to_region(**aws_args) self.vpc_conn = boto.vpc.connect_to_region(**aws_args) self.price_locker = Lock() self.prices = None self.price_lookup = None self.no_capacity = {} self.no_capacity_file = File( kwargs.price_file).parent / "no capacity.json" self.done_making_new_spot_requests = Signal() self.net_new_locker = Lock() self.net_new_spot_requests = UniqueIndex( ("id", )) # SPOT REQUESTS FOR THIS SESSION self.watcher = None self.active = None self.settings.uptime.bid_percentile = coalesce( self.settings.uptime.bid_percentile, self.settings.bid_percentile) self.settings.uptime.history = coalesce( Date(self.settings.uptime.history), DAY) self.settings.uptime.duration = coalesce( Duration(self.settings.uptime.duration), Date("5minute")) self.settings.max_percent_per_type = coalesce( self.settings.max_percent_per_type, 1) if ENABLE_SIDE_EFFECTS and instance_manager and instance_manager.setup_required( ): self._start_life_cycle_watcher() if not disable_prices: self.pricing()
def _range_composer(self, edge, domain, es_query, to_float, schema): # USE RANGES _min = coalesce(domain.min, MIN(domain.partitions.min)) _max = coalesce(domain.max, MAX(domain.partitions.max)) output = Aggs() if edge.allowNulls: output.add(FilterAggs( "_missing", NotOp(AndOp([ edge.value.exists(), GteOp([edge.value, Literal(to_float(_min))]), LtOp([edge.value, Literal(to_float(_max))]) ]).partial_eval()), self ).add(es_query)) if is_op(edge.value, Variable): calc = {"field": first(schema.leaves(edge.value.var)).es_column} else: calc = {"script": text(Painless[edge.value].to_es_script(schema))} calc['ranges'] = [{"from": to_float(p.min), "to": to_float(p.max)} for p in domain.partitions] return output.add(RangeAggs("_match", calc, self).add(es_query))
def __init__(self, host, index, port=9200, type="log", max_size=1000, batch_size=100, kwargs=None): """ settings ARE FOR THE ELASTICSEARCH INDEX """ self.es = Cluster(kwargs).get_or_create_index( schema=mo_json.json2value(value2json(SCHEMA), leaves=True), limit_replicas=True, tjson=True, kwargs=kwargs) self.batch_size = batch_size self.es.add_alias(coalesce(kwargs.alias, kwargs.index)) self.queue = Queue("debug logs to es", max=max_size, silent=True) self.es.settings.retry.times = coalesce(self.es.settings.retry.times, 3) self.es.settings.retry.sleep = Duration( coalesce(self.es.settings.retry.sleep, MINUTE)) Thread.run("add debug logs to es", self._insert_loop)
def gzip_wrapper(func, compress_lower_limit=None): compress_lower_limit = coalesce(compress_lower_limit, TOO_SMALL_TO_COMPRESS) def output(*args, **kwargs): response = func(*args, **kwargs) accept_encoding = flask.request.headers.get("Accept-Encoding", "") if "gzip" not in accept_encoding.lower(): return response response.headers["Content-Encoding"] = "gzip" response.response = ibytes2icompressed(response.response) return response return output
def _normalize_window(window, schema=None): v = window.value try: expr = jx_expression(v) except Exception: expr = ScriptOp("script", v) return Data( name=coalesce(window.name, window.value), value=expr, edges=[_normalize_edge(e, schema) for e in listwrap(window.edges)], sort=_normalize_sort(window.sort), aggregate=window.aggregate, range=_normalize_range(window.range), where=_normalize_where(window.where, schema=schema))
def __init__(self, type, expr, frum, schema, miss=None, many=False): self.simplified = True object.__init__(self) if miss not in [None, NULL, FALSE, TRUE, ONE, ZERO]: if frum.lang != miss.lang: Log.error("logic error") self.miss = coalesce( miss, FALSE ) # Expression that will return true/false to indicate missing result self.data_type = type self.expr = expr self.many = many # True if script returns multi-value self.frum = frum # THE ORIGINAL EXPRESSION THAT MADE expr self.schema = schema
def _open(self): """ DO NOT USE THIS UNLESS YOU close() FIRST""" try: self.db = connect(host=self.settings.host, port=self.settings.port, user=coalesce(self.settings.username, self.settings.user), passwd=coalesce(self.settings.password, self.settings.passwd), db=coalesce(self.settings.schema, self.settings.db), charset=u"utf8", use_unicode=True, ssl=coalesce(self.settings.ssl, None), cursorclass=cursors.SSCursor) except Exception, e: if self.settings.host.find("://") == -1: Log.error(u"Failure to connect to {{host}}:{{port}}", host=self.settings.host, port=self.settings.port, cause=e) else: Log.error( u"Failure to connect. PROTOCOL PREFIX IS PROBABLY BAD", e)
def __init__(self, name, max=None, silent=False, unique=False, allow_add_after_close=False): """ max - LIMIT THE NUMBER IN THE QUEUE, IF TOO MANY add() AND extend() WILL BLOCK silent - COMPLAIN IF THE READERS ARE TOO SLOW unique - SET True IF YOU WANT ONLY ONE INSTANCE IN THE QUEUE AT A TIME """ self.name = name self.max = coalesce(max, 2 ** 10) self.silent = silent self.allow_add_after_close=allow_add_after_close self.unique = unique self.please_stop = Signal("stop signal for " + name) self.lock = Lock("lock for queue " + name) self.queue = deque() self.next_warning = time() # FOR DEBUGGING
def __init__(self, kwargs=None): self.settings = kwargs self.settings.exclude = set(self.settings.exclude) self.settings.show_foreign_keys = coalesce( self.settings.show_foreign_keys, True) self.all_nested_paths = None self.nested_path_to_join = None self.columns = None with Explanation("scan database", debug=DEBUG): self.db = MySQL(**kwargs.database) with self.db: with self.db.transaction(): self._scan_database()
def fix(rownum, line, source, sample_only_filter, sample_size): value = json2value(line) if value._id.startswith(("tc.97", "96", "bb.27")): # AUG 24, 25 2017 - included full diff with repo; too big to index try: data = json2value(line) repo = data.repo repo.etl = None repo.branch.last_used = None repo.branch.description = None repo.branch.etl = None repo.branch.parent_name = None repo.children = None repo.parents = None if repo.changeset.diff or data.build.repo.changeset.diff: Log.error("no diff allowed") else: assertAlmostEqual(minimize_repo(repo), repo) except Exception as e: if CAN_NOT_DECODE_JSON in e: raise e data.repo = minimize_repo(repo) data.build.repo = minimize_repo(data.build.repo) line = value2json(data) else: pass if rownum == 0: if len(line) > MAX_RECORD_LENGTH: _shorten(value, source) value = _fix(value) if sample_only_filter and Random.int( int(1.0 / coalesce(sample_size, 0.01))) != 0 and jx.filter( [value], sample_only_filter): # INDEX etl.id==0, BUT NO MORE if value.etl.id != 0: Log.error("Expecting etl.id==0") row = {"value": value} return row, True elif len(line) > MAX_RECORD_LENGTH: _shorten(value, source) value = _fix(value) elif line.find('"resource_usage":') != -1: value = _fix(value) row = {"value": value} return row, False
def _get_from_elasticsearch(self, revision, locale=None, get_diff=False): rev = revision.changeset.id query = { "query": {"filtered": { "query": {"match_all": {}}, "filter": {"and": [ {"term": {"changeset.id12": rev[0:12]}}, {"term": {"branch.name": revision.branch.name}}, {"term": {"branch.locale": coalesce(locale, revision.branch.locale, DEFAULT_LOCALE)}}, {"range": {"etl.timestamp": {"gt": MIN_ETL_AGE}}} ]} }}, "size": 2000 } for attempt in range(3): try: with self.es_locker: docs = self.es.search(query).hits.hits break except Exception as e: e = Except.wrap(e) if "NodeNotConnectedException" in e: # WE LOST A NODE, THIS MAY TAKE A WHILE (Till(seconds=Random.int(5 * 60))).wait() continue elif "EsRejectedExecutionException[rejected execution (queue capacity" in e: (Till(seconds=Random.int(30))).wait() continue else: Log.warning("Bad ES call, fall back to TH", cause=e) return None best = docs[0]._source if len(docs) > 1: for d in docs: if d._id.endswith(d._source.branch.locale): best = d._source Log.warning("expecting no more than one document") if not GET_DIFF and not get_diff: return best elif best.changeset.diff: return best elif not best.changeset.files: return best # NOT EXPECTING A DIFF, RETURN IT ANYWAY else: return None
def wrap(query, schema=None): """ NORMALIZE QUERY SO IT CAN STILL BE JSON """ if isinstance(query, QueryOp) or query == None: return query query = wrap(query) output = QueryOp("from", None) output.format = query.format output.frum = wrap_from(query["from"], schema=schema) if not schema and isinstance(output.frum, Schema): schema = output.frum if not schema and hasattr(output.frum, "schema"): schema = output.frum.schema if query.select or isinstance(query.select, (Mapping, list)): output.select = _normalize_selects(query.select, query.frum, schema=schema) else: if query.edges or query.groupby: output.select = Data(name="count", value=jx_expression("."), aggregate="count", default=0) else: output.select = _normalize_selects(".", query.frum) if query.groupby and query.edges: Log.error("You can not use both the `groupby` and `edges` clauses in the same query!") elif query.edges: output.edges = _normalize_edges(query.edges, schema=schema) output.groupby = Null elif query.groupby: output.edges = Null output.groupby = _normalize_groupby(query.groupby, schema=schema) else: output.edges = Null output.groupby = Null output.where = _normalize_where(query.where, schema=schema) output.window = [_normalize_window(w) for w in listwrap(query.window)] output.having = None output.sort = _normalize_sort(query.sort) output.limit = Math.min(MAX_LIMIT, coalesce(query.limit, DEFAULT_LIMIT)) if not Math.is_integer(output.limit) or output.limit < 0: Log.error("Expecting limit >= 0") output.isLean = query.isLean return output
def compileTime2Term(edge): """ RETURN MVEL CODE THAT MAPS TIME AND DURATION DOMAINS DOWN TO AN INTEGER AND AND THE JAVASCRIPT THAT WILL TURN THAT INTEGER BACK INTO A PARTITION (INCLUDING NULLS) """ if edge.esscript: Log.error("edge script not supported yet") # IS THERE A LIMIT ON THE DOMAIN? numPartitions = len(edge.domain.partitions) value = edge.value if is_variable_name(value): value = "doc[\"" + value + "\"].value" nullTest = compileNullTest(edge) ref = coalesce(edge.domain.min, edge.domain.max, datetime(2000, 1, 1)) if edge.domain.interval.month > 0: offset = ref.subtract(ref.floorMonth(), durations.DAY).milli if offset > durations.DAY.milli * 28: offset = ref.subtract(ref.ceilingMonth(), durations.DAY).milli partition2int = "milli2Month(" + value + ", " + value2MVEL( offset) + ")" partition2int = "((" + nullTest + ") ? 0 : " + partition2int + ")" def int2Partition(value): if Math.round(value) == 0: return edge.domain.NULL d = datetime(str(value)[:4:], str(value)[-2:], 1) d = d.addMilli(offset) return edge.domain.getPartByKey(d) else: partition2int = "Math.floor((" + value + "-" + value2MVEL( ref) + ")/" + edge.domain.interval.milli + ")" partition2int = "((" + nullTest + ") ? " + numPartitions + " : " + partition2int + ")" def int2Partition(value): if Math.round(value) == numPartitions: return edge.domain.NULL return edge.domain.getPartByKey( ref.add(edge.domain.interval.multiply(value))) return Data(toTerm={ "head": "", "body": partition2int }, fromTerm=int2Partition)
def _normalize_select_no_context(select, schema=None): """ SAME NORMALIZE, BUT NO SOURCE OF COLUMNS """ if not _Column: _late_import() if isinstance(select, basestring): select = Data(value=select) elif isinstance(select, Mapping) and len(select.keys()) == 0: return None else: select = wrap(select) output = select.copy() if not select.value: output.name = coalesce(select.name, select.aggregate) if output.name: output.value = jx_expression(".") else: return output elif isinstance(select.value, basestring): if select.value.endswith(".*"): output.name = coalesce(select.name, select.value[:-2], select.aggregate) output.value = LeavesOp("leaves", Variable(select.value[:-2])) else: if select.value == ".": output.name = coalesce(select.name, select.aggregate, ".") output.value = jx_expression(select.value) elif select.value == "*": output.name = coalesce(select.name, select.aggregate, ".") output.value = LeavesOp("leaves", Variable(".")) else: output.name = coalesce(select.name, select.value, select.aggregate) output.value = jx_expression(select.value) elif isinstance(select.value, (int, float)): if not output.name: output.name = unicode(select.value) output.value = jx_expression(select.value) else: output.value = jx_expression(select.value) if not output.name: Log.error("expecting select to have a name: {{select}}", select= select) if output.name.endswith(".*"): Log.error("{{name|quote}} is invalid select", name=output.name) output.aggregate = coalesce(canonical_aggregates[select.aggregate].name, select.aggregate, "none") output.default = coalesce(select.default, canonical_aggregates[output.aggregate].default) return output
def query(self, sql, param=None, stream=False, row_tuples=False): """ RETURN LIST OF dicts """ if not self.cursor: # ALLOW NON-TRANSACTIONAL READS Log.error("must perform all queries inside a transaction") self._execute_backlog() try: if param: sql = expand_template(sql, self.quote_param(param)) sql = self.preamble + outdent(sql) if self.debug: Log.note("Execute SQL:\n{{sql}}", sql=indent(sql)) self.cursor.execute(sql) if row_tuples: if stream: result = self.cursor else: result = wrap(list(self.cursor)) else: columns = [ utf8_to_unicode(d[0]) for d in coalesce(self.cursor.description, []) ] if stream: result = (wrap( {c: utf8_to_unicode(v) for c, v in zip(columns, row)}) for row in self.cursor) else: result = wrap( [{c: utf8_to_unicode(v) for c, v in zip(columns, row)} for row in self.cursor]) return result except Exception as e: if isinstance( e, InterfaceError) or e.message.find("InterfaceError") >= 0: Log.error("Did you close the db connection?", e) Log.error("Problem executing SQL:\n{{sql|indent}}", sql=sql, cause=e, stack_depth=1)
def update_local_database(): # GET EVERYTHING WE HAVE SO FAR exists = summary_table.query({ "select": ["id", "last_updated"], "where": { "and": [{ "in": { "id": candidates.id } }, { "exists": "num_pushes" }] }, "sort": "last_updated", "limit": 100000, "format": "list", }).data # CHOOSE MISSING, THEN OLDEST, UP TO "RECENT" missing = list(set(candidates.id) - set(exists.id)) too_old = Date.today() - parse(LOCAL_RETENTION) needs_update = missing + [ e for e in exists if e.last_updated < too_old.unix ] Log.alert("{{num}} series are candidates for local update", num=len(needs_update)) limited_update = Queue("sigs") limited_update.extend( left(needs_update, coalesce(config.analysis.download_limit, 100))) Log.alert("Updating local database with {{num}} series", num=len(limited_update)) with Timer("Updating local database"): def loop(please_stop): while not please_stop: sig_id = limited_update.pop_one() if not sig_id: return process(sig_id) threads = [Thread.run(text(i), loop) for i in range(3)] for t in threads: t.join() Log.note("Local database is up to date")
def to_sql(self, schema, not_null=False, boolean=False): defult = self.default.to_sql(schema) if len(self.terms) == 0: return defult defult = coalesce(defult[0].sql, SQL_NULL) sep = self.separator.to_sql(schema)[0].sql.s acc = [] for t in self.terms: missing = t.missing().partial_eval() term = t.to_sql(schema, not_null=True)[0].sql if term.s: term_sql = term.s elif term.n: term_sql = "cast(" + term.n + " as text)" else: term_sql = SQL_CASE + SQL_WHEN + term.b + SQL_THEN + quote_value( "true") + SQL_ELSE + quote_value("false") + SQL_END if is_op(missing, Variable): acc.append(SQL_EMPTY_STRING) elif missing: acc.append(SQL_CASE + SQL_WHEN + sql_iso(missing.to_sql(schema, boolean=True)[0].sql.b) + SQL_THEN + SQL_EMPTY_STRING + SQL_ELSE + sql_iso(sql_concat([sep, term_sql])) + SQL_END) else: acc.append(sql_concat([sep, term_sql])) expr_ = "substr(" + sql_concat(acc) + ", " + LengthOp( self.separator).to_sql(schema)[0].sql.n + "+1)" missing = self.missing() if not missing: return wrap([{"name": ".", "sql": {"s": expr_}}]) else: return wrap([{ "name": ".", "sql": { "s": SQL_CASE + SQL_WHEN + "(" + missing.to_sql(schema, boolean=True)[0].sql.b + ")" + SQL_THEN + "(" + defult + ")" + SQL_ELSE + "(" + expr_ + ")" + SQL_END } }])
def query(self, sql, param=None, stream=False, row_tuples=False): """ RETURN A LIST OF dicts :param sql: SQL TEMPLATE TO SEND :param param: PARAMETERS TO INJECT INTO SQL TEMPLATE :param stream: STREAM OUTPUT :param row_tuples: DO NOT RETURN dicts """ if not self.cursor: # ALLOW NON-TRANSACTIONAL READS Log.error("must perform all queries inside a transaction") self._execute_backlog() try: if isinstance(sql, SQL): sql = text(sql) if param: sql = expand_template(sql, quote_param(param)) sql = self.preamble + outdent(sql) self.debug and Log.note("Execute SQL:\n{{sql}}", sql=indent(sql)) self.cursor.execute(sql) if row_tuples: if stream: result = self.cursor else: result = wrap(list(self.cursor)) else: columns = tuple(utf8_to_unicode(d[0]) for d in coalesce(self.cursor.description, [])) def streamer(): for row in self.cursor: output = Data() for c, v in zip(columns, row): output[c] = v yield output if stream: result = streamer() else: result = wrap(streamer()) return result except Exception as e: e = Except.wrap(e) if "InterfaceError" in e: Log.error("Did you close the db connection?", e) Log.error("Problem executing SQL:\n{{sql|indent}}", sql=sql, cause=e, stack_depth=1)
def compare_to_expected(query, result, expect, places): query = wrap(query) expect = wrap(expect) if result.meta.format == "table": assertAlmostEqual(set(result.header), set(expect.header)) # MAP FROM expected COLUMN TO result COLUMN mapping = zip(*zip(*filter( lambda v: v[0][1] == v[1][1], itertools.product(enumerate(expect.header), enumerate(result.header)) ))[1])[0] result.header = [result.header[m] for m in mapping] if result.data: columns = zip(*unwrap(result.data)) result.data = zip(*[columns[m] for m in mapping]) if not query.sort: sort_table(result) sort_table(expect) elif result.meta.format == "list": if query["from"].startswith("meta."): pass else: query = QueryOp.wrap(query) if not query.sort: try: #result.data MAY BE A LIST OF VALUES, NOT OBJECTS data_columns = jx.sort(set(jx.get_columns(result.data, leaves=True)) | set(jx.get_columns(expect.data, leaves=True)), "name") except Exception: data_columns = [{"name":"."}] sort_order = listwrap(coalesce(query.edges, query.groupby)) + data_columns if isinstance(expect.data, list): try: expect.data = jx.sort(expect.data, sort_order.name) except Exception, _: pass if isinstance(result.data, list): try: result.data = jx.sort(result.data, sort_order.name) except Exception, _: pass
def get_meta(self, key, conforming=True): """ RETURN METADATA ON FILE IN BUCKET :param key: KEY, OR PREFIX OF KEY :param conforming: TEST IF THE KEY CONFORMS TO REQUIRED PATTERN :return: METADATA, IF UNIQUE, ELSE ERROR """ try: metas = list(self.bucket.list(prefix=str(key))) metas = list_to_data([m for m in metas if text(m.name).find(".json") != -1]) perfect = Null favorite = Null too_many = False error = None for m in metas: try: simple = strip_extension(m.key) if conforming: self._verify_key_format(simple) if simple == key: perfect = m too_many = False if simple.startswith(key + ".") or simple.startswith(key + ":"): if favorite and not perfect: too_many = True favorite = m except Exception as e: error = e if too_many: Log.error( "multiple keys in {{bucket}} with prefix={{prefix|quote}}: {{list}}", bucket=self.name, prefix=key, list=[k.name for k in metas], ) if not perfect and error: Log.error("Problem with key request", error) return coalesce(perfect, favorite) except Exception as e: Log.error( READ_ERROR + " can not read {{key}} from {{bucket}}", key=key, bucket=self.bucket.name, cause=e, )
def __init__( self, alias, # NAME OF THE ALIAS type=None, # SCHEMA NAME, WILL HUNT FOR ONE IF None explore_metadata=True, # IF PROBING THE CLUSTER FOR METADATA IS ALLOWED debug=False, timeout=None, # NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests) kwargs=None ): self.debug = debug if self.debug: Log.alert("Elasticsearch debugging on {{index|quote}} is on", index= kwargs.index) if alias == None: Log.error("Alias can not be None") self.settings = kwargs self.cluster = Cluster(kwargs) if type == None: if not explore_metadata: Log.error("Alias() was given no `type` (aka schema) and not allowed to explore metadata. Do not know what to do now.") if not self.settings.alias or self.settings.alias==self.settings.index: alias_list = self.cluster.get("/_alias") candidates = ( [(name, i) for name, i in alias_list.items() if self.settings.index in i.aliases.keys()] + [(name, Null) for name, i in alias_list.items() if self.settings.index==name] ) full_name = jx.sort(candidates, 0).last()[0] mappings = self.cluster.get("/" + full_name + "/_mapping")[full_name] else: mappings = self.cluster.get("/"+self.settings.index+"/_mapping")[self.settings.index] # FIND MAPPING WITH MOST PROPERTIES (AND ASSUME THAT IS THE CANONICAL TYPE) max_prop = -1 for _type, mapping in mappings.mappings.items(): if _type == "_default_": continue num_prop = len(mapping.properties.keys()) if max_prop < num_prop: max_prop = num_prop self.settings.type = _type type = _type if type == None: Log.error("Can not find schema type for index {{index}}", index=coalesce(self.settings.alias, self.settings.index)) self.path = "/" + alias + "/" + type
def to_bq(self, schema, not_null=False, boolean=False): default = self.default.to_bq(schema) if len(self.terms) == 0: return default default = coalesce(default[0].sql.s, SQL_NULL) sep = BQLang[self.separator].to_bq(schema)[0].sql.s acc = [] for t in self.terms: t = BQLang[t] missing = t.missing().partial_eval() term = t.to_bq(schema, not_null=True)[0].sql if term.s: term_sql = term.s elif term.n: term_sql = "cast(" + term.n + " as text)" else: term_sql = (SQL_CASE + SQL_WHEN + term.b + SQL_THEN + quote_value("true") + SQL_ELSE + quote_value("false") + SQL_END) if isinstance(missing, TrueOp): acc.append(SQL_EMPTY_STRING) elif missing: acc.append( SQL_CASE + SQL_WHEN + sql_iso(missing.to_bq(schema, boolean=True)[0].sql.b) + SQL_THEN + SQL_EMPTY_STRING + SQL_ELSE + sql_iso(sql_concat_text([sep, term_sql])) + SQL_END) else: acc.append(sql_concat_text([sep, term_sql])) expr_ = "SUBSTR" + sql_iso( sql_list([ sql_concat_text(acc), LengthOp(self.separator).to_bq(schema)[0].sql.n + SQL("+1"), ])) return BQLScript( expr=expr_, data_type=STRING, frum=self, miss=self.missing(), many=False, schema=schema, )
def command_loop(local): STDOUT.write(b'{"out":"ok"}\n') DEBUG and Log.note("python process running") file = File while not please_stop: line = STDIN.readline() try: command = json2value(line.decode('utf8')) DEBUG and Log.note("got {{command}}", command=command) if "import" in command: dummy={} if is_text(command['import']): exec ("from " + command['import'] + " import *", dummy, context) else: exec ("from " + command['import']['from'] + " import " + ",".join(listwrap(command['import']['vars'])), dummy, context) STDOUT.write(DONE) elif "set" in command: for k, v in command.set.items(): context[k] = v STDOUT.write(DONE) elif "get" in command: STDOUT.write(value2json({"out": coalesce(local.get(command['get']), context.get(command['get']))}).encode('utf8')) STDOUT.write(b'\n') elif "stop" in command: STDOUT.write(DONE) please_stop.go() elif "exec" in command: if not is_text(command['exec']): Log.error("exec expects only text") exec (command['exec'], context, local) STDOUT.write(DONE) else: for k, v in command.items(): if is_list(v): exec ("_return = " + k + "(" + ",".join(map(value2json, v)) + ")", context, local) else: exec ("_return = " + k + "(" + ",".join(kk + "=" + value2json(vv) for kk, vv in v.items()) + ")", context, local) STDOUT.write(value2json({"out": local['_return']}).encode('utf8')) STDOUT.write(b'\n') except Exception as e: e = Except.wrap(e) STDOUT.write(value2json({"err": e}).encode('utf8')) STDOUT.write(b'\n') finally: STDOUT.flush()
def append_query(self, query_path, es_query): domain = self.domain domain_key = domain.key value = self.edge.value cnv = pull_functions[value.type] include = tuple(cnv(p[domain_key]) for p in domain.partitions) schema = self.schema exists = InOp([value, Literal(include)]).partial_eval() limit = coalesce(self.limit, len(domain.partitions)) if is_op(value, Variable): es_field = first(schema.leaves(value.var)).es_column # ALREADY CHECKED THERE IS ONLY ONE match = TermsAggs( "_match", { "field": es_field, "size": limit, "order": {"_term": self.sorted} if self.sorted else None }, self ) else: match = TermsAggs( "_match", { "script": text(Painless[value].to_es_script(schema)), "size": limit }, self ) output = Aggs().add(FilterAggs("_filter", exists, None).add(match.add(es_query))) if self.edge.allowNulls: # IF ALL NESTED COLUMNS ARE NULL, DOES THE FILTER PASS? # MISSING AT THE QUERY DEPTH # columns = schema[value.var] concat_inner = split_expression(NotOp(exists), self.query) for i, term in enumerate(concat_inner.terms): acc = es_query for nest in term.nests: if nest.where is not TRUE: acc = NestedAggs(nest.path.var).add(FilterAggs("_missing" + text(i), nest.where, self).add(acc)) output.add(acc) return output
def _convert_query(self, query): # if not isinstance(query["from"], Container): # Log.error('Expecting from clause to be a Container') query = wrap(query) output = QueryOp(None) output["from"] = self._convert_from(query["from"]) output.format = query.format if query.select: output.select = convert_list(self._convert_select, query.select) else: if query.edges or query.groupby: output.select = {"name": "count", "value": ".", "aggregate": "count", "default": 0} else: output.select = {"name": "__all__", "value": "*", "aggregate": "none"} if query.groupby and query.edges: Log.error("You can not use both the `groupby` and `edges` clauses in the same query!") elif query.edges: output.edges = convert_list(self._convert_edge, query.edges) output.groupby = None elif query.groupby: output.edges = None output.groupby = convert_list(self._convert_group, query.groupby) else: output.edges = [] output.groupby = None output.where = self.convert(query.where) output.window = convert_list(self._convert_window, query.window) output.sort = self._convert_sort(query.sort) output.limit = coalesce(query.limit, DEFAULT_LIMIT) if not mo_math.is_integer(output.limit) or output.limit < 0: Log.error("Expecting limit >= 0") # DEPTH ANALYSIS - LOOK FOR COLUMN REFERENCES THAT MAY BE DEEPER THAN # THE from SOURCE IS. vars = get_all_vars(output, exclude_where=True) # WE WILL EXCLUDE where VARIABLES for c in query.columns: if c.name in vars and len(c.nested_path) != 1: Log.error("This query, with variable {{var_name}} is too deep", var_name=c.name) return output
def parse_partition(part): for p in part.partitions: if part.index: p.index = part.index # COPY INDEX DOWN parse_partition(p) p.value = coalesce(p.value, p.name) p.parent = part if not part.where: if len(part.partitions) > 100: Log.error("Must define an where on {{name}} there are too many partitions ({{num_parts}})", name= part.name, num_parts= len(part.partitions)) # DEFAULT where IS THE UNION OF ALL CHILD FILTERS if part.partitions: part.where = {"or": part.partitions.where}
def wrap(query, container, namespace): """ NORMALIZE QUERY SO IT CAN STILL BE JSON """ if is_op(query, QueryOp) or query == None: return query query = wrap(query) table = container.get_table(query['from']) schema = table.schema output = QueryOp( frum=table, format=query.format, limit=mo_math.min(MAX_LIMIT, coalesce(query.limit, DEFAULT_LIMIT)) ) if query.select or isinstance(query.select, (Mapping, list)): output.select = _normalize_selects(query.select, query.frum, schema=schema) else: if query.edges or query.groupby: output.select = DEFAULT_SELECT else: output.select = _normalize_selects(".", query.frum) if query.groupby and query.edges: Log.error("You can not use both the `groupby` and `edges` clauses in the same query!") elif query.edges: output.edges = _normalize_edges(query.edges, limit=output.limit, schema=schema) output.groupby = Null elif query.groupby: output.edges = Null output.groupby = _normalize_groupby(query.groupby, limit=output.limit, schema=schema) else: output.edges = Null output.groupby = Null output.where = _normalize_where({"and": listwrap(query.where)}, schema=schema) output.window = [_normalize_window(w) for w in listwrap(query.window)] output.having = None output.sort = _normalize_sort(query.sort) if not mo_math.is_integer(output.limit) or output.limit < 0: Log.error("Expecting limit >= 0") output.isLean = query.isLean return output