def get_raw_json(path): active_data_timer = Timer("total duration") body = flask.request.get_data() try: with active_data_timer: args = wrap(Data(**flask.request.args)) limit = args.limit if args.limit else 10 args.limit = None frum = wrap_from(path) result = jx.run( { "from": path, "where": { "eq": args }, "limit": limit, "format": "list" }, frum) if isinstance( result, Container ): #TODO: REMOVE THIS CHECK, jx SHOULD ALWAYS RETURN Containers result = result.format("list") result.meta.active_data_response_time = active_data_timer.duration response_data = convert.unicode2utf8( convert.value2json(result.data, pretty=True)) Log.note("Response is {{num}} bytes", num=len(response_data)) return Response(response_data, status=200) except Exception, e: e = Except.wrap(e) return _send_error(active_data_timer, body, e)
def _get_queue(self, row): row = wrap(row) if row.json: row.value, row.json = convert.json2value(row.json), None timestamp = Date(self.rollover_field(wrap(row).value)) if timestamp == None or timestamp < Date.today() - self.rollover_max: return Null rounded_timestamp = timestamp.floor(self.rollover_interval) queue = self.known_queues.get(rounded_timestamp.unix) if queue == None: candidates = jx.run({ "from": self.cluster.get_aliases(), "where": { "regex": { "index": self.settings.index + "\d\d\d\d\d\d\d\d_\d\d\d\d\d\d" } }, "sort": "index" }) best = None for c in candidates: c = wrap(c) c.date = unicode2Date(c.index[-15:], elasticsearch.INDEX_DATE_FORMAT) if timestamp > c.date: best = c if not best or rounded_timestamp > best.date: if rounded_timestamp < wrap(candidates[-1]).date: es = elasticsearch.Index(read_only=False, alias=best.alias, index=best.index, settings=self.settings) else: try: es = self.cluster.create_index( create_timestamp=rounded_timestamp, settings=self.settings) es.add_alias(self.settings.index) except Exception, e: if "IndexAlreadyExistsException" not in e: Log.error("Problem creating index", cause=e) return self._get_queue(row) # TRY AGAIN else: es = elasticsearch.Index(read_only=False, alias=best.alias, index=best.index, settings=self.settings) with suppress_exception: es.set_refresh_interval(seconds=60 * 10, timeout=5) self._delete_old_indexes(candidates) queue = self.known_queues[ rounded_timestamp.unix] = es.threaded_queue( max_size=self.settings.queue_size, batch_size=self.settings.batch_size, silent=True)
def allowed_query(data): data = json2value(data) data.edges = None data.groupby = None data.select = {"value": "result.test", "aggregate": "cardinality"} data.format = "list" frum = wrap_from(data['from']) result = jx.run(data, frum=frum) if result.data < 10000: return True else: Log.error("not allowed to groupby result.test")
def get(*args, **kwargs): body = kwargs.get("data") if not body: return wrap({"status_code": 400}) text = convert.utf82unicode(body) text = replace_vars(text) data = convert.json2value(text) result = jx.run(data) output_bytes = convert.unicode2utf8(convert.value2json(result)) return wrap({ "status_code": 200, "all_content": output_bytes, "content": output_bytes })
def _get_queue(self, row): row = wrap(row) if row.json: row.value, row.json = mo_json.json2value(row.json), None timestamp = Date(self.rollover_field(wrap(row).value)) if timestamp == None: return Null elif timestamp < Date.today() - self.rollover_max: return DATA_TOO_OLD rounded_timestamp = timestamp.floor(self.rollover_interval) with self.locker: queue = self.known_queues.get(rounded_timestamp.unix) if queue == None: candidates = jx.run({ "from": self.cluster.get_aliases(), "where": {"regex": {"index": self.settings.index + "\d\d\d\d\d\d\d\d_\d\d\d\d\d\d"}}, "sort": "index" }) best = None for c in candidates: c = wrap(c) c.date = unicode2Date(c.index[-15:], elasticsearch.INDEX_DATE_FORMAT) if timestamp > c.date: best = c if not best or rounded_timestamp > best.date: if rounded_timestamp < wrap(candidates[-1]).date: es = elasticsearch.Index(read_only=False, alias=best.alias, index=best.index, kwargs=self.settings) else: try: es = self.cluster.create_index(create_timestamp=rounded_timestamp, kwargs=self.settings) es.add_alias(self.settings.index) except Exception as e: if "IndexAlreadyExistsException" not in e: Log.error("Problem creating index", cause=e) return self._get_queue(row) # TRY AGAIN else: es = elasticsearch.Index(read_only=False, alias=best.alias, index=best.index, kwargs=self.settings) with suppress_exception: es.set_refresh_interval(seconds=60 * 5, timeout=5) self._delete_old_indexes(candidates) threaded_queue = es.threaded_queue(max_size=self.settings.queue_size, batch_size=self.settings.batch_size, silent=True) with self.locker: queue = self.known_queues[rounded_timestamp.unix] = threaded_queue return queue
def query(self, _query): try: query = QueryOp.wrap(_query, schema=self) for n in self.namespaces: query = n.convert(query) if self.typed: query = Typed().convert(query) for s in listwrap(query.select): if not aggregates1_4.get(s.aggregate): Log.error( "ES can not aggregate {{name}} because {{aggregate|quote}} is not a recognized aggregate", name=s.name, aggregate=s.aggregate ) frum = query["from"] if isinstance(frum, QueryOp): result = self.query(frum) q2 = query.copy() q2.frum = result return jx.run(q2) if is_deepop(self._es, query): return es_deepop(self._es, query) if is_aggsop(self._es, query): return es_aggsop(self._es, frum, query) if is_setop(self._es, query): return es_setop(self._es, query) if es09_setop.is_setop(query): return es09_setop.es_setop(self._es, None, query) if es09_aggop.is_aggop(query): return es09_aggop.es_aggop(self._es, None, query) Log.error("Can not handle") except Exception, e: e = Except.wrap(e) if "Data too large, data for" in e: http.post(self._es.cluster.path+"/_cache/clear") Log.error("Problem (Tried to clear Elasticsearch cache)", e) Log.error("problem", e)
def query(self, _query): try: query = QueryOp.wrap(_query, schema=self) for n in self.namespaces: query = n.convert(query) if self.typed: query = Typed().convert(query) for s in listwrap(query.select): if not aggregates1_4.get(s.aggregate): Log.error( "ES can not aggregate {{name}} because {{aggregate|quote}} is not a recognized aggregate", name=s.name, aggregate=s.aggregate, ) frum = query["from"] if isinstance(frum, QueryOp): result = self.query(frum) q2 = query.copy() q2.frum = result return jx.run(q2) if is_deepop(self._es, query): return es_deepop(self._es, query) if is_aggsop(self._es, query): return es_aggsop(self._es, frum, query) if is_setop(self._es, query): return es_setop(self._es, query) if es09_setop.is_setop(query): return es09_setop.es_setop(self._es, None, query) if es09_aggop.is_aggop(query): return es09_aggop.es_aggop(self._es, None, query) Log.error("Can not handle") except Exception, e: e = Except.wrap(e) if "Data too large, data for" in e: http.post(self._es.cluster.path + "/_cache/clear") Log.error("Problem (Tried to clear Elasticsearch cache)", e) Log.error("problem", e)
def query(path): with CProfiler(): try: with Timer("total duration") as query_timer: preamble_timer = Timer("preamble") with preamble_timer: if flask.request.headers.get("content-length", "") in ["", "0"]: # ASSUME A BROWSER HIT THIS POINT, SEND text/html RESPONSE BACK return Response(BLANK, status=400, headers={"Content-Type": "text/html"}) elif int(flask.request.headers["content-length"] ) > QUERY_SIZE_LIMIT: Log.error("Query is too large") request_body = flask.request.get_data().strip() text = convert.utf82unicode(request_body) text = replace_vars(text, flask.request.args) data = convert.json2value(text) record_request(flask.request, data, None, None) if data.meta.testing: _test_mode_wait(data) translate_timer = Timer("translate") with translate_timer: if data.sql: data = parse_sql(data.sql) frum = wrap_from(data['from']) result = jx.run(data, frum=frum) if isinstance( result, Container ): #TODO: REMOVE THIS CHECK, jx SHOULD ALWAYS RETURN Containers result = result.format(data.format) save_timer = Timer("save") with save_timer: if data.meta.save: try: result.meta.saved_as = save_query.query_finder.save( data) except Exception, e: Log.warning("Unexpected save problem", cause=e) result.meta.timing.preamble = Math.round( preamble_timer.duration.seconds, digits=4) result.meta.timing.translate = Math.round( translate_timer.duration.seconds, digits=4) result.meta.timing.save = Math.round( save_timer.duration.seconds, digits=4) result.meta.timing.total = "{{TOTAL_TIME}}" # TIMING PLACEHOLDER with Timer("jsonification") as json_timer: response_data = convert.unicode2utf8( convert.value2json(result)) with Timer("post timer"): # IMPORTANT: WE WANT TO TIME OF THE JSON SERIALIZATION, AND HAVE IT IN THE JSON ITSELF. # WE CHEAT BY DOING A (HOPEFULLY FAST) STRING REPLACEMENT AT THE VERY END timing_replacement = b'"total": ' + str(Math.round(query_timer.duration.seconds, digits=4)) +\ b', "jsonification": ' + str(Math.round(json_timer.duration.seconds, digits=4)) response_data = response_data.replace( b'"total": "{{TOTAL_TIME}}"', timing_replacement) Log.note("Response is {{num}} bytes in {{duration}}", num=len(response_data), duration=query_timer.duration) return Response( response_data, status=200, headers={"Content-Type": result.meta.content_type}) except Exception, e: e = Except.wrap(e) return _send_error(query_timer, request_body, e)
def query(self, query): query.frum = self.__iter__() output = jx.run(query) return output
def _get_spot_prices_from_aws(self): with Timer("Read pricing file"): try: content = File(self.settings.price_file).read() cache = convert.json2value(content, flexible=False, leaves=False) except Exception as e: cache = FlatList() most_recents = jx.run({ "from": cache, "edges": ["instance_type", "availability_zone"], "select": {"value": "timestamp", "aggregate": "max"} }) zones = self._get_valid_availability_zones() prices = set(cache) with Timer("Get pricing from AWS"): for instance_type in self.settings.utility.keys(): for zone in zones: if cache: most_recent = most_recents[{ "instance_type": instance_type, "availability_zone": zone }].timestamp start_at = MAX([Date(most_recent), Date.today() - WEEK]) else: start_at = Date.today() - WEEK if DEBUG_PRICING: Log.note("get pricing for {{instance_type}} starting at {{start_at}}", instance_type=instance_type, start_at=start_at ) next_token = None while True: resultset = self.ec2_conn.get_spot_price_history( product_description=coalesce(self.settings.product, "Linux/UNIX (Amazon VPC)"), instance_type=instance_type, availability_zone=zone, start_time=start_at.format(ISO8601), next_token=next_token ) next_token = resultset.next_token for p in resultset: prices.add(wrap({ "availability_zone": p.availability_zone, "instance_type": p.instance_type, "price": p.price, "product_description": p.product_description, "region": p.region.name, "timestamp": Date(p.timestamp).unix })) if not next_token: break with Timer("Save prices to file"): new_prices = jx.filter(prices, {"gte": {"timestamp": {"date": "today-2day"}}}) def stream(): # IT'S A LOT OF PRICES, STREAM THEM TO FILE prefix = "[\n" for p in new_prices: yield prefix yield convert.value2json(p) prefix = ",\n" yield "]" File(self.settings.price_file).write(stream()) return prices
def pricing(self): with self.price_locker: if self.prices: return self.prices prices = self._get_spot_prices_from_aws() now = Date.now() expressions.ALLOW_SCRIPTING = True with Timer("processing pricing data"): hourly_pricing = jx.run({ "from": { # AWS PRICING ONLY SENDS timestamp OF CHANGES, MATCH WITH NEXT INSTANCE "from": prices, "window": [ { "name": "expire", "value": {"coalesce": [{"rows": {"timestamp": 1}}, {"date": "eod"}]}, "edges": ["availability_zone", "instance_type"], "sort": "timestamp" }, { # MAKE THIS PRICE EFFECTIVE INTO THE PAST, THIS HELPS SPREAD PRICE SPIKES OVER TIME "name": "effective", "value": {"sub": {"timestamp": self.settings.uptime.duration.seconds}} } ] }, "edges": [ "availability_zone", "instance_type", { "name": "time", "range": {"min": "effective", "max": "expire", "mode": "inclusive"}, "allowNulls": False, "domain": {"type": "time", "min": now.floor(HOUR) - self.settings.uptime.history, "max": Date.now().floor(HOUR)+HOUR, "interval": "hour"} } ], "select": [ {"value": "price", "aggregate": "max"}, {"aggregate": "count"} ], "where": {"gt": {"expire": now.floor(HOUR) - self.settings.uptime.history}}, "window": [ { "name": "current_price", "value": "rows.last.price", "edges": ["availability_zone", "instance_type"], "sort": "time" } ] }).data bid80 = jx.run({ "from": hourly_pricing, "edges": [ { "value": "availability_zone", "allowNulls": False }, { "name": "type", "value": "instance_type", "allowNulls": False, "domain": {"type": "set", "key": "instance_type", "partitions": self.settings.utility} } ], "select": [ {"name": "price_80", "value": "price", "aggregate": "percentile", "percentile": self.settings.uptime.bid_percentile}, {"name": "max_price", "value": "price", "aggregate": "max"}, {"aggregate": "count"}, {"value": "current_price", "aggregate": "one"}, {"name": "all_price", "value": "price", "aggregate": "list"} ], "window": [ {"name": "estimated_value", "value": {"div": ["type.utility", "price_80"]}}, {"name": "higher_price", "value": lambda row, rownum, rows: find_higher(row.all_price, row.price_80)} # TODO: SUPPORT {"from":"all_price", "where":{"gt":[".", "price_80"]}, "select":{"aggregate":"min"}} ] }) output = jx.run({ "from": bid80, "sort": {"value": "estimated_value", "sort": -1} }) self.prices = wrap(output.data) self.price_lookup = UniqueIndex(("type.instance_type", "availability_zone"), data=self.prices) return self.prices