def get_file(ref, url): from pyLibrary.env.files import File if ref.path.startswith("~"): home_path = os.path.expanduser("~") if os.sep == "\\": home_path = "/" + home_path.replace(os.sep, "/") if home_path.endswith("/"): home_path = home_path[:-1] ref.path = home_path + ref.path[1::] elif not ref.path.startswith("/"): # CONVERT RELATIVE TO ABSOLUTE if ref.path[0] == ".": num_dot = 1 while ref.path[num_dot] == ".": num_dot += 1 parent = url.path.rstrip("/").split("/")[:-num_dot] ref.path = "/".join(parent) + ref.path[num_dot:] else: parent = url.path.rstrip("/").split("/")[:-1] ref.path = "/".join(parent) + "/" + ref.path path = ref.path if os.sep != "\\" else ref.path[1::].replace("/", "\\") try: if DEBUG: _Log.note("reading file {{path}}", path=path) content = File(path).read() except Exception, e: content = None _Log.error("Could not read file {{filename}}", filename=path, cause=e)
def safe_size(source): """ READ THE source UP TO SOME LIMIT, THEN COPY TO A FILE IF TOO BIG RETURN A str() OR A FileString() """ if source is None: return None total_bytes = 0 bytes = [] b = source.read(MIN_READ_SIZE) while b: total_bytes += len(b) bytes.append(b) if total_bytes > MAX_STRING_SIZE: try: data = FileString(TemporaryFile()) for bb in bytes: data.write(bb) del bytes del bb b = source.read(MIN_READ_SIZE) while b: total_bytes += len(b) data.write(b) b = source.read(MIN_READ_SIZE) data.seek(0) Log.note("Using file of size {{length}} instead of str()", length= total_bytes) return data except Exception, e: Log.error("Could not write file > {{num}} bytes", num= total_bytes, cause=e) b = source.read(MIN_READ_SIZE)
def groupby_Multiset(data, min_size, max_size): # GROUP multiset BASED ON POPULATION OF EACH KEY, TRYING TO STAY IN min/max LIMITS if min_size == None: min_size = 0 total = 0 i = 0 g = list() for k, c in data.items(): if total < min_size or total + c < max_size: total += c g.append(k) elif total < max_size: yield (i, g) i += 1 total = c g = [k] if total >= max_size: Log.error("({{min}}, {{max}}) range is too strict given step of {{increment}}", min=min_size, max=max_size, increment=c ) if g: yield (i, g)
def _convert_edge(self, edge): if isinstance(edge, basestring): return Dict( name=edge, value=edge, domain=self._convert_domain() ) else: edge = wrap(edge) if not edge.name and not isinstance(edge.value, basestring): Log.error("You must name compound edges: {{edge}}", edge= edge) if isinstance(edge.value, (Mapping, list)) and not edge.domain: # COMPLEX EDGE IS SHORT HAND domain =self._convert_domain() domain.dimension = Dict(fields=edge.value) return Dict( name=edge.name, allowNulls=False if edge.allowNulls is False else True, domain=domain ) domain = self._convert_domain(edge.domain) return Dict( name=coalesce(edge.name, edge.value), value=edge.value, range=edge.range, allowNulls=False if edge.allowNulls is False else True, domain=domain )
def execute(self, requests): """ RETURN A GENERATOR THAT HAS len(requests) RESULTS (ANY ORDER) EXPECTING requests TO BE A list OF dicts, EACH dict IS USED AS kwargs TO GIVEN functions """ if not isinstance(requests, (list, tuple, GeneratorType, Iterable)): Log.error("Expecting requests to be a list or generator", stack_depth=1) else: requests = list(requests) # FILL QUEUE WITH WORK self.inbound.extend(requests) num = len(requests) def output(): for i in xrange(num): result = self.outbound.pop() if "exception" in result: raise result["exception"] else: yield result["response"] if self.outbound is not None: return output() else: return
def groupby(data, keys=None, size=None, min_size=None, max_size=None, contiguous=False): """ return list of (keys, values) pairs where group by the set of keys values IS LIST OF ALL data that has those keys contiguous - MAINTAIN THE ORDER OF THE DATA, STARTING THE NEW GROUP WHEN THE SELECTOR CHANGES """ if size != None or min_size != None or max_size != None: if size != None: max_size = size return groupby_min_max_size(data, min_size=min_size, max_size=max_size) if isinstance(data, Container): return data.groupby(keys) try: keys = listwrap(keys) get_key = jx_expression_to_function(keys) if not contiguous: data = sorted(data, key=get_key) def _output(): for g, v in itertools.groupby(data, get_key): group = Dict() for k, gg in zip(keys, g): group[k] = gg yield (group, wrap(v)) return _output() except Exception, e: Log.error("Problem grouping", e)
def groupby_size(data, size): if hasattr(data, "next"): iterator = data elif hasattr(data, "__iter__"): iterator = data.__iter__() else: Log.error("do not know how to handle this type") done = DictList() def more(): output = DictList() for i in range(size): try: output.append(iterator.next()) except StopIteration: done.append(True) break return output # THIS IS LAZY i = 0 while True: output = more() yield (i, output) if len(done) > 0: break i += 1
def get_index(self, row): if self.computed_domain: try: part = row[self.start] return self.domain.getIndexByKey(part["key"]) except Exception, e: Log.error("problem", cause=e)
def process_test_result(source_key, source, destination, please_stop=None): path = key2path(source_key) destination.delete({"and": [ {"term": {"etl.source.id": path[1]}}, {"term": {"etl.source.source.id": path[0]}} ]}) lines = source.read_lines() keys = [] data = [] for l in lines: record = convert.json2value(l) if record._id==None: continue record.result.crash_result = None #TODO: Remove me after May 2015 keys.append(record._id) data.append({ "id": record._id, "value": record }) record._id = None if data: try: destination.extend(data) except Exception, e: if "Can not decide on index by build.date" in e: if source.bucket.name == "ekyle-test-result": # KNOWN CORRUPTION # TODO: REMOVE LATER (today = Mar2015) delete_list = source.bucket.keys(prefix=key_prefix(source_key)) for d in delete_list: source.bucket.delete_key(d) Log.error("Can not add to sink", e)
def _decode(index, parent_path, path, name2index, expected_vars=NO_VARS): c, index = skip_whitespace(index) if not path: if c != b"[": # TREAT VALUE AS SINGLE-VALUE ARRAY yield _decode_token(index, c, parent_path, path, name2index, None, expected_vars) else: c, index = skip_whitespace(index) if c == b']': return # EMPTY ARRAY while True: value, index = _decode_token(index, c, parent_path, path, name2index, None, expected_vars) c, index = skip_whitespace(index) if c == b']': yield value, index return elif c == b',': c, index = skip_whitespace(index) yield value, index else: if c != b'{': Log.error("Expecting all objects to at least have {{path}}", path=path[0]) for j, i in _decode_object(index, parent_path, path, name2index, expected_vars=expected_vars): yield j, i
def _decode_token(index, c, full_path, path, name2index, destination, expected_vars): if c == b'{': if not expected_vars: index = jump_to_end(index, c) value = None elif expected_vars[0] == ".": json.mark(index-1) index = jump_to_end(index, c) value = json_decoder(json.release(index).decode("utf8")) else: count = 0 for v, i in _decode_object(index, full_path, path, name2index, destination, expected_vars=expected_vars): index = i value = v count += 1 if count != 1: Log.error("Expecting object, nothing nested") elif c == b'[': if not expected_vars: index = jump_to_end(index, c) value = None else: json.mark(index - 1) index = jump_to_end(index, c) value = json_decoder(json.release(index).decode("utf8")) else: if expected_vars and expected_vars[0] == ".": value, index = simple_token(index, c) else: index = jump_to_end(index, c) value = None return value, index
def create(self): try: os.makedirs(self._filename) except Exception, e: from pyLibrary.debugs.logs import Log Log.error("Could not make directory {{dir_name}}", dir_name= self._filename, cause=e)
def simple_token(index, c): if c == b'"': json.mark(index - 1) while True: c = json[index] index += 1 if c == b"\\": index += 1 elif c == b'"': break return json_decoder(json.release(index).decode("utf8")), index elif c in b"{[": Log.error("Expecting a primitive value") elif c == b"t" and json.slice(index, index + 3) == "rue": return True, index + 3 elif c == b"n" and json.slice(index, index + 3) == "ull": return None, index + 3 elif c == b"f" and json.slice(index, index + 4) == "alse": return False, index + 4 else: json.mark(index-1) while True: c = json[index] if c in b',]}': break index += 1 return float(json.release(index)), index
def write(self, data): if not self.parent.exists: self.parent.create() with open(self._filename, "wb") as f: if isinstance(data, list) and self.key: from pyLibrary.debugs.logs import Log Log.error("list of data and keys are not supported, encrypt before sending to file") if isinstance(data, list): pass elif isinstance(data, basestring): data=[data] elif hasattr(data, "__iter__"): pass for d in data: if not isinstance(d, unicode): from pyLibrary.debugs.logs import Log Log.error("Expecting unicode data only") if self.key: f.write(crypto.encrypt(d, self.key).encode("utf8")) else: f.write(d.encode("utf8"))
def __init__(self, filename, buffering=2 ** 14, suffix=None): """ YOU MAY SET filename TO {"path":p, "key":k} FOR CRYPTO FILES """ if filename == None: from pyLibrary.debugs.logs import Log Log.error("File must be given a filename") elif isinstance(filename, basestring): self.key = None if filename.startswith("~"): home_path = os.path.expanduser("~") if os.sep == "\\": home_path = home_path.replace(os.sep, "/") if home_path.endswith("/"): home_path = home_path[:-1] filename = home_path + filename[1::] self._filename = filename.replace(os.sep, "/") # USE UNIX STANDARD else: self.key = convert.base642bytearray(filename.key) self._filename = "/".join(filename.path.split(os.sep)) # USE UNIX STANDARD while self._filename.find(".../") >= 0: # LET ... REFER TO GRANDPARENT, .... REFER TO GREAT-GRAND-PARENT, etc... self._filename = self._filename.replace(".../", "../../") self.buffering = buffering if suffix: self._filename = File.add_suffix(self._filename, suffix)
def latin12unicode(value): if isinstance(value, unicode): Log.error("can not convert unicode from latin1") try: return unicode(value.decode('iso-8859-1')) except Exception, e: Log.error("Can not convert {{value|quote}} to unicode", value=value)
def json2value(json_string, params={}, flexible=False, leaves=False): """ :param json_string: THE JSON :param params: STANDARD JSON PARAMS :param flexible: REMOVE COMMENTS :param leaves: ASSUME JSON KEYS ARE DOT-DELIMITED :return: Python value """ if isinstance(json_string, str): Log.error("only unicode json accepted") try: if flexible: # REMOVE """COMMENTS""", # COMMENTS, //COMMENTS, AND \n \r # DERIVED FROM https://github.com/jeads/datasource/blob/master/datasource/bases/BaseHub.py# L58 json_string = re.sub(r"\"\"\".*?\"\"\"", r"\n", json_string, flags=re.MULTILINE) json_string = "\n".join(remove_line_comment(l) for l in json_string.split("\n")) # ALLOW DICTIONARY'S NAME:VALUE LIST TO END WITH COMMA json_string = re.sub(r",\s*\}", r"}", json_string) # ALLOW LISTS TO END WITH COMMA json_string = re.sub(r",\s*\]", r"]", json_string) if params: # LOOKUP REFERENCES json_string = expand_template(json_string, params) try: value = wrap(json_decoder(unicode(json_string))) except Exception, e: Log.error("can not decode\n{{content}}", content=json_string, cause=e) if leaves: value = wrap_leaves(value) return value
def datetime2string(value, format="%Y-%m-%d %H:%M:%S"): try: return value.strftime(format) except Exception, e: from pyLibrary.debugs.logs import Log Log.error("Can not format {{value}} with {{format}}", value=value, format=format, cause=e)
def string2url(value): if isinstance(value, unicode): return "".join([_map2url[c] for c in unicode2latin1(value)]) elif isinstance(value, str): return "".join([_map2url[c] for c in value]) else: Log.error("Expecting a string")
def __init__( self, exchange, # name of the Pulse exchange topic, # message name pattern to subscribe to ('#' is wildcard) target=None, # WILL BE CALLED WITH PULSE PAYLOADS AND ack() IF COMPLETE$ED WITHOUT EXCEPTION target_queue=None, # (aka self.queue) WILL BE FILLED WITH PULSE PAYLOADS host='pulse.mozilla.org', # url to connect, port=5671, # tcp port user=None, password=None, vhost="/", start=0, # USED AS STARTING POINT FOR ASSIGNING THE _meta.count ATTRIBUTE ssl=True, applabel=None, heartbeat=False, # True to also get the Pulse heartbeat message durable=False, # True to keep queue after shutdown serializer='json', broker_timezone='GMT', settings=None ): self.target_queue = target_queue self.pulse_target = target if (target_queue == None and target == None) or (target_queue != None and target != None): Log.error("Expecting a queue (for fast digesters) or a target (for slow digesters)") Thread.__init__(self, name="Pulse consumer for " + settings.exchange, target=self._worker) self.settings = settings settings.callback = self._got_result settings.user = coalesce(settings.user, settings.username) settings.applabel = coalesce(settings.applable, settings.queue, settings.queue_name) settings.topic = topic self.pulse = ModifiedGenericConsumer(settings, connect=True, **settings) self.count = coalesce(start, 0) self.start()
def _convert_in(op, term): if not term: Log.error("Expecting a term") if not isinstance(term, Mapping): Log.error("Expecting {{op}} to have dict value", op= op) var, val = term.items()[0] if isinstance(val, list): v2 = [vv for vv in val if vv != None] if len(v2) == 0: if len(val) == 0: return False else: return {"missing": {"field": var}} if len(v2) == 1: output = {"term": {var: v2[0]}} else: output = {"terms": {var: v2}} if len(v2) != len(val): output = {"or": [ {"missing": {"field": var}}, output ]} return output else: return {"term": term}
def __init__(self, **desc): Domain.__init__(self, **desc) self.type = "range" self.NULL = Null if self.partitions: # IGNORE THE min, max, interval if not self.key: Log.error("Must have a key value") parts = listwrap(self.partitions) for i, p in enumerate(parts): self.min = Math.min(self.min, p.min) self.max = Math.max(self.max, p.max) if p.dataIndex != None and p.dataIndex != i: Log.error("Expecting `dataIndex` to agree with the order of the parts") if p[self.key] == None: Log.error("Expecting all parts to have {{key}} as a property", key=self.key) p.dataIndex = i # VERIFY PARTITIONS DO NOT OVERLAP, HOLES ARE FINE for p, q in itertools.product(parts, parts): if p.min <= q.min and q.min < p.max: Log.error("partitions overlap!") self.partitions = parts return elif any([self.min == None, self.max == None, self.interval == None]): Log.error("Can not handle missing parameter") self.key = "min" self.partitions = wrap([{"min": v, "max": v + self.interval, "dataIndex": i} for i, v in enumerate(frange(self.min, self.max, self.interval))])
def execute( self, command, param=None, retry=True # IF command FAILS, JUST THROW ERROR ): if param: command = expand_template(command, self.quote_param(param)) output = None done = False while not done: try: with self.locker: if not self.connection: self._connect() with Closer(self.connection.cursor()) as curs: curs.execute(command) if curs.rowcount >= 0: output = curs.fetchall() self.connection.commit() done = True except Exception, e: try: self.connection.rollback() # TODO: FIGURE OUT WHY rollback() DOES NOT HELP self.connection.close() except Exception, f: pass self.connection = None self._connect() if not retry: Log.error("Problem with command:\n{{command|indent}}", command= command, cause=e)
def _get_from_elasticsearch(self, revision, locale=None): rev = revision.changeset.id query = { "query": {"filtered": { "query": {"match_all": {}}, "filter": {"and": [ {"prefix": {"changeset.id": rev[0:12]}}, {"term": {"branch.name": revision.branch.name}}, {"term": {"branch.locale": coalesce(locale, revision.branch.locale, DEFAULT_LOCALE)}} ]} }}, "size": 2000, } try: docs = self.es.search(query, timeout=120).hits.hits if len(docs) > 1: for d in docs: if d._id.endswith(d._source.branch.locale): return d._source Log.warning("expecting no more than one document") return docs[0]._source except Exception, e: Log.warning("Bad ES call", e) return None
def read_settings(filename=None, defs=None): # READ SETTINGS if filename: settings_file = File(filename) if not settings_file.exists: Log.error("Can not file settings file {{filename}}", { "filename": settings_file.abspath }) settings = ref.get("file:///" + settings_file.abspath) if defs: settings.args = argparse(defs) return settings else: defs = listwrap(defs) defs.append({ "name": ["--settings", "--settings-file", "--settings_file"], "help": "path to JSON file with settings", "type": str, "dest": "filename", "default": "./settings.json", "required": False }) args = argparse(defs) settings = ref.get("file://" + args.filename.replace(os.sep, "/")) settings.args = args return settings
def insert_list(self, table_name, records): if not records: return columns = set() for r in records: columns |= set(r.keys()) columns = jx.sort(columns) try: self.execute( "DELETE FROM " + self.quote_column(table_name) + " WHERE _id IN {{ids}}", {"ids": self.quote_column([r["_id"] for r in records])} ) command = \ "INSERT INTO " + self.quote_column(table_name) + "(" + \ ",".join([self.quote_column(k) for k in columns]) + \ ") VALUES " + ",\n".join([ "(" + ",".join([self.quote_value(r.get(k, None)) for k in columns]) + ")" for r in records ]) self.execute(command) except Exception, e: Log.error("problem with insert", e)
def _convert_from(self, frum): if isinstance(frum, basestring): return Dict(name=frum) elif isinstance(frum, (Container, Query)): return frum else: Log.error("Expecting from clause to be a name, or a container")
def disconnect(): with suppress_exception: self.target_queue.close() Log.note("stop put into queue") self.pulse.disconnect() Log.note("pulse listener was given a disconnect()")
def send(self, topic, message): """Publishes a pulse message to the proper exchange.""" if not message: Log.error("Expecting a message") message._prepare() if not self.connection: self.connect() producer = Producer( channel=self.connection, exchange=Exchange(self.settings.exchange, type='topic'), routing_key=topic ) # The message is actually a simple envelope format with a payload and # some metadata. final_data = Dict( payload=message.data, _meta=set_default({ 'exchange': self.settings.exchange, 'routing_key': message.routing_key, 'serializer': self.settings.serializer, 'sent': time_to_string(datetime.datetime.now(timezone(self.settings.broker_timezone))), 'count': self.count }, message.metadata) ) producer.publish(jsons.scrub(final_data), serializer=self.settings.serializer) self.count += 1
def quote_value(self, value): """ convert values to mysql code for the same mostly delegate directly to the mysql lib, but some exceptions exist """ try: if value == None: return "NULL" elif isinstance(value, SQL): if not value.param: # value.template CAN BE MORE THAN A TEMPLATE STRING return self.quote_sql(value.template) param = {k: self.quote_sql(v) for k, v in value.param.items()} return expand_template(value.template, param) elif isinstance(value, basestring): return self.db.literal(value) elif isinstance(value, datetime): return "str_to_date('" + value.strftime("%Y%m%d%H%M%S") + "', '%Y%m%d%H%i%s')" elif hasattr(value, '__iter__'): return self.db.literal(json_encode(value)) elif isinstance(value, Mapping): return self.db.literal(json_encode(value)) elif Math.is_number(value): return unicode(value) else: return self.db.literal(value) except Exception, e: Log.error("problem quoting SQL", e)
def __str__(self): Log.error("not implemented")
def remove(self, x): Log.error("not implemented")
def parse_columns(parent_path, esProperties): """ RETURN THE COLUMN DEFINITIONS IN THE GIVEN esProperties OBJECT """ columns = DictList() for name, property in esProperties.items(): if parent_path: path = join_field(split_field(parent_path) + [name]) else: path = name if property.type == "nested" and property.properties: # NESTED TYPE IS A NEW TYPE DEFINITION # MARKUP CHILD COLUMNS WITH THE EXTRA DEPTH child_columns = deepcopy(parse_columns(path, property.properties)) self_columns = deepcopy(child_columns) for c in self_columns: c.depth += 1 columns.extend(self_columns) columns.append({ "name": join_field(split_field(path)[1::]), "type": "nested", "useSource": False }) if path not in INDEX_CACHE: pp = split_field(parent_path) for i in qb.reverse(range(len(pp))): c = INDEX_CACHE.get(join_field(pp[:i + 1]), None) if c: INDEX_CACHE[path] = c.copy() break else: Log.error("Can not find parent") INDEX_CACHE[path].name = path INDEX_CACHE[path].columns = child_columns continue if property.properties: child_columns = parse_columns(path, property.properties) columns.extend(child_columns) columns.append({ "name": join_field(split_field(path)[1::]), "type": "object", "useSource": False }) if property.dynamic: continue if not property.type: continue if property.type == "multi_field": property.type = property.fields[name].type # PULL DEFAULT TYPE for i, n, p in enumerate(property.fields): if n == name: # DEFAULT columns.append({ "name": join_field(split_field(path)[1::]), "type": p.type, "useSource": p.index == "no" }) else: columns.append({ "name": join_field(split_field(path)[1::]) + "\\." + n, "type": p.type, "useSource": p.index == "no" }) continue if property.type in [ "string", "boolean", "integer", "date", "long", "double" ]: columns.append({ "name": join_field(split_field(path)[1::]), "type": property.type, "useSource": property.index == "no" }) if property.index_name and name != property.index_name: columns.append({ "name": property.index_name, "type": property.type, "useSource": property.index == "no" }) elif property.enabled == None or property.enabled == False: columns.append({ "name": join_field(split_field(path)[1::]), "type": "object", "useSource": True }) else: Log.warning("unknown type {{type}} for property {{path}}", type=property.type, path=path) return columns
def compileEdges2Term(mvel_compiler, edges, constants): """ TERMS ARE ALWAYS ESCAPED SO THEY CAN BE COMPOUNDED WITH PIPE (|) GIVE MVEL CODE THAT REDUCES A UNIQUE TUPLE OF PARTITIONS DOWN TO A UNIQUE TERM GIVE LAMBDA THAT WILL CONVERT THE TERM BACK INTO THE TUPLE RETURNS TUPLE OBJECT WITH "type" and "value" ATTRIBUTES. "type" CAN HAVE A VALUE OF "script", "field" OR "count" CAN USE THE constants (name, value pairs) """ # IF THE QUERY IS SIMPLE ENOUGH, THEN DO NOT USE TERM PACKING edge0 = edges[0] if len(edges) == 1 and edge0.domain.type in ["set", "default"]: # THE TERM RETURNED WILL BE A MEMBER OF THE GIVEN SET def temp(term): return DictList([edge0.domain.getPartByKey(term)]) if edge0.value and isKeyword(edge0.value): return Dict(field=edge0.value, term2parts=temp) elif COUNT(edge0.domain.dimension.fields) == 1: return Dict(field=edge0.domain.dimension.fields[0], term2parts=temp) elif not edge0.value and edge0.domain.partitions: script = mvel_compiler.Parts2TermScript(edge0.domain) return Dict(expression=script, term2parts=temp) else: return Dict(expression=mvel_compiler.compile_expression( edge0.value, constants), term2parts=temp) mvel_terms = [] # FUNCTION TO PACK TERMS fromTerm2Part = [] # UNPACK TERMS BACK TO PARTS for e in edges: domain = e.domain fields = domain.dimension.fields if not e.value and fields: code, decode = mvel_compiler.Parts2Term(e.domain) t = Dict(toTerm=code, fromTerm=decode) elif fields: Log.error("not expected") elif e.domain.type == "time": t = compileTime2Term(e) elif e.domain.type == "duration": t = compileDuration2Term(e) elif e.domain.type in domains.ALGEBRAIC: t = compileNumeric2Term(e) elif e.domain.type == "set" and not fields: def fromTerm(term): return e.domain.getPartByKey(term) code, decode = mvel_compiler.Parts2Term(e.domain) t = Dict(toTerm=code, fromTerm=decode) else: t = compileString2Term(e) if not t.toTerm.body: mvel_compiler.Parts2Term(e.domain) Log.unexpected("what?") fromTerm2Part.append(t.fromTerm) mvel_terms.append(t.toTerm.body) # REGISTER THE DECODE FUNCTION def temp(term): terms = term.split('|') output = DictList([t2p(t) for t, t2p in zip(terms, fromTerm2Part)]) return output return Dict(expression=mvel_compiler.compile_expression( "+'|'+".join(mvel_terms), constants), term2parts=temp)
def __delslice__(self, i, j): _Log.error( "Can not perform del on slice: modulo arithmetic was performed on the parameters. You can try using clear()" )
def select(self, key): if not _Log: _late_import() _Log.error("Not supported. Use `get()`")
def pop(self): Log.error("not implemented")
def extend(self, values): Log.error("not implemented")
def sub(self, value): if value != self.agg[0]: Log.error("Not a sliding window") self.agg = self.agg[1:]
def __getslice__(self, i, j): Log.error( "slicing is broken in Python 2.7: a[i:j] == a[i+len(a), j] sometimes. Use [start:stop:step]" )
else: new_value = old_value.__class__(value) # TRY TO MAKE INSTANCE OF SAME CLASS except Exception, e: old_value = None new_value = value try: setattr(obj, attr_name, new_value) return old_value except Exception, e: try: obj[attr_name] = new_value return old_value except Exception, f: from pyLibrary.debugs.logs import Log Log.error(PATH_NOT_FOUND) def lower_match(value, candidates): return [v for v in candidates if v.lower()==value.lower()] def wrap(v): type_ = _get(v, "__class__") if type_ is dict: m = Dict(v) return m # m = object.__new__(Dict) # object.__setattr__(m, "_dict", v) # return m
def append(self, val): Log.error("not implemented")
def pe_filter(filter, data, depth): """ PARTIAL EVALUATE THE filter BASED ON data GIVEN """ if filter is TRUE_FILTER: return True if filter is FALSE_FILTER: return False filter = wrap(filter) if filter["and"]: result = True output = DictList() for a in filter[u"and"]: f = pe_filter(a, data, depth) if f is False: result = False elif f is not True: output.append(f) if result and output: return {"and": output} else: return result elif filter["or"]: output = DictList() for o in filter[u"or"]: f = pe_filter(o, data, depth) if f is True: return True elif f is not False: output.append(f) if output: return {"or": output} else: return False elif filter["not"]: f = pe_filter(filter["not"], data, depth) if f is True: return False elif f is False: return True else: return {"not": f} elif filter.term or filter.eq: eq = coalesce(filter.term, filter.eq) result = True output = {} for col, val in eq.items(): first, rest = parse_field(col, data, depth) d = data[first] if not rest: if d != val: result = False else: output[rest] = val if result and output: return {"term": output} else: return result elif filter.equal: a, b = filter["equal"] first_a, rest_a = parse_field(a, data, depth) first_b, rest_b = parse_field(b, data, depth) val_a = data[first_a] val_b = data[first_b] if not rest_a: if not rest_b: if val_a != val_b: return False else: return True else: return {"term": {rest_b: val_a}} else: if not rest_b: return {"term": {rest_a: val_b}} else: return {"equal": [rest_a, rest_b]} elif filter.terms: result = True output = {} for col, vals in filter["terms"].items(): first, rest = parse_field(col, data, depth) d = data[first] if not rest: if d not in vals: result = False else: output[rest] = vals if result and output: return {"terms": output} else: return result elif filter.range: result = True output = {} for col, ranges in filter["range"].items(): first, rest = parse_field(col, data, depth) d = data[first] if not rest: for sign, val in ranges.items(): if sign in ("gt", ">") and d <= val: result = False if sign == "gte" and d < val: result = False if sign == "lte" and d > val: result = False if sign == "lt" and d >= val: result = False else: output[rest] = ranges if result and output: return {"range": output} else: return result elif filter.missing: if isinstance(filter.missing, basestring): field = filter["missing"] else: field = filter["missing"]["field"] first, rest = parse_field(field, data, depth) d = data[first] if not rest: if d == None: return True return False else: return {"missing": rest} elif filter.prefix: result = True output = {} for col, val in filter["prefix"].items(): first, rest = parse_field(col, data, depth) d = data[first] if not rest: if d==None or not d.startswith(val): result = False else: output[rest] = val if result and output: return {"prefix": output} else: return result elif filter.exists: if isinstance(filter["exists"], basestring): field = filter["exists"] else: field = filter["exists"]["field"] first, rest = parse_field(field, data, depth) d = data[first] if not rest: if d != None: return True return False else: return {"exists": rest} else: Log.error(u"Can not interpret esfilter: {{esfilter}}", {u"esfilter": filter})
def merge(self, agg): Log.error("Do not know how to handle")
return output except Exception, e: Log.error("Expecting a dict with lists in codomain", e) else: try: # relation[d] is expected to be a list # return set(cod for d in data for cod in relation[d]) output = set() for d in data: cod = relation(d) if cod == None: continue output.add(cod) return output except Exception, e: Log.error("Expecting a dict with lists in codomain", e) return Null def tuple(data, field_name): """ RETURN LIST OF TUPLES """ if isinstance(data, Cube): Log.error("not supported yet") if isinstance(data, FlatList): Log.error("not supported yet") if isinstance(field_name, Mapping) and "value" in field_name: # SIMPLIFY {"value":value} AS STRING
@app.route('/', defaults={'path': ''}, methods=['GET', 'POST']) @app.route('/<path:path>', methods=['GET', 'POST']) def catch_all(path): return Response(b"", status=400, headers={ "access-control-allow-origin": "*", "content-type": "text/html" }) if __name__ == "__main__": try: config = startup.read_settings() constants.set(config.constants) Log.start(config.debug) # SETUP TREEHERDER CACHE hg = HgMozillaOrg(use_cache=True, settings=config.hg) th = TreeherderService(hg, settings=config.treeherder) app.add_url_rule('/treeherder', None, th.get_treeherder_job, methods=['GET']) HeaderRewriterFix(app, remove_headers=['Date', 'Server']) app.run(**config.flask) except Exception, e: Log.error("Serious problem with service construction! Shutdown!", cause=e)
class PersistentQueue(object): """ THREAD-SAFE, PERSISTENT QUEUE CAN HANDLE MANY PRODUCERS, BUT THE pop(), commit() IDIOM CAN HANDLE ONLY ONE CONSUMER. IT IS IMPORTANT YOU commit() or close(), OTHERWISE NOTHING COMES OFF THE QUEUE """ def __init__(self, _file): """ file - USES FILE FOR PERSISTENCE """ self.file = File.new_instance(_file) self.lock = Lock("lock for persistent queue using file " + self.file.name) self.please_stop = Signal() self.db = Dict() self.pending = [] if self.file.exists: for line in self.file: try: delta = convert.json2value(line) apply_delta(self.db, delta) except: pass if self.db.status.start == None: # HAPPENS WHEN ONLY ADDED TO QUEUE, THEN CRASH self.db.status.start = 0 self.start = self.db.status.start # SCRUB LOST VALUES lost = 0 for k in self.db.keys(): try: if k != "status" and int(k) < self.start: self.db[k] = None lost += 1 except Exception: pass # HAPPENS FOR self.db.status, BUT MAYBE OTHER PROPERTIES TOO if lost: Log.warning("queue file had {{num}} items lost", num=lost) if DEBUG: Log.note("Persistent queue {{name}} found with {{num}} items", name=self.file.abspath, num=len(self)) else: self.db.status = Dict(start=0, end=0) self.start = self.db.status.start if DEBUG: Log.note("New persistent queue {{name}}", name=self.file.abspath) def _add_pending(self, delta): delta = wrap(delta) self.pending.append(delta) def _apply_pending(self): for delta in self.pending: apply_delta(self.db, delta) self.pending = [] def __iter__(self): """ BLOCKING ITERATOR """ while not self.please_stop: try: value = self.pop() if value is not Thread.STOP: yield value except Exception, e: Log.warning("Tell me about what happened here", cause=e) if DEBUG: Log.note("queue iterator is done")
def assign(source, destination): try: destination[name] = source.get(f) except Exception, e: Log.error("{{value}} does not have {{field}} property", value= source, field=f, cause=e)
def add(self, record): if isinstance(record, list): Log.error("add() has changed to only accept one record, no lists") self.extend([record])
def transform(self, *args, **kwargs): Log.error("Not implemented")
["1.4.", "1.5."])): if item.index.status not in [200, 201]: Log.error( "{{num}} {{error}} while loading line into {{index}}:\n{{line}}", num=item.index.status, error=item.index.error, line=lines[i * 2 + 1], index=self.settings.index) else: Log.error("version not supported {{version}}", version=self.cluster.version) if self.debug: Log.note("{{num}} documents added", num=len(items)) except Exception, e: Log.error("problem sending to ES", e) # RECORDS MUST HAVE id AND json AS A STRING OR # HAVE id AND value AS AN OBJECT def add(self, record): if isinstance(record, list): Log.error("add() has changed to only accept one record, no lists") self.extend([record]) # -1 FOR NO REFRESH def set_refresh_interval(self, seconds): if seconds <= 0: interval = -1 else: interval = unicode(seconds) + "s"
def _post(self, path, **kwargs): url = self.settings.host + ":" + unicode(self.settings.port) + path try: wrap(kwargs).headers["Accept-Encoding"] = "gzip,deflate" if "data" in kwargs and not isinstance(kwargs["data"], str): Log.error("data must be utf8 encoded string") if self.debug: sample = kwargs.get("data", "")[:300] Log.note("{{url}}:\n{{data|indent}}", url=url, data=sample) response = http.post(url, **kwargs) if response.status_code not in [200, 201]: Log.error(response.reason + ": " + response.all_content) if self.debug: Log.note("response: {{response}}", response=utf82unicode(response.all_content)[:130]) details = convert.json2value(utf82unicode(response.all_content)) if details.error: Log.error(convert.quote2string(details.error)) if details._shards.failed > 0: Log.error("Shard failures {{failures|indent}}", failures="---\n".join( r.replace(";", ";\n") for r in details._shards.failures.reason)) return details except Exception, e: if url[0:4] != "http": suggestion = " (did you forget \"http://\" prefix on the host name?)" else: suggestion = "" if kwargs.get("data"): Log.error("Problem with call to {{url}}" + suggestion + "\n{{body|left(10000)}}", url=url, body=kwargs["data"][0:10000] if self.debug else kwargs["data"][0:100], cause=e) else: Log.error("Problem with call to {{url}}" + suggestion, {"url": url}, e)
try: py_result = strangman.stats.chisquare(f_obs, f_exp) except Exception, e: Log.error("problem with call", e) if DEBUG_STRANGMAN: from pyLibrary.testing.fuzzytestcase import assertAlmostEqualValue sp_result = scipy.stats.chisquare(np.array(f_obs), f_exp=np.array(f_exp)) if not assertAlmostEqualValue(sp_result[0], py_result[0], digits=9) and assertAlmostEqualValue( sp_result[1], py_result[1], delta=1e-8): Log.error("problem with stats lib") return py_result def Stats2ZeroMoment(stats): # MODIFIED FROM http://statsmodels.sourceforge.net/devel/_modules/statsmodels/stats/moment_helpers.html # ADDED count mc0, mc1, mc2, skew, kurt = stats.count, coalesce(stats.mean, 0), coalesce( stats.variance, 0), coalesce(stats.skew, 0), coalesce(stats.kurtosis, 0) mz0 = mc0 mz1 = mc1 * mc0 mz2 = (mc2 + mc1 * mc1) * mc0 mc3 = coalesce(skew, 0) * (mc2**1.5) # 3rd central moment
def extend(self, records): """ records - MUST HAVE FORM OF [{"value":value}, ... {"value":value}] OR [{"json":json}, ... {"json":json}] OPTIONAL "id" PROPERTY IS ALSO ACCEPTED """ lines = [] try: for r in records: id = r.get("id") if id == None: id = Random.hex(40) if "json" in r: # if id != coalesce(wrap(convert.json2value(r["json"])).value._id, id): # Log.error("expecting _id to match") json = r["json"] elif "value" in r: # if id != coalesce(wrap(r).value._id, id): # Log.error("expecting _id to match") json = convert.value2json(r["value"]) else: json = None Log.error( "Expecting every record given to have \"value\" or \"json\" property" ) lines.append('{"index":{"_id": ' + convert.value2json(id) + '}}') lines.append(json) del records if not lines: return try: data_bytes = "\n".join(lines) + "\n" data_bytes = data_bytes.encode("utf8") except Exception, e: Log.error("can not make request body from\n{{lines|indent}}", lines=lines, cause=e) response = self.cluster._post(self.path + "/_bulk", data=data_bytes, headers={"Content-Type": "text"}, timeout=self.settings.timeout) items = response["items"] for i, item in enumerate(items): if self.cluster.version.startswith("0.90."): if not item.index.ok: Log.error("{{error}} while loading line:\n{{line}}", error=item.index.error, line=lines[i * 2 + 1]) elif any(map(self.cluster.version.startswith, ["1.4.", "1.5."])): if item.index.status not in [200, 201]: Log.error( "{{num}} {{error}} while loading line into {{index}}:\n{{line}}", num=item.index.status, error=item.index.error, line=lines[i * 2 + 1], index=self.settings.index) else: Log.error("version not supported {{version}}", version=self.cluster.version) if self.debug: Log.note("{{num}} documents added", num=len(items))
def _dispatch_work(self, source_block): """ source_block POINTS TO THE bucket AND key TO PROCESS :return: False IF THERE IS NOTHING LEFT TO DO """ source_keys = listwrap(coalesce(source_block.key, source_block.keys)) if not isinstance(source_block.bucket, basestring): # FIX MISTAKE source_block.bucket = source_block.bucket.bucket bucket = source_block.bucket work_actions = [ w for w in self.settings.workers if w.source.bucket == bucket ] if not work_actions: Log.note( "No worker defined for records from {{bucket}}, {{action}}.\n{{message|indent}}", bucket=source_block.bucket, message=source_block, action="skipping" if self.settings.keep_unknown_on_queue else "deleting") return not self.settings.keep_unknown_on_queue for action in work_actions: try: source_key = unicode(source_keys[0]) if len(source_keys) > 1: multi_source = action._source source = ConcatSources( [multi_source.get_key(k) for k in source_keys]) source_key = MIN(source_key) else: source = action._source.get_key(source_key) source_key = source.key Log.note("Execute {{action}} on bucket={{source}} key={{key}}", action=action.name, source=source_block.bucket, key=source_key) if action.transform_type == "bulk": old_keys = set() else: old_keys = action._destination.keys( prefix=source_block.key) new_keys = set( action._transformer(source_key, source, action._destination, resources=self.resources, please_stop=self.please_stop)) #VERIFY KEYS if len(new_keys) == 1 and list(new_keys)[0] == source_key: pass # ok else: etls = map(key2etl, new_keys) etls = qb.sort(etls, "id") for i, e in enumerate(etls): if i != e.id: Log.error( "expecting keys to have dense order: {{ids}}", ids=etls.id) #VERIFY KEYS EXIST if hasattr(action._destination, "get_key"): for k in new_keys: action._destination.get_key(k) for n in action._notify: for k in new_keys: n.add(k) if action.transform_type == "bulk": continue # DUE TO BUGS THIS INVARIANT IS NOW BROKEN # TODO: FIGURE OUT HOW TO FIX THIS (CHANGE NAME OF THE SOURCE BLOCK KEY?) # for n in new_keys: # if not n.startswith(source_key): # Log.error("Expecting new keys ({{new_key}}) to start with source key ({{source_key}})", new_key= n, source_key= source_key) if not new_keys and old_keys: Log.alert( "Expecting some new keys after etl of {{source_key}}, especially since there were old ones\n{{old_keys}}", old_keys=old_keys, source_key=source_key) continue elif not new_keys: Log.alert( "Expecting some new keys after processing {{source_key}}", old_keys=old_keys, source_key=source_key) continue for k in new_keys: if len(k.split(".") ) == 3 and action.destination.type != "test_result": Log.error( "two dots have not been needed yet, this is a consitency check" ) delete_me = old_keys - new_keys if delete_me: if action.destination.bucket == "ekyle-test-result": for k in delete_me: action._destination.delete_key(k) else: Log.note("delete keys?\n{{list}}", list=sorted(delete_me)) # for k in delete_me: # WE DO NOT PUT KEYS ON WORK QUEUE IF ALREADY NOTIFYING SOME OTHER # AND NOT GOING TO AN S3 BUCKET if not action._notify and isinstance( action._destination, (aws.s3.Bucket, S3Bucket)): for k in old_keys | new_keys: self.work_queue.add( Dict(bucket=action.destination.bucket, key=k)) except Exception, e: if "Key {{key}} does not exist" in e: err = Log.warning elif "multiple keys in {{bucket}}" in e: err = Log.warning if source_block.bucket == "ekyle-test-result": for k in action._source.list( prefix=key_prefix(source_key)): action._source.delete_key(strip_extension(k.key)) elif "expecting keys to have dense order" in e: err = Log.warning if source_block.bucket == "ekyle-test-result": # WE KNOW OF THIS ETL MISTAKE, REPROCESS self.work_queue.add({ "key": unicode(key_prefix(source_key)), "bucket": "ekyle-pulse-logger" }) elif "Expecting a pure key" in e: err = Log.warning else: err = Log.error err( "Problem transforming {{action}} on bucket={{source}} key={{key}} to destination={{destination}}", { "action": action.name, "source": source_block.bucket, "key": source_key, "destination": coalesce(action.destination.name, action.destination.index) }, e)
def chisquare(f_obs, f_exp): try: py_result = strangman.stats.chisquare(f_obs, f_exp) except Exception, e: Log.error("problem with call", e)
hg = HgMozillaOrg(settings=settings.hg) resources = Dict(hg=dictwrap(hg)) stopper = Signal() for i in range(coalesce(settings.param.threads, 1)): ETL(name="ETL Loop " + unicode(i), work_queue=settings.work_queue, resources=resources, workers=settings.workers, settings=settings.param, please_stop=stopper) Thread.wait_for_shutdown_signal(stopper, allow_exit=True) except Exception, e: Log.error("Problem with etl", e) finally: Log.stop() def etl_one(settings): queue = Queue("temp work queue") queue.__setattr__(b"commit", Null) queue.__setattr__(b"rollback", Null) settings.param.wait_forever = False already_in_queue = set() for w in settings.workers: source = get_container(w.source) # source.settings.fast_forward = True if id(source) in already_in_queue: continue try:
def value(self): if self.num: Log.error("can not get value of with dimension") return self.cube