def insert_list(self, table_name, records): if not records: return columns = set() for r in records: columns |= set(r.keys()) columns = qb.sort(columns) try: self.execute( "DELETE FROM " + self.quote_column(table_name) + " WHERE _id IN {{ids}}", {"ids": self.quote_column([r["_id"] for r in records])} ) command = \ "INSERT INTO " + self.quote_column(table_name) + "(" + \ ",".join([self.quote_column(k) for k in columns]) + \ ") VALUES " + ",\n".join([ "(" + ",".join([self.quote_value(r.get(k, None)) for k in columns]) + ")" for r in records ]) self.execute(command) except Exception, e: Log.error("problem with insert", e)
def main(): """ CLEAR OUT KEYS FROM BUCKET BY RANGE, OR BY FILE """ settings = startup.read_settings(defs=[ { "name": ["--bucket"], "help": "bucket to scan", "type": str, "dest": "bucket", "required": True } ]) Log.start(settings.debug) source = Connection(settings.aws).get_bucket(settings.args.bucket) for k in qb.sort(source.keys()): try: data = source.read_bytes(k) if convert.ascii2unicode(data).find("2e2834fa7ecd8d3bb1ad49ec981fdb89eb4df95e18") >= 0: Log.note("Found at {{key}}", key=k) except Exception, e: Log.warning("Problem with {{key}}", key=k, cause=e) finally:
def query(self, _query): if not self.columns: self.columns = [] alias_done = set() metadata = self._es.get_metadata() for index, meta in qb.sort(metadata.indices.items(), { "value": 0, "sort": -1 }): for _, properties in meta.mappings.items(): columns = _parse_properties(index, properties.properties) for c in columns: c.cube = index c.property = c.name c.name = None c.useSource = None self.columns.extend(columns) for a in meta.aliases: # ONLY THE LATEST ALIAS IS CHOSEN TO GET COLUMNS if a in alias_done: continue alias_done.add(a) for c in columns: self.columns.append(set_default( {"cube": a}, c)) # ENSURE WE COPY return qb.run( set_default({ "from": self.columns, "sort": ["cube", "property"] }, _query.as_dict()))
def get_or_create_index( self, index, alias=None, schema=None, limit_replicas=None, settings=None ): from pyLibrary.queries import qb settings = deepcopy(settings) aliases = self.get_aliases() indexes = qb.sort([ a for a in aliases if (a.alias == settings.index and settings.alias == None) or (re.match(re.escape(settings.index) + "\\d{8}_\\d{6}", a.index) and settings.alias == None) or (a.index == settings.index and (a.alias == None or a.alias == settings.alias )) ], "index") if not indexes: output = self.create_index(settings=settings, schema=schema, limit_replicas=limit_replicas) return output elif indexes.last().alias != None: settings.alias = indexes.last().alias settings.index = indexes.last().index elif settings.alias == None: settings.alias = settings.index settings.index = indexes.last().index return Index(settings)
def get_schema(self, retry=True): if self.settings.explore_metadata: indices = self.cluster.get_metadata().indices if not self.settings.alias or self.settings.alias==self.settings.index: #PARTIALLY DEFINED settings candidates = [(name, i) for name, i in indices.items() if self.settings.index in i.aliases] index = qb.sort(candidates, 0).last()[1] else: #FULLY DEFINED settings index = indices[self.settings.index] if index == None and retry: #TRY AGAIN, JUST IN CASE self.cluster.cluster_state = None return self.get_schema(retry=False) properties = index.mappings[self.settings.type] #TODO: REMOVE THIS BUG CORRECTION if not properties and self.settings.type == "test_result": properties = index.mappings["test_results"] # DONE BUG CORRECTION if not properties: Log.error("ElasticSearch index ({{index}}) does not have type ({{type}})", index= self.settings.index, type= self.settings.type) return properties else: mapping = self.cluster.get(self.path + "/_mapping") if not mapping[self.settings.type]: Log.error("{{index}} does not have type {{type}}", self.settings) return wrap({"mappings": mapping[self.settings.type]})
def _get_columns(self, table=None): # TODO: HANDLE MORE THEN ONE ES, MAP TABLE SHORT_NAME TO ES INSTANCE alias_done = set() index = split_field(table)[0] query_path = split_field(table)[1:] metadata = self.default_es.get_metadata(index=index) for index, meta in qb.sort(metadata.indices.items(), {"value": 0, "sort": -1}): for _, properties in meta.mappings.items(): columns = _elasticsearch.parse_properties(index, None, properties.properties) columns = columns.filter(lambda r: not r.abs_name.startswith("other.") and not r.abs_name.startswith("previous_values.cf_")) # TODO: REMOVE WHEN jobs PROPERTY EXPLOSION IS CONTAINED with Timer("upserting {{num}} columns", {"num": len(columns)}, debug=DEBUG): with self.columns.locker: for c in columns: # ABSOLUTE c.table = join_field([index]+query_path) self.upsert_column(c) for alias in meta.aliases: # ONLY THE LATEST ALIAS IS CHOSEN TO GET COLUMNS if alias in alias_done: continue alias_done.add(alias) c = copy(c) c.table = join_field([alias]+query_path) self.upsert_column(c)
def get_or_create_index(self, index, alias=None, schema=None, limit_replicas=None, settings=None): from pyLibrary.queries import qb settings = deepcopy(settings) aliases = self.get_aliases() indexes = qb.sort([ a for a in aliases if (a.alias == settings.index and settings.alias == None) or ( re.match(re.escape(settings.index) + "\\d{8}_\\d{6}", a.index) and settings.alias == None) or (a.index == settings.index and ( a.alias == None or a.alias == settings.alias)) ], "index") if not indexes: output = self.create_index(settings=settings, schema=schema, limit_replicas=limit_replicas) return output elif indexes.last().alias != None: settings.alias = indexes.last().alias settings.index = indexes.last().index elif settings.alias == None: settings.alias = settings.index settings.index = indexes.last().index return Index(settings)
def get_columns(self, table): """ RETURN METADATA COLUMNS """ with self.columns.locker: columns = qb.sort(filter(lambda r: r.table == table, self.columns.data), "name") if columns: return columns self._get_columns(table=table) with self.columns.locker: columns = qb.sort(filter(lambda r: r.table == table, self.columns.data), "name") if columns: return columns # self._get_columns(table=table) Log.error("no columns for {{table}}", table=table)
def done_count(self): self.edge.domain = SimpleSetDomain( key="value", partitions=[{ "value": v, "dataIndex": i } for i, v in enumerate( qb.sort(self.edge.domain.partitions, [k for k, v in self.fields]))])
def get_metadata(self, index=None, force=False): with self.metadata_locker: if self.settings.explore_metadata: if not self._metadata or (force and index is None): response = self.get("/_cluster/state") self._metadata = wrap(response.metadata) self.cluster_state = wrap(self.get("/")) self.version = self.cluster_state.version.number elif index: # UPDATE THE MAPPING FOR ONE INDEX ONLY response = self.get("/"+index+"/_mapping") if self.version.startswith("0.90."): best = qb.sort(response.items(), 0).last() self._metadata.indices[index].mappings = best[1] else: self._metadata.indices[index].mappings = qb.sort(response.items(), 0).last()[1].mappings return Dict(indices={index: self._metadata.indices[index]}) else: Log.error("Metadata exploration has been disabled") return self._metadata
def done_count(self): columns = map(unicode, range(len(self.fields))) parts = wrap([{unicode(i): p for i, p in enumerate(part)} for part in set(self.parts)]) self.parts = None sorted_parts = qb.sort(parts, columns) self.edge.domain = self.domain = SimpleSetDomain( key="value", partitions=[{"value": tuple(v[k] for k in columns), "dataIndex": i} for i, v in enumerate(sorted_parts)], )
def _get_best(self, settings): from pyLibrary.queries import qb aliases = self.get_aliases() indexes = qb.sort([ a for a in aliases if (a.alias == settings.index and settings.alias == None) or (re.match(re.escape(settings.index) + r'\d{8}_\d{6}', a.index) and settings.alias == None) or (a.index == settings.index and (a.alias == None or a.alias == settings.alias)) ], "index") return indexes.last()
def _es_terms2(es, mvel, query): """ WE ASSUME THERE ARE JUST TWO EDGES, AND EACH HAS A SIMPLE value """ # REQUEST VALUES IN FIRST DIMENSION q1 = query.copy() q1.edges = query.edges[0:1:] values1 = es_terms(es, mvel, q1).edges[0].domain.partitions.value select = listwrap(query.select) FromES = build_es_query(query) for s in select: for i, v in enumerate(values1): FromES.facets[s.name + "," + str(i)] = { "terms": { "field": query.edges[1].value, "size": coalesce(query.limit, 200000) }, "facet_filter": simplify_esfilter({"and": [ query.where, {"term": {query.edges[0].value: v}} ]}) } data = es09.util.post(es, FromES, query.limit) # UNION ALL TERMS FROM SECOND DIMENSION values2 = set() for k, f in data.facets.items(): values2.update(f.terms.term) values2 = qb.sort(values2) term2index = {v: i for i, v in enumerate(values2)} query.edges[1].domain.partitions = DictList([{"name": v, "value": v} for v in values2]) # MAKE CUBE output = {} dims = [len(values1), len(values2)] for s in select: output[s.name] = Matrix(*dims) # FILL CUBE # EXPECTING ONLY SELECT CLAUSE FACETS for facetName, facet in data.facets.items(): coord = facetName.split(",") s = [s for s in select if s.name == coord[0]][0] i1 = int(coord[1]) for term in facet.terms: i2 = term2index[term.term] output[s.name][(i1, i2)] = term[aggregates[s.aggregate]] cube = Cube(query.select, query.edges, output) cube.query = query return cube
def push_to_es(settings, data, dirty): global destination if not destination: index = Index(settings.destination) destination = index.threaded_queue(batch_size=100) # PREP RECORDS FOR ES for bug_id in dirty: d = data.descendants[bug_id] if not d: continue p = data.parents[bug_id] c = data.children[bug_id] destination.add({"id": bug_id, "value": { "bug_id": bug_id, "parents": qb.sort(p), "children": qb.sort(c), "descendants": qb.sort(d), "etl": {"timestamp": Date.now().unix} }})
def pull_from_es(settings, destq, all_parents, all_children, all_descendants, work_queue): # LOAD PARENTS FROM ES for g, r in qb.groupby(qb.sort(work_queue), size=100): result = destq.query({ "from": settings.destination.index, "select": "*", "where": {"terms": {"bug_id": r}} }) for r in result.data: all_parents.extend(r.bug_id, listwrap(r.parents)) all_children.extend(r.bug_id, listwrap(r.children)) all_descendants.extend(r.bug_id, listwrap(r.descendants))
def json_schema_to_markdown(schema): from pyLibrary.queries import qb def _md_code(code): return "`"+code+"`" def _md_italic(value): return "*"+value+"*" def _inner(schema, parent_name, indent): more_lines = [] for k,v in schema.items(): full_name = join_field(split_field(parent_name)+[k]) details = indent+"* "+_md_code(full_name) if v.type: details += " - "+_md_italic(v.type) else: Log.error("{{full_name}} is missing type", full_name=full_name) if v.description: details += " " + v.description more_lines.append(details) if v.type in ["object", "array", "nested"]: more_lines.extend(_inner(v.properties, full_name, indent+" ")) return more_lines lines = [] if schema.title: lines.append("#"+schema.title) lines.append(schema.description) lines.append("") for k, v in qb.sort(schema.properties.items(), 0): full_name = k if v.type in ["object", "array", "nested"]: lines.append("##"+_md_code(full_name)+" Property") if v.description: lines.append(v.description) lines.append("") if v.type in ["object", "array", "nested"]: lines.extend(_inner(v.properties, full_name, " ")) else: lines.append("##"+_md_code(full_name)+" ("+v.type+")") if v.description: lines.append(v.description) return "\n".join(lines)
def __init__( self, alias, # NAME OF THE ALIAS type=None, # SCHEMA NAME, WILL HUNT FOR ONE IF None explore_metadata=True, # IF PROBING THE CLUSTER FOR METADATA IS ALLOWED debug=False, timeout=None, # NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests) settings=None): self.debug = debug if self.debug: Log.alert("Elasticsearch debugging on {{index|quote}} is on", index=settings.index) self.settings = settings self.cluster = Cluster(settings) if type == None: if not explore_metadata: Log.error( "Alias() was given no `type` (aka schema) and not allowed to explore metadata. Do not know what to do now." ) indices = self.cluster.get_metadata().indices if not self.settings.alias or self.settings.alias == self.settings.index: candidates = [(name, i) for name, i in indices.items() if self.settings.index in i.aliases] index = qb.sort(candidates, 0).last()[1] else: index = indices[self.settings.index] # FIND MAPPING WITH MOST PROPERTIES (AND ASSUME THAT IS THE CANONICAL TYPE) max_prop = -1 for _type, mapping in index.mappings.items(): num_prop = len(mapping.properties.keys()) if max_prop < num_prop: max_prop = num_prop self.settings.type = _type type = _type if type == None: Log.error("Can not find schema type for index {{index}}", index=coalesce(self.settings.alias, self.settings.index)) self.path = "/" + alias + "/" + type
def __init__( self, alias, # NAME OF THE ALIAS type=None, # SCHEMA NAME, WILL HUNT FOR ONE IF None explore_metadata=True, # IF PROBING THE CLUSTER FOR METADATA IS ALLOWED debug=False, timeout=None, # NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests) settings=None ): self.debug = debug if self.debug: Log.alert("Elasticsearch debugging on {{index|quote}} is on", index= settings.index) self.settings = settings self.cluster = Cluster(settings) if type == None: if not explore_metadata: Log.error("Alias() was given no `type` (aka schema) and not allowed to explore metadata. Do not know what to do now.") indices = self.cluster.get_metadata().indices if not self.settings.alias or self.settings.alias==self.settings.index: alias_list = self.cluster.get("/_alias/"+self.settings.index) candidates = [(name, i) for name, i in alias_list.items() if self.settings.index in i.aliases.keys()] full_name = qb.sort(candidates, 0).last()[0] index = self.cluster.get("/" + full_name + "/_mapping")[full_name] else: index = self.cluster.get("/"+self.settings.index+"/_mapping")[self.settings.index] # FIND MAPPING WITH MOST PROPERTIES (AND ASSUME THAT IS THE CANONICAL TYPE) max_prop = -1 for _type, mapping in index.mappings.items(): if _type == "_default_": continue num_prop = len(mapping.properties.keys()) if max_prop < num_prop: max_prop = num_prop self.settings.type = _type type = _type if type == None: Log.error("Can not find schema type for index {{index}}", index=coalesce(self.settings.alias, self.settings.index)) self.path = "/" + alias + "/" + type
def insert_list(self, table_name, records): if not records: return keys = set() for r in records: keys |= set(r.keys()) keys = qb.sort(keys) try: command = \ "INSERT INTO " + self.quote_column(table_name) + "(" + \ ",".join([self.quote_column(k) for k in keys]) + \ ") VALUES " + ",\n".join([ "(" + ",".join([self.quote_value(r[k]) for k in keys]) + ")" for r in records ]) self.execute(command) except Exception, e: Log.error("problem with record: {{record}}", record= records, cause=e)
def get_schema(self, retry=True): if self.settings.explore_metadata: indices = self.cluster.get_metadata().indices if not self.settings.alias or self.settings.alias == self.settings.index: #PARTIALLY DEFINED settings candidates = [(name, i) for name, i in indices.items() if self.settings.index in i.aliases] # TODO: MERGE THE mappings OF ALL candidates, DO NOT JUST PICK THE LAST ONE index = "dummy value" schema = wrap({"properties": {}}) for _, ind in qb.sort(candidates, {"value": 0, "sort": -1}): schema.properties = _merge_mapping( schema.properties, ind.mappings[self.settings.type].properties) else: #FULLY DEFINED settings index = indices[self.settings.index] schema = index.mappings[self.settings.type] if index == None and retry: #TRY AGAIN, JUST IN CASE self.cluster.cluster_state = None return self.get_schema(retry=False) #TODO: REMOVE THIS BUG CORRECTION if not schema and self.settings.type == "test_result": schema = index.mappings["test_results"] # DONE BUG CORRECTION if not schema: Log.error( "ElasticSearch index ({{index}}) does not have type ({{type}})", index=self.settings.index, type=self.settings.type) return schema else: mapping = self.cluster.get(self.path + "/_mapping") if not mapping[self.settings.type]: Log.error("{{index}} does not have type {{type}}", self.settings) return wrap({"mappings": mapping[self.settings.type]})
def get_schema(self, retry=True): if self.settings.explore_metadata: indices = self.cluster.get_metadata().indices if not self.settings.alias or self.settings.alias==self.settings.index: #PARTIALLY DEFINED settings candidates = [(name, i) for name, i in indices.items() if self.settings.index in i.aliases] # TODO: MERGE THE mappings OF ALL candidates, DO NOT JUST PICK THE LAST ONE index = "dummy value" schema = wrap({"_routing": {}, "properties": {}}) for _, ind in qb.sort(candidates, {"value": 0, "sort": -1}): mapping = ind.mappings[self.settings.type] set_default(schema._routing, mapping._routing) schema.properties = _merge_mapping(schema.properties, mapping.properties) else: #FULLY DEFINED settings index = indices[self.settings.index] schema = index.mappings[self.settings.type] if index == None and retry: #TRY AGAIN, JUST IN CASE self.cluster.cluster_state = None return self.get_schema(retry=False) #TODO: REMOVE THIS BUG CORRECTION if not schema and self.settings.type == "test_result": schema = index.mappings["test_results"] # DONE BUG CORRECTION if not schema: Log.error( "ElasticSearch index ({{index}}) does not have type ({{type}})", index=self.settings.index, type=self.settings.type ) return schema else: mapping = self.cluster.get(self.path + "/_mapping") if not mapping[self.settings.type]: Log.error("{{index}} does not have type {{type}}", self.settings) return wrap({"mappings": mapping[self.settings.type]})
def insert_list(self, table_name, records): if not records: return keys = set() for r in records: keys |= set(r.keys()) keys = qb.sort(keys) try: command = \ "INSERT INTO " + self.quote_column(table_name) + "(" + \ ",".join([self.quote_column(k) for k in keys]) + \ ") VALUES " + ",\n".join([ "(" + ",".join([self.quote_value(r[k]) for k in keys]) + ")" for r in records ]) self.execute(command) except Exception, e: Log.error("problem with record: {{record}}", record=records, cause=e)
def main(): """ CLEAR OUT KEYS FROM BUCKET BY RANGE, OR BY FILE """ try: settings = startup.read_settings(defs=[{ "name": ["--bucket"], "help": "bucket to reprocess", "type": str, "dest": "bucket", "required": True }, { "name": ["--begin", "--start"], "help": "lowest key (or prefix) to reprocess", "type": str, "dest": "start", "default": "1", "required": False }, { "name": ["--end", "--stop"], "help": "highest key (or prefix) to reprocess", "type": str, "dest": "end", "default": None, "required": False }, { "name": ["--file"], "help": "path to file with CR-delimited prefix list", "type": str, "dest": "file", "default": None, "required": False }]) Log.start(settings.debug) with aws.Queue(settings.work_queue) as work_queue: source = Connection(settings.aws).get_bucket(settings.args.bucket) if settings.args.file: now = Date.now() for prefix in File(settings.args.file): all_keys = source.keys(prefix=key_prefix(prefix)) for k in all_keys: Log.note("Adding {{key}}", key=k) work_queue.add({ "bucket": settings.args.bucket, "key": k, "timestamp": now.unix, "date/time": now.format() }) return if settings.args.end and settings.args.start: up_to = str(int(settings.args.end) - 1) prefix = strings.common_prefix(settings.args.start, up_to) else: prefix = None start = Version(settings.args.start) end = Version(settings.args.end) all_keys = source.keys(prefix=prefix) with Timer("filtering {{num}} keys", {"num": len(all_keys)}): all_keys = [(k, Version(k)) for k in all_keys if k.find("None") == -1] all_keys = [(k, p) for k, p in all_keys if start <= p < end] with Timer("sorting {{num}} keys", {"num": len(all_keys)}): all_keys = qb.sort(all_keys, 1) for k, p in all_keys: Log.note("Adding {{key}}", key=k) now = Date.now() work_queue.add({ "bucket": settings.args.bucket, "key": k, "timestamp": now.unix, "date/time": now.format() }) except Exception, e: Log.error("Problem with etl", e)
settings=settings.destination).get_or_create_index( settings=settings.destination) keep_trying = True while keep_trying: try: all_keys = source.keys() keep_trying = False except Exception, e: Log.warning("problem", e) # all_keys = set() # for i in range(20, 97, 1): # all_keys |= source.keys(prefix=unicode(i)) for k in qb.sort(all_keys): try: pulse_block_to_es.process(k, source.get_key(k), destination) except Exception, e: Log.warning("Problem with {{key}}", key=k, cause=e) def main(): try: settings = startup.read_settings() Log.start(settings.debug) backfill(settings) except Exception, e: Log.error("Problem with backfill", e) finally: Log.stop()
"ordering": -1, "stats": geo_mean(total) } ) new_records.append(new_record) # ADD RECORD FOR GRAPH SERVER SUMMARYh new_record = Dict( machine=r.machine, treeherder=r.treeherder, run=r.run, build=r.build, result={ "test_name": "summary_old", "ordering": -1, "stats": Stats(samples=qb.sort(total.mean)[:len(total)-1:]) } ) new_records.append(new_record) return new_records except Exception, e: Log.error("Transformation failure on id={{uid}}", {"uid": uid}, e) def stats(values): """ RETURN LOTS OF AGGREGATES """ if values == None: return None
prefixes = [ p.name.rstrip(":") for p in bucket.list(prefix="", delimiter=":") ] in_s3 = [] for i, p in enumerate(prefixes): if i % 1000 == 0: Log.note("Scrubbed {{p|percent(decimal=1)}}", p=i / len(prefixes)) try: if int(p) not in in_es: in_s3.append(int(p)) else: pass except Exception, _: Log.note("delete key {{key}}", key=p) bucket.delete_key(strip_extension(p)) in_s3 = qb.reverse(qb.sort(in_s3)) return in_s3 def main(): try: settings = startup.read_settings(defs=[{ "name": ["--id"], "help": "id (prefix, really) to process", "type": str, "dest": "id", "required": False }]) constants.set(settings.constants) Log.start(settings.debug)
# EVERYTHING FROM S3 bucket = s3.Bucket(settings.source) prefixes = [p.name.rstrip(":") for p in bucket.list(prefix="", delimiter=":")] in_s3 = [] for i, p in enumerate(prefixes): if i % 1000 == 0: Log.note("Scrubbed {{p|percent(decimal=1)}}", p= i / len(prefixes)) try: if int(p) not in in_es: in_s3.append(int(p)) else: pass except Exception, _: Log.note("delete key {{key}}", key= p) bucket.delete_key(strip_extension(p)) in_s3 = qb.reverse(qb.sort(in_s3)) return in_s3 def main(): try: settings = startup.read_settings(defs=[ { "name": ["--id"], "help": "id (prefix, really) to process", "type": str, "dest": "id", "required": False } ]) constants.set(settings.constants)
def main(): """ CLEAR OUT KEYS FROM BUCKET BY RANGE, OR BY FILE """ try: settings = startup.read_settings(defs=[ { "name": ["--bucket"], "help": "bucket to reprocess", "type": str, "dest": "bucket", "required": True }, { "name": ["--begin", "--start"], "help": "lowest key (or prefix) to reprocess", "type": str, "dest": "start", "default": "1", "required": False }, { "name": ["--end", "--stop"], "help": "highest key (or prefix) to reprocess", "type": str, "dest": "end", "default": None, "required": False }, { "name": ["--file"], "help": "path to file with CR-delimited prefix list", "type": str, "dest": "file", "default": None, "required": False } ]) Log.start(settings.debug) with aws.Queue(settings.work_queue) as work_queue: source = Connection(settings.aws).get_bucket(settings.args.bucket) if settings.args.file: now = Date.now() for prefix in File(settings.args.file): all_keys = source.keys(prefix=key_prefix(prefix)) for k in all_keys: Log.note("Adding {{key}}", key=k) work_queue.add({ "bucket": settings.args.bucket, "key": k, "timestamp": now.unix, "date/time": now.format() }) return if settings.args.end and settings.args.start: up_to = str(int(settings.args.end) - 1) prefix = strings.common_prefix(settings.args.start, up_to) else: prefix = None start = Version(settings.args.start) end = Version(settings.args.end) all_keys = source.keys(prefix=prefix) with Timer("filtering {{num}} keys", {"num": len(all_keys)}): all_keys = [(k, Version(k)) for k in all_keys if k.find("None") == -1] all_keys = [(k, p) for k, p in all_keys if start <= p < end] with Timer("sorting {{num}} keys", {"num": len(all_keys)}): all_keys = qb.sort(all_keys, 1) for k, p in all_keys: Log.note("Adding {{key}}", key= k) now = Date.now() work_queue.add({ "bucket": settings.args.bucket, "key": k, "timestamp": now.unix, "date/time": now.format() }) except Exception, e: Log.error("Problem with etl", e)
source = aws.s3.Bucket(settings=settings.source) destination = elasticsearch.Cluster(settings=settings.destination).get_or_create_index(settings=settings.destination) keep_trying = True while keep_trying: try: all_keys = source.keys() keep_trying=False except Exception, e: Log.warning("problem", e) # all_keys = set() # for i in range(20, 97, 1): # all_keys |= source.keys(prefix=unicode(i)) for k in qb.sort(all_keys): try: pulse_block_to_es.process(k, source.get_key(k), destination) except Exception, e: Log.warning("Problem with {{key}}", key=k, cause=e) def main(): try: settings = startup.read_settings() Log.start(settings.debug) backfill(settings) except Exception, e: Log.error("Problem with backfill", e) finally: Log.stop()
def done_count(self): self.edge.domain = self.domain = SimpleSetDomain(partitions=qb.sort(set(self.parts))) self.parts = None
def _es_terms2(es, mvel, query): """ WE ASSUME THERE ARE JUST TWO EDGES, AND EACH HAS A SIMPLE value """ # REQUEST VALUES IN FIRST DIMENSION q1 = query.copy() q1.edges = query.edges[0:1:] values1 = es_terms(es, mvel, q1).edges[0].domain.partitions.value select = listwrap(query.select) FromES = build_es_query(query) for s in select: for i, v in enumerate(values1): FromES.facets[s.name + "," + str(i)] = { "terms": { "field": query.edges[1].value, "size": coalesce(query.limit, 200000) }, "facet_filter": simplify_esfilter({ "and": [query.where, { "term": { query.edges[0].value: v } }] }) } data = es09.util.post(es, FromES, query.limit) # UNION ALL TERMS FROM SECOND DIMENSION values2 = set() for k, f in data.facets.items(): values2.update(f.terms.term) values2 = qb.sort(values2) term2index = {v: i for i, v in enumerate(values2)} query.edges[1].domain.partitions = DictList([{ "name": v, "value": v } for v in values2]) # MAKE CUBE output = {} dims = [len(values1), len(values2)] for s in select: output[s.name] = Matrix(*dims) # FILL CUBE # EXPECTING ONLY SELECT CLAUSE FACETS for facetName, facet in data.facets.items(): coord = facetName.split(",") s = [s for s in select if s.name == coord[0]][0] i1 = int(coord[1]) for term in facet.terms: i2 = term2index[term.term] output[s.name][(i1, i2)] = term[aggregates[s.aggregate]] cube = Cube(query.select, query.edges, output) cube.query = query return cube
def done_count(self): self.edge.domain = SimpleSetDomain( key="value", partitions=[{"value": v, "dataIndex": i} for i, v in enumerate(qb.sort(self.edge.domain.partitions, [k for k, v in self.fields]))] )
def done_count(self): self.edge.domain = SimpleSetDomain( partitions=qb.sort(self.edge.domain.partitions) )
def int_list_packer(term, values): """ return singletons, ranges and exclusions """ DENSITY = 10 # a range can have holes, this is inverse of the hole density MIN_RANGE = 20 # min members before a range is allowed to be used singletons = set() ranges = [] exclude = set() sorted = qb.sort(values) last = sorted[0] curr_start = last curr_excl = set() for v in sorted[1::]: if v <= last + 1: pass elif v - last > 3: # big step, how do we deal with it? if last == curr_start: # not a range yet, so just add as singlton singletons.add(last) elif last - curr_start - len(curr_excl) < MIN_RANGE or ( (last - curr_start) < len(curr_excl) * DENSITY): # small ranges are singletons, sparse ranges are singletons singletons |= set(range(curr_start, last + 1)) singletons -= curr_excl else: # big enough, and dense enough range ranges.append({"gte": curr_start, "lte": last}) exclude |= curr_excl curr_start = v curr_excl = set() else: if 1 + last - curr_start >= len(curr_excl) * DENSITY: # high density, keep track of excluded and continue add_me = set(range(last + 1, v)) curr_excl |= add_me elif 1 + last - curr_start - len(curr_excl) < MIN_RANGE: # not big enough, convert range to singletons new_singles = set(range(curr_start, last + 1)) - curr_excl singletons = singletons | new_singles curr_start = v curr_excl = set() else: ranges.append({"gte": curr_start, "lte": last}) exclude |= curr_excl curr_start = v curr_excl = set() last = v if last == curr_start: # not a range yet, so just add as singlton singletons.add(last) elif last - curr_start - len(curr_excl) < MIN_RANGE or ( (last - curr_start) < len(curr_excl) * DENSITY): # small ranges are singletons, sparse ranges are singletons singletons |= set(range(curr_start, last + 1)) singletons -= curr_excl else: # big enough, and dense enough range ranges.append({"gte": curr_start, "lte": last}) exclude |= curr_excl if ranges: r = {"or": [{"range": {term: r}} for r in ranges]} if exclude: r = {"and": [r, {"not": {"terms": {term: qb.sort(exclude)}}}]} if singletons: return {"or": [{"terms": {term: qb.sort(singletons)}}, r]} else: return r else: raise Except("no packing possible")
def _update_cardinality(self, c): """ QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN """ if c.type in ["object", "nested"]: Log.error("not supported") try: if c.table == "meta.columns": with self.columns.locker: partitions = qb.sort([g[c.abs_name] for g, _ in qb.groupby(self.columns, c.abs_name) if g[c.abs_name] != None]) self.columns.update({ "set": { "partitions": partitions, "count": len(self.columns), "cardinality": len(partitions), "last_updated": Date.now() }, "where": {"eq": {"table": c.table, "abs_name": c.abs_name}} }) return if c.table == "meta.tables": with self.columns.locker: partitions = qb.sort([g[c.abs_name] for g, _ in qb.groupby(self.tables, c.abs_name) if g[c.abs_name] != None]) self.columns.update({ "set": { "partitions": partitions, "count": len(self.tables), "cardinality": len(partitions), "last_updated": Date.now() }, "where": {"eq": {"table": c.table, "name": c.name}} }) return es_index = c.table.split(".")[0] result = self.default_es.post("/"+es_index+"/_search", data={ "aggs": {c.name: _counting_query(c)}, "size": 0 }) r = result.aggregations.values()[0] count = result.hits.total cardinality = coalesce(r.value, r._nested.value) if cardinality == None: Log.error("logic error") query = Dict(size=0) if c.type in ["object", "nested"]: Log.note("{{field}} has {{num}} parts", field=c.name, num=cardinality) with self.columns.locker: self.columns.update({ "set": { "count": count, "cardinality": cardinality, "last_updated": Date.now() }, "clear": ["partitions"], "where": {"eq": {"table": c.table, "name": c.name}} }) return elif cardinality > 1000 or (count >= 30 and cardinality == count) or (count >= 1000 and cardinality / count > 0.99): Log.note("{{field}} has {{num}} parts", field=c.name, num=cardinality) with self.columns.locker: self.columns.update({ "set": { "count": count, "cardinality": cardinality, "last_updated": Date.now() }, "clear": ["partitions"], "where": {"eq": {"table": c.table, "name": c.name}} }) return elif c.type in _elasticsearch.ES_NUMERIC_TYPES and cardinality > 30: Log.note("{{field}} has {{num}} parts", field=c.name, num=cardinality) with self.columns.locker: self.columns.update({ "set": { "count": count, "cardinality": cardinality, "last_updated": Date.now() }, "clear": ["partitions"], "where": {"eq": {"table": c.table, "name": c.name}} }) return elif c.nested_path: query.aggs[literal_field(c.name)] = { "nested": {"path": listwrap(c.nested_path)[0]}, "aggs": {"_nested": {"terms": {"field": c.abs_name, "size": 0}}} } else: query.aggs[literal_field(c.name)] = {"terms": {"field": c.abs_name, "size": 0}} result = self.default_es.post("/"+es_index+"/_search", data=query) aggs = result.aggregations.values()[0] if aggs._nested: parts = qb.sort(aggs._nested.buckets.key) else: parts = qb.sort(aggs.buckets.key) Log.note("{{field}} has {{parts}}", field=c.name, parts=parts) with self.columns.locker: self.columns.update({ "set": { "count": count, "cardinality": cardinality, "partitions": parts, "last_updated": Date.now() }, "where": {"eq": {"table": c.table, "abs_name": c.abs_name}} }) except Exception, e: if "IndexMissingException" in e and c.table.startswith("testing"): Log.alert("{{col.table}} does not exist", col=c) else: self.columns.update({ "set": { "last_updated": Date.now() }, "clear":[ "count", "cardinality", "partitions", ], "where": {"eq": {"table": c.table, "abs_name": c.abs_name}} }) Log.warning("Could not get {{col.table}}.{{col.abs_name}} info", col=c, cause=e)
def int_list_packer(term, values): """ return singletons, ranges and exclusions """ DENSITY = 10 # a range can have holes, this is inverse of the hole density MIN_RANGE = 20 # min members before a range is allowed to be used singletons = set() ranges = [] exclude = set() sorted = qb.sort(values) last = sorted[0] curr_start = last curr_excl = set() for v in sorted[1::]: if v <= last + 1: pass elif v - last > 3: # big step, how do we deal with it? if last == curr_start: # not a range yet, so just add as singlton singletons.add(last) elif last - curr_start - len(curr_excl) < MIN_RANGE or ((last - curr_start) < len(curr_excl) * DENSITY): # small ranges are singletons, sparse ranges are singletons singletons |= set(range(curr_start, last + 1)) singletons -= curr_excl else: # big enough, and dense enough range ranges.append({"gte": curr_start, "lte": last}) exclude |= curr_excl curr_start = v curr_excl = set() else: if 1 + last - curr_start >= len(curr_excl) * DENSITY: # high density, keep track of excluded and continue add_me = set(range(last + 1, v)) curr_excl |= add_me elif 1 + last - curr_start - len(curr_excl) < MIN_RANGE: # not big enough, convert range to singletons new_singles = set(range(curr_start, last + 1)) - curr_excl singletons = singletons | new_singles curr_start = v curr_excl = set() else: ranges.append({"gte": curr_start, "lte": last}) exclude |= curr_excl curr_start = v curr_excl = set() last = v if last == curr_start: # not a range yet, so just add as singlton singletons.add(last) elif last - curr_start - len(curr_excl) < MIN_RANGE or ((last - curr_start) < len(curr_excl) * DENSITY): # small ranges are singletons, sparse ranges are singletons singletons |= set(range(curr_start, last + 1)) singletons -= curr_excl else: # big enough, and dense enough range ranges.append({"gte": curr_start, "lte": last}) exclude |= curr_excl if ranges: r = {"or": [{"range": {term: r}} for r in ranges]} if exclude: r = {"and": [r, {"not": {"terms": {term: qb.sort(exclude)}}}]} if singletons: return {"or": [ {"terms": {term: qb.sort(singletons)}}, r ]} else: return r else: raise Except("no packing possible")
def _dispatch_work(self, source_block): """ source_block POINTS TO THE bucket AND key TO PROCESS :return: False IF THERE IS NOTHING LEFT TO DO """ source_keys = listwrap(coalesce(source_block.key, source_block.keys)) if not isinstance(source_block.bucket, basestring): # FIX MISTAKE source_block.bucket = source_block.bucket.bucket bucket = source_block.bucket work_actions = [w for w in self.settings.workers if w.source.bucket == bucket] if not work_actions: Log.note("No worker defined for records from {{bucket}}, {{action}}.\n{{message|indent}}", bucket= source_block.bucket, message= source_block, action= "skipping" if self.settings.keep_unknown_on_queue else "deleting") return not self.settings.keep_unknown_on_queue for action in work_actions: try: source_key = unicode(source_keys[0]) if len(source_keys) > 1: multi_source = action._source source = ConcatSources([multi_source.get_key(k) for k in source_keys]) source_key = MIN(source_key) else: source = action._source.get_key(source_key) source_key = source.key Log.note("Execute {{action}} on bucket={{source}} key={{key}}", action= action.name, source= source_block.bucket, key= source_key) if action.transform_type == "bulk": old_keys = set() else: old_keys = action._destination.keys(prefix=source_block.key) new_keys = set(action._transformer(source_key, source, action._destination, resources=self.resources, please_stop=self.please_stop)) #VERIFY KEYS if len(new_keys) == 1 and list(new_keys)[0] == source_key: pass # ok else: etls = map(key2etl, new_keys) etls = qb.sort(etls, "id") for i, e in enumerate(etls): if i != e.id: Log.error("expecting keys to have dense order: {{ids}}", ids=etls.id) #VERIFY KEYS EXIST if hasattr(action._destination, "get_key"): for k in new_keys: action._destination.get_key(k) for n in action._notify: for k in new_keys: n.add(k) if action.transform_type == "bulk": continue # DUE TO BUGS THIS INVARIANT IS NOW BROKEN # TODO: FIGURE OUT HOW TO FIX THIS (CHANGE NAME OF THE SOURCE BLOCK KEY?) # for n in new_keys: # if not n.startswith(source_key): # Log.error("Expecting new keys ({{new_key}}) to start with source key ({{source_key}})", new_key= n, source_key= source_key) if not new_keys and old_keys: Log.alert("Expecting some new keys after etl of {{source_key}}, especially since there were old ones\n{{old_keys}}", old_keys= old_keys, source_key= source_key) continue elif not new_keys: Log.alert("Expecting some new keys after processing {{source_key}}", old_keys= old_keys, source_key= source_key) continue for k in new_keys: if len(k.split(".")) == 3 and action.destination.type!="test_result": Log.error("two dots have not been needed yet, this is a consitency check") delete_me = old_keys - new_keys if delete_me: if action.destination.bucket == "ekyle-test-result": for k in delete_me: action._destination.delete_key(k) else: Log.note("delete keys?\n{{list}}", list= sorted(delete_me)) # for k in delete_me: # WE DO NOT PUT KEYS ON WORK QUEUE IF ALREADY NOTIFYING SOME OTHER # AND NOT GOING TO AN S3 BUCKET if not action._notify and isinstance(action._destination, (aws.s3.Bucket, S3Bucket)): for k in old_keys | new_keys: self.work_queue.add(Dict( bucket=action.destination.bucket, key=k )) except Exception, e: if "Key {{key}} does not exist" in e: err = Log.warning elif "multiple keys in {{bucket}}" in e: err = Log.warning if source_block.bucket=="ekyle-test-result": for k in action._source.list(prefix=key_prefix(source_key)): action._source.delete_key(strip_extension(k.key)) elif "expecting keys to have dense order" in e: err = Log.warning if source_block.bucket=="ekyle-test-result": # WE KNOW OF THIS ETL MISTAKE, REPROCESS self.work_queue.add({ "key": unicode(key_prefix(source_key)), "bucket": "ekyle-pulse-logger" }) elif "Expecting a pure key" in e: err = Log.warning else: err = Log.error err("Problem transforming {{action}} on bucket={{source}} key={{key}} to destination={{destination}}", { "action": action.name, "source": source_block.bucket, "key": source_key, "destination": coalesce(action.destination.name, action.destination.index) }, e)
}) new_records.append(new_record) # ADD RECORD FOR GRAPH SERVER SUMMARYh new_record = Dict( machine=r.machine, treeherder=r.treeherder, run=r.run, build=r.build, result={ "test_name": "summary_old", "ordering": -1, "stats": Stats(samples=qb.sort(total.mean)[:len(total) - 1:]) }) new_records.append(new_record) return new_records except Exception, e: Log.error("Transformation failure on id={{uid}}", {"uid": uid}, e) def stats(values): """ RETURN LOTS OF AGGREGATES """ if values == None: return None
def sort(self, sort): return ListContainer("from "+self.name, qb.sort(self.data, sort), self.schema)
def done_count(self): self.edge.domain = SimpleSetDomain( partitions=qb.sort(self.edge.domain.partitions))
def full_etl(settings, sink, bugs): with Timer("process block {{start}}", {"start": min(bugs)}): es = elasticsearch.Index(settings.source) with FromES(es) as esq: versions = esq.query({ "from": "bugs", "select": "*", "where": {"terms": {"bug_id": bugs}} }) starts = qb.run({ "select": [ "bug_id", "bug_status", {"name": "attach_id", "value": "attachments.attach_id"}, {"name": "request_time", "value": "modified_ts"}, {"name": "request_type", "value": "attachments.flags.request_type"}, {"name": "reviewer", "value": "attachments.flags.requestee"}, {"name": "created_by", "value": "attachments.created_by"}, "product", "component" ], "from": versions, "where": {"and": [ {"terms": {"attachments.flags.request_status": ["?"]}}, {"terms": {"attachments.flags.request_type": TYPES}}, {"equal": ["attachments.flags.modified_ts", "modified_ts"]}, {"term": {"attachments.isobsolete": 0}} ]}, "sort": ["bug_id", "attach_id", "created_by"] }) ends = qb.run({ "select": [ {"name": "bug_id", "value": "bug_id"}, "bug_status", {"name": "attach_id", "value": "attachments.attach_id"}, {"name": "modified_ts", "value": lambda r: Math.max(r.modified_ts, r.attachments.modified_ts, r.attachments.flags.modified_ts)}, {"name": "reviewer", "value": "attachments.flags.requestee"}, {"name": "request_type", "value": "attachments.flags.request_type"}, {"name": "modified_by", "value": "attachments.flags.modified_by"}, {"name": "product", "value": "product"}, {"name": "component", "value": "component"}, {"name": "review_end_reason", "value": lambda r: 'done' if r.attachments.flags.request_status != '?' else ('obsolete' if r.attachments.isobsolete == 1 else 'closed')}, {"name": "review_result", "value": lambda r: '+' if r.attachments.flags.request_status == '+' else ('-' if r.attachments.flags.request_status == '-' else '?')} ], "from": versions, "where": {"and": [ {"terms": {"attachments.flags.request_type": TYPES}}, {"or": [ {"and": [# IF THE REQUESTEE SWITCHED THE ? FLAG, THEN IT IS DONE {"term": {"attachments.flags.previous_status": "?"}}, {"not": {"term": {"attachments.flags.request_status": "?"}}}, {"equal": ["attachments.flags.modified_ts", "modified_ts"]} ]}, {"and": [# IF OBSOLETED THE ATTACHMENT, IT IS DONE {"term": {"attachments.isobsolete": 1}}, {"term": {"previous_values.isobsolete_value": 0}} ]}, {"and": [# SOME BUGS ARE CLOSED WITHOUT REMOVING REVIEW {"terms": {"bug_status": ["resolved", "verified", "closed"]}}, {"not": {"terms": {"previous_values.bug_status_value": ["resolved", "verified", "closed"]}}} ]} ]} ]} }) # SOME ATTACHMENTS GO MISSING, CLOSE THEM TOO closed_bugs = {b.bug_id: b for b in qb.filter(versions, {"and": [# SOME BUGS ARE CLOSED WITHOUT REMOVING REVIEW {"terms": {"bug_status": ["resolved", "verified", "closed"]}}, {"range": {"expires_on": {"gte": Date.now().milli}}} ]})} for s in starts: if s.bug_id in closed_bugs: e = closed_bugs[s.bug_id] ends.append({ "bug_id": e.bug_id, "bug_status": e.bug_status, "attach_id": s.attach_id, "modified_ts": e.modified_ts, "reviewer": s.reviewer, "request_type": s.request_type, "modified_by": e.modified_by, "product": e.product, "component": e.component, "review_end_reason": 'closed', "review_result": '?' }) # REVIEWS END WHEN REASSIGNED TO SOMEONE ELSE changes = qb.run({ "select": [ "bug_id", {"name": "attach_id", "value": "changes.attach_id"}, "modified_ts", {"name": "reviewer", "value": lambda r: r.changes.old_value.split("?")[1]}, {"name": "request_type", "value": lambda r: r.changes.old_value.split("?")[0]}, {"name": "modified_by", "value": "null"}, "product", "component", {"name": "review_end_reason", "value": "'reassigned'"} ], "from": versions, "where": {"and": [# ONLY LOOK FOR NAME CHANGES IN THE "review?" FIELD {"term": {"changes.field_name": "flags"}}, {"or": [{"prefix": {"changes.old_value": t + "?"}} for t in TYPES]} ]} }) ends.extend(changes) # PYTHON VERSION NOT CAPABLE OF THIS JOIN, YET # reviews = qb.run({ # "from": # starts, # "select": [ # {"name": "bug_status", "value": "bug_status", "aggregate": "one"}, # {"name": "review_time", "value": "doneReview.modified_ts", "aggregate": "minimum"}, # {"name": "review_result", "value": "doneReview.review_result", "aggregate": "minimum"}, # {"name": "product", "value": "coalesce(doneReview.product, product)", "aggregate": "minimum"}, # {"name": "component", "value": "coalesce(doneReview.component, component)", "aggregate": "minimum"}, # # {"name": "keywords", "value": "(coalesce(keywords, '')+' '+ETL.parseWhiteBoard(whiteboard)).trim()+' '+flags", "aggregate": "one"}, # {"name": "requester_review_num", "value": "-1", "aggregate": "one"} # ], # "analytic": [ # {"name": "is_first", "value": "rownum==0 ? 1 : 0", "sort": "request_time", "edges": ["bug_id"]} # ], # "edges": [ # "bug_id", # "attach_id", # {"name": "reviewer", "value": "requestee"}, # {"name": "requester", "value": "created_by"}, # {"name": "request_time", "value": "modified_ts"}, # { # "name": "doneReview", # "test": # "bug_id==doneReview.bug_id && " + # "attach_id==doneReview.attach_id && " + # "requestee==doneReview.requestee && " + # "!(bug_status=='closed' && doneReview.review_end_reason=='closed') && " + # "modified_ts<=doneReview.modified_ts", # "allowNulls": True, # "domain": {"type": "set", "key":["bug_id", "attach_id", "requestee", "modified_ts"], "partitions": ends} # } # ] # }) with Timer("match starts and ends for block {{start}}", {"start":min(*bugs)}): reviews = [] ends = Index(data=ends, keys=["bug_id", "attach_id", "request_type", "reviewer"]) for g, s in qb.groupby(starts, ["bug_id", "attach_id", "request_type", "reviewer"]): start_candidates = qb.sort(s, {"value": "request_time", "sort": 1}) end_candidates = qb.sort(ends[g], {"value": "modified_ts", "sort": 1}) #ZIP, BUT WITH ADDED CONSTRAINT s.modified_ts<=e.modified_ts if len(start_candidates) > 1: Log.note("many reviews on one attachment") ei = 0 for i, s in enumerate(start_candidates): while ei < len(end_candidates) and end_candidates[ei].modified_ts < coalesce(s.request_time, convert.datetime2milli(Date.MAX)): ei += 1 e = end_candidates[ei] s.review_time = e.modified_ts s.review_duration = e.modified_ts - s.request_time s.review_result = e.review_result s.review_end_reason = e.review_end_reason s.product = coalesce(e.product, s.product) s.component = coalesce(e.component, s.component) s.requester_review_num = -1 ei += 1 if s.bug_status == 'closed' and e.review_end_reason == 'closed': #reviews on closed bugs are ignored continue reviews.append(s) qb.run({ "from": reviews, "window": [{ "name": "is_first", "value": "rownum == 0", "edges": ["bug_id"], "sort": ["request_time"], "aggregate": "none" }] }) with Timer("add {{num}} reviews to ES for block {{start}}", {"start": min(*bugs), "num": len(reviews)}): sink.extend({"json": convert.value2json(r)} for r in reviews)
def _dispatch_work(self, source_block): """ source_block POINTS TO THE bucket AND key TO PROCESS :return: False IF THERE IS NOTHING LEFT TO DO """ source_keys = listwrap(coalesce(source_block.key, source_block.keys)) if not isinstance(source_block.bucket, basestring): # FIX MISTAKE source_block.bucket = source_block.bucket.bucket bucket = source_block.bucket work_actions = [ w for w in self.settings.workers if w.source.bucket == bucket ] if not work_actions: Log.note( "No worker defined for records from {{bucket}}, {{action}}.\n{{message|indent}}", bucket=source_block.bucket, message=source_block, action="skipping" if self.settings.keep_unknown_on_queue else "deleting") return not self.settings.keep_unknown_on_queue for action in work_actions: try: source_key = unicode(source_keys[0]) if len(source_keys) > 1: multi_source = action._source source = ConcatSources( [multi_source.get_key(k) for k in source_keys]) source_key = MIN(source_key) else: source = action._source.get_key(source_key) source_key = source.key Log.note("Execute {{action}} on bucket={{source}} key={{key}}", action=action.name, source=source_block.bucket, key=source_key) if action.transform_type == "bulk": old_keys = set() else: old_keys = action._destination.keys( prefix=source_block.key) new_keys = set( action._transformer(source_key, source, action._destination, resources=self.resources, please_stop=self.please_stop)) #VERIFY KEYS if len(new_keys) == 1 and list(new_keys)[0] == source_key: pass # ok else: etls = map(key2etl, new_keys) etls = qb.sort(etls, "id") for i, e in enumerate(etls): if i != e.id: Log.error( "expecting keys to have dense order: {{ids}}", ids=etls.id) #VERIFY KEYS EXIST if hasattr(action._destination, "get_key"): for k in new_keys: action._destination.get_key(k) for n in action._notify: for k in new_keys: n.add(k) if action.transform_type == "bulk": continue # DUE TO BUGS THIS INVARIANT IS NOW BROKEN # TODO: FIGURE OUT HOW TO FIX THIS (CHANGE NAME OF THE SOURCE BLOCK KEY?) # for n in new_keys: # if not n.startswith(source_key): # Log.error("Expecting new keys ({{new_key}}) to start with source key ({{source_key}})", new_key= n, source_key= source_key) if not new_keys and old_keys: Log.alert( "Expecting some new keys after etl of {{source_key}}, especially since there were old ones\n{{old_keys}}", old_keys=old_keys, source_key=source_key) continue elif not new_keys: Log.alert( "Expecting some new keys after processing {{source_key}}", old_keys=old_keys, source_key=source_key) continue for k in new_keys: if len(k.split(".") ) == 3 and action.destination.type != "test_result": Log.error( "two dots have not been needed yet, this is a consitency check" ) delete_me = old_keys - new_keys if delete_me: if action.destination.bucket == "ekyle-test-result": for k in delete_me: action._destination.delete_key(k) else: Log.note("delete keys?\n{{list}}", list=sorted(delete_me)) # for k in delete_me: # WE DO NOT PUT KEYS ON WORK QUEUE IF ALREADY NOTIFYING SOME OTHER # AND NOT GOING TO AN S3 BUCKET if not action._notify and isinstance( action._destination, (aws.s3.Bucket, S3Bucket)): for k in old_keys | new_keys: self.work_queue.add( Dict(bucket=action.destination.bucket, key=k)) except Exception, e: if "Key {{key}} does not exist" in e: err = Log.warning elif "multiple keys in {{bucket}}" in e: err = Log.warning if source_block.bucket == "ekyle-test-result": for k in action._source.list( prefix=key_prefix(source_key)): action._source.delete_key(strip_extension(k.key)) elif "expecting keys to have dense order" in e: err = Log.warning if source_block.bucket == "ekyle-test-result": # WE KNOW OF THIS ETL MISTAKE, REPROCESS self.work_queue.add({ "key": unicode(key_prefix(source_key)), "bucket": "ekyle-pulse-logger" }) elif "Expecting a pure key" in e: err = Log.warning else: err = Log.error err( "Problem transforming {{action}} on bucket={{source}} key={{key}} to destination={{destination}}", { "action": action.name, "source": source_block.bucket, "key": source_key, "destination": coalesce(action.destination.name, action.destination.index) }, e)