def copy(self, keys, source, sample_only_filter=None, sample_size=None, done_copy=None): """ :param keys: THE KEYS TO LOAD FROM source :param source: THE SOURCE (USUALLY S3 BUCKET) :param sample_only_filter: SOME FILTER, IN CASE YOU DO NOT WANT TO SEND EVERYTHING :param sample_size: FOR RANDOM SAMPLE OF THE source DATA :param done_copy: CALLBACK, ADDED TO queue, TO FINISH THE TRANSACTION :return: LIST OF SUB-keys PUSHED INTO ES """ num_keys = 0 queue = None for key in keys: timer = Timer("key") try: with timer: for rownum, line in enumerate(source.read_lines(strip_extension(key))): if not line: continue if rownum > 0 and rownum % 1000 == 0: Log.note("Ingested {{num}} records from {{key}} in bucket {{bucket}}", num=rownum, key=key, bucket=source.name) row, please_stop = fix(rownum, line, source, sample_only_filter, sample_size) num_keys += 1 if queue == None: queue = self._get_queue(row) queue.add(row) if please_stop: break except Exception, e: done_copy = None Log.warning("Could not process {{key}} after {{duration|round(places=2)}}seconds", key=key, duration=timer.duration.seconds, cause=e)
def copy(self, keys, source, sample_only_filter=None, sample_size=None, done_copy=None): """ :param keys: THE KEYS TO LOAD FROM source :param source: THE SOURCE (USUALLY S3 BUCKET) :param sample_only_filter: SOME FILTER, IN CASE YOU DO NOT WANT TO SEND EVERYTHING :param sample_size: FOR RANDOM SAMPLE OF THE source DATA :param done_copy: CALLBACK, ADDED TO queue, TO FINISH THE TRANSACTION :return: LIST OF SUB-keys PUSHED INTO ES """ num_keys = 0 queue = None pending = [] # FOR WHEN WE DO NOT HAVE QUEUE YET for key in keys: timer = Timer("Process {{key}}", param={"key": key}) try: with timer: for rownum, line in enumerate(source.read_lines(strip_extension(key))): if not line: continue if rownum > 0 and rownum % 1000 == 0: Log.note("Ingested {{num}} records from {{key}} in bucket {{bucket}}", num=rownum, key=key, bucket=source.name) row, please_stop = fix(rownum, line, source, sample_only_filter, sample_size) num_keys += 1 if queue == None: queue = self._get_queue(row) if queue == None: pending.append(row) if len(pending) > 1000: self._get_queue(row) Log.error("first 1000 (key={{key}}) records have no indication what index to put data", key=tuple(keys)[0]) continue elif queue is DATA_TOO_OLD: break if pending: queue.extend(pending) pending = [] queue.add(row) if please_stop: break except Exception as e: done_copy = None Log.warning("Could not process {{key}} after {{duration|round(places=2)}}seconds", key=key, duration=timer.duration.seconds, cause=e) if done_copy: if queue == None: done_copy() else: queue.add(done_copy) if pending: Log.error("Did not find an index to place the data for key={{key}}", key=tuple(keys)[0]) Log.note("{{num}} keys from {{key|json}} added", num=num_keys, key=keys) return num_keys
def get_all_s3(in_es, settings): # EVERYTHING FROM S3 bucket = s3.Bucket(settings.source) prefixes = [p.name.rstrip(":") for p in bucket.list(prefix="", delimiter=":")] in_s3 = [] for i, p in enumerate(prefixes): if i % 1000 == 0: Log.note("Scrubbed {{p|percent(decimal=1)}}", p= i / len(prefixes)) try: if int(p) not in in_es: in_s3.append(int(p)) else: pass except Exception, _: Log.note("delete key {{key}}", key= p) bucket.delete_key(strip_extension(p))
def get_all_s3(in_es, settings): # EVERYTHING FROM S3 bucket = s3.Bucket(settings.source) prefixes = [ p.name.rstrip(":") for p in bucket.list(prefix="", delimiter=":") ] in_s3 = [] for i, p in enumerate(prefixes): if i % 1000 == 0: Log.note("Scrubbed {{p|percent(decimal=1)}}", p=i / len(prefixes)) try: if int(p) not in in_es: in_s3.append(int(p)) else: pass except Exception, _: Log.note("delete key {{key}}", key=p) bucket.delete_key(strip_extension(p))
def copy(self, keys, source, sample_only_filter=None, sample_size=None): num_keys = 0 for key in keys: try: for rownum, line in enumerate( source.read_lines(strip_extension(key))): if rownum == 0: value = convert.json2value(line) if len(line) > 1000000: # Log.warning("Line {{num}} for key {{key}} is too long ({{length|comma}} bytes, {{num_tests}} subtests)", key=key, length=len(line), num=rownum, num_tests=len(value.result.subtests)) value.result.subtests = None value.result.missing_subtests = True _id, value = _fix(value) row = {"id": _id, "value": value} if sample_only_filter and Random.int( int(1.0 / coalesce( sample_size, 0.01))) != 0 and qb.filter( [value], sample_only_filter): # INDEX etl.id==0, BUT NO MORE if value.etl.id != 0: Log.error("Expecting etl.id==0") num_keys += 1 self.queue.add(row) break elif len(line) > 1000000: value = convert.json2value(line) # Log.warning("Line {{num}} for key {{key}} is too long ({{length|comma}} bytes, {{num_tests}} subtests).", key=key, length=len(line), num=rownum, num_tests=len(value.result.subtests)) value.result.subtests = None value.result.missing_subtests = True _id, value = _fix(value) row = {"id": _id, "value": value} else: #FAST _id = strings.between(line, "_id\": \"", "\"") # AVOID DECODING JSON row = {"id": _id, "json": line} num_keys += 1 self.queue.add(row) except Exception, e: Log.warning("Could not get queue for {{key}}", key=key, cause=e)
def copy(self, keys, source, sample_only_filter=None, sample_size=None, done_copy=None): """ :param keys: THE KEYS TO LOAD FROM source :param source: THE SOURCE (USUALLY S3 BUCKET) :param sample_only_filter: SOME FILTER, IN CASE YOU DO NOT WANT TO SEND EVERYTHING :param sample_size: FOR RANDOM SAMPLE OF THE source DATA :param done_copy: CALLBACK, ADDED TO queue, TO FINISH THE TRANSACTION :return: LIST OF SUB-keys PUSHED INTO ES """ num_keys = 0 queue = None for key in keys: timer = Timer("key") try: with timer: for rownum, line in enumerate( source.read_lines(strip_extension(key))): if not line: continue row, please_stop = fix(rownum, line, source, sample_only_filter, sample_size) num_keys += 1 if queue == None: queue = self._get_queue(row) queue.add(row) if please_stop: break except Exception, e: done_copy = None Log.warning( "Could not process {{key}} after {{duration|round(places=2)}}seconds", key=key, duration=timer.duration.seconds, cause=e)
def copy(self, keys, source, sample_only_filter=None, sample_size=None): num_keys = 0 for key in keys: try: for rownum, line in enumerate(source.read_lines(strip_extension(key))): if rownum == 0: value = convert.json2value(line) if len(line) > 1000000: # Log.warning("Line {{num}} for key {{key}} is too long ({{length|comma}} bytes, {{num_tests}} subtests)", key=key, length=len(line), num=rownum, num_tests=len(value.result.subtests)) value.result.subtests = None value.result.missing_subtests = True _id, value = _fix(value) row = {"id": _id, "value": value} if sample_only_filter and Random.int(int(1.0/coalesce(sample_size, 0.01))) != 0 and qb.filter([value], sample_only_filter): # INDEX etl.id==0, BUT NO MORE if value.etl.id != 0: Log.error("Expecting etl.id==0") num_keys += 1 self.queue.add(row) break elif len(line) > 1000000: value = convert.json2value(line) # Log.warning("Line {{num}} for key {{key}} is too long ({{length|comma}} bytes, {{num_tests}} subtests).", key=key, length=len(line), num=rownum, num_tests=len(value.result.subtests)) value.result.subtests = None value.result.missing_subtests = True _id, value = _fix(value) row = {"id": _id, "value": value} else: #FAST _id = strings.between(line, "_id\": \"", "\"") # AVOID DECODING JSON row = {"id": _id, "json": line} num_keys += 1 self.queue.add(row) except Exception, e: Log.warning("Could not get queue for {{key}}", key=key, cause=e)
def copy(self, keys, source, sample_only_filter=None, sample_size=None, done_copy=None): """ :param keys: THE KEYS TO LOAD FROM source :param source: THE SOURCE (USUALLY S3 BUCKET) :param sample_only_filter: SOME FILTER, IN CASE YOU DO NOT WANT TO SEND EVERYTHING :param sample_size: FOR RANDOM SAMPLE OF THE source DATA :param done_copy: CALLBACK, ADDED TO queue, TO FINISH THE TRANSACTION :return: LIST OF SUB-keys PUSHED INTO ES """ num_keys = 0 queue = None pending = [] # FOR WHEN WE DO NOT HAVE QUEUE YET for key in keys: timer = Timer("Process {{key}}", param={"key": key}, silent=not DEBUG) try: with timer: for rownum, line in enumerate( source.read_lines(strip_extension(key))): if not line: continue if rownum > 0 and rownum % 1000 == 0: Log.note( "Ingested {{num}} records from {{key}} in bucket {{bucket}}", num=rownum, key=key, bucket=source.name) insert_me, please_stop = fix(key, rownum, line, source, sample_only_filter, sample_size) if insert_me == None: continue value = insert_me['value'] if '_id' not in value: Log.warning( "expecting an _id in all S3 records. If missing, there can be duplicates" ) if queue == None: queue = self._get_queue(insert_me) if queue == None: pending.append(insert_me) if len(pending) > 1000: if done_copy: done_copy() Log.error( "first 1000 (key={{key}}) records for {{alias}} have no indication what index to put data", key=tuple(keys)[0], alias=self.settings.index) continue elif queue is DATA_TOO_OLD: break if pending: queue.extend(pending) pending = [] num_keys += 1 queue.add(insert_me) if please_stop: break except Exception as e: if KEY_IS_WRONG_FORMAT in e: Log.warning( "Could not process {{key}} because bad format. Never trying again.", key=key, cause=e) pass elif CAN_NOT_DECODE_JSON in e: Log.warning( "Could not process {{key}} because of bad JSON. Never trying again.", key=key, cause=e) pass else: Log.warning( "Could not process {{key}} after {{duration|round(places=2)}}seconds", key=key, duration=timer.duration.seconds, cause=e) done_copy = None if done_copy: if queue == None: done_copy() elif queue is DATA_TOO_OLD: done_copy() else: queue.add(done_copy) if [ p for p in pending if wrap(p).value.task.state not in ('failed', 'exception') ]: Log.error( "Did not find an index for {{alias}} to place the data for key={{key}}", key=tuple(keys)[0], alias=self.settings.index) Log.note("{{num}} keys from {{key|json}} added", num=num_keys, key=keys) return num_keys
def _dispatch_work(self, source_block): """ source_block POINTS TO THE bucket AND key TO PROCESS :return: False IF THERE IS NOTHING LEFT TO DO """ source_keys = listwrap(coalesce(source_block.key, source_block.keys)) if not isinstance(source_block.bucket, basestring): # FIX MISTAKE source_block.bucket = source_block.bucket.bucket bucket = source_block.bucket work_actions = [w for w in self.settings.workers if w.source.bucket == bucket] if not work_actions: Log.note("No worker defined for records from {{bucket}}, {{action}}.\n{{message|indent}}", bucket= source_block.bucket, message= source_block, action= "skipping" if self.settings.keep_unknown_on_queue else "deleting") return not self.settings.keep_unknown_on_queue for action in work_actions: try: source_key = unicode(source_keys[0]) if len(source_keys) > 1: multi_source = action._source source = ConcatSources([multi_source.get_key(k) for k in source_keys]) source_key = MIN(source_key) else: source = action._source.get_key(source_key) source_key = source.key Log.note("Execute {{action}} on bucket={{source}} key={{key}}", action= action.name, source= source_block.bucket, key= source_key) if action.transform_type == "bulk": old_keys = set() else: old_keys = action._destination.keys(prefix=source_block.key) new_keys = set(action._transformer(source_key, source, action._destination, resources=self.resources, please_stop=self.please_stop)) #VERIFY KEYS if len(new_keys) == 1 and list(new_keys)[0] == source_key: pass # ok else: etls = map(key2etl, new_keys) etls = qb.sort(etls, "id") for i, e in enumerate(etls): if i != e.id: Log.error("expecting keys to have dense order: {{ids}}", ids=etls.id) #VERIFY KEYS EXIST if hasattr(action._destination, "get_key"): for k in new_keys: action._destination.get_key(k) for n in action._notify: for k in new_keys: n.add(k) if action.transform_type == "bulk": continue # DUE TO BUGS THIS INVARIANT IS NOW BROKEN # TODO: FIGURE OUT HOW TO FIX THIS (CHANGE NAME OF THE SOURCE BLOCK KEY?) # for n in new_keys: # if not n.startswith(source_key): # Log.error("Expecting new keys ({{new_key}}) to start with source key ({{source_key}})", new_key= n, source_key= source_key) if not new_keys and old_keys: Log.alert("Expecting some new keys after etl of {{source_key}}, especially since there were old ones\n{{old_keys}}", old_keys= old_keys, source_key= source_key) continue elif not new_keys: Log.alert("Expecting some new keys after processing {{source_key}}", old_keys= old_keys, source_key= source_key) continue for k in new_keys: if len(k.split(".")) == 3 and action.destination.type!="test_result": Log.error("two dots have not been needed yet, this is a consitency check") delete_me = old_keys - new_keys if delete_me: if action.destination.bucket == "ekyle-test-result": for k in delete_me: action._destination.delete_key(k) else: Log.note("delete keys?\n{{list}}", list= sorted(delete_me)) # for k in delete_me: # WE DO NOT PUT KEYS ON WORK QUEUE IF ALREADY NOTIFYING SOME OTHER # AND NOT GOING TO AN S3 BUCKET if not action._notify and isinstance(action._destination, (aws.s3.Bucket, S3Bucket)): for k in old_keys | new_keys: self.work_queue.add(Dict( bucket=action.destination.bucket, key=k )) except Exception, e: if "Key {{key}} does not exist" in e: err = Log.warning elif "multiple keys in {{bucket}}" in e: err = Log.warning if source_block.bucket=="ekyle-test-result": for k in action._source.list(prefix=key_prefix(source_key)): action._source.delete_key(strip_extension(k.key)) elif "expecting keys to have dense order" in e: err = Log.warning if source_block.bucket=="ekyle-test-result": # WE KNOW OF THIS ETL MISTAKE, REPROCESS self.work_queue.add({ "key": unicode(key_prefix(source_key)), "bucket": "ekyle-pulse-logger" }) elif "Expecting a pure key" in e: err = Log.warning else: err = Log.error err("Problem transforming {{action}} on bucket={{source}} key={{key}} to destination={{destination}}", { "action": action.name, "source": source_block.bucket, "key": source_key, "destination": coalesce(action.destination.name, action.destination.index) }, e)
def copy(self, keys, source, sample_only_filter=None, sample_size=None, done_copy=None): """ :param keys: THE KEYS TO LOAD FROM source :param source: THE SOURCE (USUALLY S3 BUCKET) :param sample_only_filter: SOME FILTER, IN CASE YOU DO NOT WANT TO SEND EVERYTHING :param sample_size: FOR RANDOM SAMPLE OF THE source DATA :param done_copy: CALLBACK, ADDED TO queue, TO FINISH THE TRANSACTION :return: LIST OF SUB-keys PUSHED INTO ES """ num_keys = 0 queue = None pending = [] # FOR WHEN WE DO NOT HAVE QUEUE YET for key in keys: timer = Timer("Process {{key}}", param={"key": key}, silent=not DEBUG) try: with timer: for rownum, line in enumerate(source.read_lines(strip_extension(key))): if not line: continue if rownum > 0 and rownum % 1000 == 0: Log.note("Ingested {{num}} records from {{key}} in bucket {{bucket}}", num=rownum, key=key, bucket=source.name) insert_me, please_stop = fix(key, rownum, line, source, sample_only_filter, sample_size) if insert_me == None: continue value = insert_me['value'] if '_id' not in value: Log.warning("expecting an _id in all S3 records. If missing, there can be duplicates") if queue == None: queue = self._get_queue(insert_me) if queue == None: pending.append(insert_me) if len(pending) > 1000: if done_copy: done_copy() Log.error("first 1000 (key={{key}}) records for {{alias}} have no indication what index to put data", key=tuple(keys)[0], alias=self.settings.index) continue elif queue is DATA_TOO_OLD: break if pending: queue.extend(pending) pending = [] num_keys += 1 queue.add(insert_me) if please_stop: break except Exception as e: if KEY_IS_WRONG_FORMAT in e: Log.warning("Could not process {{key}} because bad format. Never trying again.", key=key, cause=e) pass elif CAN_NOT_DECODE_JSON in e: Log.warning("Could not process {{key}} because of bad JSON. Never trying again.", key=key, cause=e) pass else: Log.warning("Could not process {{key}} after {{duration|round(places=2)}}seconds", key=key, duration=timer.duration.seconds, cause=e) done_copy = None if done_copy: if queue == None: done_copy() elif queue is DATA_TOO_OLD: done_copy() else: queue.add(done_copy) if [p for p in pending if wrap(p).value.task.state not in ('failed', 'exception')]: Log.error("Did not find an index for {{alias}} to place the data for key={{key}}", key=tuple(keys)[0], alias=self.settings.index) Log.note("{{num}} keys from {{key|json}} added", num=num_keys, key=keys) return num_keys
def _dispatch_work(self, source_block): """ source_block POINTS TO THE bucket AND key TO PROCESS :return: False IF THERE IS NOTHING LEFT TO DO """ source_keys = listwrap(coalesce(source_block.key, source_block.keys)) if not isinstance(source_block.bucket, basestring): # FIX MISTAKE source_block.bucket = source_block.bucket.bucket bucket = source_block.bucket work_actions = [ w for w in self.settings.workers if w.source.bucket == bucket ] if not work_actions: Log.note( "No worker defined for records from {{bucket}}, {{action}}.\n{{message|indent}}", bucket=source_block.bucket, message=source_block, action="skipping" if self.settings.keep_unknown_on_queue else "deleting") return not self.settings.keep_unknown_on_queue for action in work_actions: try: source_key = unicode(source_keys[0]) if len(source_keys) > 1: multi_source = action._source source = ConcatSources( [multi_source.get_key(k) for k in source_keys]) source_key = MIN(source_key) else: source = action._source.get_key(source_key) source_key = source.key Log.note("Execute {{action}} on bucket={{source}} key={{key}}", action=action.name, source=source_block.bucket, key=source_key) if action.transform_type == "bulk": old_keys = set() else: old_keys = action._destination.keys( prefix=source_block.key) new_keys = set( action._transformer(source_key, source, action._destination, resources=self.resources, please_stop=self.please_stop)) #VERIFY KEYS if len(new_keys) == 1 and list(new_keys)[0] == source_key: pass # ok else: etls = map(key2etl, new_keys) etls = qb.sort(etls, "id") for i, e in enumerate(etls): if i != e.id: Log.error( "expecting keys to have dense order: {{ids}}", ids=etls.id) #VERIFY KEYS EXIST if hasattr(action._destination, "get_key"): for k in new_keys: action._destination.get_key(k) for n in action._notify: for k in new_keys: n.add(k) if action.transform_type == "bulk": continue # DUE TO BUGS THIS INVARIANT IS NOW BROKEN # TODO: FIGURE OUT HOW TO FIX THIS (CHANGE NAME OF THE SOURCE BLOCK KEY?) # for n in new_keys: # if not n.startswith(source_key): # Log.error("Expecting new keys ({{new_key}}) to start with source key ({{source_key}})", new_key= n, source_key= source_key) if not new_keys and old_keys: Log.alert( "Expecting some new keys after etl of {{source_key}}, especially since there were old ones\n{{old_keys}}", old_keys=old_keys, source_key=source_key) continue elif not new_keys: Log.alert( "Expecting some new keys after processing {{source_key}}", old_keys=old_keys, source_key=source_key) continue for k in new_keys: if len(k.split(".") ) == 3 and action.destination.type != "test_result": Log.error( "two dots have not been needed yet, this is a consitency check" ) delete_me = old_keys - new_keys if delete_me: if action.destination.bucket == "ekyle-test-result": for k in delete_me: action._destination.delete_key(k) else: Log.note("delete keys?\n{{list}}", list=sorted(delete_me)) # for k in delete_me: # WE DO NOT PUT KEYS ON WORK QUEUE IF ALREADY NOTIFYING SOME OTHER # AND NOT GOING TO AN S3 BUCKET if not action._notify and isinstance( action._destination, (aws.s3.Bucket, S3Bucket)): for k in old_keys | new_keys: self.work_queue.add( Dict(bucket=action.destination.bucket, key=k)) except Exception, e: if "Key {{key}} does not exist" in e: err = Log.warning elif "multiple keys in {{bucket}}" in e: err = Log.warning if source_block.bucket == "ekyle-test-result": for k in action._source.list( prefix=key_prefix(source_key)): action._source.delete_key(strip_extension(k.key)) elif "expecting keys to have dense order" in e: err = Log.warning if source_block.bucket == "ekyle-test-result": # WE KNOW OF THIS ETL MISTAKE, REPROCESS self.work_queue.add({ "key": unicode(key_prefix(source_key)), "bucket": "ekyle-pulse-logger" }) elif "Expecting a pure key" in e: err = Log.warning else: err = Log.error err( "Problem transforming {{action}} on bucket={{source}} key={{key}} to destination={{destination}}", { "action": action.name, "source": source_block.bucket, "key": source_key, "destination": coalesce(action.destination.name, action.destination.index) }, e)