def queue_consumer(pull_queue, please_stop=None): queue = aws.Queue(pull_queue) time_offset = None request_count = 0 while not please_stop: request = queue.pop(till=please_stop) if please_stop: break if not request: Log.note("Nothing in queue, pausing for 5 seconds...") (please_stop | Till(seconds=5)).wait() continue if SKIP_TRY_REQUESTS and 'try' in request.where['and'].eq.branch: Log.note("Skipping try revision.") queue.commit() continue now = Date.now().unix if time_offset is None: time_offset = now - request.meta.request_time next_request = request.meta.request_time + time_offset if next_request > now: Log.note("Next request in {{wait_time}}", wait_time=Duration(seconds=next_request - now)) Till(till=next_request).wait() Thread.run("request " + text_type(request_count), one_request, request) request_count += 1 queue.commit()
def main(): try: settings = startup.read_settings(defs=[{ "name": ["--id"], "help": "id (prefix, really) to process", "type": str, "dest": "id", "required": False }]) constants.set(settings.constants) Log.start(settings.debug) queries.config.default = { "type": "elasticsearch", "settings": settings.elasticsearch.copy() } if settings.args.id: work_queue = Queue("local work queue") work_queue.extend(parse_id_argument(settings.args.id)) else: work_queue = aws.Queue(settings=settings.work_queue) Log.note("Listen to queue {{queue}}, and read off of {{s3}}", queue=settings.work_queue.name, s3=settings.source.bucket) es = MultiDayIndex(settings.elasticsearch, queue_size=100000) threads = [] please_stop = Signal() for _ in range(settings.threads): p = Thread.run("copy to es", copy2es, es, settings, work_queue, please_stop=please_stop) threads.append(p) def monitor_progress(please_stop): while not please_stop: Log.note("Remaining: {{num}}", num=len(work_queue)) Thread.sleep(seconds=10) Thread.run(name="monitor progress", target=monitor_progress, please_stop=please_stop) aws.capture_termination_signal(please_stop) Thread.wait_for_shutdown_signal(please_stop=please_stop, allow_exit=True) please_stop.go() Log.note("Shutdown started") except Exception, e: Log.error("Problem with etl", e)
def required_utility(self): queue = aws.Queue(self.settings.work_queue) pending = len(queue) tod_minimum = None if Date.now().hour not in [4, 5, 6, 7, 8, 9, 10, 11]: tod_minimum = 100 return max(self.settings.minimum_utility, tod_minimum, Math.ceiling(pending / 30))
def main(): try: settings = startup.read_settings() Log.start(settings.debug) source = get_container(settings.source) destination = get_container(settings.destination) work_queue = aws.Queue(settings.work_queue) backfill(source, destination, work_queue, settings) except Exception, e: Log.error("Problem with backfill", e)
class ETL(Thread): @use_settings def __init__(self, name, work_queue, workers, resources, please_stop, wait_forever=False, settings=None): # FIND THE WORKERS METHODS settings.workers = [] for w in workers: w = deepcopy(w) for existing_worker in settings.workers: try: fuzzytestcase.assertAlmostEqual(existing_worker.source, w.source) fuzzytestcase.assertAlmostEqual( existing_worker.transformer, w.transformer) # SAME SOURCE AND TRANSFORMER, MERGE THE destinations except Exception, e: continue destination = get_container(w.destination) existing_worker._destination = Split( existing_worker._destination, destination) break else: t_name = w.transformer w._transformer = dot.get_attr(sys.modules, t_name) if not w._transformer: Log.error( "Can not find {{path}} to transformer (are you sure you are pointing to a function?)", path=t_name) w._source = get_container(w.source) w._destination = get_container(w.destination) settings.workers.append(w) w._notify = [] for notify in listwrap(w.notify): w._notify.append(aws.Queue(notify)) self.resources = resources self.settings = settings if isinstance(work_queue, Mapping): self.work_queue = aws.Queue(work_queue) else: self.work_queue = work_queue Thread.__init__(self, name, self.loop, please_stop=please_stop) self.start()
def __init__(self, endpoint, push_queue=None, timeout=30, db=None, kwargs=None): self.enabled = True self.num_bad_requests = 0 self.endpoint = endpoint self.timeout = timeout self.push_queue = aws.Queue(push_queue) if push_queue else None self.config = kwargs self.db = Sqlite(filename=coalesce(db.filename, "tuid_client.sqlite"), kwargs=db) if not self.db.query("SELECT name FROM sqlite_master WHERE type='table';").data: with self.db.transaction() as transaction: self._setup(transaction)
def diff(settings, please_stop=None): # EVERYTHING FROM ELASTICSEARCH es = MultiDayIndex(settings.elasticsearch, queue_size=100000) in_es = get_all_in_es(es) in_s3 = get_all_s3(in_es, settings) # IGNORE THE 500 MOST RECENT BLOCKS, BECAUSE THEY ARE PROBABLY NOT DONE in_s3 = in_s3[500:500 + settings.limit:] Log.note( "Queueing {{num}} keys (from {{min}} to {{max}}) for insertion to ES", num=len(in_s3), min=Math.MIN(in_s3), max=Math.MAX(in_s3)) work_queue = aws.Queue(settings=settings.work_queue) work_queue.extend(in_s3)
def queue_consumer(client, pull_queue, please_stop=None, kwargs=None): queue = aws.Queue(pull_queue) client = TuidClient(client) try_revs = {} test_try_revs = True #while len(queue) > 0: # request = queue.pop(till=please_stop) # if request: # Log.note("Popping request from {{time}}", time=request.meta.request_time) # queue.commit() while not please_stop: request = queue.pop(till=please_stop) if please_stop: break if not request: Log.note("Nothing in queue, pausing for 5 seconds...") (please_stop | Till(seconds=5)).wait() continue Log.note("Found something in queue") repo = 'mozilla-central' and_op = request.where['and'] revision = None files = None for a in and_op: if a.eq.revision: revision = a.eq.revision elif a['in'].path: files = a['in'].path elif a.eq.path: files = [a.eq.path] if len(files) <= 0: Log.warning("No files in the given request: {{request}}", request=request) continue if revision[:12] in try_revs and not test_try_revs: Log.warning( "Revision {{cset}} does not exist in the {{branch}} branch", cset=revision[:12], branch='mozilla-central') queue.commit() continue clog_url = HG_URL / 'mozilla-central' / 'json-log' / revision[:12] clog_obj = http.get_json(clog_url, retry=RETRY) if isinstance(clog_obj, (text_type, str)): Log.warning( "Revision {{cset}} does not exist in the {{branch}} branch", cset=revision[:12], branch='mozilla-central') try_revs[revision[:12]] = True if not test_try_revs: queue.commit() continue else: json_rev_url = 'https://hg.mozilla.org/try/json-rev/' + revision[: 12] clog_obj = http.get_json(json_rev_url, retry=RETRY) if 'phase' not in clog_obj: Log.warning( "Revision {{cset}} does not exist in the try branch", cset=revision[:12], branch='mozilla-central') queue.commit() continue if clog_obj['phase'] == 'draft': repo = 'try' else: Log.note("Revision {{cset}} exists on mozilla-central.", cset=revision[:12]) request.branch = repo with Timer("Make TUID request from {{timestamp|date}}", {"timestamp": request.meta.request_time}): client.enabled = True # ENSURE THE REQUEST IS MADE result = http.post_json("http://localhost:5000/tuid", json=request, timeout=10000) if not client.enabled: Log.note("pausing consumer for {{num}}sec", num=PAUSE_ON_FAILURE) Till(seconds=PAUSE_ON_FAILURE).wait() if result is None or len(result.data) != len(files): Log.warning("expecting response for every file requested") queue.commit()
def __init__(self, kwargs=None): self.settings = kwargs self.schema = SnowflakeSchema(self.settings.snowflake) self._extract = extract = kwargs.extract # SOME PREP get_git_revision() # VERIFY WE DO NOT HAVE TOO MANY OTHER PROCESSES WORKING ON STUFF with MySQL(**kwargs.snowflake.database) as db: processes = None try: processes = jx.filter( db.query("show processlist"), { "and": [{ "neq": { "Command": "Sleep" } }, { "neq": { "Info": "show processlist" } }] }) except Exception as e: Log.warning("no database", cause=e) if processes: if DEBUG: Log.warning("Processes are running\n{{list|json}}", list=processes) else: Log.error("Processes are running\n{{list|json}}", list=processes) extract.type = listwrap(extract.type) extract.start = listwrap(extract.start) extract.batch = listwrap(extract.batch) extract.field = listwrap(extract.field) if any( len(extract.type) != len(other) for other in [extract.start, extract.batch, extract.field]): Log.error( "Expecting same number of dimensions for `type`, `start`, `batch`, and `field` in the `extract` inner object" ) for i, t in enumerate(extract.type): if t == "time": extract.start[i] = Date(extract.start[i]) extract.batch[i] = Duration(extract.batch[i]) elif t == "number": pass else: Log.error('Expecting `extract.type` to be "number" or "time"') extract.threads = coalesce(extract.threads, 1) self.done_pulling = Signal() self.queue = Queue("all batches", max=2 * coalesce(extract.threads, 1), silent=True) self.bucket = s3.Bucket(self.settings.destination) self.notify = aws.Queue(self.settings.notify) Thread.run("get records", self.pull_all_remaining)
def main(): """ CLEAR OUT KEYS FROM BUCKET BY RANGE, OR BY FILE """ try: settings = startup.read_settings(defs=[{ "name": ["--bucket"], "help": "bucket to reprocess", "type": str, "dest": "bucket", "required": True }, { "name": ["--begin", "--start"], "help": "lowest key (or prefix) to reprocess", "type": str, "dest": "start", "default": "1", "required": False }, { "name": ["--end", "--stop"], "help": "highest key (or prefix) to reprocess", "type": str, "dest": "end", "default": None, "required": False }, { "name": ["--file"], "help": "path to file with CR-delimited prefix list", "type": str, "dest": "file", "default": None, "required": False }]) Log.start(settings.debug) with aws.Queue(settings.work_queue) as work_queue: source = Connection(settings.aws).get_bucket(settings.args.bucket) if settings.args.file: now = Date.now() for prefix in File(settings.args.file): all_keys = source.keys(prefix=key_prefix(prefix)) for k in all_keys: Log.note("Adding {{key}}", key=k) work_queue.add({ "bucket": settings.args.bucket, "key": k, "timestamp": now.unix, "date/time": now.format() }) return if settings.args.end and settings.args.start: up_to = str(int(settings.args.end) - 1) prefix = strings.common_prefix(settings.args.start, up_to) else: prefix = None start = Version(settings.args.start) end = Version(settings.args.end) all_keys = source.keys(prefix=prefix) with Timer("filtering {{num}} keys", {"num": len(all_keys)}): all_keys = [(k, Version(k)) for k in all_keys if k.find("None") == -1] all_keys = [(k, p) for k, p in all_keys if start <= p < end] with Timer("sorting {{num}} keys", {"num": len(all_keys)}): all_keys = qb.sort(all_keys, 1) for k, p in all_keys: Log.note("Adding {{key}}", key=k) now = Date.now() work_queue.add({ "bucket": settings.args.bucket, "key": k, "timestamp": now.unix, "date/time": now.format() }) except Exception, e: Log.error("Problem with etl", e)
def list_queue(settings, num=10): queue = aws.Queue(settings) for i in range(num): content = queue.pop() Log.note("{{content}}", content=content) queue.rollback()