def main(): try: settings = startup.read_settings() with startup.SingleInstance(settings.args.filename): constants.set(settings.constants) Log.start(settings.debug) extractor = Extract(settings) def extract(please_stop): with MySQL(**settings.snowflake.database) as db: with db.transaction(): for kwargs in extractor.queue: if please_stop: break try: extractor.extract(db=db, please_stop=please_stop, **kwargs) except Exception as e: Log.warning("Could not extract", cause=e) extractor.queue.add(kwargs) for i in range(settings.extract.threads): Thread.run("extract #" + text_type(i), extract) please_stop = Signal() Thread.wait_for_shutdown_signal(please_stop=please_stop, allow_exit=True, wait_forever=False) except Exception as e: Log.warning("Problem with data extraction", e) finally: Log.stop()
def __exit__(self, exc_type, exc_val, exc_tb): from mo_threads import Thread Thread.run("delete file " + self.name, delete_daemon, file=self, caller_stack=get_stacktrace(1))
def queue_consumer(pull_queue, please_stop=None): queue = aws.Queue(pull_queue) time_offset = None request_count = 0 while not please_stop: request = queue.pop(till=please_stop) if please_stop: break if not request: Log.note("Nothing in queue, pausing for 5 seconds...") (please_stop | Till(seconds=5)).wait() continue if SKIP_TRY_REQUESTS and 'try' in request.where['and'].eq.branch: Log.note("Skipping try revision.") queue.commit() continue now = Date.now().unix if time_offset is None: time_offset = now - request.meta.request_time next_request = request.meta.request_time + time_offset if next_request > now: Log.note("Next request in {{wait_time}}", wait_time=Duration(seconds=next_request - now)) Till(till=next_request).wait() Thread.run("request "+text_type(request_count), one_request, request) request_count += 1 queue.commit()
def test_lock_and_till(self): locker = Lock("prime lock") got_lock = Signal() a_is_ready = Signal("a lock") b_is_ready = Signal("b lock") def loop(is_ready, please_stop): with locker: while not got_lock: # Log.note("{{thread}} is waiting", thread=Thread.current().name) locker.wait(till=Till(seconds=0)) is_ready.go() locker.wait() Log.note("thread is expected to get here") thread_a = Thread.run("a", loop, a_is_ready) thread_b = Thread.run("b", loop, b_is_ready) a_is_ready.wait() b_is_ready.wait() with locker: got_lock.go() Till(seconds=0.1).wait() # MUST WAIT FOR a AND b TO PERFORM locker.wait() Log.note("leaving") pass with locker: Log.note("leaving again") pass Till(seconds=1).wait() self.assertTrue(bool(thread_a.stopped), "Thread should be done by now") self.assertTrue(bool(thread_b.stopped), "Thread should be done by now")
def __init__(self, rate=None, amortization_period=None, source=None, database=None, kwargs=None): self.amortization_period = coalesce(amortization_period, AMORTIZATION_PERIOD) self.rate = coalesce(rate, HG_REQUEST_PER_SECOND) self.cache_locker = Lock() self.cache = {} # MAP FROM url TO (ready, headers, response, timestamp) PAIR self.no_cache = {} # VERY SHORT TERM CACHE self.workers = [] self.todo = Queue(APP_NAME+" todo") self.requests = Queue(APP_NAME + " requests", max=int(self.rate * self.amortization_period.seconds)) self.url = URL(source.url) self.db = Sqlite(database) self.inbound_rate = RateLogger("Inbound") self.outbound_rate = RateLogger("hg.mo") if not self.db.query("SELECT name FROM sqlite_master WHERE type='table'").data: with self.db.transaction() as t: t.execute( "CREATE TABLE cache (" " path TEXT PRIMARY KEY, " " headers TEXT, " " response TEXT, " " timestamp REAL " ")" ) self.threads = [ Thread.run(APP_NAME+" worker" + text_type(i), self._worker) for i in range(CONCURRENCY) ] self.limiter = Thread.run(APP_NAME+" limiter", self._rate_limiter) self.cleaner = Thread.run(APP_NAME+" cleaner", self._cache_cleaner)
def capture_termination_signal(please_stop): """ WILL SIGNAL please_stop WHEN THIS AWS INSTANCE IS DUE FOR SHUTDOWN """ def worker(please_stop): seen_problem = False while not please_stop: request_time = (time.time() - timer.START)/60 # MINUTES try: response = requests.get("http://169.254.169.254/latest/meta-data/spot/termination-time") seen_problem = False if response.status_code not in [400, 404]: Log.alert("Shutdown AWS Spot Node {{name}} {{type}}", name=machine_metadata.name, type=machine_metadata.aws_instance_type) please_stop.go() except Exception as e: e = Except.wrap(e) if "Failed to establish a new connection: [Errno 10060]" in e or "A socket operation was attempted to an unreachable network" in e: Log.note("AWS Spot Detection has shutdown, probably not a spot node, (http://169.254.169.254 is unreachable)") return elif seen_problem: # IGNORE THE FIRST PROBLEM Log.warning("AWS shutdown detection has more than one consecutive problem: (last request {{time|round(1)}} minutes since startup)", time=request_time, cause=e) seen_problem = True (Till(seconds=61) | please_stop).wait() (Till(seconds=11) | please_stop).wait() Thread.run("listen for termination", worker)
def __init__(self, host, index, alias=None, name=None, port=9200, kwargs=None): global _elasticsearch if hasattr(self, "settings"): return from pyLibrary.queries.containers.list_usingPythonList import ListContainer from pyLibrary.env import elasticsearch as _elasticsearch self.settings = kwargs self.default_name = coalesce(name, alias, index) self.default_es = _elasticsearch.Cluster(kwargs=kwargs) self.todo = Queue("refresh metadata", max=100000, unique=True) self.es_metadata = Null self.last_es_metadata = Date.now()-OLD_METADATA self.meta=Data() table_columns = metadata_tables() column_columns = metadata_columns() self.meta.tables = ListContainer("meta.tables", [], wrap({c.names["."]: c for c in table_columns})) self.meta.columns = ColumnList() self.meta.columns.insert(column_columns) self.meta.columns.insert(table_columns) # TODO: fix monitor so it does not bring down ES if ENABLE_META_SCAN: self.worker = Thread.run("refresh metadata", self.monitor) else: self.worker = Thread.run("refresh metadata", self.not_monitor) return
def __init__(self, host, index, port=9200, type="log", queue_size=1000, batch_size=100, kwargs=None): """ settings ARE FOR THE ELASTICSEARCH INDEX """ kwargs.timeout = Duration(coalesce(kwargs.timeout, "30second")).seconds kwargs.retry.times = coalesce(kwargs.retry.times, 3) kwargs.retry.sleep = Duration(coalesce(kwargs.retry.sleep, MINUTE)).seconds self.es = Cluster(kwargs).get_or_create_index( schema=mo_json.json2value(value2json(SCHEMA), leaves=True), limit_replicas=True, tjson=True, kwargs=kwargs) self.batch_size = batch_size self.es.add_alias(coalesce(kwargs.alias, kwargs.index)) self.queue = Queue("debug logs to es", max=queue_size, silent=True) Thread.run("add debug logs to es", self._insert_loop)
def __init__(self, name): self.name = name self.lock = Lock("rate locker") self.request_rate = 0.0 self.last_request = Date.now() Thread.run("rate logger", self._daemon)
def __init__(self, rate=None, amortization_period=None, source=None, database=None, kwargs=None): self.amortization_period = coalesce(amortization_period, AMORTIZATION_PERIOD) self.rate = coalesce(rate, HG_REQUEST_PER_SECOND) self.cache_locker = Lock() self.cache = {} # MAP FROM url TO (ready, headers, response, timestamp) PAIR self.no_cache = {} # VERY SHORT TERM CACHE self.workers = [] self.todo = Queue(APP_NAME+" todo") self.requests = Queue(APP_NAME + " requests", max=int(self.rate * self.amortization_period.seconds)) self.url = URL(source.url) self.db = Sqlite(database) self.inbound_rate = RateLogger("Inbound") self.outbound_rate = RateLogger("hg.mo") if not self.db.query("SELECT name FROM sqlite_master WHERE type='table'").data: with self.db.transaction() as t: t.execute( "CREATE TABLE cache (" " path TEXT PRIMARY KEY, " " headers TEXT, " " response TEXT, " " timestamp REAL " ")" ) self.threads = [ Thread.run(APP_NAME+" worker" + text_type(i), self._worker) for i in range(CONCURRENCY) ] self.limiter = Thread.run(APP_NAME+" limiter", self._rate_limiter) self.cleaner = Thread.run(APP_NAME+" cleaner", self._cache_cleaner)
def capture_termination_signal(please_stop): """ WILL SIGNAL please_stop WHEN THIS AWS INSTANCE IS DUE FOR SHUTDOWN """ def worker(please_stop): while not please_stop: try: response = requests.get( "http://169.254.169.254/latest/meta-data/spot/termination-time" ) if response.status_code not in [400, 404]: Log.alert("Shutdown AWS Spot Node {{name}} {{type}}", name=machine_metadata.name, type=machine_metadata.aws_instance_type) please_stop.go() except Exception as e: e = Except.wrap(e) if "Failed to establish a new connection: [Errno 10060]" in e or "A socket operation was attempted to an unreachable network" in e: Log.note( "AWS Spot Detection has shutdown, probably not a spot node, (http://169.254.169.254 is unreachable)" ) return else: Log.warning("AWS shutdown detection has problems", cause=e) (Till(seconds=61) | please_stop).wait() (Till(seconds=11) | please_stop).wait() Thread.run("listen for termination", worker)
def queue_consumer(pull_queue, please_stop=None): queue = aws.Queue(pull_queue) time_offset = None request_count = 0 while not please_stop: request = queue.pop(till=please_stop) if please_stop: break if not request: Log.note("Nothing in queue, pausing for 5 seconds...") (please_stop | Till(seconds=5)).wait() continue if SKIP_TRY_REQUESTS and 'try' in request.where['and'].eq.branch: Log.note("Skipping try revision.") queue.commit() continue now = Date.now().unix if time_offset is None: time_offset = now - request.meta.request_time next_request = request.meta.request_time + time_offset if next_request > now: Log.note("Next request in {{wait_time}}", wait_time=Duration(seconds=next_request - now)) Till(till=next_request).wait() Thread.run("request " + text_type(request_count), one_request, request) request_count += 1 queue.commit()
def __init__(self, name): self.name = name self.lock = Lock("rate locker") self.request_rate = 0.0 self.last_request = Date.now() Thread.run("rate logger", self._daemon)
def __init__(self, host, index, alias=None, name=None, port=9200, kwargs=None): global _elasticsearch if hasattr(self, "settings"): return from pyLibrary.queries.containers.list_usingPythonList import ListContainer from pyLibrary.env import elasticsearch as _elasticsearch self.settings = kwargs self.default_name = coalesce(name, alias, index) self.default_es = _elasticsearch.Cluster(kwargs=kwargs) self.todo = Queue("refresh metadata", max=100000, unique=True) self.es_metadata = Null self.last_es_metadata = Date.now()-OLD_METADATA self.meta=Data() table_columns = metadata_tables() column_columns = metadata_columns() self.meta.tables = ListContainer("meta.tables", [], wrap({c.names["."]: c for c in table_columns})) self.meta.columns = ColumnList() self.meta.columns.insert(column_columns) self.meta.columns.insert(table_columns) # TODO: fix monitor so it does not bring down ES if ENABLE_META_SCAN: self.worker = Thread.run("refresh metadata", self.monitor) else: self.worker = Thread.run("refresh metadata", self.not_monitor) return
def __init__(self, name, config): config = to_data(config) if config.debug.logs: Log.error("not allowed to configure logging on other process") Log.note("begin process") # WINDOWS REQUIRED shell, WHILE LINUX NOT shell = "windows" in platform.system().lower() self.process = Process( name, [PYTHON, "-u", "mo_threads" + os.sep + "python_worker.py"], debug=False, cwd=os.getcwd(), shell=shell) self.process.stdin.add( value2json(set_default({}, config, {"debug": { "trace": True }}))) status = self.process.stdout.pop() if status != '{"out":"ok"}': Log.error("could not start python\n{{error|indent}}", error=self.process.stderr.pop_all() + [status] + self.process.stdin.pop_all()) self.lock = Lock("wait for response from " + name) self.current_task = DONE self.current_response = None self.current_error = None self.daemon = Thread.run("", self._daemon) self.errors = Thread.run("", self._stderr)
def test_queue_speed(self): SCALE = 1000*10 done = Signal("done") slow = Queue() q = ThreadedQueue("test queue", queue=slow) def empty(please_stop): while not please_stop: item = q.pop() if item is THREAD_STOP: break done.go() Thread.run("empty", empty) timer = Timer("add {{num}} to queue", param={"num": SCALE}) with timer: for i in range(SCALE): q.add(i) q.add(THREAD_STOP) Log.note("Done insert") done.wait() self.assertLess(timer.duration.seconds, 1.5, "Expecting queue to be fast")
def es_bulksetop(esq, frum, query): abs_limit = MIN([query.limit, MAX_DOCUMENTS]) guid = randoms.base64(32, extra="-_") schema = frum.schema all_paths, split_decoders, var_to_columns = pre_process(query) new_select, split_select, flatten = get_selects(query) op, split_wheres = setop_to_es_queries(query, all_paths, split_select, var_to_columns) es_query = es_query_proto(split_select, op, split_wheres, schema) es_query.size = MIN([query.chunk_size, MAX_CHUNK_SIZE]) es_query.sort = jx_sort_to_es_sort(query.sort, schema) if not es_query.sort: es_query.sort = ["_doc"] formatter = formatters[query.format](abs_limit, new_select, query) Thread.run( "Download " + guid, extractor, guid, abs_limit, esq, es_query, formatter, parent_thread=Null, ).release() output = to_data( { "url": URL_PREFIX / (guid + ".json"), "status": URL_PREFIX / (guid + ".status.json"), "meta": {"format": query.format, "es_query": es_query, "limit": abs_limit}, } ) return output
def _get_queue(self, row): row = wrap(row) if row.json: row.value, row.json = json2value(row.json), None timestamp = Date(self.rollover_field(row.value)) if timestamp == None: return Null elif timestamp < Date.today() - self.rollover_max: return DATA_TOO_OLD rounded_timestamp = timestamp.floor(self.rollover_interval) with self.locker: queue = self.known_queues.get(rounded_timestamp.unix) if queue == None: candidates = sort_using_key( filter( lambda r: re.match( re.escape(self.settings.index) + r"\d\d\d\d\d\d\d\d_\d\d\d\d\d\d$", r['index'] ), self.cluster.get_aliases() ), key=lambda r: r['index'] ) best = None for c in candidates: c = wrap(c) c.date = unicode2Date(c.index[-15:], elasticsearch.INDEX_DATE_FORMAT) if timestamp > c.date: best = c if not best or rounded_timestamp > best.date: if rounded_timestamp < wrap(candidates[-1]).date: es = self.cluster.get_or_create_index(read_only=False, alias=best.alias, index=best.index, kwargs=self.settings) else: try: es = self.cluster.create_index(create_timestamp=rounded_timestamp, kwargs=self.settings) es.add_alias(self.settings.index) except Exception as e: e = Except.wrap(e) if "IndexAlreadyExistsException" not in e: Log.error("Problem creating index", cause=e) return self._get_queue(row) # TRY AGAIN else: es = self.cluster.get_or_create_index(read_only=False, alias=best.alias, index=best.index, kwargs=self.settings) def refresh(please_stop): try: es.set_refresh_interval(seconds=60 * 10, timeout=5) except Exception: Log.note("Could not set refresh interval for {{index}}", index=es.settings.index) Thread.run("refresh", refresh) self._delete_old_indexes(candidates) threaded_queue = es.threaded_queue(max_size=self.settings.queue_size, batch_size=self.settings.batch_size, silent=True) with self.locker: queue = self.known_queues[rounded_timestamp.unix] = threaded_queue return queue
def __init__(self): self.out_of_memory_restart = False self.total_locker = Lock() self.total_files_requested = 0 self.total_tuids_mapped = 0 self.threads_locker = Lock() self.waiting = 0 self.threads_waiting = 0 self.requests_locker = Lock() self.requests_total = 0 self.requests_complete = 0 self.requests_incomplete = 0 self.requests_passed = 0 self.requests_failed = 0 self.prev_mem = 0 self.curr_mem = 0 self.initial_growth = {} Thread.run("pc-daemon", self.run_pc_daemon) Thread.run("threads-daemon", self.run_threads_daemon) Thread.run("memory-daemon", self.run_memory_daemon) Thread.run("requests-daemon", self.run_requests_daemon)
def __init__(self): self.out_of_memory_restart = False self.total_locker = Lock() self.total_files_requested = 0 self.total_tuids_mapped = 0 self.threads_locker = Lock() self.waiting = 0 self.threads_waiting = 0 self.requests_locker = Lock() self.requests_total = 0 self.requests_complete = 0 self.requests_incomplete = 0 self.requests_passed = 0 self.requests_failed = 0 self.prev_mem = 0 self.curr_mem = 0 self.initial_growth = {} Thread.run("pc-daemon", self.run_pc_daemon) Thread.run("threads-daemon", self.run_threads_daemon) Thread.run("memory-daemon", self.run_memory_daemon) Thread.run("requests-daemon", self.run_requests_daemon)
def test_and_signals(self): acc = [] locker = Lock() def worker(please_stop): with locker: acc.append("worker") a = Thread.run("a", worker) b = Thread.run("b", worker) c = Thread.run("c", worker) (a.stopped & b.stopped & c.stopped).wait() acc.append("done") self.assertEqual(acc, ["worker", "worker", "worker", "done"])
def __init__(self, host, index, sql_file='metadata.sqlite', alias=None, name=None, port=9200, kwargs=None): if hasattr(self, "settings"): return self.too_old = TOO_OLD self.settings = kwargs self.default_name = coalesce(name, alias, index) self.es_cluster = elasticsearch.Cluster(kwargs=kwargs) self.index_does_not_exist = set() self.todo = Queue("refresh metadata", max=100000, unique=True) self.index_to_alias = Relation_usingList() self.es_metadata = Null self.metadata_last_updated = Date.now() - OLD_METADATA self.meta = Data() self.meta.columns = ColumnList() self.alias_to_query_paths = { "meta.columns": [['.']], "meta.tables": [['.']] } self.alias_last_updated = { "meta.columns": Date.now(), "meta.tables": Date.now() } table_columns = metadata_tables() self.meta.tables = ListContainer( "meta.tables", [ # TableDesc("meta.columns", None, ".", Date.now()), # TableDesc("meta.tables", None, ".", Date.now()) ], jx_base.Schema(".", table_columns)) self.meta.columns.extend(table_columns) # TODO: fix monitor so it does not bring down ES if ENABLE_META_SCAN: self.worker = Thread.run("refresh metadata", self.monitor) else: self.worker = Thread.run("refresh metadata", self.not_monitor) return
def __init__(self, conn=None, tuid_service=None, kwargs=None): try: self.config = kwargs self.conn = conn if conn else sql.Sql(self.config.database.name) self.hg_cache = HgMozillaOrg( kwargs=self.config.hg_cache, use_cache=True) if self.config.hg_cache else Null self.tuid_service = tuid_service if tuid_service else tuid.service.TUIDService( database=None, hg=None, kwargs=self.config, conn=self.conn, clogger=self) self.rev_locker = Lock() self.working_locker = Lock() self.init_db() self.next_revnum = coalesce( self.conn.get_one("SELECT max(revnum)+1 FROM csetLog")[0], 1) self.csets_todo_backwards = Queue( name="Clogger.csets_todo_backwards") self.deletions_todo = Queue(name="Clogger.deletions_todo") self.maintenance_signal = Signal(name="Clogger.maintenance_signal") self.config = self.config.tuid self.disable_backfilling = False self.disable_tipfilling = False self.disable_deletion = False self.disable_maintenance = False # Make sure we are filled before allowing queries numrevs = self.conn.get_one("SELECT count(revnum) FROM csetLog")[0] if numrevs < MINIMUM_PERMANENT_CSETS: Log.note("Filling in csets to hold {{minim}} csets.", minim=MINIMUM_PERMANENT_CSETS) oldest_rev = 'tip' with self.conn.transaction() as t: tmp = t.query( "SELECT min(revnum), revision FROM csetLog").data[0][1] if tmp: oldest_rev = tmp self._fill_in_range(MINIMUM_PERMANENT_CSETS - numrevs, oldest_rev, timestamp=False) Log.note( "Table is filled with atleast {{minim}} entries. Starting workers...", minim=MINIMUM_PERMANENT_CSETS) Thread.run('clogger-tip', self.fill_forward_continuous) Thread.run('clogger-backfill', self.fill_backward_with_list) Thread.run('clogger-maintenance', self.csetLog_maintenance) Thread.run('clogger-deleter', self.csetLog_deleter) Log.note("Started clogger workers.") except Exception as e: Log.warning("Cannot setup clogger: {{cause}}", cause=str(e))
def __init__(self, name, config): config = wrap(config) if config.debug.logs: Log.error("not allowed to configure logging on other process") self.process = Process(name, [PYTHON, "mo_threads" + os.sep + "python_worker.py"], shell=True) self.process.stdin.add(value2json(set_default({"debug": {"trace": True}}, config))) self.lock = Lock("wait for response from "+name) self.current_task = None self.current_response = None self.current_error = None self.daemon = Thread.run("", self._daemon) self.errors = Thread.run("", self._stderr)
def test_till_in_loop(self): def loop(please_stop): counter = 0 while not please_stop: (Till(seconds=0.001) | please_stop).wait() counter += 1 Log.note("{{count}}", count=counter) please_stop=Signal("please_stop") Thread.run("loop", loop, please_stop=please_stop) Till(seconds=1).wait() with please_stop.lock: self.assertLessEqual(len(please_stop.job_queue), 1, "Expecting only one pending job on go") please_stop.go()
def __init__(self, name): Table.__init__(self, "meta.columns") self.db_file = File("metadata." + name + ".sqlite") self.data = {} # MAP FROM ES_INDEX TO (abs_column_name to COLUMNS) self.locker = Lock() self._schema = None self.db = sqlite3.connect( database=self.db_file.abspath, check_same_thread=False, isolation_level=None ) self.last_load = Null self.todo = Queue( "update columns to db" ) # HOLD (action, column) PAIR, WHERE action in ['insert', 'update'] self._db_load() Thread.run("update " + name, self._db_worker)
def __init__(self, message="ping", every="second", start=None, until=None): if is_text(message): self.message = show_message(message) else: self.message = message self.every = Duration(every) if isinstance(until, Signal): self.please_stop = until elif until == None: self.please_stop = Signal() else: self.please_stop = Till(Duration(until).seconds) self.thread = None if start: self.thread = Thread.run( "repeat", _repeat, self.message, self.every, Date(start), parent_thread=MAIN_THREAD, please_stop=self.please_stop, ).release()
def setup( self, instance, # THE boto INSTANCE OBJECT FOR THE MACHINE TO SETUP utility # THE utility OBJECT FOUND IN CONFIG ): with self.locker: if not self.settings.setup_timeout: Log.error("expecting instance.setup_timeout to prevent setup from locking") def worker(please_stop): cpu_count = int(round(utility.cpu)) with hide('output'): Log.note("setup {{instance}}", instance=instance.id) self._config_fabric(instance) Log.note("update packages on {{instance}} ip={{ip}}", instance=instance.id, ip=instance.ip_address) try: self._update_ubuntu_packages() except Exception as e: Log.warning("Can not setup {{instance}}, type={{type}}", instance=instance.id, type=instance.instance_type, cause=e) return Log.note("setup etl on {{instance}}", instance=instance.id) self._setup_etl_code() Log.note("setup grcov on {{instance}}", instance=instance.id) self._setup_grcov() Log.note("add config file on {{instance}}", instance=instance.id) self._add_private_file() Log.note("setup supervisor on {{instance}}", instance=instance.id) self._setup_etl_supervisor(cpu_count) Log.note("setup done {{instance}}", instance=instance.id) worker_thread = Thread.run("etl setup started at "+unicode(Date.now().format()), worker) (Till(timeout=Duration(self.settings.setup_timeout).seconds) | worker_thread.stopped).wait() if not worker_thread.stopped: Log.error("critical failure in thread {{name|quote}}", name=worker_thread.name) worker_thread.join()
def __init__(self, host, index, port=9200, type="log", max_size=1000, batch_size=100, kwargs=None): """ settings ARE FOR THE ELASTICSEARCH INDEX """ self.es = Cluster(kwargs).get_or_create_index( schema=mo_json.json2value(value2json(SCHEMA), leaves=True), limit_replicas=True, tjson=True, kwargs=kwargs ) self.batch_size = batch_size self.es.add_alias(coalesce(kwargs.alias, kwargs.index)) self.queue = Queue("debug logs to es", max=max_size, silent=True) self.es.settings.retry.times = coalesce(self.es.settings.retry.times, 3) self.es.settings.retry.sleep = Duration(coalesce(self.es.settings.retry.sleep, MINUTE)) Thread.run("add debug logs to es", self._insert_loop)
def test_thread_wait(self): NUM = 100 locker = Lock("test") phase1 = [] phase2 = [] def work(value, please_stop): with locker: phase1.append(value) locker.wait() phase2.append(value) with locker: threads = [Thread.run(unicode(i), work, i) for i in range(NUM)] # CONTINUE TO USE THE locker SO WAITS GET TRIGGERED while len(phase2) < NUM: with locker: pass for t in threads: t.join() self.assertEqual(len(phase1), NUM, "expecting "+unicode(NUM)+" items") self.assertEqual(len(phase2), NUM, "expecting "+unicode(NUM)+" items") for i in range(NUM): self.assertTrue(i in phase1, "expecting "+unicode(i)) self.assertTrue(i in phase2, "expecting "+unicode(i)) Log.note("done")
def __init__( self, host, index, port=9200, type="log", queue_size=1000, batch_size=100, kwargs=None, ): """ settings ARE FOR THE ELASTICSEARCH INDEX """ kwargs.timeout = Duration(coalesce(kwargs.timeout, "30second")).seconds kwargs.retry.times = coalesce(kwargs.retry.times, 3) kwargs.retry.sleep = Duration(coalesce(kwargs.retry.sleep, MINUTE)).seconds kwargs.host = Random.sample(listwrap(host), 1)[0] schema = json2value(value2json(SCHEMA), leaves=True) schema.mappings[type].properties["~N~"].type = "nested" self.es = Cluster(kwargs).get_or_create_index( schema=schema, limit_replicas=True, typed=True, kwargs=kwargs, ) self.batch_size = batch_size self.es.add_alias(coalesce(kwargs.alias, kwargs.index)) self.queue = Queue("debug logs to es", max=queue_size, silent=True) self.worker = Thread.run("add debug logs to es", self._insert_loop)
def test_loop(self): acc = [] started = Signal() def work(please_stop): started.go() while not please_stop: acc.append(Date.now().unix) Till(seconds=0.1).wait() worker = Thread.run("loop", work) started.wait() while len(acc) < 10: Till(seconds=0.1).wait() worker.stop() worker.join() # We expect 10, but 9 is good enough num = len(acc) self.assertGreater( num, 9, "Expecting some reasonable number of entries to prove there was looping, not " + text(num), )
def __init__(self, es_cluster): Table.__init__(self, META_COLUMNS_NAME) self.data = {} # MAP FROM ES_INDEX TO (abs_column_name to COLUMNS) self.locker = Lock() self._schema = None self.dirty = False self.es_cluster = es_cluster self.es_index = None self.last_load = Null self.todo = Queue( "update columns to es" ) # HOLD (action, column) PAIR, WHERE action in ['insert', 'update'] self._db_load() Thread.run("update " + META_COLUMNS_NAME, self._update_from_es, parent_thread=MAIN_THREAD)
def _find_revision(self, revision): please_stop = False locker = Lock() output = [] queue = Queue("branches", max=2000) queue.extend(b for b in self.branches if b.locale == DEFAULT_LOCALE and b.name in ["try", "mozilla-inbound", "autoland"]) queue.add(THREAD_STOP) problems = [] def _find(please_stop): for b in queue: if please_stop: return try: url = b.url + "json-info?node=" + revision rev = self.get_revision(Revision(branch=b, changeset={"id": revision})) with locker: output.append(rev) Log.note("Revision found at {{url}}", url=url) except Exception as f: problems.append(f) threads = [] for i in range(3): threads.append(Thread.run("find changeset " + text_type(i), _find, please_stop=please_stop)) for t in threads: with assert_no_exception: t.join() return output
def _find_revision(self, revision): please_stop = False locker = Lock() output = [] queue = Queue("branches", max=2000) queue.extend(b for b in self.branches if b.locale == DEFAULT_LOCALE and b.name in ["try", "mozilla-inbound", "autoland"]) queue.add(THREAD_STOP) problems = [] def _find(please_stop): for b in queue: if please_stop: return try: url = b.url + "json-info?node=" + revision rev = self.get_revision(Revision(branch=b, changeset={"id": revision})) with locker: output.append(rev) Log.note("Revision found at {{url}}", url=url) except Exception as f: problems.append(f) threads = [] for i in range(3): threads.append(Thread.run("find changeset " + text_type(i), _find, please_stop=please_stop)) for t in threads: with assert_no_exception: t.join() return output
def setup_flask_ssl(): config.flask.ssl_context = None if not config.flask.ssl_context: return ssl_flask = config.flask.copy() ssl_flask.debug = False ssl_flask.port = 443 if is_data(config.flask.ssl_context): # EXPECTED PEM ENCODED FILE NAMES # `load_cert_chain` REQUIRES CONCATENATED LIST OF CERTS with TempFile() as tempfile: try: tempfile.write( File(ssl_flask.ssl_context.certificate_file).read_bytes()) if ssl_flask.ssl_context.certificate_chain_file: tempfile.write( File(ssl_flask.ssl_context.certificate_chain_file). read_bytes()) tempfile.flush() tempfile.close() context = SSLContext(PROTOCOL_SSLv23) context.load_cert_chain( tempfile.name, keyfile=File( ssl_flask.ssl_context.privatekey_file).abspath) ssl_flask.ssl_context = context except Exception as e: Log.error("Could not handle ssl context construction", cause=e) def runner(please_stop): Log.warning("ActiveData listening on encrypted port {{port}}", port=ssl_flask.port) flask_app.run(**ssl_flask) Thread.run("SSL Server", runner) if config.flask.ssl_context and config.flask.port != 80: Log.warning( "ActiveData has SSL context, but is still listening on non-encrypted http port {{port}}", port=config.flask.port) config.flask.ssl_context = None
def __init__( self, host, index, port=9200, type="log", queue_size=1000, batch_size=100, refresh_interval="1second", kwargs=None, ): """ settings ARE FOR THE ELASTICSEARCH INDEX """ kwargs.timeout = Duration(coalesce(kwargs.timeout, "30second")).seconds kwargs.retry.times = coalesce(kwargs.retry.times, 3) kwargs.retry.sleep = Duration(coalesce(kwargs.retry.sleep, MINUTE)).seconds kwargs.host = randoms.sample(listwrap(host), 1)[0] rollover_interval = coalesce(kwargs.rollover.interval, kwargs.rollover.max, "year") rollover_max = coalesce(kwargs.rollover.max, kwargs.rollover.interval, "year") schema = set_default( kwargs.schema, { "mappings": { kwargs.type: { "properties": { "~N~": { "type": "nested" } } } } }, json2value(value2json(SCHEMA), leaves=True), ) self.es = RolloverIndex( rollover_field={"get": [{ "first": "." }, { "literal": "timestamp" }]}, rollover_interval=rollover_interval, rollover_max=rollover_max, schema=schema, limit_replicas=True, typed=True, read_only=False, kwargs=kwargs, ) self.batch_size = batch_size self.queue = Queue("debug logs to es", max=queue_size, silent=True) self.worker = Thread.run("add debug logs to es", self._insert_loop)
def update_local_database(config, deviant_summary, candidates, since): if isinstance(deviant_summary, bigquery.Table): Log.note("Only the ETL process should fill the bigquery table") return # GET EVERYTHING WE HAVE SO FAR exists = deviant_summary.query({ "select": ["signature_hash", "last_updated"], "where": { "and": [ { "in": { "signature_hash": candidates.signature_hash } }, { "exists": "num_pushes" }, ] }, "sort": "last_updated", "limit": 100000, "format": "list", }).data # CHOOSE MISSING, THEN OLDEST, UP TO "RECENT" missing = list(set(candidates.signature_hash) - set(exists.signature_hash)) too_old = Date.today() - parse(LOCAL_RETENTION) needs_update = missing + [ e.signature_hash for e in exists if e.last_updated < too_old.unix ] Log.alert("{{num}} series are candidates for local update", num=len(needs_update)) limited_update = Queue("sigs") limited_update.extend( left(needs_update, coalesce(config.display.download_limit, 100))) Log.alert("Updating local database with {{num}} series", num=len(limited_update)) with Timer("Updating local database"): def loop(please_stop): while not please_stop: signature_hash = limited_update.pop_one() if not signature_hash: return process( signature_hash, since, source=config.database, deviant_summary=deviant_summary, ) threads = [Thread.run(text(i), loop) for i in range(3)] for t in threads: t.join() Log.note("Local database is up to date")
def __init__(self, host, index, sql_file='metadata.sqlite', alias=None, name=None, port=9200, kwargs=None): if hasattr(self, "settings"): return self.too_old = TOO_OLD self.settings = kwargs self.default_name = coalesce(name, alias, index) self.es_cluster = elasticsearch.Cluster(kwargs=kwargs) self.index_does_not_exist = set() self.todo = Queue("refresh metadata", max=100000, unique=True) self.index_to_alias = Relation_usingList() self.es_metadata = Null self.metadata_last_updated = Date.now() - OLD_METADATA self.meta = Data() self.meta.columns = ColumnList() self.alias_to_query_paths = { "meta.columns": [['.']], "meta.tables": [['.']] } self.alias_last_updated = { "meta.columns": Date.now(), "meta.tables": Date.now() } table_columns = metadata_tables() self.meta.tables = ListContainer( "meta.tables", [ # TableDesc("meta.columns", None, ".", Date.now()), # TableDesc("meta.tables", None, ".", Date.now()) ], jx_base.Schema(".", table_columns) ) self.meta.columns.extend(table_columns) # TODO: fix monitor so it does not bring down ES if ENABLE_META_SCAN: self.worker = Thread.run("refresh metadata", self.monitor) else: self.worker = Thread.run("refresh metadata", self.not_monitor) return
def __init__( self, hg=None, # CONNECT TO hg repo=None, # CONNECTION INFO FOR ES CACHE branches=None, # CONNECTION INFO FOR ES CACHE use_cache=False, # True IF WE WILL USE THE ES FOR DOWNLOADING BRANCHES timeout=30 * SECOND, kwargs=None ): if not _hg_branches: _late_imports() self.es_locker = Lock() self.todo = mo_threads.Queue("todo for hg daemon", max=DAEMON_QUEUE_SIZE) self.settings = kwargs self.timeout = Duration(timeout) # VERIFY CONNECTIVITY with Explanation("Test connect with hg"): response = http.head(self.settings.hg.url) if branches == None: self.branches = _hg_branches.get_branches(kwargs=kwargs) self.es = None return self.last_cache_miss = Date.now() set_default(repo, {"schema": revision_schema}) self.es = elasticsearch.Cluster(kwargs=repo).get_or_create_index(kwargs=repo) def setup_es(please_stop): with suppress_exception: self.es.add_alias() with suppress_exception: self.es.set_refresh_interval(seconds=1) Thread.run("setup_es", setup_es) self.branches = _hg_branches.get_branches(kwargs=kwargs) self.timeout = timeout Thread.run("hg daemon", self._daemon)
def _setup(): threads = Data() signals = Data() db = Sqlite() db.query("CREATE TABLE my_table (value TEXT)") for name in ["a", "b"]: signals[name] = [{"begin": Signal(), "done": Signal()} for _ in range(4)] threads[name] = Thread.run(name, _work, name, db, signals[name]) return db, threads, signals
def capture_termination_signal(please_stop): """ WILL SIGNAL please_stop WHEN THIS AWS INSTANCE IS DUE FOR SHUTDOWN """ def worker(please_stop): while not please_stop: try: response = requests.get("http://169.254.169.254/latest/meta-data/spot/termination-time") if response.status_code not in [400, 404]: Log.alert("Shutdown AWS Spot Node {{name}} {{type}}", name=machine_metadata.name, type=machine_metadata.aws_instance_type) please_stop.go() except Exception as e: e = Except.wrap(e) if "Failed to establish a new connection: [Errno 10060]" in e or "A socket operation was attempted to an unreachable network" in e: Log.note("AWS Spot Detection has shutdown, probably not a spot node, (http://169.254.169.254 is unreachable)") return else: Log.warning("AWS shutdown detection has problems", cause=e) (Till(seconds=61) | please_stop).wait() (Till(seconds=11) | please_stop).wait() Thread.run("listen for termination", worker, please_stop=please_stop)
def __init__(self, filename=None, db=None, upgrade=True): """ :param db: Optional, wrap a sqlite db in a thread :return: Multithread-safe database """ if upgrade and not _upgraded: _upgrade() self.filename = filename self.db = db self.queue = Queue("sql commands") # HOLD (command, result, signal) PAIRS self.worker = Thread.run("sqlite db thread", self._worker) self.get_trace = DEBUG self.upgrade = upgrade
def query(self, command): """ WILL BLOCK CALLING THREAD UNTIL THE command IS COMPLETED :param command: COMMAND FOR SQLITE :return: list OF RESULTS """ if not self.worker: self.worker = Thread.run("sqlite db thread", self._worker) signal = Signal() result = Data() self.queue.add((command, result, signal, None)) signal.wait() if result.exception: Log.error("Problem with Sqlite call", cause=result.exception) return result
def _start_life_cycle_watcher(self): def life_cycle_watcher(please_stop): failed_attempts=Data() while not please_stop: spot_requests = self._get_managed_spot_requests() last_get = Date.now() instances = wrap({i.id: i for r in self.ec2_conn.get_all_instances() for i in r.instances}) # INSTANCES THAT REQUIRE SETUP time_to_stop_trying = {} please_setup = [ (i, r) for i, r in [(instances[r.instance_id], r) for r in spot_requests] if i.id and not i.tags.get("Name") and i._state.name == "running" and Date.now() > Date(i.launch_time) + DELAY_BEFORE_SETUP ] for i, r in please_setup: try: p = self.settings.utility[i.instance_type] if p == None: try: self.ec2_conn.terminate_instances(instance_ids=[i.id]) with self.net_new_locker: self.net_new_spot_requests.remove(r.id) finally: Log.error("Can not setup unknown {{instance_id}} of type {{type}}", instance_id=i.id, type=i.instance_type) i.markup = p try: self.instance_manager.setup(i, coalesce(p, 0)) except Exception as e: e = Except.wrap(e) failed_attempts[r.id] += [e] Log.error(ERROR_ON_CALL_TO_SETUP, e) i.add_tag("Name", self.settings.ec2.instance.name + " (running)") with self.net_new_locker: self.net_new_spot_requests.remove(r.id) except Exception as e: if not time_to_stop_trying.get(i.id): time_to_stop_trying[i.id] = Date.now() + TIME_FROM_RUNNING_TO_LOGIN if Date.now() > time_to_stop_trying[i.id]: # FAIL TO SETUP AFTER x MINUTES, THEN TERMINATE INSTANCE self.ec2_conn.terminate_instances(instance_ids=[i.id]) with self.net_new_locker: self.net_new_spot_requests.remove(r.id) Log.warning("Problem with setup of {{instance_id}}. Time is up. Instance TERMINATED!", instance_id=i.id, cause=e) elif "Can not setup unknown " in e: Log.warning("Unexpected failure on startup", instance_id=i.id, cause=e) elif ERROR_ON_CALL_TO_SETUP in e: if len(failed_attempts[r.id]) > 2: Log.warning("Problem with setup() of {{instance_id}}", instance_id=i.id, cause=failed_attempts[r.id]) else: Log.warning("Unexpected failure on startup", instance_id=i.id, cause=e) if Date.now() - last_get > 5 * SECOND: # REFRESH STALE spot_requests = self._get_managed_spot_requests() last_get = Date.now() pending = wrap([r for r in spot_requests if r.status.code in PENDING_STATUS_CODES]) give_up = wrap([r for r in spot_requests if r.status.code in PROBABLY_NOT_FOR_A_WHILE | TERMINATED_STATUS_CODES]) ignore = wrap([r for r in spot_requests if r.status.code in MIGHT_HAPPEN]) # MIGHT HAPPEN, BUT NO NEED TO WAIT FOR IT if self.done_spot_requests: with self.net_new_locker: expired = Date.now() - self.settings.run_interval + 2 * MINUTE for ii in list(self.net_new_spot_requests): if Date(ii.create_time) < expired: ## SOMETIMES REQUESTS NEVER GET INTO THE MAIN LIST OF REQUESTS self.net_new_spot_requests.remove(ii) for g in give_up: self.net_new_spot_requests.remove(g.id) for g in ignore: self.net_new_spot_requests.remove(g.id) pending = UniqueIndex(("id",), data=pending) pending = pending | self.net_new_spot_requests if give_up: self.ec2_conn.cancel_spot_instance_requests(request_ids=give_up.id) Log.note("Cancelled spot requests {{spots}}, {{reasons}}", spots=give_up.id, reasons=give_up.status.code) if not pending and not time_to_stop_trying and self.done_spot_requests: Log.note("No more pending spot requests") please_stop.go() break elif pending: Log.note("waiting for spot requests: {{pending}}", pending=[p.id for p in pending]) (Till(seconds=10) | please_stop).wait() Log.note("life cycle watcher has stopped") # Log.warning("lifecycle watcher is disabled") timeout = Till(seconds=self.settings.run_interval.seconds - 60) self.watcher = Thread.run("lifecycle watcher", life_cycle_watcher, please_stop=timeout)
def __exit__(self, exc_type, exc_val, exc_tb): Thread.run("delete file " + self.name, delete_daemon, file=self, caller_stack=extract_stack(1))
Log.note("Skipping try revision.") queue.commit() continue now = Date.now().unix if time_offset is None: time_offset = now - request.meta.request_time next_request = request.meta.request_time + time_offset if next_request > now: Log.note("Next request in {{wait_time}}", wait_time=Duration(seconds=next_request - now)) Till(till=next_request).wait() Thread.run("request "+text_type(request_count), one_request, request) request_count += 1 queue.commit() if __name__ == '__main__': try: tmp_signal = Signal() config = startup.read_settings() constants.set(config.constants) Log.start(config.debug) queue_consumer(kwargs=config, please_stop=tmp_signal) worker = Thread.run("sqs consumer", queue_consumer, kwargs=config) MAIN_THREAD.wait_for_shutdown_signal(allow_exit=True, please_stop=worker.stopped) except BaseException as e: Log.error("Serious problem with consumer construction! Shutdown!", cause=e)
def start_backfilling(self): if not self.backfill_thread: self.backfill_thread = Thread.run('clogger-backfill', self.fill_backward_with_list)
def start_tipfillling(self): if not self.tipfill_thread: self.tipfill_thread = Thread.run('clogger-tip', self.fill_forward_continuous)
def start_maintenance(self): if not self.maintenance_thread: self.maintenance_thread = Thread.run('clogger-maintenance', self.csetLog_maintenance)
def es_deepop(es, query): schema = query.frum.schema query_path = schema.query_path[0] # TODO: FIX THE GREAT SADNESS CAUSED BY EXECUTING post_expressions # THE EXPRESSIONS SHOULD BE PUSHED TO THE CONTAINER: ES ALLOWS # {"inner_hit":{"script_fields":[{"script":""}...]}}, BUT THEN YOU # LOOSE "_source" BUT GAIN "fields", FORCING ALL FIELDS TO BE EXPLICIT post_expressions = {} es_query, es_filters = es_query_template(query_path) # SPLIT WHERE CLAUSE BY DEPTH wheres = split_expression_by_depth(query.where, schema) for i, f in enumerate(es_filters): script = AndOp("and", wheres[i]).partial_eval().to_esfilter(schema) set_default(f, script) if not wheres[1]: # WITHOUT NESTED CONDITIONS, WE MUST ALSO RETURN DOCS WITH NO NESTED RECORDS more_filter = { "and": [ es_filters[0], {"missing": {"field": untype_path(query_path) + "." + EXISTS_TYPE}} ] } else: more_filter = None es_query.size = coalesce(query.limit, DEFAULT_LIMIT) # es_query.sort = jx_sort_to_es_sort(query.sort) map_to_es_columns = schema.map_to_es() # {c.names["."]: c.es_column for c in schema.leaves(".")} query_for_es = query.map(map_to_es_columns) es_query.sort = jx_sort_to_es_sort(query_for_es.sort, schema) es_query.fields = [] is_list = isinstance(query.select, list) new_select = FlatList() i = 0 for s in listwrap(query.select): if isinstance(s.value, LeavesOp) and isinstance(s.value.term, Variable): # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS leaves = schema.leaves(s.value.term.var) col_names = set() for c in leaves: if c.nested_path[0] == ".": if c.jx_type == NESTED: continue es_query.fields += [c.es_column] c_name = untype_path(c.names[query_path]) col_names.add(c_name) new_select.append({ "name": concat_field(s.name, c_name), "nested_path": c.nested_path[0], "put": {"name": concat_field(s.name, literal_field(c_name)), "index": i, "child": "."}, "pull": get_pull_function(c) }) i += 1 # REMOVE DOTS IN PREFIX IF NAME NOT AMBIGUOUS for n in new_select: if n.name.startswith("..") and n.name.lstrip(".") not in col_names: n.put.name = n.name = n.name.lstrip(".") col_names.add(n.name) elif isinstance(s.value, Variable): net_columns = schema.leaves(s.value.var) if not net_columns: new_select.append({ "name": s.name, "nested_path": ".", "put": {"name": s.name, "index": i, "child": "."}, "pull": NULL }) else: for n in net_columns: pull = get_pull_function(n) if n.nested_path[0] == ".": if n.jx_type == NESTED: continue es_query.fields += [n.es_column] # WE MUST FIGURE OUT WHICH NAMESSPACE s.value.var IS USING SO WE CAN EXTRACT THE child for np in n.nested_path: c_name = untype_path(n.names[np]) if startswith_field(c_name, s.value.var): child = relative_field(c_name, s.value.var) break else: child = relative_field(untype_path(n.names[n.nested_path[0]]), s.value.var) new_select.append({ "name": s.name, "pull": pull, "nested_path": n.nested_path[0], "put": { "name": s.name, "index": i, "child": child } }) i += 1 else: expr = s.value for v in expr.vars(): for c in schema[v.var]: if c.nested_path[0] == ".": es_query.fields += [c.es_column] # else: # Log.error("deep field not expected") pull_name = EXPRESSION_PREFIX + s.name map_to_local = MapToLocal(schema) pull = jx_expression_to_function(pull_name) post_expressions[pull_name] = compile_expression(expr.map(map_to_local).to_python()) new_select.append({ "name": s.name if is_list else ".", "pull": pull, "value": expr.__data__(), "put": {"name": s.name, "index": i, "child": "."} }) i += 1 # <COMPLICATED> ES needs two calls to get all documents more = [] def get_more(please_stop): more.append(es_post( es, Data( query={"filtered": {"filter": more_filter}}, fields=es_query.fields ), query.limit )) if more_filter: need_more = Thread.run("get more", target=get_more) with Timer("call to ES") as call_timer: data = es_post(es, es_query, query.limit) # EACH A HIT IS RETURNED MULTIPLE TIMES FOR EACH INNER HIT, WITH INNER HIT INCLUDED def inners(): for t in data.hits.hits: for i in t.inner_hits[literal_field(query_path)].hits.hits: t._inner = i._source for k, e in post_expressions.items(): t[k] = e(t) yield t if more_filter: Thread.join(need_more) for t in more[0].hits.hits: yield t #</COMPLICATED> try: formatter, groupby_formatter, mime_type = format_dispatch[query.format] output = formatter(inners(), new_select, query) output.meta.timing.es = call_timer.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception as e: Log.error("problem formatting", e)
def start_deleter(self): if not self.deletion_thread: self.deletion_thread = Thread.run('clogger-deleter', self.csetLog_deleter)
def es_deepop(es, query): schema = query.frum.schema columns = schema.columns query_path = schema.query_path map_to_local = {k: get_pull(c[0]) for k, c in schema.lookup.items()} # TODO: FIX THE GREAT SADNESS CAUSED BY EXECUTING post_expressions # THE EXPRESSIONS SHOULD BE PUSHED TO THE CONTAINER: ES ALLOWS # {"inner_hit":{"script_fields":[{"script":""}...]}}, BUT THEN YOU # LOOSE "_source" BUT GAIN "fields", FORCING ALL FIELDS TO BE EXPLICIT post_expressions = {} es_query, es_filters = es14.util.es_query_template(query.frum.name) # SPLIT WHERE CLAUSE BY DEPTH wheres = split_expression_by_depth(query.where, schema) for i, f in enumerate(es_filters): # PROBLEM IS {"match_all": {}} DOES NOT SURVIVE set_default() for k, v in unwrap(simplify_esfilter(AndOp("and", wheres[i]).to_esfilter())).items(): f[k] = v if not wheres[1]: more_filter = { "and": [ simplify_esfilter(AndOp("and", wheres[0]).to_esfilter()), {"not": { "nested": { "path": query_path, "filter": { "match_all": {} } } }} ] } else: more_filter = None es_query.size = coalesce(query.limit, queries.query.DEFAULT_LIMIT) es_query.sort = jx_sort_to_es_sort(query.sort) es_query.fields = [] is_list = isinstance(query.select, list) new_select = FlatList() i = 0 for s in listwrap(query.select): if isinstance(s.value, LeavesOp): if isinstance(s.value.term, Variable): if s.value.term.var == ".": # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS for c in columns: if c.type not in STRUCT and c.es_column != "_id": if c.nested_path[0] == ".": es_query.fields += [c.es_column] new_select.append({ "name": c.names[query_path], "pull": get_pull(c), "nested_path": c.nested_path[0], "put": {"name": literal_field(c.names[query_path]), "index": i, "child": "."} }) i += 1 # REMOVE DOTS IN PREFIX IF NAME NOT AMBIGUOUS col_names = set(c.names[query_path] for c in columns) for n in new_select: if n.name.startswith("..") and n.name.lstrip(".") not in col_names: n.name = n.name.lstrip(".") n.put.name = literal_field(n.name) col_names.add(n.name) else: prefix = schema[s.value.term.var][0].names["."] + "." prefix_length = len(prefix) for c in columns: cname = c.names["."] if cname.startswith(prefix) and c.type not in STRUCT: pull = get_pull(c) if c.nested_path[0] == ".": es_query.fields += [c.es_column] new_select.append({ "name": s.name + "." + cname[prefix_length:], "pull": pull, "nested_path": c.nested_path[0], "put": { "name": s.name + "." + literal_field(cname[prefix_length:]), "index": i, "child": "." } }) i += 1 elif isinstance(s.value, Variable): if s.value.var == ".": for c in columns: if c.type not in STRUCT and c.es_column != "_id": if len(c.nested_path) == 1: es_query.fields += [c.es_column] new_select.append({ "name": c.name, "pull": get_pull(c), "nested_path": c.nested_path[0], "put": {"name": ".", "index": i, "child": c.es_column} }) i += 1 elif s.value.var == "_id": new_select.append({ "name": s.name, "value": s.value.var, "pull": "_id", "put": {"name": s.name, "index": i, "child": "."} }) i += 1 else: prefix = schema[s.value.var][0] if not prefix: net_columns = [] else: parent = prefix.es_column+"." prefix_length = len(parent) net_columns = [c for c in columns if c.es_column.startswith(parent) and c.type not in STRUCT] if not net_columns: pull = get_pull(prefix) if len(prefix.nested_path) == 1: es_query.fields += [prefix.es_column] new_select.append({ "name": s.name, "pull": pull, "nested_path": prefix.nested_path[0], "put": {"name": s.name, "index": i, "child": "."} }) else: done = set() for n in net_columns: # THE COLUMNS CAN HAVE DUPLICATE REFERNCES TO THE SAME ES_COLUMN if n.es_column in done: continue done.add(n.es_column) pull = get_pull(n) if len(n.nested_path) == 1: es_query.fields += [n.es_column] new_select.append({ "name": s.name, "pull": pull, "nested_path": n.nested_path[0], "put": {"name": s.name, "index": i, "child": n.es_column[prefix_length:]} }) i += 1 else: expr = s.value for v in expr.vars(): for c in schema[v]: if c.nested_path[0] == ".": es_query.fields += [c.es_column] # else: # Log.error("deep field not expected") pull = EXPRESSION_PREFIX + s.name post_expressions[pull] = compile_expression(expr.map(map_to_local).to_python()) new_select.append({ "name": s.name if is_list else ".", "pull": pull, "value": expr.__data__(), "put": {"name": s.name, "index": i, "child": "."} }) i += 1 # <COMPLICATED> ES needs two calls to get all documents more = [] def get_more(please_stop): more.append(es09.util.post( es, Data( filter=more_filter, fields=es_query.fields ), query.limit )) if more_filter: need_more = Thread.run("get more", target=get_more) with Timer("call to ES") as call_timer: data = es09.util.post(es, es_query, query.limit) # EACH A HIT IS RETURNED MULTIPLE TIMES FOR EACH INNER HIT, WITH INNER HIT INCLUDED def inners(): for t in data.hits.hits: for i in t.inner_hits[literal_field(query_path)].hits.hits: t._inner = i._source for k, e in post_expressions.items(): t[k] = e(t) yield t if more_filter: Thread.join(need_more) for t in more[0].hits.hits: yield t #</COMPLICATED> try: formatter, groupby_formatter, mime_type = format_dispatch[query.format] output = formatter(inners(), new_select, query) output.meta.timing.es = call_timer.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception as e: Log.error("problem formatting", e)
output = wrap({k.replace("-", "_"): v for k, v in boto_utils.get_instance_metadata(timeout=coalesce(timeout, 5), num_retries=2).items()}) return output def aws_retry(func): def output(*args, **kwargs): while True: try: return func(*args, **kwargs) except Exception as e: e = Except.wrap(e) if "Request limit exceeded" in e: Log.warning("AWS Problem", cause=e) continue else: Log.error("Problem with call to AWS", cause=e) return output # GET FROM AWS, IF WE CAN def _get_metadata_from_from_aws(please_stop): with suppress_exception: ec2 = get_instance_metadata() if ec2: machine_metadata.aws_instance_type = ec2.instance_type machine_metadata.name = ec2.instance_id Thread.run("get aws machine metadata", _get_metadata_from_from_aws) from . import s3