def extract_alert_settings(env_setup): settings = startup.read_settings(filename=extract_alerts.CONFIG_FILE, complain=False) settings.source.database.ssl = None # NOT REQUIRED FOR TEST DATABASE constants.set(settings.constants) Log.start(settings.debug) return settings
def main(): try: settings = startup.read_settings() with startup.SingleInstance(settings.args.filename): constants.set(settings.constants) Log.start(settings.debug) extractor = Extract(settings) def extract(please_stop): with MySQL(**settings.snowflake.database) as db: with db.transaction(): for kwargs in extractor.queue: if please_stop: break try: extractor.extract(db=db, please_stop=please_stop, **kwargs) except Exception as e: Log.warning("Could not extract", cause=e) extractor.queue.add(kwargs) for i in range(settings.extract.threads): Thread.run("extract #" + text_type(i), extract) please_stop = Signal() Thread.wait_for_shutdown_signal(please_stop=please_stop, allow_exit=True, wait_forever=False) except Exception as e: Log.warning("Problem with data extraction", e) finally: Log.stop()
def main(): try: settings = startup.read_settings() constants.set(settings.constants) Log.start(settings.debug) with SingleInstance(flavor_id=settings.args.filename): settings.run_interval = Duration(settings.run_interval) for u in settings.utility: u.discount = coalesce(u.discount, 0) # MARKUP drives WITH EXPECTED device MAPPING num_ephemeral_volumes = ephemeral_storage[ u.instance_type]["num"] for i, d in enumerate(d for d in u.drives if not d.device): letter = convert.ascii2char(98 + num_ephemeral_volumes + i) d.device = "/dev/xvd" + letter settings.utility = UniqueIndex(["instance_type"], data=settings.utility) instance_manager = new_instance(settings.instance) m = SpotManager(instance_manager, kwargs=settings) if ENABLE_SIDE_EFFECTS: m.update_spot_requests() if m.watcher: m.watcher.join() except Exception as e: Log.warning("Problem with spot manager", cause=e) finally: Log.stop() MAIN_THREAD.stop()
def setUpClass(cls): try: cls.config = startup.read_settings(filename="tests/config.json") constants.set(cls.config.constants) Log.start(cls.config.debug) except Exception as e: Log.error("Problem with etl", e)
def main(): try: settings = startup.read_settings( defs=[{ "name": ["--all", "-a"], "action": 'store_true', "help": 'process all mo-* subdirectories', "dest": "all", "required": False }, { "name": ["--dir", "--directory", "-d"], "help": 'directory to deploy', "type": str, "dest": "directory", "required": True, "default": "." }]) constants.set(settings.constants) Log.start(settings.debug) if settings.args.all: deploy_all(File(settings.args.directory), settings.prefix, settings) else: Deploy(File(settings.args.directory), kwargs=settings).deploy() except Exception, e: Log.warning("Problem with etl", cause=e)
def main(): try: settings = startup.read_settings() Log.start(settings.debug) with SingleInstance(flavor_id=settings.args.filename): constants.set(settings.constants) settings.run_interval = Duration(settings.run_interval) for u in settings.utility: u.discount = coalesce(u.discount, 0) # MARKUP drives WITH EXPECTED device MAPPING num_ephemeral_volumes = ephemeral_storage[u.instance_type]["num"] for i, d in enumerate(d for d in u.drives if not d.device): letter = convert.ascii2char(98 + num_ephemeral_volumes + i) d.device = "/dev/xvd" + letter settings.utility = UniqueIndex(["instance_type"], data=settings.utility) instance_manager = new_instance(settings.instance) m = SpotManager(instance_manager, kwargs=settings) if ENABLE_SIDE_EFFECTS: m.update_spot_requests(instance_manager.required_utility()) if m.watcher: m.watcher.join() except Exception as e: Log.warning("Problem with spot manager", cause=e) finally: Log.stop() MAIN_THREAD.stop()
def main(): try: config = startup.read_settings() constants.set(config.constants) Log.start(config.debug) please_stop = Signal("main stop signal") Thread.wait_for_shutdown_signal(please_stop) except Exception, e: Log.error("Problem with etl", cause=e)
def setup(settings=None): global config try: config = startup.read_settings(defs={ "name": ["--process_num", "--process"], "help": "Additional port offset (for multiple Flask processes", "type": int, "dest": "process_num", "default": 0, "required": False }, filename=settings) constants.set(config.constants) Log.start(config.debug) if config.args.process_num and config.flask.port: config.flask.port += config.args.process_num # PIPE REQUEST LOGS TO ES DEBUG if config.request_logs: request_logger = elasticsearch.Cluster( config.request_logs).get_or_create_index(config.request_logs) active_data.request_log_queue = request_logger.threaded_queue( max_size=2000) # SETUP DEFAULT CONTAINER, SO THERE IS SOMETHING TO QUERY containers.config.default = { "type": "elasticsearch", "settings": config.elasticsearch.copy() } # TURN ON /exit FOR WINDOWS DEBUGGING if config.flask.debug or config.flask.allow_exit: config.flask.allow_exit = None Log.warning("ActiveData is in debug mode") app.add_url_rule('/exit', 'exit', _exit) # TRIGGER FIRST INSTANCE FromESMetadata(config.elasticsearch) if config.saved_queries: setattr(save_query, "query_finder", SaveQueries(config.saved_queries)) HeaderRewriterFix(app, remove_headers=['Date', 'Server']) if config.flask.ssl_context: if config.args.process_num: Log.error( "can not serve ssl and multiple Flask instances at once") setup_ssl() return app except Exception, e: Log.error( "Serious problem with ActiveData service construction! Shutdown!", cause=e)
def main(): try: settings = startup.read_settings() constants.set(settings.constants) Log.start(settings.debug) ETL(settings).setup(settings.instance, settings.utility) except Exception as e: Log.warning("Problem with setup of ETL", cause=e) finally: Log.stop()
def start(): try: config = json2value(STDIN.readline().decode('utf8')) constants.set(config.constants) Log.start(set_default(config.debug, {"logs": [{"type": "raw"}]})) command_loop({"config": config}) except Exception as e: Log.error("problem staring worker", cause=e) finally: Log.stop()
def main(): try: settings = startup.read_settings() constants.set(settings.constants) Log.start(settings.debug) ETL(settings).setup(settings.instance, settings.utility) except Exception as e: Log.warning("Problem with setup of ETL", cause=e) finally: Log.stop()
def setUpClass(cls): global config, broker try: config = startup.read_settings(filename="tests/config/file.json") constants.set(config.constants) Log.start(config.debug) File(config.broker.backing.directory).delete() broker = Broker(kwargs=config.broker) except Exception as e: Log.error("could not setup for testing", cause=e)
def main(): global config global hg try: config = startup.read_settings() constants.set(config.constants) hg = HgMozillaOrg(config) Log.start(config.debug) except Exception as e: Log.error("Problem with etl", e)
def main(): settings = startup.read_settings() Log.start(settings.debug) constants.set(settings.constants) try: _synch(settings) except Exception as e: Log.error("Problem with synch", e) finally: Log.stop()
def run(self, force=False, restart=False, start=None, merge=False): try: # SETUP LOGGING settings = startup.read_settings(filename=CONFIG_FILE) constants.set(settings.constants) Log.start(settings.debug) self.extract(settings, force, restart, start, merge) except Exception as e: Log.error("could not extract jobs", cause=e) finally: Log.stop()
def start(): try: line = STDIN.readline().decode("utf8") config = json2value(line) constants.set(config.constants) Log.start(config.debug) Log.set_logger(RawLogger()) command_loop({"config": config}) except Exception as e: Log.error("problem staring worker", cause=e) finally: Log.stop()
def main(): global config global hg try: config = startup.read_settings() constants.set(config.constants) Log.start(config.debug) hg = HgMozillaOrg(config) random = _parse_diff( Data(changeset={"id": "2d9d0bebb5c6"}, branch={"url": "https://hg.mozilla.org/mozilla-central"})) except Exception as e: Log.error("Problem with etl", e)
def setUpClass(cls): Log.start(settings.debug) with Timer("setup database"): try: with MySQL(schema=None, kwargs=settings.database) as db: db.query("drop database testing") except Exception as e: if "Can't drop database " in e: pass else: Log.warning("problem removing db", cause=e) MySQL.execute_file("tests/resources/database.sql", schema=None, kwargs=settings.database)
def main(): try: settings = wrap({ "elasticsearch": { "host": "http://activedata.allizom.org", "port": 9200, "debug": True } }) Log.start(settings) move_shards(settings) except Exception, e: Log.error("Problem with assign of shards", e)
def main(): try: settings = startup.read_settings() constants.set(settings.constants) Log.start(settings.debug) branches = _get_branches_from_hg(settings.hg) es = elasticsearch.Cluster(kwargs=settings.hg.branches).get_or_create_index(kwargs=settings.hg.branches) es.add_alias() es.extend({"id": b.name + " " + b.locale, "value": b} for b in branches) Log.alert("DONE!") except Exception as e: Log.error("Problem with etl", e) finally: Log.stop()
def main(): try: config = startup.read_settings() constants.set(config.constants) inject_secrets(config) with Timer("PATCH ADR: dd update() method to Configuration class"): def update(self, config): """ Update the configuration object with new parameters :param config: dict of configuration """ for k, v in config.items(): if v != None: self._config[k] = v self._config["sources"] = sorted( map(os.path.expanduser, set(self._config["sources"])) ) # Use the NullStore by default. This allows us to control whether # caching is enabled or not at runtime. self._config["cache"].setdefault("stores", {"null": {"driver": "null"}}) object.__setattr__(self, "cache", CacheManager(self._config["cache"])) self.cache.extend("null", lambda driver: NullStore()) setattr(Configuration, "update", update) # UPDATE ADR COFIGURATION adr.config.update(config.adr) Log.start(config.debug) # SHUNT ADR LOGGING TO MAIN LOGGING # https://loguru.readthedocs.io/en/stable/api/logger.html#loguru._logger.Logger.add loguru.logger.remove() loguru.logger.add( _logging, level="DEBUG", format="{message}", filter=lambda r: True, ) Schedulers(config).process() except Exception as e: Log.warning("Problem with etl! Shutting down.", cause=e) finally: Log.stop()
def main(): try: settings = startup.read_settings() constants.set(settings.constants) Log.start(settings.debug) branches = _get_branches_from_hg(settings.hg) es = elasticsearch.Cluster(kwargs=settings.hg.branches).get_or_create_index(kwargs=settings.hg.branches) es.add_alias() es.extend({"id": b.name + " " + b.locale, "value": b} for b in branches) Log.alert("DONE!") except Exception as e: Log.error("Problem with etl", e) finally: Log.stop()
def extract_job_settings(): # These values not directly accessed during testing, but the code requires that they be present. os.environ["NEW_RELIC_APP_NAME"] = "testing" os.environ["BIGQUERY_PRIVATE_KEY_ID"] = "1" os.environ["BIGQUERY_PRIVATE_KEY"] = "1" # USE THE TEST SCHEMA db_url = os.environ["DATABASE_URL"] db_url = db_url.replace(strings.between(db_url, "/", None), DATABASES["default"]["TEST"]["NAME"]) os.environ["DATABASE_URL"] = db_url settings = startup.read_settings(filename=extract_jobs.CONFIG_FILE, complain=False) settings.source.database.ssl = None # NOT REQUIRED FOR TEST DATABASE constants.set(settings.constants) Log.start(settings.debug) return settings
def setup(): global config config = startup.read_settings( filename=os.environ.get('ACTIVEDATA_CONFIG'), defs=[ { "name": ["--process_num", "--process"], "help": "Additional port offset (for multiple Flask processes", "type": int, "dest": "process_num", "default": 0, "required": False } ] ) constants.set(config.constants) Log.start(config.debug) # PIPE REQUEST LOGS TO ES DEBUG if config.request_logs: cluster = elasticsearch.Cluster(config.request_logs) request_logger = cluster.get_or_create_index(config.request_logs) active_data.request_log_queue = request_logger.threaded_queue(max_size=2000) if config.dockerflow: def backend_check(): http.get_json(config.elasticsearch.host + ":" + text_type(config.elasticsearch.port)) dockerflow(flask_app, backend_check) # SETUP DEFAULT CONTAINER, SO THERE IS SOMETHING TO QUERY container.config.default = { "type": "elasticsearch", "settings": config.elasticsearch.copy() } # TRIGGER FIRST INSTANCE if config.saved_queries: setattr(save_query, "query_finder", SaveQueries(config.saved_queries)) HeaderRewriterFix(flask_app, remove_headers=['Date', 'Server'])
def main(num): try: Log.start() results = [] test_json(results, "mo-json encoder", json_encoder, num) test_json(results, "mo-json encoder (again)", json_encoder, num) test_json(results, "scrub before json.dumps", cPythonJSONEncoder().encode, num) test_json(results, "override JSONEncoder.default()", EnhancedJSONEncoder().encode, num) test_json(results, "default json.dumps", json.dumps, num) # WILL CRASH, CAN NOT HANDLE DIVERSITY OF TYPES test_json(results, "typed json", typed_encoder.encode, num) # test_json(results, "scrubbed ujson", ujson.dumps, num) # THIS PLAIN CRASHES Log.note(u"\n{{summary}}", summary=convert.list2tab(results)) finally: Log.stop()
def main(): try: config = startup.read_settings(defs=[{ "name": ["--file"], "help": "file to save backup", "type": str, "dest": "file", "required": True }]) constants.set(config.constants) Log.start(config.debug) sq = elasticsearch.Index(kwargs=config.saved_queries) result = sq.search({"query": {"match_all": {}}, "size": 200000}) File(config.args.file).write("".join( map(convert.json2value, result.hits.hits))) except Exception, e: Log.error("Problem with etl", e)
def main(): try: settings = startup.read_settings() constants.set(settings.constants) Log.start(settings.debug) hg = HgMozillaOrg(settings) todo = Queue() todo.add("97160a734959") least = 100000 while todo: next_ = todo.pop() curr = hg.get_revision( wrap({ "changeset": { "id": next_ }, "branch": { "name": BRANCH } })) if len(curr.changeset.files) > MIN_FILES: diff = hg._get_json_diff_from_hg(curr) num_changes = sum(len(d.changes) for d in diff) score = num_changes / len(diff) if score < least: least = score Log.note( "smallest = {{rev}}, num_lines={{num}}, num_files={{files}}", rev=curr.changeset.id, num=num_changes, files=len(diff)) todo.extend(listwrap(curr.parents)) except Exception as e: Log.error("Problem with scna", e) finally: Log.stop()
now = Date.now().unix if time_offset is None: time_offset = now - request.meta.request_time next_request = request.meta.request_time + time_offset if next_request > now: Log.note("Next request in {{wait_time}}", wait_time=Duration(seconds=next_request - now)) Till(till=next_request).wait() Thread.run("request " + text_type(request_count), one_request, request) request_count += 1 queue.commit() if __name__ == '__main__': try: tmp_signal = Signal() config = startup.read_settings() constants.set(config.constants) Log.start(config.debug) queue_consumer(kwargs=config, please_stop=tmp_signal) worker = Thread.run("sqs consumer", queue_consumer, kwargs=config) MAIN_THREAD.wait_for_shutdown_signal(allow_exit=True, please_stop=worker.stopped) except BaseException as e: Log.error("Serious problem with consumer construction! Shutdown!", cause=e)
STDOUT.write(value2json({"out": local['_return']})) STDOUT.write('\n') except Exception as e: STDOUT.write(value2json({"err": e})) STDOUT.write('\n') finally: STDOUT.flush() num_temps = 0 def temp_var(): global num_temps try: return "temp_var" + text_type(num_temps) finally: num_temps += 1 if __name__ == "__main__": try: config = json2value(sys.stdin.readline().decode('utf8')) constants.set(config.constants) Log.start(set_default(config.debug, {"logs": [{"type": "raw"}]})) command_loop({"config": config}) except Exception as e: Log.error("problem staring worker", cause=e) finally: Log.stop()
def setUpClass(cls): Log.start({"trace": False})
from mo_dots import to_data, from_data, listwrap, is_many from mo_files import mimetype from mo_future import first from mo_http import http from mo_logs import Log from mo_math import is_nan from mo_times import Date, YEAR, WEEK, MONTH from pandas import DataFrame from utils import nice_ceiling # PROVINCE = 7 # Ontario # PROVINCE = 10 # Alberta PROVINCE = 11 # British Columbia Log.start(trace=True) http.DEBUG = True http.default_headers = to_data({ "From": "*****@*****.**", "Referer": "https://github.com/klahnakoski/mo-statcan", "User-Agent": "mo-statscan", "Accept": mimetype.ANY, }) # LESS DETAILED CAUSES CAUSE_OF_DEATH = ( 13_10_0394 # https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1310039401 ) # DETAILED CAUSES
Log.note("Skipping try revision.") queue.commit() continue now = Date.now().unix if time_offset is None: time_offset = now - request.meta.request_time next_request = request.meta.request_time + time_offset if next_request > now: Log.note("Next request in {{wait_time}}", wait_time=Duration(seconds=next_request - now)) Till(till=next_request).wait() Thread.run("request "+text_type(request_count), one_request, request) request_count += 1 queue.commit() if __name__ == '__main__': try: tmp_signal = Signal() config = startup.read_settings() constants.set(config.constants) Log.start(config.debug) queue_consumer(kwargs=config, please_stop=tmp_signal) worker = Thread.run("sqs consumer", queue_consumer, kwargs=config) MAIN_THREAD.wait_for_shutdown_signal(allow_exit=True, please_stop=worker.stopped) except BaseException as e: Log.error("Serious problem with consumer construction! Shutdown!", cause=e)
def setUp(self): Log.start({"trace": True})
def setUpClass(cls): Log.start({"trace": True, "cprofile": False})
def run(self, force=False, restart=False, merge=False): # SETUP LOGGING settings = startup.read_settings(filename=CONFIG_FILE) constants.set(settings.constants) Log.start(settings.debug) if not settings.extractor.app_name: Log.error("Expecting an extractor.app_name in config file") # SETUP DESTINATION destination = bigquery.Dataset( dataset=settings.extractor.app_name, kwargs=settings.destination ).get_or_create_table(settings.destination) try: if merge: with Timer("merge shards"): destination.merge_shards() # RECOVER LAST SQL STATE redis = Redis() state = redis.get(settings.extractor.key) if restart or not state: state = (0, 0) redis.set(settings.extractor.key, value2json(state).encode("utf8")) else: state = json2value(state.decode("utf8")) last_modified, job_id = state # SCAN SCHEMA, GENERATE EXTRACTION SQL extractor = MySqlSnowflakeExtractor(settings.source) canonical_sql = extractor.get_sql(SQL("SELECT 0")) # ENSURE SCHEMA HAS NOT CHANGED SINCE LAST RUN old_sql = redis.get(settings.extractor.sql) if old_sql and old_sql.decode("utf8") != canonical_sql.sql: if force: Log.warning("Schema has changed") else: Log.error("Schema has changed") redis.set(settings.extractor.sql, canonical_sql.sql.encode("utf8")) # SETUP SOURCE source = MySQL(settings.source.database) while True: Log.note( "Extracting jobs for last_modified={{last_modified|datetime|quote}}, job.id={{job_id}}", last_modified=last_modified, job_id=job_id, ) # Example: job.id ==283890114 # get_ids = ConcatSQL( # (SQL_SELECT, sql_alias(quote_value(283890114), "id")) # ) # get_ids = sql_query( # { # "from": "job", # "select": ["id"], # "where": { # "or": [ # {"gt": {"last_modified": parse(last_modified)}}, # { # "and": [ # {"eq": {"last_modified": parse(last_modified)}}, # {"gt": {"id": job_id}}, # ] # }, # ] # }, # "sort": ["last_modified", "id"], # "limit": settings.extractor.chunk_size, # } # ) get_ids = SQL(str( ( Job.objects.filter( Q(last_modified__gt=parse(last_modified).datetime) | ( Q(last_modified=parse(last_modified).datetime) & Q(id__gt=job_id) ) ) .annotate() .values("id") .order_by("last_modified", "id")[ : settings.extractor.chunk_size ] ).query )) sql = extractor.get_sql(get_ids) # PULL FROM source, AND PUSH TO destination acc = [] with source.transaction(): cursor = source.query(sql, stream=True, row_tuples=True) extractor.construct_docs(cursor, acc.append, False) if not acc: break destination.extend(acc) # RECORD THE STATE last_doc = acc[-1] last_modified, job_id = last_doc.last_modified, last_doc.id redis.set( settings.extractor.key, value2json((last_modified, job_id)).encode("utf8"), ) if len(acc) < settings.extractor.chunk_size: break except Exception as e: Log.warning("problem with extraction", cause=e) Log.note("done job extraction") try: with Timer("merge shards"): destination.merge_shards() except Exception as e: Log.warning("problem with merge", cause=e) Log.note("done job merge")