def __init__( self, host, index, port=9200, type="log", queue_size=1000, batch_size=100, kwargs=None, ): """ settings ARE FOR THE ELASTICSEARCH INDEX """ kwargs.timeout = Duration(coalesce(kwargs.timeout, "30second")).seconds kwargs.retry.times = coalesce(kwargs.retry.times, 3) kwargs.retry.sleep = Duration(coalesce(kwargs.retry.sleep, MINUTE)).seconds kwargs.host = Random.sample(listwrap(host), 1)[0] schema = json2value(value2json(SCHEMA), leaves=True) schema.mappings[type].properties["~N~"].type = "nested" self.es = Cluster(kwargs).get_or_create_index( schema=schema, limit_replicas=True, typed=True, kwargs=kwargs, ) self.batch_size = batch_size self.es.add_alias(coalesce(kwargs.alias, kwargs.index)) self.queue = Queue("debug logs to es", max=queue_size, silent=True) self.worker = Thread.run("add debug logs to es", self._insert_loop)
def fix(source_key, rownum, line, source, sample_only_filter, sample_size): """ :param rownum: :param line: :param source: :param sample_only_filter: :param sample_size: :return: (row, no_more_data) TUPLE WHERE row IS {"value":<data structure>} OR {"json":<text line>} """ value = json2value(line) if rownum == 0: if len(line) > MAX_RECORD_LENGTH: _shorten(source_key, value, source) value = _fix(value) if sample_only_filter and Random.int(int(1.0/coalesce(sample_size, 0.01))) != 0 and jx.filter([value], sample_only_filter): # INDEX etl.id==0, BUT NO MORE if value.etl.id != 0: Log.error("Expecting etl.id==0") row = {"value": value} return row, True elif len(line) > MAX_RECORD_LENGTH: _shorten(source_key, value, source) value = _fix(value) elif '"resource_usage":' in line: value = _fix(value) row = {"value": value} return row, False
def output(*args): with cache_store.locker: if using_self: self = args[0] args = args[1:] else: self = cache_store now = Date.now() try: _cache = getattr(self, attr_name) except Exception, _: _cache = {} setattr(self, attr_name, _cache) if Random.int(100) == 0: # REMOVE OLD CACHE _cache = { k: v for k, v in _cache.items() if v[0] == None or v[0] > now } setattr(self, attr_name, _cache) timeout, key, value, exception = _cache.get( args, (Null, Null, Null, Null))
def _send_email(self): try: if not self.accumulation: return with Emailer(self.settings) as emailer: # WHO ARE WE SENDING TO emails = Data() for template, params in self.accumulation: content = expand_template(template, params) emails[literal_field( self.settings.to_address)] += [content] for c in self.cc: if any(d in params.params.error for d in c.contains): emails[literal_field(c.to_address)] += [content] # SEND TO EACH for to_address, content in emails.items(): emailer.send_email(from_address=self.settings.from_address, to_address=listwrap(to_address), subject=self.settings.subject, text_data="\n\n".join(content)) self.accumulation = [] except Exception as e: Log.warning("Could not send", e) finally: self.next_send = Date.now() + self.settings.average_interval * ( 2 * Random.float())
def commit(self): with self.lock: if self.closed: Log.error("Queue is closed, commit not allowed") try: self._add_pending({"add": {"status.start": self.start}}) for i in range(self.db.status.start, self.start): self._add_pending({"remove": str(i)}) if self.db.status.end - self.start < 10 or Random.range( 0, 1000) == 0: # FORCE RE-WRITE TO LIMIT FILE SIZE # SIMPLY RE-WRITE FILE if DEBUG: Log.note( "Re-write {{num_keys}} keys to persistent queue", num_keys=self.db.status.end - self.start) for k in self.db.keys(): if k == "status" or int(k) >= self.db.status.start: continue Log.error("Not expecting {{key}}", key=k) self._commit() self.file.write( mo_json.value2json({"add": self.db}) + "\n") else: self._commit() except Exception as e: raise e
def _send_email(self): try: if not self.accumulation: return with Emailer(self.settings) as emailer: # WHO ARE WE SENDING TO emails = Data() for template, params in self.accumulation: content = expand_template(template, params) emails[literal_field(self.settings.to_address)] += [content] for c in self.cc: if any(d in params.params.error for d in c.contains): emails[literal_field(c.to_address)] += [content] # SEND TO EACH for to_address, content in emails.items(): emailer.send_email( from_address=self.settings.from_address, to_address=listwrap(to_address), subject=self.settings.subject, text_data="\n\n".join(content) ) self.accumulation = [] except Exception as e: Log.warning("Could not send", e) finally: self.next_send = Date.now() + self.settings.average_interval * (2 * Random.float())
def _create_new_shard(self): primary_shard = self.container.create_table( table=self.short_name + "_" + "".join(Random.sample(ALLOWED, 20)), sharded=False, schema=self._flake.schema, kwargs=self.config, ) self.shard = primary_shard.shard
def put(self, local, remote, use_sudo=False): if use_sudo: filename = "/tmp/" + Random.string(20, string.digits + 'ABCDEF') self.conn.put(File(local).abspath, filename) self.sudo("cp " + filename + " " + remote) self.sudo("rm " + filename) else: self.conn.put(File(local).abspath, remote)
def test_wrap_2(): Log.alert("Random types") switch = [ lambda: Random.int(20), lambda: Random.string(20), lambda: {"i": Random.int(2000)}, lambda: Data(i=Random.int(2000)), lambda: FlatList([{"i": Random.int(2000)}]), lambda: [{"i": Random.int(2000)}] ] inputs = [switch[min(len(switch) - 1, int(floor(-log(Random.float(), 2))))]() for i in range(NUM_INPUT)] for i in range(NUM_REPEAT): results = [] gc.collect() with Timer("more string: to_data"): for v in inputs: results.append(to_data(v)) results = [] gc.collect() with Timer("more string: baseline"): for v in inputs: results.append(baseline(v)) Log.note("Done {{i}} of {{num}}", i=i, num=NUM_REPEAT)
def output(*args, **kwargs): if kwargs: Log.error( "Sorry, caching only works with ordered parameter, not keyword arguments" ) with cache_store.locker: if using_self: self = args[0] args = args[1:] else: self = cache_store now = Date.now() try: _cache = getattr(self, attr_name) except Exception: _cache = {} setattr(self, attr_name, _cache) if Random.int(100) == 0: # REMOVE OLD CACHE _cache = { k: v for k, v in _cache.items() if v.timeout == None or v.timeout > now } setattr(self, attr_name, _cache) timeout, key, value, exception = _cache.get( args, (Null, Null, Null, Null)) if now >= timeout: value = func(self, *args) with cache_store.locker: _cache[args] = CacheElement(now + cache_store.timeout, args, value, None) return value if value == None: if exception == None: try: value = func(self, *args) with cache_store.locker: _cache[args] = CacheElement(now + cache_store.timeout, args, value, None) return value except Exception as e: e = Except.wrap(e) with cache_store.locker: _cache[args] = CacheElement(now + cache_store.timeout, args, None, e) raise e else: raise exception else: return value
def __enter__(self): if self.sample and Random.int(100) == 0: _Log.warning("acquire lock {{name|quote}}", name=self.name) self.debug and _Log.note("acquire lock {{name|quote}}", name=self.name) self.lock.acquire() self.debug and _Log.note("acquired lock {{name|quote}}", name=self.name) return self
def _daemon(self, please_stop): while not please_stop: with Explanation("looking for work"): try: branch, revisions, after = self.todo.pop(till=please_stop) except Exception as e: if please_stop: break else: raise e if branch.name in DAEMON_DO_NO_SCAN: continue revisions = set(revisions) # FIND THE REVSIONS ON THIS BRANCH for r in list(revisions): try: rev = self.get_revision( Revision(branch=branch, changeset={"id": r}), None, # local False, # get_diff True, # get_moves ) if after and after > rev.etl.timestamp: rev = self._get_from_hg(revision=rev) if DAEMON_DEBUG: Log.note( "found revision with push date {{date|datetime}}", date=rev.push.date, ) revisions.discard(r) if rev.etl.timestamp > Date.now() - ( DAEMON_RECENT_HG_PULL * SECOND): # SOME PUSHES ARE BIG, RUNNING THE RISK OTHER MACHINES ARE # ALSO INTERESTED AND PERFORMING THE SAME SCAN. THIS DELAY # WILL HAVE SMALL EFFECT ON THE MAJORITY OF SMALL PUSHES # https://bugzilla.mozilla.org/show_bug.cgi?id=1417720 Till(seconds=Random.float(DAEMON_HG_INTERVAL * 2)).wait() except Exception as e: Log.warning( "Scanning {{branch}} {{revision|left(12)}}", branch=branch.name, revision=r, cause=e, ) if "Read timed out" in e: Till(seconds=DAEMON_WAIT_AFTER_TIMEOUT).wait() # FIND ANY BRANCH THAT MAY HAVE THIS REVISION for r in list(revisions): self._find_revision(r)
def put(self, local, remote, use_sudo=False): if self.conn.command_cwds and not remote.startswith(("/", "~")): remote = self.conn.command_cwds[-1].rstrip("/'") + "/" + remote if use_sudo: filename = "/tmp/" + Random.hex(20) self.conn.put(File(local).abspath, filename) self.sudo("cp " + filename + " " + remote) self.sudo("rm " + filename) else: self.conn.put(File(local).abspath, remote)
def __init__( self, host, index, port=9200, type="log", queue_size=1000, batch_size=100, kwargs=None, ): """ settings ARE FOR THE ELASTICSEARCH INDEX """ kwargs.timeout = Duration(coalesce(kwargs.timeout, "30second")).seconds kwargs.retry.times = coalesce(kwargs.retry.times, 3) kwargs.retry.sleep = Duration(coalesce(kwargs.retry.sleep, MINUTE)).seconds kwargs.host = Random.sample(listwrap(host), 1)[0] rollover_interval = coalesce(kwargs.rollover.interval, kwargs.rollover.max, "year") rollover_max = coalesce(kwargs.rollover.max, kwargs.rollover.interval, "year") schema = set_default(kwargs.schema, { "mappings": { kwargs.type: { "properties": { "~N~": { "type": "nested" } } } } }, json2value(value2json(SCHEMA), leaves=True)) self.es = RolloverIndex( rollover_field={"get": [{ "first": "." }, { "literal": "timestamp" }]}, rollover_interval=rollover_interval, rollover_max=rollover_max, schema=schema, limit_replicas=True, typed=True, read_only=False, kwargs=kwargs, ) self.batch_size = batch_size self.queue = Queue("debug logs to es", max=queue_size, silent=True) self.worker = Thread.run("add debug logs to es", self._insert_loop)
def fix(rownum, line, source, sample_only_filter, sample_size): # ES SCHEMA IS STRICTLY TYPED, USE "code" FOR TEXT IDS line = line.replace('{"id": "bb"}', '{"code": "bb"}').replace('{"id": "tc"}', '{"code": "tc"}') # ES SCHEMA IS STRICTLY TYPED, THE SUITE OBJECT CAN NOT BE HANDLED if source.name.startswith("active-data-test-result"): # "suite": {"flavor": "plain-chunked", "name": "mochitest"} found = strings.between(line, '"suite": {', '}') if found: suite_json = '{' + found + "}" if suite_json: suite = mo_json.json2value(suite_json) suite = convert.value2json(coalesce(suite.fullname, suite.name)) line = line.replace(suite_json, suite) if source.name.startswith("active-data-codecoverage"): d = convert.json2value(line) if d.source.file.total_covered > 0: return {"id": d._id, "json": line}, False else: return None, False if rownum == 0: value = mo_json.json2value(line) if len(line) > MAX_RECORD_LENGTH: _shorten(value, source) _id, value = _fix(value) row = {"id": _id, "value": value} if sample_only_filter and Random.int( int(1.0 / coalesce(sample_size, 0.01))) != 0 and jx.filter( [value], sample_only_filter): # INDEX etl.id==0, BUT NO MORE if value.etl.id != 0: Log.error("Expecting etl.id==0") return row, True elif len(line) > MAX_RECORD_LENGTH: value = mo_json.json2value(line) _shorten(value, source) _id, value = _fix(value) row = {"id": _id, "value": value} elif line.find('"resource_usage":') != -1: value = mo_json.json2value(line) _id, value = _fix(value) row = {"id": _id, "value": value} else: # FAST _id = strings.between(line, "\"_id\": \"", "\"") # AVOID DECODING JSON row = {"id": _id, "json": line} return row, False
def get(self, remote, local, use_sudo=False): if self.conn.command_cwds and not remote.startswith(("/", "~")): remote = self.conn.command_cwds[-1].rstrip("/'") + "/" + remote if use_sudo: filename = "/tmp/" + Random.filename() self.sudo("cp " + remote + " " + filename) self.sudo("chmod a+r " + filename) self.conn.get(filename, File(local).abspath) self.sudo("rm " + filename) else: self.conn.get(remote, File(local).abspath)
def _get_from_elasticsearch(self, revision, locale=None, get_diff=False, get_moves=True): rev = revision.changeset.id if self.es.cluster.version.startswith("1.7."): query = { "query": {"filtered": { "query": {"match_all": {}}, "filter": {"and": [ {"term": {"changeset.id12": rev[0:12]}}, {"term": {"branch.name": revision.branch.name}}, {"term": {"branch.locale": coalesce(locale, revision.branch.locale, DEFAULT_LOCALE)}}, {"range": {"etl.timestamp": {"gt": MIN_ETL_AGE}}} ]} }}, "size": 20 } else: query = { "query": {"bool": {"must": [ {"term": {"changeset.id12": rev[0:12]}}, {"term": {"branch.name": revision.branch.name}}, {"term": {"branch.locale": coalesce(locale, revision.branch.locale, DEFAULT_LOCALE)}}, {"range": {"etl.timestamp": {"gt": MIN_ETL_AGE}}} ]}}, "size": 20 } for attempt in range(3): try: with self.es_locker: docs = self.es.search(query).hits.hits if len(docs) == 0: return None best = docs[0]._source if len(docs) > 1: for d in docs: if d._id.endswith(d._source.branch.locale): best = d._source Log.warning("expecting no more than one document") return best except Exception as e: e = Except.wrap(e) if "EsRejectedExecutionException[rejected execution (queue capacity" in e: (Till(seconds=Random.int(30))).wait() continue else: Log.warning("Bad ES call, waiting for {{num}} seconds", num=WAIT_AFTER_NODE_FAILURE, cause=e) Till(seconds=WAIT_AFTER_NODE_FAILURE).wait() continue Log.warning("ES did not deliver, fall back to HG") return None
def output(*args): with cache_store.locker: if using_self: self = args[0] args = args[1:] else: self = cache_store now = Date.now() try: _cache = getattr(self, attr_name) except Exception: _cache = {} setattr(self, attr_name, _cache) if Random.int(100) == 0: # REMOVE OLD CACHE _cache = { k: v for k, v in _cache.items() if v[0] == None or v[0] > now } setattr(self, attr_name, _cache) timeout, key, value, exception = _cache.get( args, (Null, Null, Null, Null)) if now >= timeout: value = func(self, *args) with cache_store.locker: _cache[args] = (now + cache_store.timeout, args, value, None) return value if value == None: if exception == None: try: value = func(self, *args) with cache_store.locker: _cache[args] = (now + cache_store.timeout, args, value, None) return value except Exception as e: e = Except.wrap(e) with cache_store.locker: _cache[args] = (now + cache_store.timeout, args, None, e) raise e else: raise exception else: return value
def test_wrap_2(): switch = [ lambda: { "i": Random.int(2000) }, lambda: Data(i=Random.int(2000)), lambda: FlatList([{ "i": Random.int(2000) }]), lambda: [{ "i": Random.int(2000) }] ] inputs = [ switch[min(len(switch) - 1, int(floor(-log(Random.float(), 2))))]() for i in range(NUM_INPUT) ] for i in range(NUM_REPEAT): results = [] gc.collect() with Profiler("more dict: slow_wrap"): for v in inputs: results.append(slow_wrap(v)) results = [] gc.collect() with Profiler("more dict: wrap"): for v in inputs: results.append(wrap(v)) results = [] gc.collect() with Profiler("more dict: baseline"): for v in inputs: results.append(baseline(v)) Log.note("Done {{i}} of {{num}}", {"i": i, "num": NUM_REPEAT})
def fix(rownum, line, source, sample_only_filter, sample_size): value = json2value(line) if value._id.startswith(("tc.97", "96", "bb.27")): # AUG 24, 25 2017 - included full diff with repo; too big to index try: data = json2value(line) repo = data.repo repo.etl = None repo.branch.last_used = None repo.branch.description = None repo.branch.etl = None repo.branch.parent_name = None repo.children = None repo.parents = None if repo.changeset.diff or data.build.repo.changeset.diff: Log.error("no diff allowed") else: assertAlmostEqual(minimize_repo(repo), repo) except Exception as e: if CAN_NOT_DECODE_JSON in e: raise e data.repo = minimize_repo(repo) data.build.repo = minimize_repo(data.build.repo) line = value2json(data) else: pass if rownum == 0: if len(line) > MAX_RECORD_LENGTH: _shorten(value, source) value = _fix(value) if sample_only_filter and Random.int( int(1.0 / coalesce(sample_size, 0.01))) != 0 and jx.filter( [value], sample_only_filter): # INDEX etl.id==0, BUT NO MORE if value.etl.id != 0: Log.error("Expecting etl.id==0") row = {"value": value} return row, True elif len(line) > MAX_RECORD_LENGTH: _shorten(value, source) value = _fix(value) elif line.find('"resource_usage":') != -1: value = _fix(value) row = {"value": value} return row, False
def test_id_vs_id(self): ops = [Op() for _ in range(200)] lang1 = {id(o): o for o in ops} sample = Random.sample(ops, 1000 * 1000) with Timer("using id()"): result1 = [lang1[id(o)] for o in sample] lang2 = [None] * (max(o.id for o in ops) + 1) for o in ops: lang2[o.id] = o # lang2 = tuple(lang2) with Timer("using o.id"): result2 = [lang2[o.id] for o in sample]
def encrypt(text, _key, salt=None): """ RETURN {"salt":s, "length":l, "data":d} -> JSON -> UTF8 """ if is_text(text): encoding = 'utf8' data = bytearray(text.encode("utf8")) elif is_binary(text): encoding = None if PY2: data = bytearray(text) else: data = text if _key is None: Log.error("Expecting a key") if is_binary(_key): _key = bytearray(_key) if salt is None: salt = Random.bytes(16) # Initialize encryption using key and iv key_expander_256 = key_expander.KeyExpander(256) expanded_key = key_expander_256.expand(_key) aes_cipher_256 = aes_cipher.AESCipher(expanded_key) aes_cbc_256 = cbc_mode.CBCMode(aes_cipher_256, 16) aes_cbc_256.set_iv(salt) output = Data() output.type = "AES256" output.salt = bytes2base64(salt) output.length = len(data) output.encoding = encoding encrypted = bytearray() for _, d in _groupby16(data): encrypted.extend(aes_cbc_256.encrypt_block(d)) output.data = bytes2base64(encrypted) json = get_module("mo_json").value2json(output, pretty=True).encode('utf8') if DEBUG: test = decrypt(json, _key) if test != text: Log.error("problem with encryption") return json
def output(*args, **kwargs): if kwargs: Log.error("Sorry, caching only works with ordered parameter, not keyword arguments") with cache_store.locker: if using_self: self = args[0] args = args[1:] else: self = cache_store now = Date.now() try: _cache = getattr(self, attr_name) except Exception: _cache = {} setattr(self, attr_name, _cache) if Random.int(100) == 0: # REMOVE OLD CACHE _cache = {k: v for k, v in _cache.items() if v.timeout == None or v.timeout > now} setattr(self, attr_name, _cache) timeout, key, value, exception = _cache.get(args, (Null, Null, Null, Null)) if now >= timeout: value = func(self, *args) with cache_store.locker: _cache[args] = CacheElement(now + cache_store.timeout, args, value, None) return value if value == None: if exception == None: try: value = func(self, *args) with cache_store.locker: _cache[args] = CacheElement(now + cache_store.timeout, args, value, None) return value except Exception as e: e = Except.wrap(e) with cache_store.locker: _cache[args] = CacheElement(now + cache_store.timeout, args, None, e) raise e else: raise exception else: return value
def encrypt(text, _key, salt=None): """ RETURN {"salt":s, "length":l, "data":d} -> JSON -> UTF8 """ if isinstance(text, text_type): encoding = 'utf8' data = bytearray(text.encode("utf8")) elif isinstance(text, binary_type): encoding = None if PY2: data = bytearray(text) else: data = text if _key is None: Log.error("Expecting a key") if isinstance(_key, binary_type): _key = bytearray(_key) if salt is None: salt = Random.bytes(16) # Initialize encryption using key and iv key_expander_256 = key_expander.KeyExpander(256) expanded_key = key_expander_256.expand(_key) aes_cipher_256 = aes_cipher.AESCipher(expanded_key) aes_cbc_256 = cbc_mode.CBCMode(aes_cipher_256, 16) aes_cbc_256.set_iv(salt) output = Data() output.type = "AES256" output.salt = bytes2base64(salt) output.length = len(data) output.encoding = encoding encrypted = bytearray() for _, d in _groupby16(data): encrypted.extend(aes_cbc_256.encrypt_block(d)) output.data = bytes2base64(encrypted) json = get_module("mo_json").value2json(output, pretty=True).encode('utf8') if DEBUG: test = decrypt(json, _key) if test != text: Log.error("problem with encryption") return json
def _daemon(self, please_stop): while not please_stop: with Explanation("looking for work"): try: branch, revisions = self.todo.pop(till=please_stop) except Exception as e: if please_stop: break else: raise e if branch.name in DAEMON_DO_NO_SCAN: continue revisions = set(revisions) # FIND THE REVSIONS ON THIS BRANCH for r in list(revisions): try: rev = self.get_revision(Revision(branch=branch, changeset={"id": r})) if DAEMON_DEBUG: Log.note("found revision with push date {{date|datetime}}", date=rev.push.date) revisions.discard(r) if rev.etl.timestamp > Date.now() - (DAEMON_RECENT_HG_PULL * SECOND): # SOME PUSHES ARE BIG, RUNNING THE RISK OTHER MACHINES ARE # ALSO INTERESTED AND PERFORMING THE SAME SCAN. THIS DELAY # WILL HAVE SMALL EFFECT ON THE MAJORITY OF SMALL PUSHES # https://bugzilla.mozilla.org/show_bug.cgi?id=1417720 Till(seconds=Random.float(DAEMON_HG_INTERVAL*2)).wait() except Exception as e: Log.warning( "Scanning {{branch}} {{revision|left(12)}}", branch=branch.name, revision=r, cause=e ) if "Read timed out" in e: Till(seconds=DAEMON_WAIT_AFTER_TIMEOUT).wait() # FIND ANY BRANCH THAT MAY HAVE THIS REVISION for r in list(revisions): self._find_revision(r)
def fix(rownum, line, source, sample_only_filter, sample_size): # ES SCHEMA IS STRICTLY TYPED, USE "code" FOR TEXT IDS line = line.replace('{"id": "bb"}', '{"code": "bb"}').replace('{"id": "tc"}', '{"code": "tc"}') # ES SCHEMA IS STRICTLY TYPED, THE SUITE OBJECT CAN NOT BE HANDLED if source.name.startswith("active-data-test-result"): # "suite": {"flavor": "plain-chunked", "name": "mochitest"} found = strings.between(line, '"suite": {', '}') if found: suite_json = '{' + found + "}" if suite_json: suite = mo_json.json2value(suite_json) suite = convert.value2json(coalesce(suite.fullname, suite.name)) line = line.replace(suite_json, suite) if rownum == 0: value = mo_json.json2value(line) if len(line) > MAX_RECORD_LENGTH: _shorten(value, source) _id, value = _fix(value) row = {"id": _id, "value": value} if sample_only_filter and Random.int(int(1.0/coalesce(sample_size, 0.01))) != 0 and jx.filter([value], sample_only_filter): # INDEX etl.id==0, BUT NO MORE if value.etl.id != 0: Log.error("Expecting etl.id==0") return row, True elif len(line) > MAX_RECORD_LENGTH: value = mo_json.json2value(line) _shorten(value, source) _id, value = _fix(value) row = {"id": _id, "value": value} elif line.find('"resource_usage":') != -1: value = mo_json.json2value(line) _id, value = _fix(value) row = {"id": _id, "value": value} else: # FAST _id = strings.between(line, "\"_id\": \"", "\"") # AVOID DECODING JSON row = {"id": _id, "json": line} return row, False
def output(*args): with cache_store.locker: if using_self: self = args[0] args = args[1:] else: self = cache_store now = Date.now() try: _cache = getattr(self, attr_name) except Exception, _: _cache = {} setattr(self, attr_name, _cache) if Random.int(100) == 0: # REMOVE OLD CACHE _cache = {k: v for k, v in _cache.items() if v[0]==None or v[0] > now} setattr(self, attr_name, _cache) timeout, key, value, exception = _cache.get(args, (Null, Null, Null, Null))
def device_register(self, path=None): """ EXPECTING A SIGNED REGISTRATION REQUEST RETURN JSON WITH url FOR LOGIN """ now = Date.now().unix request_body = request.get_data().strip() signed = json2value(request_body.decode("utf8")) command = json2value(base642bytes(signed.data).decode("utf8")) session.public_key = command.public_key rsa_crypto.verify(signed, session.public_key) self.session_manager.setup_session(session) session.expires = now + parse("10minute").seconds session.state = bytes2base64URL(Random.bytes(32)) with self.device.db.transaction() as t: t.execute( sql_insert( self.device.table, { "state": session.state, "session_id": session.session_id }, )) response = value2json( Data( session_id=session.session_id, interval="5second", expiry=session.expires, url=URL( self.device.home, path=self.device.endpoints.login, query={"state": session.state}, ), )) return Response(response, headers={"Content-Type": "application/json"}, status=200)
def encrypt(text, _key, salt=None): """ RETURN JSON OF ENCRYPTED DATA {"salt":s, "length":l, "data":d} """ if not isinstance(text, unicode): Log.error("only unicode is encrypted") if _key is None: Log.error("Expecting a key") if isinstance(_key, str): _key = bytearray(_key) if salt is None: salt = Random.bytes(16) data = bytearray(text.encode("utf8")) # Initialize encryption using key and iv key_expander_256 = key_expander.KeyExpander(256) expanded_key = key_expander_256.expand(_key) aes_cipher_256 = aes_cipher.AESCipher(expanded_key) aes_cbc_256 = cbc_mode.CBCMode(aes_cipher_256, 16) aes_cbc_256.set_iv(salt) output = Data() output.type = "AES256" output.salt = bytes2base64(salt) output.length = len(data) encrypted = bytearray() for _, d in _groupby16(data): encrypted.extend(aes_cbc_256.encrypt_block(d)) output.data = bytes2base64(encrypted) json = get_module("mo_json").value2json(output) if DEBUG: test = decrypt(json, _key) if test != text: Log.error("problem with encryption") return json
def encrypt(text, _key, salt=None): """ RETURN JSON OF ENCRYPTED DATA {"salt":s, "length":l, "data":d} """ if not isinstance(text, text_type): Log.error("only unicode is encrypted") if _key is None: Log.error("Expecting a key") if isinstance(_key, str): _key = bytearray(_key) if salt is None: salt = Random.bytes(16) data = bytearray(text.encode("utf8")) # Initialize encryption using key and iv key_expander_256 = key_expander.KeyExpander(256) expanded_key = key_expander_256.expand(_key) aes_cipher_256 = aes_cipher.AESCipher(expanded_key) aes_cbc_256 = cbc_mode.CBCMode(aes_cipher_256, 16) aes_cbc_256.set_iv(salt) output = Data() output.type = "AES256" output.salt = bytes2base64(salt) output.length = len(data) encrypted = bytearray() for _, d in _groupby16(data): encrypted.extend(aes_cbc_256.encrypt_block(d)) output.data = bytes2base64(encrypted) json = get_module("mo_json").value2json(output) if DEBUG: test = decrypt(json, _key) if test != text: Log.error("problem with encryption") return json
def es_bulksetop(esq, frum, query): abs_limit = MIN([query.limit, MAX_DOCUMENTS]) guid = Random.base64(32, extra="-_") schema = query.frum.schema query_path = schema.query_path[0] new_select, split_select = get_selects(query) split_wheres = split_expression_by_path(query.where, schema, lang=ES52) es_query = es_query_proto(query_path, split_select, split_wheres, schema) es_query.size = MIN([query.chunk_size, MAX_CHUNK_SIZE]) es_query.sort = jx_sort_to_es_sort(query.sort, schema) if not es_query.sort: es_query.sort = ["_doc"] formatter = formatters[query.format](abs_limit, new_select, query) Thread.run( "Download " + guid, extractor, guid, abs_limit, esq, es_query, formatter, parent_thread=Null, ).release() output = wrap({ "url": URL_PREFIX / (guid + ".json"), "status": URL_PREFIX / (guid + ".status.json"), "meta": { "format": query.format, "es_query": es_query, "limit": abs_limit }, }) return output
def test_compare_isinstance_to_class_checks(self): num = 1 * 1000 * 1000 options = { 0: lambda: {}, 1: lambda: Data(), 2: lambda: Null, 3: lambda: 6, 4: lambda: "string", } data = [options[Random.int(len(options))]() for _ in range(num)] with Timer("isinstance check") as i_time: i_result = [isinstance(d, Mapping) for d in data] with Timer("set check") as s_time: s_result = [d.__class__ in MAPPING_TYPES for d in data] with Timer("eq check") as e_time: e_result = [ d.__class__ is Data or d.__class__ is dict for d in data ] with Timer("name check") as n_time: n_result = [ is_instance(d, Data) or is_instance(d, dict) for d in data ] with Timer("check w method") as m_time: m_result = [is_mapping(d) for d in data] self.assertEqual(s_result, i_result) self.assertEqual(m_result, i_result) self.assertEqual(e_result, i_result) self.assertEqual(n_result, i_result) self.assertGreater(i_time.duration, s_time.duration) self.assertGreater(m_time.duration, s_time.duration)
def commit(self): with self.lock: if self.closed: Log.error("Queue is closed, commit not allowed") try: self._add_pending({"add": {"status.start": self.start}}) for i in range(self.db.status.start, self.start): self._add_pending({"remove": str(i)}) if self.db.status.end - self.start < 10 or Random.range(0, 1000) == 0: # FORCE RE-WRITE TO LIMIT FILE SIZE # SIMPLY RE-WRITE FILE if DEBUG: Log.note("Re-write {{num_keys}} keys to persistent queue", num_keys=self.db.status.end - self.start) for k in self.db.keys(): if k == "status" or int(k) >= self.db.status.start: continue Log.error("Not expecting {{key}}", key=k) self._commit() self.file.write(mo_json.value2json({"add": self.db}) + "\n") else: self._commit() except Exception as e: raise e
def complex_job( transactional_db, generic_reference_data, test_repository, extract_job_settings, now ): fc = FailureClassification.objects.create(id=1, name="not classified") repository_group = RepositoryGroup.objects.create(name="common") repo = Repository.objects.create(name="autoland", repository_group=repository_group) push = Push.objects.create( **{ "author": "*****@*****.**", "repository": repo, "revision": "ae6bb3a1066959a8c43d003a3caab0af769455bf", "time": unix2datetime(1578427105).replace(tzinfo=None), } ) Commit.objects.create( push=push, revision="ae6bb3a1066959a8c43d003a3caab0af769455bf", author="*****@*****.**", comments="no comment", ) Commit.objects.create( push=push, revision="0123456789012345678901234567890123456789", author="*****@*****.**", comments="no comment2", ) debug = Option.objects.create(name="debug") oc = OptionCollection.objects.create(option_collection_hash=Random.base64(5), option=debug) job = Job.objects.create( autoclassify_status=1, guid=Random.base64(20), repository=test_repository, push_id=push.id, signature=generic_reference_data.signature, build_platform=generic_reference_data.build_platform, machine_platform=generic_reference_data.machine_platform, machine=generic_reference_data.machine, option_collection_hash=oc.option_collection_hash, job_type=generic_reference_data.job_type, job_group=generic_reference_data.job_group, product=generic_reference_data.product, failure_classification_id=fc.id, who="*****@*****.**", reason="scheduled", result="success", state="completed", submit_time=unix2datetime(1578427253).replace(tzinfo=None), start_time=unix2datetime(1578430841).replace(tzinfo=None), last_modified=unix2datetime(1578432686.364459).replace(tzinfo=None), end_time=unix2datetime(1578432680).replace(tzinfo=None), tier=1, ) text_log_step = TextLogStep.objects.create( job=job, **{ "finished_line_number": 88739, "name": "Unnamed step", "result": 7, "started_line_number": 0, }, ) TextLogError.objects.create( step=text_log_step, line="line contents here", line_number=619845839 ) TextLogError.objects.create(step=text_log_step, line="ERROR! more line contents", line_number=6) TaskclusterMetadata.objects.create(job=job, retry_id=0, task_id="WWb9ExAvQUa78ku0DIxdSQ") JobLog.objects.create( **{ "job_id": job.id, "name": "builds-4h", "status": 1, "url": "https://example.com/api/queue/v1/task/WWb9ExAvQUa78ku0DIxdSQ/runs/0/artifacts/public/logs/live_backing.log", } ) job_logs1 = JobLog.objects.create( **{ "job_id": job.id, "name": "errorsummary_json", "status": 1, "url": "https://example.com/api/queue/v1/task/WWb9ExAvQUa78ku0DIxdSQ/runs/0/artifacts/public/test_info/wpt_errorsummary.log", } ) bcf = ClassifiedFailure.objects.create(**{"bug_number": 1234567,}) bcf.created = Date("2020-01-17 12:00:00").datetime bcf.save() FailureLine.objects.create( job_log=job_logs1, **{ "action": "test_groups", "best_classification": bcf, "best_is_verified": True, "repository": repo, "job_guid": job.guid, "line": 15, "modified": 0, "stackwalk_stderr": 1578432686, "stackwalk_stdout": 1578432686, }, ) FailureLine.objects.create( job_log=job_logs1, **{ "action": "crash", "best_classification": bcf, "best_is_verified": False, "repository": repo, "job_guid": job.guid, "line": 24031, "modified": 0, "signature": "@ mozilla::dom::CustomElementData::SetCustomElementDefinition(mozilla::dom::CustomElementDefinition*)", "stackwalk_stderr": 1578432686, "stackwalk_stdout": 1578432686, "test": "/custom-elements/upgrading.html", }, ) return job
def unique_name(): return Random.string(20)
def _scan_database(self): # GET ALL RELATIONS raw_relations = self.db.query( """ SELECT table_schema, table_name, referenced_table_schema, referenced_table_name, referenced_column_name, constraint_name, column_name, ordinal_position FROM information_schema.key_column_usage WHERE referenced_column_name IS NOT NULL """, param=self.settings.database, ) if not raw_relations: Log.error("No relations in the database") for r in self.settings.add_relations: try: lhs, rhs = map(strings.trim, r.split("->")) lhs = lhs.split(".") if len(lhs) == 2: lhs = [self.settings.database.schema] + lhs rhs = rhs.split(".") if len(rhs) == 2: rhs = [self.settings.database.schema] + rhs to_add = Data( ordinal_position=1, # CAN ONLY HANDLE 1-COLUMN RELATIONS table_schema=lhs[0], table_name=lhs[1], column_name=lhs[2], referenced_table_schema=rhs[0], referenced_table_name=rhs[1], referenced_column_name=rhs[2], ) # CHECK IF EXISTING if jx.filter(raw_relations, {"eq": to_add}): Log.note("Relation {{relation}} already exists", relation=r) continue to_add.constraint_name = Random.hex(20) raw_relations.append(to_add) except Exception as e: Log.error("Could not parse {{line|quote}}", line=r, cause=e) relations = jx.select( raw_relations, [ { "name": "constraint.name", "value": "constraint_name" }, { "name": "table.schema", "value": "table_schema" }, { "name": "table.name", "value": "table_name" }, { "name": "column.name", "value": "column_name" }, { "name": "referenced.table.schema", "value": "referenced_table_schema" }, { "name": "referenced.table.name", "value": "referenced_table_name" }, { "name": "referenced.column.name", "value": "referenced_column_name" }, { "name": "ordinal_position", "value": "ordinal_position" }, ], ) # GET ALL TABLES raw_tables = self.db.query(""" SELECT t.table_schema, t.table_name, c.constraint_name, c.constraint_type, k.column_name, k.ordinal_position FROM information_schema.tables t LEFT JOIN information_schema.table_constraints c on c.table_name=t.table_name AND c.table_schema=t.table_schema and (constraint_type='UNIQUE' or constraint_type='PRIMARY KEY') LEFT JOIN information_schema.key_column_usage k on k.constraint_name=c.constraint_name AND k.table_name=t.table_name and k.table_schema=t.table_schema ORDER BY t.table_schema, t.table_name, c.constraint_name, k.ordinal_position, k.column_name """) # ORGANIZE, AND PICK ONE UNIQUE CONSTRAINT FOR LINKING tables = UniqueIndex(keys=["name", "schema"]) for t, c in jx.groupby(raw_tables, ["table_name", "table_schema"]): c = wrap(list(c)) best_index = Null is_referenced = False is_primary = False for g, w in jx.groupby(c, "constraint_name"): if not g.constraint_name: continue w = list(w) ref = False for r in relations: if (r.table.name == t.table_name and r.table.schema == t.table_schema and r.constraint.name == g.constraint_name): ref = True is_prime = w[0].constraint_type == "PRIMARY" reasons_this_one_is_better = [ best_index == None, # WE DO NOT HAVE A CANDIDATE YET is_prime and not is_primary, # PRIMARY KEYS ARE GOOD TO HAVE is_primary == is_prime and ref and not is_referenced, # REFERENCED UNIQUE TUPLES ARE GOOD TOO is_primary == is_prime and ref == is_referenced and len(w) < len(best_index), # THE SHORTER THE TUPLE, THE BETTER ] if any(reasons_this_one_is_better): is_primary = is_prime is_referenced = ref best_index = w tables.add({ "name": t.table_name, "schema": t.table_schema, "id": [b.column_name for b in best_index], }) fact_table = tables[self.settings.fact_table, self.settings.database.schema] ids_table = { "alias": "t0", "name": "__ids__", "schema": fact_table.schema, "id": fact_table.id, } relations.extend( wrap({ "constraint": { "name": "__link_ids_to_fact_table__" }, "table": ids_table, "column": { "name": c }, "referenced": { "table": fact_table, "column": { "name": c } }, "ordinal_position": i, }) for i, c in enumerate(fact_table.id)) tables.add(ids_table) # GET ALL COLUMNS raw_columns = self.db.query(""" SELECT column_name, table_schema, table_name, ordinal_position, data_type FROM information_schema.columns """) reference_only_tables = [ r.split(".")[0] for r in self.settings.reference_only if len(r.split(".")) == 2 ] reference_all_tables = [ r.split(".")[0] for r in self.settings.reference_only if len(r.split(".")) == 1 ] foreign_column_table_schema_triples = {(r.column.name, r.table.name, r.table.schema) for r in relations} referenced_column_table_schema_triples = {( r.referenced.column.name, r.referenced.table.name, r.referenced.table.schema, ) for r in relations} related_column_table_schema_triples = ( foreign_column_table_schema_triples | referenced_column_table_schema_triples) columns = UniqueIndex(["column.name", "table.name", "table.schema"]) for c in raw_columns: if c.table_name in reference_only_tables: if c.table_name + "." + c.column_name in self.settings.reference_only: include = True reference = True foreign = False elif c.column_name in tables[(c.table_name, c.table_schema)].id: include = self.settings.show_foreign_keys reference = False foreign = False else: include = False reference = False foreign = False elif c.table_name in reference_all_tables: # TABLES USED FOR REFERENCE, NO NESTED DOCUMENTS EXPECTED if c.column_name in tables[(c.table_name, c.table_schema)].id: include = self.settings.show_foreign_keys reference = True foreign = False elif ( c.column_name, c.table_name, c.table_schema, ) in foreign_column_table_schema_triples: include = False reference = False foreign = True else: include = True reference = False foreign = False elif c.column_name in tables[(c.table_name, c.table_schema)].id: include = self.settings.show_foreign_keys reference = False foreign = False elif ( c.column_name, c.table_name, c.table_schema, ) in foreign_column_table_schema_triples: include = False reference = False foreign = True elif ( c.column_name, c.table_name, c.table_schema, ) in referenced_column_table_schema_triples: include = self.settings.show_foreign_keys reference = False foreign = False else: include = True reference = False foreign = False rel = { "column": { "name": c.column_name, "type": c.data_type }, "table": { "name": c.table_name, "schema": c.table_schema }, "ordinal_position": c.ordinal_position, "is_id": c.column_name in tables[(c.table_name, c.table_schema)].id, "include": include, # TRUE IF THIS COLUMN IS OUTPUTTED "reference": reference, # TRUE IF THIS COLUMN REPRESENTS THE ROW "foreign": foreign, # TRUE IF THIS COLUMN POINTS TO ANOTHER ROW } columns.add(rel) # ITERATE OVER ALL PATHS todo = FlatList() output_columns = FlatList() nested_path_to_join = {} all_nested_paths = [["."]] def follow_paths(position, path, nested_path, done_relations, no_nested_docs): if position.name in self.settings.exclude: return if self.path_not_allowed(path): return if DEBUG: Log.note("Trace {{path}}", path=path) if position.name != "__ids__": # USED TO CONFIRM WE CAN ACCESS THE TABLE (WILL THROW ERROR WHEN IF IT FAILS) self.db.query( ConcatSQL( SQL_SELECT, SQL_STAR, SQL_FROM, quote_column(position.schema, position.name), SQL_LIMIT, SQL_ONE, )) if position.name in reference_all_tables: no_nested_docs = True if position.name in reference_only_tables: return curr_join_list = copy(nested_path_to_join[nested_path[0]]) ############################################################################### # INNER OBJECTS ############################################################################### referenced_tables = list( sort_using_key( jx.groupby( jx.filter( relations, { "eq": { "table.name": position.name, "table.schema": position.schema, } }, ), "constraint.name", ), key=lambda p: first(p[1]).column.name, )) for g, constraint_columns in referenced_tables: g = unwrap(g) constraint_columns = deepcopy(constraint_columns) if g["constraint.name"] in done_relations: continue if any(cc for cc in constraint_columns if cc.referenced.table.name in self.settings.exclude): continue done_relations.add(g["constraint.name"]) many_to_one_joins = nested_path_to_join[nested_path[0]] index = len(many_to_one_joins) alias = "t" + text(index) for c in constraint_columns: c.referenced.table.alias = alias c.table = position many_to_one_joins.append({ "join_columns": constraint_columns, "path": path, "nested_path": nested_path, }) # HANDLE THE COMMON *id SUFFIX name = [] for cname, tname in zip( constraint_columns.column.name, constraint_columns.referenced.table.name, ): if cname.startswith(tname): name.append(tname) elif cname.endswith("_id"): name.append(cname[:-3]) else: name.append(cname) relation_string = many_to_one_string(constraint_columns[0]) step = "/".join(name) if len(constraint_columns) == 1: step = self.name_relations.get(relation_string, step) referenced_column_path = concat_field(path, step) if self.path_not_allowed(referenced_column_path): continue if referenced_column_path in reference_only_tables: continue col_pointer_name = relative_field(referenced_column_path, nested_path[0]) for col in columns: if (col.table.name == constraint_columns[0].referenced.table.name and col.table.schema == constraint_columns[0].referenced.table.schema): col_full_name = concat_field( col_pointer_name, literal_field(col.column.name)) if (col.is_id and len(nested_path) == 1 and col.table.name == fact_table.name and col.table.schema == fact_table.schema): # ALWAYS SHOW THE ID OF THE FACT c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": True, "path": referenced_column_path, "nested_path": nested_path, "put": col_full_name, }) elif col.column.name == constraint_columns[ 0].column.name: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": False, "path": referenced_column_path, "nested_path": nested_path, "put": col_full_name if self.settings.show_foreign_keys else None, }) elif col.is_id: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": False, "path": referenced_column_path, "nested_path": nested_path, "put": col_full_name if self.settings.show_foreign_keys else None, }) elif col.reference: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": False, "path": referenced_column_path, "nested_path": nested_path, "put": col_pointer_name if not self.settings.show_foreign_keys else col_full_name, # REFERENCE FIELDS CAN REPLACE THE WHOLE OBJECT BEING REFERENCED }) elif col.include: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": False, "path": referenced_column_path, "nested_path": nested_path, "put": col_full_name, }) if position.name in reference_only_tables: continue todo.append( Data( position=copy(constraint_columns[0].referenced.table), path=referenced_column_path, nested_path=nested_path, done_relations=copy(done_relations), no_nested_docs=no_nested_docs, )) ############################################################################### # NESTED OBJECTS ############################################################################### if not no_nested_docs: nesting_tables = list( sort_using_key( jx.groupby( jx.filter( relations, { "eq": { "referenced.table.name": position.name, "referenced.table.schema": position.schema, } }, ), "constraint.name", ), key=lambda p: [(r.table.name, r.column.name) for r in [first(p[1])]][0], )) for g, constraint_columns in nesting_tables: g = unwrap(g) constraint_columns = deepcopy(constraint_columns) if g["constraint.name"] in done_relations: continue done_relations.add(g["constraint.name"]) many_table = set(constraint_columns.table.name) if not (many_table - self.settings.exclude): continue relation_string = one_to_many_string(constraint_columns[0]) step = "/".join(many_table) if len(constraint_columns) == 1: step = self.name_relations.get(relation_string, step) referenced_column_path = concat_field(path, step) if self.path_not_allowed(referenced_column_path): continue new_nested_path = [referenced_column_path] + nested_path all_nested_paths.append(new_nested_path) if referenced_column_path in nested_path_to_join: Log.error( "{{path}} already exists, try adding entry to name_relations", path=referenced_column_path, ) one_to_many_joins = nested_path_to_join[ referenced_column_path] = copy(curr_join_list) index = len(one_to_many_joins) alias = "t" + text(index) for c in constraint_columns: c.table.alias = alias c.referenced.table = position one_to_many_joins.append( set_default( {}, g, { "children": True, "join_columns": constraint_columns, "path": path, "nested_path": nested_path, }, )) for col in columns: if (col.table.name == constraint_columns[0].table.name and col.table.schema == constraint_columns[0].table.schema): col_full_name = join_field( split_field(referenced_column_path) [len(split_field(new_nested_path[0])):] + [literal_field(col.column.name)]) if col.column.name == constraint_columns[ 0].column.name: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": col.is_id, "path": referenced_column_path, "nested_path": new_nested_path, "put": col_full_name if self.settings.show_foreign_keys else None, }) elif col.is_id: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": col.is_id, "path": referenced_column_path, "nested_path": new_nested_path, "put": col_full_name if self.settings.show_foreign_keys else None, }) else: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": col.is_id, "path": referenced_column_path, "nested_path": new_nested_path, "put": col_full_name if col.include else None, }) todo.append( Data( position=constraint_columns[0].table, path=referenced_column_path, nested_path=new_nested_path, done_relations=copy(done_relations), no_nested_docs=no_nested_docs, )) path = "." nested_path = [path] nested_path_to_join["."] = [{ "path": path, "join_columns": [{ "referenced": { "table": ids_table } }], "nested_path": nested_path, }] todo.append( Data( position=ids_table, path=path, nested_path=nested_path, done_relations=set(), no_nested_docs=False, )) while todo: item = todo.pop(0) follow_paths(**item) self.all_nested_paths = all_nested_paths self.nested_path_to_join = nested_path_to_join self.columns = output_columns
def _get_from_elasticsearch( self, revision, locale=None, get_diff=False, get_moves=True, after=None, # RETURN RECORDS ETLed AFTER GIVEN TIME ): """ MAKE CALL TO ES """ rev = revision.changeset.id if self.repo.cluster.version.startswith("1.7."): query = { "query": { "filtered": { "query": { "match_all": {} }, "filter": { "and": [ { "term": { "changeset.id12": rev[0:12] } }, { "term": { "branch.name": revision.branch.name } }, { "term": { "branch.locale": coalesce( locale, revision.branch.locale, DEFAULT_LOCALE, ) } }, { "range": { "etl.timestamp": { "gt": Date.max(after, MIN_ETL_AGE) } } }, ] }, } }, "size": 20, } else: query = { "query": { "bool": { "must": [ { "term": { "changeset.id12": rev[0:12] } }, { "term": { "branch.name": revision.branch.name } }, { "term": { "branch.locale": coalesce(locale, revision.branch.locale, DEFAULT_LOCALE) } }, { "range": { "etl.timestamp": { "gt": Date.max(after, MIN_ETL_AGE) } } }, ] } }, "size": 20, } for attempt in range(3): try: if get_moves: with self.moves_locker: docs = self.moves.search(query).hits.hits else: with self.repo_locker: docs = self.repo.search(query).hits.hits if len(docs) == 0: return None best = docs[0]._source if len(docs) > 1: for d in docs: if d._id.endswith(d._source.branch.locale): best = d._source Log.warning("expecting no more than one document") return best except Exception as e: e = Except.wrap(e) if ("EsRejectedExecutionException[rejected execution (queue capacity" in e): (Till(seconds=Random.int(30))).wait() continue else: Log.warning( "Bad ES call, waiting for {{num}} seconds", num=WAIT_AFTER_NODE_FAILURE, cause=e, ) Till(seconds=WAIT_AFTER_NODE_FAILURE).wait() continue Log.warning("ES did not deliver, fall back to HG") return None
def es_bulkaggsop(esq, frum, query): query = query.copy() # WE WILL MARK UP THIS QUERY chunk_size = min(coalesce(query.chunk_size, MAX_CHUNK_SIZE), MAX_CHUNK_SIZE) schema = frum.schema query_path = first(schema.query_path) selects = listwrap(query.select) variable = first(query.groupby).value # FIND CARDINALITY cardinality_check = Timer( "Get cardinality for {{column}}", param={"column": variable.var} ) with cardinality_check: columns = schema.leaves(variable.var) if len(columns) != 1: Log.error( "too many columns to bulk groupby:\n{{columns|json}}", columns=columns ) column = first(columns) if query.where is TRUE: cardinality = column.cardinality if cardinality == None: esq.namespace._update_cardinality(column) cardinality = column.cardinality else: cardinality = esq.query( { "select": { "name": "card", "value": variable, "aggregate": "cardinality", }, "from": frum.name, "where": query.where, "format": "cube", } ).card num_partitions = (cardinality + chunk_size - 1) // chunk_size if num_partitions > MAX_PARTITIONS: Log.error("Requesting more than {{num}} partitions", num=num_partitions) acc, decoders, es_query = build_es_query(selects, query_path, schema, query) guid = Random.base64(32, extra="-_") abs_limit = mo_math.MIN((query.limit, first(query.groupby).domain.limit)) formatter = formatters[query.format](abs_limit) Thread.run( "extract to " + guid + ".json", extractor, guid, num_partitions, esq, query, selects, query_path, schema, chunk_size, cardinality, abs_limit, formatter, parent_thread=Null, ) output = wrap( { "url": URL_PREFIX / (guid + ".json"), "status": URL_PREFIX / (guid + ".status.json"), "meta": { "format": query.format, "timing": {"cardinality_check": cardinality_check.duration}, "es_query": es_query, "num_partitions": num_partitions, "cardinality": cardinality, }, } ) return output
def get_revision(self, revision, locale=None, get_diff=False, get_moves=True): """ EXPECTING INCOMPLETE revision OBJECT RETURNS revision """ rev = revision.changeset.id if not rev: return Null elif rev == "None": return Null elif revision.branch.name == None: return Null locale = coalesce(locale, revision.branch.locale, DEFAULT_LOCALE) output = self._get_from_elasticsearch(revision, locale=locale, get_diff=get_diff) if output: if not get_diff: # DIFF IS BIG, DO NOT KEEP IT IF NOT NEEDED output.changeset.diff = None if not get_moves: output.changeset.moves = None DEBUG and Log.note("Got hg ({{branch}}, {{locale}}, {{revision}}) from ES", branch=output.branch.name, locale=locale, revision=output.changeset.id) if output.push.date >= Date.now()-MAX_TODO_AGE: self.todo.add((output.branch, listwrap(output.parents))) self.todo.add((output.branch, listwrap(output.children))) if output.push.date: return output # RATE LIMIT CALLS TO HG (CACHE MISSES) next_cache_miss = self.last_cache_miss + (Random.float(WAIT_AFTER_CACHE_MISS * 2) * SECOND) self.last_cache_miss = Date.now() if next_cache_miss > self.last_cache_miss: Log.note("delaying next hg call for {{seconds|round(decimal=1)}}", seconds=next_cache_miss - self.last_cache_miss) Till(till=next_cache_miss.unix).wait() found_revision = copy(revision) if isinstance(found_revision.branch, (text_type, binary_type)): lower_name = found_revision.branch.lower() else: lower_name = found_revision.branch.name.lower() if not lower_name: Log.error("Defective revision? {{rev|json}}", rev=found_revision.branch) b = found_revision.branch = self.branches[(lower_name, locale)] if not b: b = found_revision.branch = self.branches[(lower_name, DEFAULT_LOCALE)] if not b: Log.warning("can not find branch ({{branch}}, {{locale}})", branch=lower_name, locale=locale) return Null if Date.now() - Date(b.etl.timestamp) > _OLD_BRANCH: self.branches = _hg_branches.get_branches(kwargs=self.settings) push = self._get_push(found_revision.branch, found_revision.changeset.id) url1 = found_revision.branch.url.rstrip("/") + "/json-info?node=" + found_revision.changeset.id[0:12] url2 = found_revision.branch.url.rstrip("/") + "/json-rev/" + found_revision.changeset.id[0:12] with Explanation("get revision from {{url}}", url=url1, debug=DEBUG): raw_rev2 = Null try: raw_rev1 = self._get_raw_json_info(url1, found_revision.branch) raw_rev2 = self._get_raw_json_rev(url2, found_revision.branch) except Exception as e: if "Hg denies it exists" in e: raw_rev1 = Data(node=revision.changeset.id) else: raise e output = self._normalize_revision(set_default(raw_rev1, raw_rev2), found_revision, push, get_diff, get_moves) if output.push.date >= Date.now()-MAX_TODO_AGE: self.todo.add((output.branch, listwrap(output.parents))) self.todo.add((output.branch, listwrap(output.children))) if not get_diff: # DIFF IS BIG, DO NOT KEEP IT IF NOT NEEDED output.changeset.diff = None if not get_moves: output.changeset.moves = None return output
def _install_es(self, gigabytes): volumes = self.instance.markup.drives if not fabric_files.exists("/usr/local/elasticsearch"): with cd("/home/ec2-user/"): run("mkdir -p temp") if not File(LOCAL_JRE).exists: Log.error( "Expecting {{file}} on manager to spread to ES instances", file=LOCAL_JRE) with cd("/home/ec2-user/temp"): run('rm -f ' + JRE) put("resources/" + JRE, JRE) sudo("rpm -i " + JRE) sudo( "alternatives --install /usr/bin/java java /usr/java/default/bin/java 20000" ) run("export JAVA_HOME=/usr/java/default") with cd("/home/ec2-user/"): run('wget https://download.elasticsearch.org/elasticsearch/elasticsearch/elasticsearch-1.7.1.tar.gz' ) run('tar zxfv elasticsearch-1.7.1.tar.gz') sudo('mkdir /usr/local/elasticsearch') sudo('cp -R elasticsearch-1.7.1/* /usr/local/elasticsearch/') with cd('/usr/local/elasticsearch/'): # BE SURE TO MATCH THE PLUGLIN WITH ES VERSION # https://github.com/elasticsearch/elasticsearch-cloud-aws sudo( 'bin/plugin -install elasticsearch/elasticsearch-cloud-aws/2.7.1' ) #REMOVE THESE FILES, WE WILL REPLACE THEM WITH THE CORRECT VERSIONS AT THE END sudo("rm -f /usr/local/elasticsearch/config/elasticsearch.yml") sudo("rm -f /usr/local/elasticsearch/bin/elasticsearch.in.sh") self.conn = self.instance.connection # MOUNT AND FORMAT THE EBS VOLUMES (list with `lsblk`) for i, k in enumerate(volumes): if not fabric_files.exists(k.path): with fabric_settings(warn_only=True): sudo('sudo umount ' + k.device) sudo('yes | sudo mkfs -t ext4 ' + k.device) sudo('mkdir ' + k.path) sudo('sudo mount ' + k.device + ' ' + k.path) #ADD TO /etc/fstab SO AROUND AFTER REBOOT sudo("sed -i '$ a\\" + k.device + " " + k.path + " ext4 defaults,nofail 0 2' /etc/fstab") # TEST IT IS WORKING sudo('mount -a') # INCREASE THE FILE HANDLE LIMITS with cd("/home/ec2-user/"): File("./results/temp/sysctl.conf").delete() get("/etc/sysctl.conf", "./results/temp/sysctl.conf", use_sudo=True) lines = File("./results/temp/sysctl.conf").read() if lines.find("fs.file-max = 100000") == -1: lines += "\nfs.file-max = 100000" lines = lines.replace("net.bridge.bridge-nf-call-ip6tables = 0", "") lines = lines.replace("net.bridge.bridge-nf-call-iptables = 0", "") lines = lines.replace("net.bridge.bridge-nf-call-arptables = 0", "") File("./results/temp/sysctl.conf").write(lines) put("./results/temp/sysctl.conf", "/etc/sysctl.conf", use_sudo=True) sudo("sysctl -p") # INCREASE FILE HANDLE PERMISSIONS sudo("sed -i '$ a\\root soft nofile 50000' /etc/security/limits.conf") sudo("sed -i '$ a\\root hard nofile 100000' /etc/security/limits.conf") sudo("sed -i '$ a\\root memlock unlimited' /etc/security/limits.conf") sudo( "sed -i '$ a\\ec2-user soft nofile 50000' /etc/security/limits.conf" ) sudo( "sed -i '$ a\\ec2-user hard nofile 100000' /etc/security/limits.conf" ) sudo( "sed -i '$ a\\ec2-user memlock unlimited' /etc/security/limits.conf" ) # EFFECTIVE LOGIN TO LOAD CHANGES TO FILE HANDLES # sudo("sudo -i -u ec2-user") if not fabric_files.exists("/data1/logs"): sudo('mkdir /data1/logs') sudo('mkdir /data1/heapdump') #INCREASE NUMBER OF FILE HANDLES # sudo("sysctl -w fs.file-max=64000") # COPY CONFIG FILE TO ES DIR if not fabric_files.exists( "/usr/local/elasticsearch/config/elasticsearch.yml"): yml = File("./examples/config/es_config.yml").read().replace( "\r", "") yml = expand_template( yml, { "id": Random.hex(length=8), "data_paths": ",".join("/data" + unicode(i + 1) for i, _ in enumerate(volumes)) }) File("./results/temp/elasticsearch.yml").write(yml) put("./results/temp/elasticsearch.yml", '/usr/local/elasticsearch/config/elasticsearch.yml', use_sudo=True) # FOR SOME REASON THE export COMMAND DOES NOT SEEM TO WORK # THIS SCRIPT SETS THE ES_MIN_MEM/ES_MAX_MEM EXPLICITLY if not fabric_files.exists( "/usr/local/elasticsearch/bin/elasticsearch.in.sh"): sh = File("./examples/config/es_run.sh").read().replace("\r", "") sh = expand_template(sh, {"memory": unicode(int(gigabytes / 2))}) File("./results/temp/elasticsearch.in.sh").write(sh) with cd("/home/ec2-user"): put("./results/temp/elasticsearch.in.sh", './temp/elasticsearch.in.sh', use_sudo=True) sudo( "cp -f ./temp/elasticsearch.in.sh /usr/local/elasticsearch/bin/elasticsearch.in.sh" )
def _scan_database(self): # GET ALL RELATIONS raw_relations = self.db.query(""" SELECT table_schema, table_name, referenced_table_schema, referenced_table_name, referenced_column_name, constraint_name, column_name, ordinal_position FROM information_schema.key_column_usage WHERE referenced_column_name IS NOT NULL """, param=self.settings.database) if not raw_relations: Log.error("No relations in the database") for r in self.settings.add_relations: try: a, b = map(strings.trim, r.split("->")) a = a.split(".") b = b.split(".") raw_relations.append( Data(table_schema=a[0], table_name=a[1], referenced_table_schema=b[0], referenced_table_name=b[1], referenced_column_name=b[2], constraint_name=Random.hex(20), column_name=a[2], ordinal_position=1)) except Exception as e: Log.error("Could not parse {{line|quote}}", line=r, cause=e) relations = jx.select(raw_relations, [{ "name": "constraint.name", "value": "constraint_name" }, { "name": "table.schema", "value": "table_schema" }, { "name": "table.name", "value": "table_name" }, { "name": "column.name", "value": "column_name" }, { "name": "referenced.table.schema", "value": "referenced_table_schema" }, { "name": "referenced.table.name", "value": "referenced_table_name" }, { "name": "referenced.column.name", "value": "referenced_column_name" }, { "name": "ordinal_position", "value": "ordinal_position" }]) # GET ALL TABLES raw_tables = self.db.query(""" SELECT t.table_schema, t.table_name, c.constraint_name, c.constraint_type, k.column_name, k.ordinal_position FROM information_schema.tables t LEFT JOIN information_schema.table_constraints c on c.table_name=t.table_name AND c.table_schema=t.table_schema and (constraint_type='UNIQUE' or constraint_type='PRIMARY KEY') LEFT JOIN information_schema.key_column_usage k on k.constraint_name=c.constraint_name AND k.table_name=t.table_name and k.table_schema=t.table_schema ORDER BY t.table_schema, t.table_name, c.constraint_name, k.ordinal_position, k.column_name """, param=self.settings.database) # ORGANIZE, AND PICK ONE UNIQUE CONSTRAINT FOR LINKING tables = UniqueIndex(keys=["name", "schema"]) for t, c in jx.groupby(raw_tables, ["table_name", "table_schema"]): c = wrap(list(c)) best_index = Null is_referenced = False is_primary = False for g, w in jx.groupby(c, "constraint_name"): if not g.constraint_name: continue w = list(w) ref = False for r in relations: if r.table.name == t.table_name and r.table.schema == t.table_schema and r.constraint.name == g.constraint_name: ref = True is_prime = w[0].constraint_type == "PRIMARY" reasons_this_one_is_better = [ best_index == None, # WE DO NOT HAVE A CANDIDATE YET is_prime and not is_primary, # PRIMARY KEYS ARE GOOD TO HAVE is_primary == is_prime and ref and not is_referenced, # REFERENCED UNIQUE TUPLES ARE GOOD TOO is_primary == is_prime and ref == is_referenced and len(w) < len(best_index) # THE SHORTER THE TUPLE, THE BETTER ] if any(reasons_this_one_is_better): is_primary = is_prime is_referenced = ref best_index = w tables.add({ "name": t.table_name, "schema": t.table_schema, "id": [b.column_name for b in best_index] }) fact_table = tables[self.settings.fact_table, self.settings.database.schema] ids_table = { "alias": "t0", "name": "__ids__", "schema": fact_table.schema, "id": fact_table.id } relations.extend( wrap({ "constraint": { "name": "__link_ids_to_fact_table__" }, "table": ids_table, "column": { "name": c }, "referenced": { "table": fact_table, "column": { "name": c } }, "ordinal_position": i }) for i, c in enumerate(fact_table.id)) tables.add(ids_table) # GET ALL COLUMNS raw_columns = self.db.query(""" SELECT column_name, table_schema, table_name, ordinal_position, data_type FROM information_schema.columns """, param=self.settings.database) reference_only_tables = [ r.split(".")[0] for r in self.settings.reference_only if len(r.split(".")) == 2 ] reference_all_tables = [ r.split(".")[0] for r in self.settings.reference_only if len(r.split(".")) == 1 ] foreign_column_table_schema_triples = {(r.column.name, r.table.name, r.table.schema) for r in relations} referenced_column_table_schema_triples = { (r.referenced.column.name, r.referenced.table.name, r.referenced.table.schema) for r in relations } related_column_table_schema_triples = foreign_column_table_schema_triples | referenced_column_table_schema_triples columns = UniqueIndex(["column.name", "table.name", "table.schema"]) for c in raw_columns: if c.table_name in reference_only_tables: if c.table_name + "." + c.column_name in self.settings.reference_only: include = True reference = True foreign = False elif c.column_name in tables[(c.table_name, c.table_schema)].id: include = self.settings.show_foreign_keys reference = False foreign = False else: include = False reference = False foreign = False elif c.table_name in reference_all_tables: # TABLES USED FOR REFERENCE, NO NESTED DOCUMENTS EXPECTED if c.column_name in tables[(c.table_name, c.table_schema)].id: include = self.settings.show_foreign_keys reference = True foreign = False elif (c.column_name, c.table_name, c.table_schema) in foreign_column_table_schema_triples: include = False reference = False foreign = True else: include = True reference = False foreign = False elif c.column_name in tables[(c.table_name, c.table_schema)].id: include = self.settings.show_foreign_keys reference = False foreign = False elif (c.column_name, c.table_name, c.table_schema) in foreign_column_table_schema_triples: include = False reference = False foreign = True elif (c.column_name, c.table_name, c.table_schema) in referenced_column_table_schema_triples: include = self.settings.show_foreign_keys reference = False foreign = False else: include = True reference = False foreign = False rel = { "column": { "name": c.column_name, "type": c.data_type }, "table": { "name": c.table_name, "schema": c.table_schema }, "ordinal_position": c.ordinal_position, "is_id": c.column_name in tables[(c.table_name, c.table_schema)].id, "include": include, # TRUE IF THIS COLUMN IS OUTPUTTED "reference": reference, # TRUE IF THIS COLUMN REPRESENTS THE ROW "foreign": foreign # TRUE IF THIS COLUMN POINTS TO ANOTHER ROW } columns.add(rel) # ITERATE OVER ALL PATHS todo = FlatList() output_columns = FlatList() nested_path_to_join = {} all_nested_paths = [["."]] def follow_paths(position, path, nested_path, done_relations, no_nested_docs): if position.name in self.settings.exclude: return if DEBUG: Log.note("Trace {{path}}", path=path) if position.name != "__ids__": # USED TO CONFIRM WE CAN ACCESS THE TABLE (WILL THROW ERROR WHEN IF IT FAILS) self.db.query("SELECT * FROM " + quote_column(position.name, position.schema) + " LIMIT 1") if position.name in reference_all_tables: no_nested_docs = True if position.name in reference_only_tables: return curr_join_list = copy(nested_path_to_join[nested_path[0]]) # INNER OBJECTS referenced_tables = list( jx.groupby( jx.filter( relations, { "eq": { "table.name": position.name, "table.schema": position.schema } }), "constraint.name")) for g, constraint_columns in referenced_tables: g = unwrap(g) constraint_columns = deepcopy(constraint_columns) if g["constraint.name"] in done_relations: continue if any(cc for cc in constraint_columns if cc.referenced.table.name in self.settings.exclude): continue done_relations.add(g["constraint.name"]) many_to_one_joins = nested_path_to_join[nested_path[0]] index = len(many_to_one_joins) alias = "t" + text_type(index) for c in constraint_columns: c.referenced.table.alias = alias c.table = position many_to_one_joins.append({ "join_columns": constraint_columns, "path": path, "nested_path": nested_path }) # referenced_table_path = join_field(split_field(path) + ["/".join(constraint_columns.referenced.table.name)]) # HANDLE THE COMMON *id SUFFIX name = [] for a, b in zip(constraint_columns.column.name, constraint_columns.referenced.table.name): if a.startswith(b): name.append(b) elif a.endswith("_id"): name.append(a[:-3]) else: name.append(a) referenced_column_path = join_field( split_field(path) + ["/".join(name)]) col_pointer_name = relative_field(referenced_column_path, nested_path[0]) # insert into nested1 VALUES (100, 10, 'aaa', -1); # id.about.time.nested1 .ref=10 # id.about.time.nested1 .ref.name for col in columns: if col.table.name == constraint_columns[ 0].referenced.table.name and col.table.schema == constraint_columns[ 0].referenced.table.schema: col_full_name = concat_field( col_pointer_name, literal_field(col.column.name)) if col.is_id and col.table.name == fact_table.name and col.table.schema == fact_table.schema: # ALWAYS SHOW THE ID OF THE FACT c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text_type(c_index), "column": col, "sort": True, "path": referenced_column_path, "nested_path": nested_path, "put": col_full_name }) elif col.column.name == constraint_columns[ 0].column.name: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text_type(c_index), "column": col, "sort": False, "path": referenced_column_path, "nested_path": nested_path, "put": col_full_name if self.settings.show_foreign_keys else None }) elif col.is_id: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text_type(c_index), "column": col, "sort": False, "path": referenced_column_path, "nested_path": nested_path, "put": col_full_name if self.settings.show_foreign_keys else None }) elif col.reference: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text_type(c_index), "column": col, "sort": False, "path": referenced_column_path, "nested_path": nested_path, "put": col_pointer_name if not self.settings.show_foreign_keys else col_full_name # REFERENCE FIELDS CAN REPLACE THE WHOLE OBJECT BEING REFERENCED }) elif col.include: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text_type(c_index), "column": col, "sort": False, "path": referenced_column_path, "nested_path": nested_path, "put": col_full_name }) if position.name in reference_only_tables: continue todo.append( Data(position=copy(constraint_columns[0].referenced.table), path=referenced_column_path, nested_path=nested_path, done_relations=copy(done_relations), no_nested_docs=no_nested_docs)) # NESTED OBJECTS if not no_nested_docs: for g, constraint_columns in jx.groupby( jx.filter( relations, { "eq": { "referenced.table.name": position.name, "referenced.table.schema": position.schema } }), "constraint.name"): g = unwrap(g) constraint_columns = deepcopy(constraint_columns) if g["constraint.name"] in done_relations: continue done_relations.add(g["constraint.name"]) many_table = set(constraint_columns.table.name) if not (many_table - self.settings.exclude): continue referenced_column_path = join_field( split_field(path) + ["/".join(many_table)]) new_nested_path = [referenced_column_path] + nested_path all_nested_paths.append(new_nested_path) # if new_path not in self.settings.include: # Log.note("Exclude nested path {{path}}", path=new_path) # continue one_to_many_joins = nested_path_to_join[ referenced_column_path] = copy(curr_join_list) index = len(one_to_many_joins) alias = "t" + text_type(index) for c in constraint_columns: c.table.alias = alias c.referenced.table = position one_to_many_joins.append( set_default({}, g, { "children": True, "join_columns": constraint_columns, "path": path, "nested_path": nested_path })) # insert into nested1 VALUES (100, 10, 'aaa', -1); # id.about.time.nested1 .ref=10# id.about.time.nested1 .ref.name for col in columns: if col.table.name == constraint_columns[ 0].table.name and col.table.schema == constraint_columns[ 0].table.schema: col_full_name = join_field( split_field(referenced_column_path) [len(split_field(new_nested_path[0])):] + [literal_field(col.column.name)]) if col.column.name == constraint_columns[ 0].column.name: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text_type(c_index), "column": col, "sort": col.is_id, "path": referenced_column_path, "nested_path": new_nested_path, "put": col_full_name if self.settings.show_foreign_keys else None }) elif col.is_id: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text_type(c_index), "column": col, "sort": col.is_id, "path": referenced_column_path, "nested_path": new_nested_path, "put": col_full_name if self.settings.show_foreign_keys else None }) else: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text_type(c_index), "column": col, "sort": col.is_id, "path": referenced_column_path, "nested_path": new_nested_path, "put": col_full_name if col.include else None }) todo.append( Data(position=constraint_columns[0].table, path=referenced_column_path, nested_path=new_nested_path, done_relations=copy(done_relations), no_nested_docs=no_nested_docs)) path = "." nested_path = [path] nested_path_to_join["."] = [{ "path": path, "join_columns": [{ "referenced": { "table": ids_table } }], "nested_path": nested_path }] todo.append( Data(position=ids_table, path=path, nested_path=nested_path, done_relations=set(), no_nested_docs=False)) while todo: item = todo.pop(0) follow_paths(**item) self.all_nested_paths = all_nested_paths self.nested_path_to_join = nested_path_to_join self.columns = output_columns
def random_id(): return Random.hex(40)
def _install_es(self, gigabytes): volumes = self.instance.markup.drives if not fabric_files.exists("/usr/local/elasticsearch"): with cd("/home/ec2-user/"): run("mkdir -p temp") if not File(LOCAL_JRE).exists: Log.error("Expecting {{file}} on manager to spread to ES instances", file=LOCAL_JRE) with cd("/home/ec2-user/temp"): run('rm -f '+JRE) put("resources/"+JRE, JRE) sudo("rpm -i "+JRE) sudo("alternatives --install /usr/bin/java java /usr/java/default/bin/java 20000") run("export JAVA_HOME=/usr/java/default") with cd("/home/ec2-user/"): run('wget https://download.elasticsearch.org/elasticsearch/elasticsearch/elasticsearch-1.7.1.tar.gz') run('tar zxfv elasticsearch-1.7.1.tar.gz') sudo('mkdir /usr/local/elasticsearch') sudo('cp -R elasticsearch-1.7.1/* /usr/local/elasticsearch/') with cd('/usr/local/elasticsearch/'): # BE SURE TO MATCH THE PLUGLIN WITH ES VERSION # https://github.com/elasticsearch/elasticsearch-cloud-aws sudo('bin/plugin -install elasticsearch/elasticsearch-cloud-aws/2.7.1') #REMOVE THESE FILES, WE WILL REPLACE THEM WITH THE CORRECT VERSIONS AT THE END sudo("rm -f /usr/local/elasticsearch/config/elasticsearch.yml") sudo("rm -f /usr/local/elasticsearch/bin/elasticsearch.in.sh") self.conn = self.instance.connection # MOUNT AND FORMAT THE EBS VOLUMES (list with `lsblk`) for i, k in enumerate(volumes): if not fabric_files.exists(k.path): with fabric_settings(warn_only=True): sudo('sudo umount '+k.device) sudo('yes | sudo mkfs -t ext4 '+k.device) sudo('mkdir '+k.path) sudo('sudo mount '+k.device+' '+k.path) #ADD TO /etc/fstab SO AROUND AFTER REBOOT sudo("sed -i '$ a\\"+k.device+" "+k.path+" ext4 defaults,nofail 0 2' /etc/fstab") # TEST IT IS WORKING sudo('mount -a') # INCREASE THE FILE HANDLE LIMITS with cd("/home/ec2-user/"): File("./results/temp/sysctl.conf").delete() get("/etc/sysctl.conf", "./results/temp/sysctl.conf", use_sudo=True) lines = File("./results/temp/sysctl.conf").read() if lines.find("fs.file-max = 100000") == -1: lines += "\nfs.file-max = 100000" lines = lines.replace("net.bridge.bridge-nf-call-ip6tables = 0", "") lines = lines.replace("net.bridge.bridge-nf-call-iptables = 0", "") lines = lines.replace("net.bridge.bridge-nf-call-arptables = 0", "") File("./results/temp/sysctl.conf").write(lines) put("./results/temp/sysctl.conf", "/etc/sysctl.conf", use_sudo=True) sudo("sysctl -p") # INCREASE FILE HANDLE PERMISSIONS sudo("sed -i '$ a\\root soft nofile 50000' /etc/security/limits.conf") sudo("sed -i '$ a\\root hard nofile 100000' /etc/security/limits.conf") sudo("sed -i '$ a\\root memlock unlimited' /etc/security/limits.conf") sudo("sed -i '$ a\\ec2-user soft nofile 50000' /etc/security/limits.conf") sudo("sed -i '$ a\\ec2-user hard nofile 100000' /etc/security/limits.conf") sudo("sed -i '$ a\\ec2-user memlock unlimited' /etc/security/limits.conf") # EFFECTIVE LOGIN TO LOAD CHANGES TO FILE HANDLES # sudo("sudo -i -u ec2-user") if not fabric_files.exists("/data1/logs"): sudo('mkdir /data1/logs') sudo('mkdir /data1/heapdump') #INCREASE NUMBER OF FILE HANDLES # sudo("sysctl -w fs.file-max=64000") # COPY CONFIG FILE TO ES DIR if not fabric_files.exists("/usr/local/elasticsearch/config/elasticsearch.yml"): yml = File("./examples/config/es_config.yml").read().replace("\r", "") yml = expand_template(yml, { "id": Random.hex(length=8), "data_paths": ",".join("/data"+unicode(i+1) for i, _ in enumerate(volumes)) }) File("./results/temp/elasticsearch.yml").write(yml) put("./results/temp/elasticsearch.yml", '/usr/local/elasticsearch/config/elasticsearch.yml', use_sudo=True) # FOR SOME REASON THE export COMMAND DOES NOT SEEM TO WORK # THIS SCRIPT SETS THE ES_MIN_MEM/ES_MAX_MEM EXPLICITLY if not fabric_files.exists("/usr/local/elasticsearch/bin/elasticsearch.in.sh"): sh = File("./examples/config/es_run.sh").read().replace("\r", "") sh = expand_template(sh, {"memory": unicode(int(gigabytes/2))}) File("./results/temp/elasticsearch.in.sh").write(sh) with cd("/home/ec2-user"): put("./results/temp/elasticsearch.in.sh", './temp/elasticsearch.in.sh', use_sudo=True) sudo("cp -f ./temp/elasticsearch.in.sh /usr/local/elasticsearch/bin/elasticsearch.in.sh")