def exists(self, path): with TempFile() as t: try: result = self.conn.get(path, t.abspath) return t.exists except IOError: return False
def test_expression_ff(self) -> None: source, expected = read_data("expression") with TempFile() as tmp_file: self.assertTrue(ff(tmp_file)) actual = tmp_file.read() self.assertFormatEqual(expected, actual) with patch("dump_to_file", dump_to_stderr): assert_equivalent(source, actual) assert_stable(source, actual, FileMode())
def setup_flask_ssl(flask_app, flask_config): """ SPAWN A NEW THREAD TO RUN AN SSL ENDPOINT REMOVES ssl_context FROM flask_config BEFORE RETURNING :param flask_app: :param flask_config: :return: """ if not flask_config.ssl_context: return ssl_flask = flask_config.copy() ssl_flask.debug = False ssl_flask.port = 443 if is_data(flask_config.ssl_context): # EXPECTED PEM ENCODED FILE NAMES # `load_cert_chain` REQUIRES CONCATENATED LIST OF CERTS with TempFile() as tempfile: try: tempfile.write( File(ssl_flask.ssl_context.certificate_file).read_bytes() ) if ssl_flask.ssl_context.certificate_chain_file: tempfile.write( File(ssl_flask.ssl_context.certificate_chain_file).read_bytes() ) tempfile.flush() tempfile.close() context = SSLContext(PROTOCOL_SSLv23) context.load_cert_chain( tempfile.name, keyfile=File(ssl_flask.ssl_context.privatekey_file).abspath, ) ssl_flask.ssl_context = context except Exception as e: Log.error("Could not handle ssl context construction", cause=e) def runner(please_stop): Log.warning( "ActiveData listening on encrypted port {{port}}", port=ssl_flask.port ) flask_app.run(**ssl_flask) Thread.run("SSL Server", runner) if flask_config.ssl_context and flask_config.port != 80: Log.warning( "ActiveData has SSL context, but is still listening on non-encrypted http port {{port}}", port=flask_config.port, ) flask_config.ssl_context = None
def _restart_etl_supervisor(conn, please_stop, cpu_count): # READ LOCAL CONFIG FILE, ALTER IT FOR THIS MACHINE RESOURCES, AND PUSH TO REMOTE conf_file = File("./examples/config/etl_supervisor.conf") content = conf_file.read_bytes() find = between(content, "numprocs=", "\n") content = content.replace("numprocs=" + find + "\n", "numprocs=" + str(cpu_count) + "\n") with TempFile() as tempfile: tempfile.write(content) conn.sudo("rm -f /etc/supervisor/conf.d/etl_supervisor.conf") conn.put(tempfile.abspath, "/etc/supervisord.conf", use_sudo=True) conn.run("mkdir -p /home/ec2-user/logs") # START DAEMON (OR THROW ERROR IF RUNNING ALREADY) conn.sudo("supervisord -c /etc/supervisord.conf", warn=True) conn.sudo("supervisorctl reread") conn.sudo("supervisorctl update")
def extractor( guid, num_partitions, esq, query, selects, query_path, schema, chunk_size, cardinality, abs_limit, formatter, please_stop, ): total = 0 # WE MESS WITH THE QUERY LIMITS FOR CHUNKING query.limit = first(query.groupby).domain.limit = chunk_size * 2 start_time = Date.now() try: write_status( guid, { "status": "starting", "chunks": num_partitions, "rows": min(abs_limit, cardinality), "start_time": start_time, "timestamp": Date.now(), }, ) with TempFile() as temp_file: with open(temp_file.abspath, "wb") as output: for i in range(0, num_partitions): if please_stop: Log.error("request to shutdown!") is_last = i == num_partitions - 1 first(query.groupby).allowNulls = is_last acc, decoders, es_query = aggop_to_es_queries( selects, query_path, schema, query) # REACH INTO THE QUERY TO SET THE partitions terms = es_query.aggs._filter.aggs._match.terms terms.include.partition = i terms.include.num_partitions = num_partitions result = esq.es.search(deepcopy(es_query), query.limit) aggs = unwrap(result.aggregations) formatter.add(aggs, acc, query, decoders, selects) for b in formatter.bytes(): if b is DONE: break output.write(b) else: write_status( guid, { "status": "working", "chunk": i, "chunks": num_partitions, "row": total, "rows": min(abs_limit, cardinality), "start_time": start_time, "timestamp": Date.now(), }, ) continue break for b in formatter.footer(): output.write(b) upload(guid + ".json", temp_file) write_status( guid, { "ok": True, "status": "done", "chunks": num_partitions, "rows": min(abs_limit, cardinality), "start_time": start_time, "end_time": Date.now(), "timestamp": Date.now(), }, ) except Exception as e: e = Except.wrap(e) write_status( guid, { "ok": False, "status": "error", "error": e, "start_time": start_time, "end_time": Date.now(), "timestamp": Date.now(), }, ) Log.warning("Could not extract", cause=e)
def extractor(guid, abs_limit, esq, es_query, formatter, please_stop): start_time = Date.now() total = 0 write_status( guid, { "status": "starting", "limit": abs_limit, "start_time": start_time, "timestamp": Date.now(), }, ) try: with TempFile() as temp_file: with open(temp_file.abspath, "wb") as output: result = esq.es.search(es_query, scroll="5m") while not please_stop: scroll_id = result._scroll_id hits = result.hits.hits chunk_limit = abs_limit - total hits = hits[:chunk_limit] if len(hits) == 0: break formatter.add(hits) for b in formatter.bytes(): if b is DONE: break output.write(b) else: total += len(hits) DEBUG and Log.note( "{{num}} of {{total}} downloaded", num=total, total=result.hits.total, ) write_status( guid, { "status": "working", "row": total, "rows": result.hits.total, "start_time": start_time, "timestamp": Date.now(), }, ) with Timer("get more", verbose=DEBUG): result = esq.es.scroll(scroll_id) continue break if please_stop: Log.error("Bulk download stopped for shutdown") for b in formatter.footer(): output.write(b) write_status( guid, { "status": "uploading to s3", "rows": total, "start_time": start_time, "timestamp": Date.now(), }, ) upload(guid + ".json", temp_file) if please_stop: Log.error("shutdown requested, did not complete download") DEBUG and Log.note("Done. {{total}} uploaded", total=total) write_status( guid, { "ok": True, "status": "done", "rows": total, "start_time": start_time, "end_time": Date.now(), "timestamp": Date.now(), }, ) except Exception as e: e = Except.wrap(e) write_status( guid, { "ok": False, "status": "error", "error": e, "start_time": start_time, "end_time": Date.now(), "timestamp": Date.now(), }, ) Log.warning("Could not extract", cause=e)
def extract(self, db, start_point, first_value, data, please_stop): Log.note( "Starting scan of {{table}} at {{id}} and sending to batch {{start_point}}", table=self.settings.snowflake.fact_table, id=first_value, start_point=start_point) id = quote_column(self._extract.field.last()) ids = (SQL_SELECT + id + SQL_FROM + self.settings.snowflake.fact_table + SQL_WHERE + id + " in " + sql_iso(sql_list(map(db.quote_value, data)))) sql = self.schema.get_sql(ids) with Timer("Sending SQL"): cursor = db.query(sql, stream=True, row_tuples=True) extract = self.settings.extract fact_table = self.settings.snowflake.fact_table with TempFile() as temp_file: parent_etl = None for s in start_point: parent_etl = {"id": s, "source": parent_etl} parent_etl["revision"] = get_git_revision() parent_etl["machine"] = machine_metadata def append(value, i): """ :param value: THE DOCUMENT TO ADD :return: PleaseStop """ temp_file.append( convert.value2json({ fact_table: elasticsearch.scrub(value), "etl": { "id": i, "source": parent_etl, "timestamp": Date.now() } })) with Timer("assemble data"): self.construct_docs(cursor, append, please_stop) # WRITE TO S3 s3_file_name = ".".join(map(text_type, start_point)) with Timer("write to destination {{filename}}", param={"filename": s3_file_name}): if not isinstance(self.settings.destination, text_type): destination = self.bucket.get_key(s3_file_name, must_exist=False) destination.write_lines(temp_file) else: destination = File(self.settings.destination) destination.write( convert.value2json( [convert.json2value(o) for o in temp_file], pretty=True)) return False # NOTIFY SQS now = Date.now() self.notify.add({ "bucket": self.settings.destination.bucket, "key": s3_file_name, "timestamp": now.unix, "date/time": now.format() }) # SUCCESS!! File(extract.last).write(convert.value2json([start_point, first_value]))
def _install_es(gigabytes, es_version="6.5.4", instance=None, conn=None): es_file = 'elasticsearch-' + es_version + '.tar.gz' volumes = instance.markup.drives if not conn.exists("/usr/local/elasticsearch/config/elasticsearch.yml"): with conn.cd("/home/ec2-user/"): conn.run("mkdir -p temp") if not (RESOURCES / JRE).exists: Log.error("Expecting {{file}} on manager to spread to ES instances", file=(RESOURCES / JRE)) response = conn.run("java -version", warn=True) if "Java(TM) SE Runtime Environment" not in response: with conn.cd("/home/ec2-user/temp"): conn.run('rm -f '+JRE) conn.put((RESOURCES / JRE), JRE) conn.sudo("rpm -i "+JRE) conn.sudo("alternatives --install /usr/bin/java java /usr/java/default/bin/java 20000") conn.run("export JAVA_HOME=/usr/java/default") with conn.cd("/home/ec2-user/"): conn.put(RESOURCES / es_file, es_file) conn.run('tar zxfv ' + es_file) conn.sudo("rm -fr /usr/local/elasticsearch", warn=True) conn.sudo('mkdir /usr/local/elasticsearch') conn.sudo('cp -R elasticsearch-'+es_version+'/* /usr/local/elasticsearch/') with conn.cd('/usr/local/elasticsearch/'): # BE SURE TO MATCH THE PLUGLIN WITH ES VERSION # https://github.com/elasticsearch/elasticsearch-cloud-aws conn.sudo('sudo bin/elasticsearch-plugin install -b discovery-ec2') # REMOVE THESE FILES, WE WILL REPLACE THEM WITH THE CORRECT VERSIONS AT THE END conn.sudo("rm -f /usr/local/elasticsearch/config/elasticsearch.yml") conn.sudo("rm -f /usr/local/elasticsearch/config/jvm.options") conn.sudo("rm -f /usr/local/elasticsearch/config/log4j2.properties") # MOUNT AND FORMAT THE VOLUMES (list with `lsblk`) for i, k in enumerate(volumes): if not conn.exists(k.path): # ENSURE DEVICE IS NOT MOUNTED conn.sudo('sudo umount '+k.device, warn=True) # (RE)PARTITION THE LOCAL DEVICE, AND FORMAT conn.sudo("parted " + k.device + " --script \"mklabel gpt mkpart primary ext4 2048s 100%\"") conn.sudo('yes | sudo mkfs -t ext4 '+k.device) # ES AND JOURNALLING DO NOT MIX conn.sudo('tune2fs -o journal_data_writeback '+k.device) conn.sudo('tune2fs -O ^has_journal '+k.device) # MOUNT IT conn.sudo('mkdir '+k.path) conn.sudo('sudo mount '+k.device+' '+k.path) conn.sudo('chown -R ec2-user:ec2-user '+k.path) # ADD TO /etc/fstab SO AROUND AFTER REBOOT conn.sudo("sed -i '$ a\\"+k.device+" "+k.path+" ext4 defaults,nofail 0 2' /etc/fstab") # TEST IT IS WORKING conn.sudo('mount -a') # INCREASE THE FILE HANDLE LIMITS with conn.cd("/home/ec2-user/"): with TempFile() as temp: conn.get("/etc/sysctl.conf", temp, use_sudo=True) lines = temp.read() if lines.find("fs.file-max = 100000") == -1: lines += "\nfs.file-max = 100000" lines = lines.replace("net.bridge.bridge-nf-call-ip6tables = 0", "") lines = lines.replace("net.bridge.bridge-nf-call-iptables = 0", "") lines = lines.replace("net.bridge.bridge-nf-call-arptables = 0", "") temp.write(lines) conn.put(temp, "/etc/sysctl.conf", use_sudo=True) conn.sudo("sudo sed -i '$ a\\vm.max_map_count = 262144' /etc/sysctl.conf") conn.sudo("sysctl -p") # INCREASE FILE HANDLE PERMISSIONS conn.sudo("sed -i '$ a\\root soft nofile 100000' /etc/security/limits.conf") conn.sudo("sed -i '$ a\\root hard nofile 100000' /etc/security/limits.conf") conn.sudo("sed -i '$ a\\root soft memlock unlimited' /etc/security/limits.conf") conn.sudo("sed -i '$ a\\root hard memlock unlimited' /etc/security/limits.conf") conn.sudo("sed -i '$ a\\ec2-user soft nofile 100000' /etc/security/limits.conf") conn.sudo("sed -i '$ a\\ec2-user hard nofile 100000' /etc/security/limits.conf") conn.sudo("sed -i '$ a\\ec2-user soft memlock unlimited' /etc/security/limits.conf") conn.sudo("sed -i '$ a\\ec2-user hard memlock unlimited' /etc/security/limits.conf") if not conn.exists("/data1/logs"): conn.run('mkdir /data1/logs') conn.run('mkdir /data1/heapdump') # COPY CONFIG FILES TO ES DIR if not conn.exists("/usr/local/elasticsearch/config/elasticsearch.yml"): conn.put("./examples/config/es6_log4j2.properties", '/usr/local/elasticsearch/config/log4j2.properties', use_sudo=True) jvm = File("./examples/config/es6_jvm.options").read().replace('\r', '') jvm = expand_template(jvm, {"memory": int(gigabytes/2)}) with TempFile() as temp: temp.write(jvm) conn.put(temp, '/usr/local/elasticsearch/config/jvm.options', use_sudo=True) yml = File("./examples/config/es6_config.yml").read().replace("\r", "") yml = expand_template(yml, { "id": instance.ip_address, "data_paths": ",".join("/data" + text(i + 1) for i, _ in enumerate(volumes)) }) with TempFile() as temp: temp.write(yml) conn.put(temp, '/usr/local/elasticsearch/config/elasticsearch.yml', use_sudo=True) conn.sudo("chown -R ec2-user:ec2-user /usr/local/elasticsearch")
def test_empty_ff(self) -> None: expected = "" with TempFile() as tmp_file: self.assertFalse(ff(tmp_file)) actual = tmp_file.read() self.assertFormatEqual(expected, actual)