def exists(self, path):
     with TempFile() as t:
         try:
             result = self.conn.get(path, t.abspath)
             return t.exists
         except IOError:
             return False
 def test_expression_ff(self) -> None:
     source, expected = read_data("expression")
     with TempFile() as tmp_file:
         self.assertTrue(ff(tmp_file))
         actual = tmp_file.read()
         self.assertFormatEqual(expected, actual)
         with patch("dump_to_file", dump_to_stderr):
             assert_equivalent(source, actual)
             assert_stable(source, actual, FileMode())
Beispiel #3
0
def setup_flask_ssl(flask_app, flask_config):
    """
    SPAWN A NEW THREAD TO RUN AN SSL ENDPOINT
    REMOVES ssl_context FROM flask_config BEFORE RETURNING

    :param flask_app:
    :param flask_config:
    :return:
    """
    if not flask_config.ssl_context:
        return

    ssl_flask = flask_config.copy()
    ssl_flask.debug = False
    ssl_flask.port = 443

    if is_data(flask_config.ssl_context):
        # EXPECTED PEM ENCODED FILE NAMES
        # `load_cert_chain` REQUIRES CONCATENATED LIST OF CERTS
        with TempFile() as tempfile:
            try:
                tempfile.write(
                    File(ssl_flask.ssl_context.certificate_file).read_bytes()
                )
                if ssl_flask.ssl_context.certificate_chain_file:
                    tempfile.write(
                        File(ssl_flask.ssl_context.certificate_chain_file).read_bytes()
                    )
                tempfile.flush()
                tempfile.close()

                context = SSLContext(PROTOCOL_SSLv23)
                context.load_cert_chain(
                    tempfile.name,
                    keyfile=File(ssl_flask.ssl_context.privatekey_file).abspath,
                )

                ssl_flask.ssl_context = context
            except Exception as e:
                Log.error("Could not handle ssl context construction", cause=e)

    def runner(please_stop):
        Log.warning(
            "ActiveData listening on encrypted port {{port}}", port=ssl_flask.port
        )
        flask_app.run(**ssl_flask)

    Thread.run("SSL Server", runner)

    if flask_config.ssl_context and flask_config.port != 80:
        Log.warning(
            "ActiveData has SSL context, but is still listening on non-encrypted http port {{port}}",
            port=flask_config.port,
        )

    flask_config.ssl_context = None
def _restart_etl_supervisor(conn, please_stop, cpu_count):
    # READ LOCAL CONFIG FILE, ALTER IT FOR THIS MACHINE RESOURCES, AND PUSH TO REMOTE
    conf_file = File("./examples/config/etl_supervisor.conf")
    content = conf_file.read_bytes()
    find = between(content, "numprocs=", "\n")
    content = content.replace("numprocs=" + find + "\n",
                              "numprocs=" + str(cpu_count) + "\n")
    with TempFile() as tempfile:
        tempfile.write(content)
        conn.sudo("rm -f /etc/supervisor/conf.d/etl_supervisor.conf")
        conn.put(tempfile.abspath, "/etc/supervisord.conf", use_sudo=True)
    conn.run("mkdir -p /home/ec2-user/logs")

    # START DAEMON (OR THROW ERROR IF RUNNING ALREADY)
    conn.sudo("supervisord -c /etc/supervisord.conf", warn=True)
    conn.sudo("supervisorctl reread")
    conn.sudo("supervisorctl update")
def extractor(
    guid,
    num_partitions,
    esq,
    query,
    selects,
    query_path,
    schema,
    chunk_size,
    cardinality,
    abs_limit,
    formatter,
    please_stop,
):
    total = 0
    # WE MESS WITH THE QUERY LIMITS FOR CHUNKING
    query.limit = first(query.groupby).domain.limit = chunk_size * 2
    start_time = Date.now()

    try:
        write_status(
            guid,
            {
                "status": "starting",
                "chunks": num_partitions,
                "rows": min(abs_limit, cardinality),
                "start_time": start_time,
                "timestamp": Date.now(),
            },
        )

        with TempFile() as temp_file:
            with open(temp_file.abspath, "wb") as output:
                for i in range(0, num_partitions):
                    if please_stop:
                        Log.error("request to shutdown!")
                    is_last = i == num_partitions - 1
                    first(query.groupby).allowNulls = is_last
                    acc, decoders, es_query = aggop_to_es_queries(
                        selects, query_path, schema, query)
                    # REACH INTO THE QUERY TO SET THE partitions
                    terms = es_query.aggs._filter.aggs._match.terms
                    terms.include.partition = i
                    terms.include.num_partitions = num_partitions

                    result = esq.es.search(deepcopy(es_query), query.limit)
                    aggs = unwrap(result.aggregations)

                    formatter.add(aggs, acc, query, decoders, selects)
                    for b in formatter.bytes():
                        if b is DONE:
                            break
                        output.write(b)
                    else:
                        write_status(
                            guid,
                            {
                                "status": "working",
                                "chunk": i,
                                "chunks": num_partitions,
                                "row": total,
                                "rows": min(abs_limit, cardinality),
                                "start_time": start_time,
                                "timestamp": Date.now(),
                            },
                        )
                        continue
                    break
                for b in formatter.footer():
                    output.write(b)

            upload(guid + ".json", temp_file)
        write_status(
            guid,
            {
                "ok": True,
                "status": "done",
                "chunks": num_partitions,
                "rows": min(abs_limit, cardinality),
                "start_time": start_time,
                "end_time": Date.now(),
                "timestamp": Date.now(),
            },
        )
    except Exception as e:
        e = Except.wrap(e)
        write_status(
            guid,
            {
                "ok": False,
                "status": "error",
                "error": e,
                "start_time": start_time,
                "end_time": Date.now(),
                "timestamp": Date.now(),
            },
        )
        Log.warning("Could not extract", cause=e)
Beispiel #6
0
def extractor(guid, abs_limit, esq, es_query, formatter, please_stop):
    start_time = Date.now()
    total = 0
    write_status(
        guid,
        {
            "status": "starting",
            "limit": abs_limit,
            "start_time": start_time,
            "timestamp": Date.now(),
        },
    )

    try:
        with TempFile() as temp_file:
            with open(temp_file.abspath, "wb") as output:
                result = esq.es.search(es_query, scroll="5m")

                while not please_stop:
                    scroll_id = result._scroll_id
                    hits = result.hits.hits
                    chunk_limit = abs_limit - total
                    hits = hits[:chunk_limit]
                    if len(hits) == 0:
                        break
                    formatter.add(hits)
                    for b in formatter.bytes():
                        if b is DONE:
                            break
                        output.write(b)
                    else:
                        total += len(hits)
                        DEBUG and Log.note(
                            "{{num}} of {{total}} downloaded",
                            num=total,
                            total=result.hits.total,
                        )
                        write_status(
                            guid,
                            {
                                "status": "working",
                                "row": total,
                                "rows": result.hits.total,
                                "start_time": start_time,
                                "timestamp": Date.now(),
                            },
                        )
                        with Timer("get more", verbose=DEBUG):
                            result = esq.es.scroll(scroll_id)
                        continue
                    break
                if please_stop:
                    Log.error("Bulk download stopped for shutdown")
                for b in formatter.footer():
                    output.write(b)

            write_status(
                guid,
                {
                    "status": "uploading to s3",
                    "rows": total,
                    "start_time": start_time,
                    "timestamp": Date.now(),
                },
            )
            upload(guid + ".json", temp_file)
        if please_stop:
            Log.error("shutdown requested, did not complete download")
        DEBUG and Log.note("Done. {{total}} uploaded", total=total)
        write_status(
            guid,
            {
                "ok": True,
                "status": "done",
                "rows": total,
                "start_time": start_time,
                "end_time": Date.now(),
                "timestamp": Date.now(),
            },
        )
    except Exception as e:
        e = Except.wrap(e)
        write_status(
            guid,
            {
                "ok": False,
                "status": "error",
                "error": e,
                "start_time": start_time,
                "end_time": Date.now(),
                "timestamp": Date.now(),
            },
        )
        Log.warning("Could not extract", cause=e)
Beispiel #7
0
    def extract(self, db, start_point, first_value, data, please_stop):
        Log.note(
            "Starting scan of {{table}} at {{id}} and sending to batch {{start_point}}",
            table=self.settings.snowflake.fact_table,
            id=first_value,
            start_point=start_point)

        id = quote_column(self._extract.field.last())
        ids = (SQL_SELECT + id + SQL_FROM +
               self.settings.snowflake.fact_table + SQL_WHERE + id + " in " +
               sql_iso(sql_list(map(db.quote_value, data))))
        sql = self.schema.get_sql(ids)

        with Timer("Sending SQL"):
            cursor = db.query(sql, stream=True, row_tuples=True)

        extract = self.settings.extract
        fact_table = self.settings.snowflake.fact_table

        with TempFile() as temp_file:
            parent_etl = None
            for s in start_point:
                parent_etl = {"id": s, "source": parent_etl}
            parent_etl["revision"] = get_git_revision()
            parent_etl["machine"] = machine_metadata

            def append(value, i):
                """
                :param value: THE DOCUMENT TO ADD
                :return: PleaseStop
                """
                temp_file.append(
                    convert.value2json({
                        fact_table: elasticsearch.scrub(value),
                        "etl": {
                            "id": i,
                            "source": parent_etl,
                            "timestamp": Date.now()
                        }
                    }))

            with Timer("assemble data"):
                self.construct_docs(cursor, append, please_stop)

            # WRITE TO S3
            s3_file_name = ".".join(map(text_type, start_point))
            with Timer("write to destination {{filename}}",
                       param={"filename": s3_file_name}):
                if not isinstance(self.settings.destination, text_type):
                    destination = self.bucket.get_key(s3_file_name,
                                                      must_exist=False)
                    destination.write_lines(temp_file)
                else:
                    destination = File(self.settings.destination)
                    destination.write(
                        convert.value2json(
                            [convert.json2value(o) for o in temp_file],
                            pretty=True))
                    return False

        # NOTIFY SQS
        now = Date.now()
        self.notify.add({
            "bucket": self.settings.destination.bucket,
            "key": s3_file_name,
            "timestamp": now.unix,
            "date/time": now.format()
        })

        # SUCCESS!!
        File(extract.last).write(convert.value2json([start_point,
                                                     first_value]))
def _install_es(gigabytes, es_version="6.5.4", instance=None, conn=None):
    es_file = 'elasticsearch-' + es_version + '.tar.gz'
    volumes = instance.markup.drives

    if not conn.exists("/usr/local/elasticsearch/config/elasticsearch.yml"):
        with conn.cd("/home/ec2-user/"):
            conn.run("mkdir -p temp")

        if not (RESOURCES / JRE).exists:
            Log.error("Expecting {{file}} on manager to spread to ES instances", file=(RESOURCES / JRE))
        response = conn.run("java -version", warn=True)
        if "Java(TM) SE Runtime Environment" not in response:
            with conn.cd("/home/ec2-user/temp"):
                conn.run('rm -f '+JRE)
                conn.put((RESOURCES / JRE), JRE)
                conn.sudo("rpm -i "+JRE)
                conn.sudo("alternatives --install /usr/bin/java java /usr/java/default/bin/java 20000")
                conn.run("export JAVA_HOME=/usr/java/default")

        with conn.cd("/home/ec2-user/"):
            conn.put(RESOURCES / es_file, es_file)
            conn.run('tar zxfv ' + es_file)
            conn.sudo("rm -fr /usr/local/elasticsearch", warn=True)
            conn.sudo('mkdir /usr/local/elasticsearch')
            conn.sudo('cp -R elasticsearch-'+es_version+'/* /usr/local/elasticsearch/')

        with conn.cd('/usr/local/elasticsearch/'):
            # BE SURE TO MATCH THE PLUGLIN WITH ES VERSION
            # https://github.com/elasticsearch/elasticsearch-cloud-aws
            conn.sudo('sudo bin/elasticsearch-plugin install -b discovery-ec2')

        # REMOVE THESE FILES, WE WILL REPLACE THEM WITH THE CORRECT VERSIONS AT THE END
        conn.sudo("rm -f /usr/local/elasticsearch/config/elasticsearch.yml")
        conn.sudo("rm -f /usr/local/elasticsearch/config/jvm.options")
        conn.sudo("rm -f /usr/local/elasticsearch/config/log4j2.properties")

    # MOUNT AND FORMAT THE VOLUMES (list with `lsblk`)
    for i, k in enumerate(volumes):
        if not conn.exists(k.path):
            # ENSURE DEVICE IS NOT MOUNTED
            conn.sudo('sudo umount '+k.device, warn=True)

            # (RE)PARTITION THE LOCAL DEVICE, AND FORMAT
            conn.sudo("parted " + k.device + " --script \"mklabel gpt mkpart primary ext4 2048s 100%\"")
            conn.sudo('yes | sudo mkfs -t ext4 '+k.device)

            # ES AND JOURNALLING DO NOT MIX
            conn.sudo('tune2fs -o journal_data_writeback '+k.device)
            conn.sudo('tune2fs -O ^has_journal '+k.device)

            # MOUNT IT
            conn.sudo('mkdir '+k.path)
            conn.sudo('sudo mount '+k.device+' '+k.path)
            conn.sudo('chown -R ec2-user:ec2-user '+k.path)

            # ADD TO /etc/fstab SO AROUND AFTER REBOOT
            conn.sudo("sed -i '$ a\\"+k.device+"   "+k.path+"       ext4    defaults,nofail  0   2' /etc/fstab")

    # TEST IT IS WORKING
    conn.sudo('mount -a')

    # INCREASE THE FILE HANDLE LIMITS
    with conn.cd("/home/ec2-user/"):
        with TempFile() as temp:
            conn.get("/etc/sysctl.conf", temp, use_sudo=True)
            lines = temp.read()
            if lines.find("fs.file-max = 100000") == -1:
                lines += "\nfs.file-max = 100000"
            lines = lines.replace("net.bridge.bridge-nf-call-ip6tables = 0", "")
            lines = lines.replace("net.bridge.bridge-nf-call-iptables = 0", "")
            lines = lines.replace("net.bridge.bridge-nf-call-arptables = 0", "")
            temp.write(lines)
            conn.put(temp, "/etc/sysctl.conf", use_sudo=True)

    conn.sudo("sudo sed -i '$ a\\vm.max_map_count = 262144' /etc/sysctl.conf")

    conn.sudo("sysctl -p")

    # INCREASE FILE HANDLE PERMISSIONS
    conn.sudo("sed -i '$ a\\root soft nofile 100000' /etc/security/limits.conf")
    conn.sudo("sed -i '$ a\\root hard nofile 100000' /etc/security/limits.conf")
    conn.sudo("sed -i '$ a\\root soft memlock unlimited' /etc/security/limits.conf")
    conn.sudo("sed -i '$ a\\root hard memlock unlimited' /etc/security/limits.conf")

    conn.sudo("sed -i '$ a\\ec2-user soft nofile 100000' /etc/security/limits.conf")
    conn.sudo("sed -i '$ a\\ec2-user hard nofile 100000' /etc/security/limits.conf")
    conn.sudo("sed -i '$ a\\ec2-user soft memlock unlimited' /etc/security/limits.conf")
    conn.sudo("sed -i '$ a\\ec2-user hard memlock unlimited' /etc/security/limits.conf")

    if not conn.exists("/data1/logs"):
        conn.run('mkdir /data1/logs')
        conn.run('mkdir /data1/heapdump')

    # COPY CONFIG FILES TO ES DIR
    if not conn.exists("/usr/local/elasticsearch/config/elasticsearch.yml"):
        conn.put("./examples/config/es6_log4j2.properties", '/usr/local/elasticsearch/config/log4j2.properties', use_sudo=True)

        jvm = File("./examples/config/es6_jvm.options").read().replace('\r', '')
        jvm = expand_template(jvm, {"memory": int(gigabytes/2)})
        with TempFile() as temp:
            temp.write(jvm)
            conn.put(temp, '/usr/local/elasticsearch/config/jvm.options', use_sudo=True)

        yml = File("./examples/config/es6_config.yml").read().replace("\r", "")
        yml = expand_template(yml, {
            "id": instance.ip_address,
            "data_paths": ",".join("/data" + text(i + 1) for i, _ in enumerate(volumes))
        })
        with TempFile() as temp:
            temp.write(yml)
            conn.put(temp, '/usr/local/elasticsearch/config/elasticsearch.yml', use_sudo=True)

    conn.sudo("chown -R ec2-user:ec2-user /usr/local/elasticsearch")
 def test_empty_ff(self) -> None:
     expected = ""
     with TempFile() as tmp_file:
         self.assertFalse(ff(tmp_file))
         actual = tmp_file.read()
         self.assertFormatEqual(expected, actual)