Esempio n. 1
0
 def run(self):
     adam_cmd = ("{adam_home}/bin/adam-submit --master {spark_master} flatten " "{source} {target}").format(
         adam_home=eggo_config.get("worker_env", "adam_home"),
         spark_master=eggo_config.get("worker_env", "spark_master"),
         source=ToastConfig().edition_url(edition=self.source_edition),
         target=ToastConfig().edition_url(edition=self.edition),
     )
     check_call(adam_cmd, shell=True)
Esempio n. 2
0
def teardown():
    teardown_cmd = "{spark_home}/ec2/spark-ec2 -k {ec2_key_pair} " "-i {ec2_private_key_file} destroy {stack_name}"
    interp_cmd = teardown_cmd.format(
        spark_home=eggo_config.get("client_env", "spark_home"),
        ec2_key_pair=eggo_config.get("aws", "ec2_key_pair"),
        ec2_private_key_file=eggo_config.get("aws", "ec2_private_key_file"),
        stack_name=eggo_config.get("spark_ec2", "stack_name"),
    )
    local(interp_cmd)
Esempio n. 3
0
def get_master_host():
    getmaster_cmd = "{spark_home}/ec2/spark-ec2 -k {ec2_key_pair} " "-i {ec2_private_key_file} get-master {stack_name}"
    interp_cmd = getmaster_cmd.format(
        spark_home=eggo_config.get("client_env", "spark_home"),
        ec2_key_pair=eggo_config.get("aws", "ec2_key_pair"),
        ec2_private_key_file=eggo_config.get("aws", "ec2_private_key_file"),
        stack_name=eggo_config.get("spark_ec2", "stack_name"),
    )
    result = local(interp_cmd, capture=True)
    return result.split("\n")[2].strip()
Esempio n. 4
0
def create_SUCCESS_file(path):
    if path.startswith("s3:") or path.startswith("s3n:") or path.startswith("s3a:"):
        s3_client = S3Client(
            eggo_config.get("aws", "aws_access_key_id"), eggo_config.get("aws", "aws_secret_access_key")
        )
        s3_client.put_string("", os.path.join(path, "_SUCCESS"))
    elif path.startswith("hdfs:"):
        hdfs_client = HdfsClient()
        hdfs_client.put("/dev/null", os.path.join(path, "_SUCCESS"))
    elif path.startswith("file:"):
        open(os.path.join(path, "_SUCCESS"), "a").close()
Esempio n. 5
0
    def mapper(self, line):
        source = json.loads("\t".join(line.split("\t")[1:]))
        dest_name = build_dest_filename(source["url"], decompress=source["compression"])
        dest_url = os.path.join(self.destination, dest_name)
        if dest_url.startswith("s3:") or dest_url.startswith("s3n:"):
            client = S3Client(
                eggo_config.get("aws", "aws_access_key_id"), eggo_config.get("aws", "aws_secret_access_key")
            )
        else:
            client = HdfsClient()
        if not client.exists(dest_url):
            _dnload_to_local_upload_to_dfs(source["url"], dest_url, source["compression"])

        yield (source["url"], 1)  # dummy output
Esempio n. 6
0
 def do():
     with open(config, 'r') as ip:
         config_data = json.load(ip)
     dag_class = config_data['dag']
     # push the toast config to the remote machine
     toast_config_worker_path = os.path.join(
         eggo_config.get('worker_env', 'work_path'),
         build_dest_filename(config))
     put(local_path=config,
         remote_path=toast_config_worker_path)
     # TODO: run on central scheduler instead
     toast_cmd = ('toaster.py --local-scheduler {clazz} '
                  '--ToastConfig-config {toast_config}'.format(
                     clazz=dag_class,
                     toast_config=toast_config_worker_path))
     
     hadoop_bin = os.path.join(eggo_config.get('worker_env', 'hadoop_home'), 'bin')
     toast_env = {'EGGO_HOME': eggo_config.get('worker_env', 'eggo_home'),  # toaster.py imports eggo_config, which needs EGGO_HOME on worker
                  'EGGO_CONFIG': eggo_config.get('worker_env', 'eggo_config_path'),  # bc toaster.py imports eggo_config which must be init on the worker
                  'LUIGI_CONFIG_PATH': eggo_config.get('worker_env', 'luigi_config_path'),
                  'AWS_ACCESS_KEY_ID': eggo_config.get('aws', 'aws_access_key_id'),  # bc dataset dnload pushes data to S3 TODO: should only be added if the dfs is S3
                  'AWS_SECRET_ACCESS_KEY': eggo_config.get('aws', 'aws_secret_access_key'),  # TODO: should only be added if the dfs is S3
                  'SPARK_HOME': eggo_config.get('worker_env', 'spark_home')}
     if exec_ctx == 'local':
             # this should copy vars that maintain venv info
             env_copy = os.environ.copy()
             env_copy.update(toast_env)
             toast_env = env_copy
     with path(hadoop_bin):
         with shell_env(**toast_env):
             wrun(toast_cmd)
Esempio n. 7
0
def update_eggo():
    work_path = eggo_config.get('worker_env', 'work_path')
    venv_path = eggo_config.get('worker_env', 'venv_path')
    eggo_fork = eggo_config.get('versions', 'eggo_fork')
    eggo_branch = eggo_config.get('versions', 'eggo_branch')
    eggo_home = eggo_config.get('worker_env', 'eggo_home')

    def do():
        env.parallel = True
        if exec_ctx in ['director', 'spark_ec2']:
            wrun('rm -rf {0}'.format(eggo_home))
        install_eggo(work_path, eggo_home, eggo_fork, eggo_branch)

    execute(do, hosts=get_worker_hosts())
Esempio n. 8
0
 def run(self):
     delete_raw_cmd = "{hadoop_home}/bin/hadoop fs -rm -r {raw} {target}".format(
         hadoop_home=eggo_config.get("worker_env", "hadoop_home"),
         raw=ToastConfig().raw_data_url(),
         target=ToastConfig().dataset_url(),
     )
     check_call(delete_raw_cmd, shell=True)
Esempio n. 9
0
 def job_runner(self):
     addl_conf = {"mapred.map.tasks.speculative.execution": "false", "mapred.task.timeout": 12000000}
     # TODO: can we delete the AWS vars with Director? does it set AWS cred in core-site.xml?
     streaming_args = [
         "-cmdenv",
         "EGGO_HOME=" + eggo_config.get("worker_env", "eggo_home"),
         "-cmdenv",
         "EGGO_CONFIG=" + eggo_config.get("worker_env", "eggo_config_path"),
         "-cmdenv",
         "AWS_ACCESS_KEY_ID=" + eggo_config.get("aws", "aws_access_key_id"),
         "-cmdenv",
         "AWS_SECRET_ACCESS_KEY=" + eggo_config.get("aws", "aws_secret_access_key"),
     ]
     return HadoopJobRunner(
         streaming_jar=eggo_config.get("worker_env", "streaming_jar"),
         streaming_args=streaming_args,
         jobconfs=addl_conf,
         input_format="org.apache.hadoop.mapred.lib.NLineInputFormat",
         output_format="org.apache.hadoop.mapred.lib.NullOutputFormat",
         end_job_with_atomic_move_dir=False,
     )
Esempio n. 10
0
def provision():
    if exec_ctx == 'spark_ec2':
        eggo.spark_ec2.provision()
    elif exec_ctx == 'director':
        eggo.director.provision()
    # at this point, get_master() should be valid

    # if the DFS is on the local fs, the directories may need to be created
    url = urlparse(eggo_config.get('dfs', 'dfs_root_url'))
    if url.scheme == 'file':
        local('mkdir -p {0}'.format(url.path))
        url = urlparse(eggo_config.get('dfs', 'dfs_raw_data_url'))
        local('mkdir -p {0}'.format(url.path))
        url = urlparse(eggo_config.get('dfs', 'dfs_tmp_data_url'))
        local('mkdir -p {0}'.format(url.path))

    # tag all the provisioned instances
    if exec_ctx in ['spark_ec2', 'director']:
        conn = connect_to_region(eggo_config.get(exec_ctx, 'region'))
        instances = conn.get_only_instances(
            filters={'key-name': [eggo_config.get('aws', 'ec2_key_pair')]})
        for instance in instances:
            instance.add_tag('owner', getuser())
            instance.add_tag('stack_name',
                             eggo_config.get(exec_ctx, 'stack_name'))
Esempio n. 11
0
    def run(self):
        format = ToastConfig().config["sources"][0]["format"].lower()
        if format not in self.allowed_file_formats:
            raise ValueError("Format '{0}' not in allowed formats {1}.".format(format, self.allowed_file_formats))

        # 1. Copy the data from source (e.g. S3) to Hadoop's default filesystem
        tmp_hadoop_path = "/tmp/{rand_id}.{format}".format(rand_id=random_id(), format=format)
        distcp_cmd = "{hadoop_home}/bin/hadoop distcp {source} {target}".format(
            hadoop_home=eggo_config.get("worker_env", "hadoop_home"),
            source=ToastConfig().raw_data_url(),
            target=tmp_hadoop_path,
        )
        check_call(distcp_cmd, shell=True)

        # 2. Run the adam-submit job
        adam_cmd = ("{adam_home}/bin/adam-submit --master {spark_master} {adam_command} " "{source} {target}").format(
            adam_home=eggo_config.get("worker_env", "adam_home"),
            spark_master=eggo_config.get("worker_env", "spark_master"),
            adam_command=self.adam_command,
            source=tmp_hadoop_path,
            target=ToastConfig().edition_url(edition=self.edition),
        )
        check_call(adam_cmd, shell=True)
Esempio n. 12
0
    def run(self):
        tmp_dir = mkdtemp(prefix="tmp_eggo_", dir=eggo_config.get("worker_env", "work_path"))
        try:
            # build the remote command for each source
            tmp_command_file = "{0}/command_file".format(tmp_dir)
            with open(tmp_command_file, "w") as command_file:
                for source in ToastConfig().config["sources"]:
                    command_file.write("{0}\n".format(json.dumps(source)))

            # 3. Copy command file to Hadoop filesystem
            hdfs_client = HdfsClient()
            hdfs_client.mkdir(os.path.dirname(self.hdfs_path), True)
            hdfs_client.put(tmp_command_file, self.hdfs_path)
        finally:
            rmtree(tmp_dir)
Esempio n. 13
0
def delete_toasted(config):
    with open(config, 'r') as ip:
        config_data = json.load(ip)
    url = os.path.join(eggo_config.get('dfs', 'dfs_root_url'),
                       config_data['name'])
    url = urlparse(url)
    if url.scheme == 's3n':
        conn = S3Connection()
        bucket = conn.get_bucket(url.netloc)
        keys = bucket.list(url.path.lstrip('/'))
        bucket.delete_keys(keys)
    elif url.scheme == 'file':
        rmtree(url.path, ignore_errors=True)
    else:
        raise NotImplementedError(
            "{0} dfs scheme not supported".format(url.scheme))
Esempio n. 14
0
def provision():
    provision_cmd = (
        "{spark_home}/ec2/spark-ec2 -k {ec2_key_pair} "
        "-i {ec2_private_key_file} -s {slaves} -t {type_} "
        "-r {region} {zone_arg} {spot_price_arg} "
        "--copy-aws-credentials launch {stack_name}"
    )
    az = eggo_config.get("spark_ec2", "availability_zone")
    zone_arg = "--zone {0}".format(az) if az != "" else ""
    spot_price = eggo_config.get("spark_ec2", "spot_price")
    spot_price_arg = "--spot-price {0}".format(spot_price) if spot_price != "" else ""
    interp_cmd = provision_cmd.format(
        spark_home=eggo_config.get("client_env", "spark_home"),
        ec2_key_pair=eggo_config.get("aws", "ec2_key_pair"),
        ec2_private_key_file=eggo_config.get("aws", "ec2_private_key_file"),
        slaves=eggo_config.get("spark_ec2", "num_slaves"),
        type_=eggo_config.get("spark_ec2", "instance_type"),
        region=eggo_config.get("spark_ec2", "region"),
        zone_arg=zone_arg,
        spot_price_arg=spot_price_arg,
        stack_name=eggo_config.get("spark_ec2", "stack_name"),
    )
    local(interp_cmd)
Esempio n. 15
0
 def raw_data_url(self):
     return os.path.join(eggo_config.get("dfs", "dfs_raw_data_url"), self.config["name"])
Esempio n. 16
0
from cStringIO import StringIO

from fabric.api import (
    task, env, execute, local, open_shell, put, cd, run, prefix, shell_env,
    require, hosts, path, sudo, lcd)
from fabric.contrib.files import append, exists
from boto.ec2 import connect_to_region
from boto.s3.connection import S3Connection

import eggo.director
import eggo.spark_ec2
from eggo.util import build_dest_filename
from eggo.config import eggo_config, generate_luigi_cfg


exec_ctx = eggo_config.get('execution', 'context')
work_path = eggo_config.get('worker_env', 'work_path')
eggo_config_path = eggo_config.get('worker_env', 'eggo_config_path')
luigi_config_path = eggo_config.get('worker_env', 'luigi_config_path')
adam_fork = eggo_config.get('versions', 'adam_fork')
adam_branch = eggo_config.get('versions', 'adam_branch')
adam_home = eggo_config.get('worker_env', 'adam_home')
eggo_fork = eggo_config.get('versions', 'eggo_fork')
eggo_branch = eggo_config.get('versions', 'eggo_branch')
eggo_home = eggo_config.get('worker_env', 'eggo_home')
maven_version = eggo_config.get('versions', 'maven')


# the diff exec ctxs have diff permissions
if exec_ctx == 'local':
    wrun = local
Esempio n. 17
0
def _dnload_to_local_upload_to_dfs(source, destination, compression):
    # source: (string) URL suitable for curl
    # destination: (string) full URL of destination file name
    # compression: (bool) whether file needs to be decompressed
    tmp_local_dir = mkdtemp(prefix="tmp_eggo_", dir=eggo_config.get("worker_env", "work_path"))
    try:
        # 1. dnload file
        dnload_cmd = "pushd {tmp_local_dir} && curl -L -O {source} && popd"
        check_call(dnload_cmd.format(tmp_local_dir=tmp_local_dir, source=source), shell=True)

        # 2. decompress if necessary
        if compression:
            compression_type = os.path.splitext(source)[-1]
            if compression_type == ".gz":
                decompr_cmd = "pushd {tmp_local_dir} && gunzip *.gz && popd"
            else:
                raise ValueError("Unknown compression type: {0}".format(compression_type))
            check_call(decompr_cmd.format(tmp_local_dir=tmp_local_dir), shell=True)

        try:
            # 3. upload to tmp distributed filesystem location (e.g. S3)
            tmp_staged_dir = os.path.join(eggo_config.get("dfs", "dfs_tmp_data_url"), "staged", random_id())
            # get the name of the local file that we're uploading
            local_files = os.listdir(tmp_local_dir)
            if len(local_files) != 1:
                # TODO: generate warning/error here
                pass
            filename = local_files[0]
            # ensure the dfs directory exists; this cmd may fail if the dir
            # already exists, but that's ok (though it shouldn't already exist)
            create_dir_cmd = "{hadoop_home}/bin/hadoop fs -mkdir -p {tmp_dfs_dir}"
            call(
                create_dir_cmd.format(
                    hadoop_home=eggo_config.get("worker_env", "hadoop_home"), tmp_dfs_dir=tmp_staged_dir
                ),
                shell=True,
            )
            upload_cmd = "{hadoop_home}/bin/hadoop fs -put {tmp_local_file} {tmp_dfs_file}"
            check_call(
                upload_cmd.format(
                    hadoop_home=eggo_config.get("worker_env", "hadoop_home"),
                    tmp_local_file=os.path.join(tmp_local_dir, filename),
                    tmp_dfs_file=os.path.join(tmp_staged_dir, filename),
                ),
                shell=True,
            )

            # 4. rename to final target location
            rename_cmd = "{hadoop_home}/bin/hadoop fs -mv {tmp_path} {final_path}"
            check_call(
                rename_cmd.format(
                    hadoop_home=eggo_config.get("worker_env", "hadoop_home"),
                    tmp_path=os.path.join(tmp_staged_dir, filename),
                    final_path=destination,
                ),
                shell=True,
            )
        finally:
            pass  # TODO: clean up dfs tmp dir
    finally:
        rmtree(tmp_local_dir)
Esempio n. 18
0
 def dfs_tmp_data_url(self):
     return os.path.join(
         eggo_config.get("dfs", "dfs_tmp_data_url"), self.config["name"], eggo_config.get("execution", "random_id")
     )
Esempio n. 19
0
 def edition_url(self, format="bdg", edition="basic"):
     return os.path.join(eggo_config.get("dfs", "dfs_root_url"), self.config["name"], format, edition)
Esempio n. 20
0
# regarding copyright ownership.  The BDG licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import sys

from boto.ec2 import connect_to_region

from eggo.config import eggo_config


exec_ctx = eggo_config.get('execution', 'context')
# check that we're running on EC2
if exec_ctx not in ['spark_ec2', 'director']:
    sys.exit()
conn = connect_to_region(eggo_config.get(exec_ctx, 'region'))
instances = conn.get_only_instances(
    filters={'tag:stack_name': [eggo_config.get(exec_ctx, 'stack_name')]})
for instance in instances:
    print instance
    instance.terminate()
Esempio n. 21
0
 def dataset_url(self):
     return os.path.join(eggo_config.get("dfs", "dfs_root_url"), self.config["name"])
Esempio n. 22
0
import os
import sys
import time
from tempfile import mkdtemp
from datetime import datetime

import boto.ec2
import boto.cloudformation
from boto.ec2.networkinterface import (
    NetworkInterfaceCollection, NetworkInterfaceSpecification)
from fabric.api import local, env, run, execute, prefix, put, open_shell

from eggo.config import eggo_config


AWS_ACCESS_KEY_ID = eggo_config.get('aws', 'aws_access_key_id')
AWS_SECRET_ACCESS_KEY = eggo_config.get('aws', 'aws_secret_access_key')
EC2_KEY_PAIR = eggo_config.get('aws', 'ec2_key_pair')
EC2_PRIVATE_KEY_FILE = eggo_config.get('aws', 'ec2_private_key_file')

REGION = eggo_config.get('director', 'region')
LAUNCHER_INSTANCE_TYPE = eggo_config.get('director', 'launcher_instance_type')
LAUNCHER_AMI = eggo_config.get('director', 'launcher_ami')
CLUSTER_AMI = eggo_config.get('director', 'cluster_ami')
NUM_WORKERS = eggo_config.get('director', 'num_workers')
STACK_NAME = eggo_config.get('director', 'stack_name')
CLOUDFORMATION_TEMPLATE = eggo_config.get('director', 'cloudformation_template')
DIRECTOR_CONF_TEMPLATE = eggo_config.get('director', 'director_conf_template')

def provision():
    start_time = datetime.now()