def do(): with open(config, 'r') as ip: config_data = json.load(ip) dag_class = config_data['dag'] # push the toast config to the remote machine toast_config_worker_path = os.path.join( eggo_config.get('worker_env', 'work_path'), build_dest_filename(config)) put(local_path=config, remote_path=toast_config_worker_path) # TODO: run on central scheduler instead toast_cmd = ('toaster.py --local-scheduler {clazz} ' '--ToastConfig-config {toast_config}'.format( clazz=dag_class, toast_config=toast_config_worker_path)) hadoop_bin = os.path.join(eggo_config.get('worker_env', 'hadoop_home'), 'bin') toast_env = {'EGGO_HOME': eggo_config.get('worker_env', 'eggo_home'), # toaster.py imports eggo_config, which needs EGGO_HOME on worker 'EGGO_CONFIG': eggo_config.get('worker_env', 'eggo_config_path'), # bc toaster.py imports eggo_config which must be init on the worker 'LUIGI_CONFIG_PATH': eggo_config.get('worker_env', 'luigi_config_path'), 'AWS_ACCESS_KEY_ID': eggo_config.get('aws', 'aws_access_key_id'), # bc dataset dnload pushes data to S3 TODO: should only be added if the dfs is S3 'AWS_SECRET_ACCESS_KEY': eggo_config.get('aws', 'aws_secret_access_key'), # TODO: should only be added if the dfs is S3 'SPARK_HOME': eggo_config.get('worker_env', 'spark_home')} if exec_ctx == 'local': # this should copy vars that maintain venv info env_copy = os.environ.copy() env_copy.update(toast_env) toast_env = env_copy with path(hadoop_bin): with shell_env(**toast_env): wrun(toast_cmd)
def requires(self): for source in ToastConfig().config["sources"]: dest_name = build_dest_filename(source["url"], decompress=source["compression"]) yield DownloadFileToDFSTask( source=source["url"], target=os.path.join(self.destination, dest_name), compression=source["compression"], )
def mapper(self, line): source = json.loads("\t".join(line.split("\t")[1:])) dest_name = build_dest_filename(source["url"], decompress=source["compression"]) dest_url = os.path.join(self.destination, dest_name) if dest_url.startswith("s3:") or dest_url.startswith("s3n:"): client = S3Client( eggo_config.get("aws", "aws_access_key_id"), eggo_config.get("aws", "aws_secret_access_key") ) else: client = HdfsClient() if not client.exists(dest_url): _dnload_to_local_upload_to_dfs(source["url"], dest_url, source["compression"]) yield (source["url"], 1) # dummy output