def run(self): format = ToastConfig().config["sources"][0]["format"].lower() if format not in self.allowed_file_formats: raise ValueError("Format '{0}' not in allowed formats {1}.".format(format, self.allowed_file_formats)) # 1. Copy the data from source (e.g. S3) to Hadoop's default filesystem tmp_hadoop_path = "/tmp/{rand_id}.{format}".format(rand_id=random_id(), format=format) distcp_cmd = "{hadoop_home}/bin/hadoop distcp {source} {target}".format( hadoop_home=eggo_config.get("worker_env", "hadoop_home"), source=ToastConfig().raw_data_url(), target=tmp_hadoop_path, ) check_call(distcp_cmd, shell=True) # 2. Run the adam-submit job adam_cmd = ("{adam_home}/bin/adam-submit --master {spark_master} {adam_command} " "{source} {target}").format( adam_home=eggo_config.get("worker_env", "adam_home"), spark_master=eggo_config.get("worker_env", "spark_master"), adam_command=self.adam_command, source=tmp_hadoop_path, target=ToastConfig().edition_url(edition=self.edition), ) check_call(adam_cmd, shell=True)
def _dnload_to_local_upload_to_dfs(source, destination, compression): # source: (string) URL suitable for curl # destination: (string) full URL of destination file name # compression: (bool) whether file needs to be decompressed tmp_local_dir = mkdtemp(prefix="tmp_eggo_", dir=eggo_config.get("worker_env", "work_path")) try: # 1. dnload file dnload_cmd = "pushd {tmp_local_dir} && curl -L -O {source} && popd" check_call(dnload_cmd.format(tmp_local_dir=tmp_local_dir, source=source), shell=True) # 2. decompress if necessary if compression: compression_type = os.path.splitext(source)[-1] if compression_type == ".gz": decompr_cmd = "pushd {tmp_local_dir} && gunzip *.gz && popd" else: raise ValueError("Unknown compression type: {0}".format(compression_type)) check_call(decompr_cmd.format(tmp_local_dir=tmp_local_dir), shell=True) try: # 3. upload to tmp distributed filesystem location (e.g. S3) tmp_staged_dir = os.path.join(eggo_config.get("dfs", "dfs_tmp_data_url"), "staged", random_id()) # get the name of the local file that we're uploading local_files = os.listdir(tmp_local_dir) if len(local_files) != 1: # TODO: generate warning/error here pass filename = local_files[0] # ensure the dfs directory exists; this cmd may fail if the dir # already exists, but that's ok (though it shouldn't already exist) create_dir_cmd = "{hadoop_home}/bin/hadoop fs -mkdir -p {tmp_dfs_dir}" call( create_dir_cmd.format( hadoop_home=eggo_config.get("worker_env", "hadoop_home"), tmp_dfs_dir=tmp_staged_dir ), shell=True, ) upload_cmd = "{hadoop_home}/bin/hadoop fs -put {tmp_local_file} {tmp_dfs_file}" check_call( upload_cmd.format( hadoop_home=eggo_config.get("worker_env", "hadoop_home"), tmp_local_file=os.path.join(tmp_local_dir, filename), tmp_dfs_file=os.path.join(tmp_staged_dir, filename), ), shell=True, ) # 4. rename to final target location rename_cmd = "{hadoop_home}/bin/hadoop fs -mv {tmp_path} {final_path}" check_call( rename_cmd.format( hadoop_home=eggo_config.get("worker_env", "hadoop_home"), tmp_path=os.path.join(tmp_staged_dir, filename), final_path=destination, ), shell=True, ) finally: pass # TODO: clean up dfs tmp dir finally: rmtree(tmp_local_dir)