Ejemplo n.º 1
0
    def run(self):
        format = ToastConfig().config["sources"][0]["format"].lower()
        if format not in self.allowed_file_formats:
            raise ValueError("Format '{0}' not in allowed formats {1}.".format(format, self.allowed_file_formats))

        # 1. Copy the data from source (e.g. S3) to Hadoop's default filesystem
        tmp_hadoop_path = "/tmp/{rand_id}.{format}".format(rand_id=random_id(), format=format)
        distcp_cmd = "{hadoop_home}/bin/hadoop distcp {source} {target}".format(
            hadoop_home=eggo_config.get("worker_env", "hadoop_home"),
            source=ToastConfig().raw_data_url(),
            target=tmp_hadoop_path,
        )
        check_call(distcp_cmd, shell=True)

        # 2. Run the adam-submit job
        adam_cmd = ("{adam_home}/bin/adam-submit --master {spark_master} {adam_command} " "{source} {target}").format(
            adam_home=eggo_config.get("worker_env", "adam_home"),
            spark_master=eggo_config.get("worker_env", "spark_master"),
            adam_command=self.adam_command,
            source=tmp_hadoop_path,
            target=ToastConfig().edition_url(edition=self.edition),
        )
        check_call(adam_cmd, shell=True)
Ejemplo n.º 2
0
def _dnload_to_local_upload_to_dfs(source, destination, compression):
    # source: (string) URL suitable for curl
    # destination: (string) full URL of destination file name
    # compression: (bool) whether file needs to be decompressed
    tmp_local_dir = mkdtemp(prefix="tmp_eggo_", dir=eggo_config.get("worker_env", "work_path"))
    try:
        # 1. dnload file
        dnload_cmd = "pushd {tmp_local_dir} && curl -L -O {source} && popd"
        check_call(dnload_cmd.format(tmp_local_dir=tmp_local_dir, source=source), shell=True)

        # 2. decompress if necessary
        if compression:
            compression_type = os.path.splitext(source)[-1]
            if compression_type == ".gz":
                decompr_cmd = "pushd {tmp_local_dir} && gunzip *.gz && popd"
            else:
                raise ValueError("Unknown compression type: {0}".format(compression_type))
            check_call(decompr_cmd.format(tmp_local_dir=tmp_local_dir), shell=True)

        try:
            # 3. upload to tmp distributed filesystem location (e.g. S3)
            tmp_staged_dir = os.path.join(eggo_config.get("dfs", "dfs_tmp_data_url"), "staged", random_id())
            # get the name of the local file that we're uploading
            local_files = os.listdir(tmp_local_dir)
            if len(local_files) != 1:
                # TODO: generate warning/error here
                pass
            filename = local_files[0]
            # ensure the dfs directory exists; this cmd may fail if the dir
            # already exists, but that's ok (though it shouldn't already exist)
            create_dir_cmd = "{hadoop_home}/bin/hadoop fs -mkdir -p {tmp_dfs_dir}"
            call(
                create_dir_cmd.format(
                    hadoop_home=eggo_config.get("worker_env", "hadoop_home"), tmp_dfs_dir=tmp_staged_dir
                ),
                shell=True,
            )
            upload_cmd = "{hadoop_home}/bin/hadoop fs -put {tmp_local_file} {tmp_dfs_file}"
            check_call(
                upload_cmd.format(
                    hadoop_home=eggo_config.get("worker_env", "hadoop_home"),
                    tmp_local_file=os.path.join(tmp_local_dir, filename),
                    tmp_dfs_file=os.path.join(tmp_staged_dir, filename),
                ),
                shell=True,
            )

            # 4. rename to final target location
            rename_cmd = "{hadoop_home}/bin/hadoop fs -mv {tmp_path} {final_path}"
            check_call(
                rename_cmd.format(
                    hadoop_home=eggo_config.get("worker_env", "hadoop_home"),
                    tmp_path=os.path.join(tmp_staged_dir, filename),
                    final_path=destination,
                ),
                shell=True,
            )
        finally:
            pass  # TODO: clean up dfs tmp dir
    finally:
        rmtree(tmp_local_dir)