def create_SUCCESS_file(path): if path.startswith("s3:") or path.startswith("s3n:") or path.startswith("s3a:"): s3_client = S3Client( eggo_config.get("aws", "aws_access_key_id"), eggo_config.get("aws", "aws_secret_access_key") ) s3_client.put_string("", os.path.join(path, "_SUCCESS")) elif path.startswith("hdfs:"): hdfs_client = HdfsClient() hdfs_client.put("/dev/null", os.path.join(path, "_SUCCESS")) elif path.startswith("file:"): open(os.path.join(path, "_SUCCESS"), "a").close()
def mapper(self, line): source = json.loads("\t".join(line.split("\t")[1:])) dest_name = build_dest_filename(source["url"], decompress=source["compression"]) dest_url = os.path.join(self.destination, dest_name) if dest_url.startswith("s3:") or dest_url.startswith("s3n:"): client = S3Client( eggo_config.get("aws", "aws_access_key_id"), eggo_config.get("aws", "aws_secret_access_key") ) else: client = HdfsClient() if not client.exists(dest_url): _dnload_to_local_upload_to_dfs(source["url"], dest_url, source["compression"]) yield (source["url"], 1) # dummy output
def run(self): tmp_dir = mkdtemp(prefix="tmp_eggo_", dir=eggo_config.get("worker_env", "work_path")) try: # build the remote command for each source tmp_command_file = "{0}/command_file".format(tmp_dir) with open(tmp_command_file, "w") as command_file: for source in ToastConfig().config["sources"]: command_file.write("{0}\n".format(json.dumps(source))) # 3. Copy command file to Hadoop filesystem hdfs_client = HdfsClient() hdfs_client.mkdir(os.path.dirname(self.hdfs_path), True) hdfs_client.put(tmp_command_file, self.hdfs_path) finally: rmtree(tmp_dir)