def fetch_remotes(self, remotes): # TODO: fetch remotes inside Mappers, and process each one as it becomes available. remote_names = [r["name"] for r in remotes if r["type"] == "remote"] # TODO: check cache first. result = 0 if len(remote_names) == 0: return result fetch_cwd = os.path.join(self._work_dir, "cache") if not os.path.isdir(fetch_cwd): os.makedirs(fetch_cwd) loader = s3util.Loader(fetch_cwd, self._bucket_name, aws_key=self._aws_key, aws_secret_key=self._aws_secret_key) start = datetime.now() downloaded_bytes = 0 for local, remote, err in loader.get_list(remote_names): if err is None: print "Downloaded", remote downloaded_bytes += os.path.getsize(local) else: print "Failed to download", remote result += 1 duration_sec = timer.delta_sec(start) downloaded_mb = float(downloaded_bytes) / 1024.0 / 1024.0 print "Downloaded %.2fMB in %.2fs (%.2fMB/s)" % ( downloaded_mb, duration_sec, downloaded_mb / duration_sec) return result
def __init__(self, config, data_dir, pattern, keep_backups=False): self.bucket = config["incoming_bucket"] self.queue = config.get("incoming_queue", None) self.aws_key = config.get("aws_key", None) self.aws_secret_key = config.get("aws_secret_key", None) self.aws_region = config.get("aws_region", None) self.data_dir = data_dir self.pattern = pattern self.keep_backups = keep_backups if self.queue is not None: # Get a connection to the Queue conn = boto.sqs.connect_to_region( self.aws_region, aws_access_key_id=self.aws_key, aws_secret_access_key=self.aws_secret_key) # This gets the queue if it already exists, otherwise returns null self.q_incoming = conn.get_queue(self.queue) if self.q_incoming is None: raise ValueError("Failed to get queue " + self.queue) self.s3loader = s3util.Loader(self.data_dir, self.bucket, self.aws_key, self.aws_secret_key) # Make sure the target S3 bucket exists. s3conn = S3Connection(self.aws_key, self.aws_secret_key) try: print "Verifying that we can write to", self.bucket b = s3conn.get_bucket(self.bucket) print "Looks good!" except S3ResponseError: print "Bucket", self.bucket, "not found. Attempting to create it." b = s3conn.create_bucket(self.bucket)
def run_mapper(self, mapper_id, inputs, work_dir, module, partition_count, delete_files, aws_key, aws_secret_key, s3_bucket): self.work_dir = work_dir print "I am mapper", mapper_id, ", and I'm mapping", len(inputs), "inputs. 0% complete." bytes_total = sum([f.size for f in inputs]) bytes_completed = 0 next_notice_pct = 10 start = datetime.now() loader = None output_file = os.path.join(work_dir, "mapper_" + str(mapper_id)) mapfunc = getattr(module, 'map', None) context = Context(output_file, partition_count) if not callable(mapfunc): print "No map function!!!" sys.exit(1) # TODO: Stream/decompress the files directly. for input_file in inputs: if input_file.remote: # TODO: check if the file already exists locally. # Lazy load the loader (so we don't do it on "local only" jobs). if loader is None: loader = s3util.Loader(os.path.join(self.work_dir, "cache"), s3_bucket, aws_key=aws_key, aws_secret_key=aws_secret_key, poolsize=1) for local, remote, err in loader.get_list([input_file.name]): if err is not None: print "Failed to download", remote, ":", err try: handle = self.open_input_file(input_file) except: print "Error opening", input_file.name, "(skipping)" traceback.print_exc(file=sys.stderr) continue line_num = 0 for line in handle: line_num += 1 try: # Remove the trailing EOL character(s) before passing to # the map function. key, value = line.rstrip('\r\n').split("\t", 1) mapfunc(key, input_file.dimensions, value, context) except ValueError, e: # TODO: increment "bad line" metrics. print "Bad line:", input_file.name, ":", line_num, e handle.close() if delete_files: print "Removing", input_file.name os.remove(handle.filename) bytes_completed += input_file.size completed_pct = (float(bytes_completed) / bytes_total) * 100 if completed_pct >= next_notice_pct: next_notice_pct += 10 duration_sec = timer.delta_sec(start) completed_mb = float(bytes_completed) / 1024.0 / 1024.0 print "Mapper %d: %.2f%% complete. Processed %.2fMB in %.2fs (%.2fMB/s)" % (mapper_id, completed_pct, completed_mb, duration_sec, completed_mb / duration_sec)
def main(): try: assert not os.path.exists(test_dir) os.makedirs(test_dir) num_procs = 15 print "Running with", num_procs, "processes." d = s3util.Loader(test_dir, "telemetry-published-v2", poolsize=num_procs) test_list(d) test_schema(d) finally: shutil.rmtree(test_dir) return 0
def run_mapper(self, mapper_id, inputs, work_dir, module, partition_count, delete_files, aws_key, aws_secret_key, s3_bucket): self.work_dir = work_dir print "I am mapper", mapper_id, ", and I'm mapping", len(inputs), "inputs. 0% complete." bytes_total = sum([f.size for f in inputs]) bytes_completed = 0 next_notice_pct = 5 start = datetime.now() loader = None output_file = os.path.join(work_dir, "mapper_" + str(mapper_id)) mapfunc = getattr(module, 'map', None) context = Context(output_file, partition_count) if not callable(mapfunc): print "No map function!!!" sys.exit(1) for input_file in inputs: if input_file.remote: # Lazy load the loader (so we don't do it on "local only" jobs). if loader is None: loader = s3util.Loader(os.path.join(self.work_dir, "cache"), s3_bucket, aws_key=aws_key, aws_secret_key=aws_secret_key, poolsize=1) for local, remote, err in loader.get_list([input_file.name]): if err is not None: print "Failed to download", remote, ":", err line_num = 0 full_filename = os.path.join(self.work_dir, "cache", input_file.name) for r, _ in heka_message.unpack_file(full_filename): msg = heka_message_parser.parse_heka_record(r) line_num += 1 try: mapfunc(msg["meta"]["documentId"], msg, context) except ValueError, e: # TODO: increment "bad line" metrics. print "Bad record:", input_file.name, ":", line_num, e if delete_files: os.remove(full_filename) bytes_completed += input_file.size completed_pct = (float(bytes_completed) / bytes_total) * 100 if completed_pct >= next_notice_pct: next_notice_pct += 5 duration_sec = timer.delta_sec(start) completed_mb = float(bytes_completed) / 1024.0 / 1024.0 print "Mapper %d: %.2f%% complete. Processed %.2fMB in %.2fs (%.2fMB/s)" % (mapper_id, completed_pct, completed_mb, duration_sec, completed_mb / duration_sec)
logger.log("Error: could not get queue " + config["incoming_queue"]) return -2 logger.log("Verifying that we can write to " + config["publish_bucket"]) try: publish_bucket = conn.get_bucket(config["publish_bucket"]) logger.log("Looks good!") except S3ResponseError: logger.log("Bucket {0} not found. Attempting to create it.".format( config["publish_bucket"])) publish_bucket = conn.create_bucket(config["publish_bucket"]) s3downloader = s3util.Loader(args.work_dir, config["incoming_bucket"], poolsize=num_cpus, aws_key=config.get("aws_key", None), aws_secret_key=config.get( "aws_secret_key", None)) while not done: if args.dry_run: done = True try: start = now() incoming_filenames = [] incoming_queue_messages = [] logger.log("Fetching file list from queue " + config["incoming_queue"]) if args.dry_run: logger.log("Dry run mode... can't read from the queue " \ "without messing things up...")