Beispiel #1
0
    def fetch_remotes(self, remotes):
        # TODO: fetch remotes inside Mappers, and process each one as it becomes available.
        remote_names = [r["name"] for r in remotes if r["type"] == "remote"]

        # TODO: check cache first.
        result = 0
        if len(remote_names) == 0:
            return result

        fetch_cwd = os.path.join(self._work_dir, "cache")
        if not os.path.isdir(fetch_cwd):
            os.makedirs(fetch_cwd)
        loader = s3util.Loader(fetch_cwd,
                               self._bucket_name,
                               aws_key=self._aws_key,
                               aws_secret_key=self._aws_secret_key)
        start = datetime.now()
        downloaded_bytes = 0
        for local, remote, err in loader.get_list(remote_names):
            if err is None:
                print "Downloaded", remote
                downloaded_bytes += os.path.getsize(local)
            else:
                print "Failed to download", remote
                result += 1
        duration_sec = timer.delta_sec(start)
        downloaded_mb = float(downloaded_bytes) / 1024.0 / 1024.0
        print "Downloaded %.2fMB in %.2fs (%.2fMB/s)" % (
            downloaded_mb, duration_sec, downloaded_mb / duration_sec)
        return result
Beispiel #2
0
    def __init__(self, config, data_dir, pattern, keep_backups=False):
        self.bucket = config["incoming_bucket"]
        self.queue = config.get("incoming_queue", None)
        self.aws_key = config.get("aws_key", None)
        self.aws_secret_key = config.get("aws_secret_key", None)
        self.aws_region = config.get("aws_region", None)
        self.data_dir = data_dir
        self.pattern = pattern
        self.keep_backups = keep_backups
        if self.queue is not None:
            # Get a connection to the Queue
            conn = boto.sqs.connect_to_region(
                self.aws_region,
                aws_access_key_id=self.aws_key,
                aws_secret_access_key=self.aws_secret_key)

            # This gets the queue if it already exists, otherwise returns null
            self.q_incoming = conn.get_queue(self.queue)
            if self.q_incoming is None:
                raise ValueError("Failed to get queue " + self.queue)
        self.s3loader = s3util.Loader(self.data_dir, self.bucket, self.aws_key,
                                      self.aws_secret_key)

        # Make sure the target S3 bucket exists.
        s3conn = S3Connection(self.aws_key, self.aws_secret_key)
        try:
            print "Verifying that we can write to", self.bucket
            b = s3conn.get_bucket(self.bucket)
            print "Looks good!"
        except S3ResponseError:
            print "Bucket", self.bucket, "not found.  Attempting to create it."
            b = s3conn.create_bucket(self.bucket)
Beispiel #3
0
    def run_mapper(self, mapper_id, inputs, work_dir, module, partition_count, delete_files, aws_key, aws_secret_key, s3_bucket):
        self.work_dir = work_dir

        print "I am mapper", mapper_id, ", and I'm mapping", len(inputs), "inputs. 0% complete."

        bytes_total = sum([f.size for f in inputs])
        bytes_completed = 0
        next_notice_pct = 10

        start = datetime.now()

        loader = None
        output_file = os.path.join(work_dir, "mapper_" + str(mapper_id))
        mapfunc = getattr(module, 'map', None)
        context = Context(output_file, partition_count)
        if not callable(mapfunc):
            print "No map function!!!"
            sys.exit(1)

        # TODO: Stream/decompress the files directly.
        for input_file in inputs:
            if input_file.remote:
                # TODO: check if the file already exists locally.
                # Lazy load the loader (so we don't do it on "local only" jobs).
                if loader is None:
                    loader = s3util.Loader(os.path.join(self.work_dir, "cache"), s3_bucket, aws_key=aws_key, aws_secret_key=aws_secret_key, poolsize=1)
                for local, remote, err in loader.get_list([input_file.name]):
                    if err is not None:
                        print "Failed to download", remote, ":", err

            try:
                handle = self.open_input_file(input_file)
            except:
                print "Error opening", input_file.name, "(skipping)"
                traceback.print_exc(file=sys.stderr)
                continue
            line_num = 0
            for line in handle:
                line_num += 1
                try:
                    # Remove the trailing EOL character(s) before passing to
                    # the map function.
                    key, value = line.rstrip('\r\n').split("\t", 1)
                    mapfunc(key, input_file.dimensions, value, context)
                except ValueError, e:
                    # TODO: increment "bad line" metrics.
                    print "Bad line:", input_file.name, ":", line_num, e
            handle.close()
            if delete_files:
                print "Removing", input_file.name
                os.remove(handle.filename)
            bytes_completed += input_file.size
            completed_pct = (float(bytes_completed) / bytes_total) * 100
            if completed_pct >= next_notice_pct:
                next_notice_pct += 10
                duration_sec = timer.delta_sec(start)
                completed_mb = float(bytes_completed) / 1024.0 / 1024.0
                print "Mapper %d: %.2f%% complete. Processed %.2fMB in %.2fs (%.2fMB/s)" % (mapper_id, completed_pct, completed_mb, duration_sec, completed_mb / duration_sec)
Beispiel #4
0
def main():
    try:
        assert not os.path.exists(test_dir)
        os.makedirs(test_dir)
        num_procs = 15
        print "Running with", num_procs, "processes."
        d = s3util.Loader(test_dir,
                          "telemetry-published-v2",
                          poolsize=num_procs)
        test_list(d)
        test_schema(d)
    finally:
        shutil.rmtree(test_dir)
    return 0
Beispiel #5
0
    def run_mapper(self, mapper_id, inputs, work_dir, module, partition_count, delete_files, aws_key, aws_secret_key, s3_bucket):
        self.work_dir = work_dir

        print "I am mapper", mapper_id, ", and I'm mapping", len(inputs), "inputs. 0% complete."

        bytes_total = sum([f.size for f in inputs])
        bytes_completed = 0
        next_notice_pct = 5
        start = datetime.now()

        loader = None
        output_file = os.path.join(work_dir, "mapper_" + str(mapper_id))
        mapfunc = getattr(module, 'map', None)
        context = Context(output_file, partition_count)
        if not callable(mapfunc):
            print "No map function!!!"
            sys.exit(1)

        for input_file in inputs:
            if input_file.remote:
                # Lazy load the loader (so we don't do it on "local only" jobs).
                if loader is None:
                    loader = s3util.Loader(os.path.join(self.work_dir, "cache"), s3_bucket, aws_key=aws_key, aws_secret_key=aws_secret_key, poolsize=1)

                for local, remote, err in loader.get_list([input_file.name]):
                    if err is not None:
                        print "Failed to download", remote, ":", err
            line_num = 0
            full_filename = os.path.join(self.work_dir, "cache", input_file.name)

            for r, _ in heka_message.unpack_file(full_filename):
                msg = heka_message_parser.parse_heka_record(r)
                line_num += 1
                try:
                    mapfunc(msg["meta"]["documentId"], msg, context)
                except ValueError, e:
                    # TODO: increment "bad line" metrics.
                    print "Bad record:", input_file.name, ":", line_num, e
            if delete_files:
                os.remove(full_filename)

            bytes_completed += input_file.size
            completed_pct = (float(bytes_completed) / bytes_total) * 100
            if completed_pct >= next_notice_pct:
                next_notice_pct += 5
                duration_sec = timer.delta_sec(start)
                completed_mb = float(bytes_completed) / 1024.0 / 1024.0
                print "Mapper %d: %.2f%% complete. Processed %.2fMB in %.2fs (%.2fMB/s)" % (mapper_id, completed_pct, completed_mb, duration_sec, completed_mb / duration_sec)
Beispiel #6
0
            logger.log("Error: could not get queue " +
                       config["incoming_queue"])
            return -2

        logger.log("Verifying that we can write to " +
                   config["publish_bucket"])
        try:
            publish_bucket = conn.get_bucket(config["publish_bucket"])
            logger.log("Looks good!")
        except S3ResponseError:
            logger.log("Bucket {0} not found. Attempting to create it.".format(
                config["publish_bucket"]))
            publish_bucket = conn.create_bucket(config["publish_bucket"])
        s3downloader = s3util.Loader(args.work_dir,
                                     config["incoming_bucket"],
                                     poolsize=num_cpus,
                                     aws_key=config.get("aws_key", None),
                                     aws_secret_key=config.get(
                                         "aws_secret_key", None))

    while not done:
        if args.dry_run:
            done = True
        try:
            start = now()
            incoming_filenames = []
            incoming_queue_messages = []
            logger.log("Fetching file list from queue " +
                       config["incoming_queue"])
            if args.dry_run:
                logger.log("Dry run mode... can't read from the queue " \
                           "without messing things up...")