Ejemplo n.º 1
0
class Job:
    """A class for orchestrating a Telemetry MapReduce job"""
    # 1. read input filter
    # 2. generate filtered list of local input files
    # 2a. generate filtered list of remote input files
    # 3. load mapper
    # 4. spawn N processes
    # 5. distribute files among processes
    # 6. map(key, value, dims) each line in the file
    # 7. combine map output for each file
    # 8. reduce combine output overall

    def __init__(self, config):
        # Sanity check args.
        if config.num_mappers <= 0:
            raise ValueError("Number of mappers must be greater than zero")
        if config.num_reducers <= 0:
            raise ValueError("Number of reducers must be greater than zero")
        if not os.path.isdir(config.data_dir):
            raise ValueError("Data dir must be a valid directory")
        if not os.path.isdir(config.work_dir):
            raise ValueError("Work dir must be a valid directory")
        if not os.path.isfile(config.job_script):
            raise ValueError("Job script must be a valid python file")
        if not os.path.isfile(config.input_filter):
            raise ValueError("Input filter must be a valid json file")

        self._input_dir = config.data_dir
        if self._input_dir[-1] == os.path.sep:
            self._input_dir = self._input_dir[0:-1]
        self._work_dir = config.work_dir
        self._input_filter = TelemetrySchema(json.load(open(config.input_filter)))
        self._allowed_values = self._input_filter.sanitize_allowed_values()
        self._output_file = config.output
        self._num_mappers = config.num_mappers
        self._num_reducers = config.num_reducers
        self._local_only = config.local_only
        self._bucket_name = config.bucket
        self._aws_key = config.aws_key
        self._aws_secret_key = config.aws_secret_key
        modulefd = open(config.job_script)
        # let the job script import additional modules under its path
        sys.path.append(os.path.dirname(config.job_script))
        ## Lifted from FileDriver.py in jydoop.
        self._job_module = imp.load_module("telemetry_job", modulefd, config.job_script, ('.py', 'U', 1))

    def dump_stats(self, partitions):
        total = sum(partitions)
        avg = total / len(partitions)
        for i in range(len(partitions)):
            print "Partition %d contained %d (%+d)" % (i, partitions[i], float(partitions[i]) - avg)

    def fetch_remotes(self, remotes):
        # TODO: fetch remotes inside Mappers, and process each one as it becomes available.
        remote_names = [ r["name"] for r in remotes if r["type"] == "remote" ]

        # TODO: check cache first.
        result = 0
        if len(remote_names) == 0:
            return result

        fetch_cwd = os.path.join(self._work_dir, "cache")
        if not os.path.isdir(fetch_cwd):
            os.makedirs(fetch_cwd)
        loader = s3util.Loader(fetch_cwd, self._bucket_name, aws_key=self._aws_key, aws_secret_key=self._aws_secret_key)
        start = datetime.now()
        downloaded_bytes = 0
        for local, remote, err in loader.get_list(remote_names):
            if err is None:
                print "Downloaded", remote
                downloaded_bytes += os.path.getsize(local)
            else:
                print "Failed to download", remote
                result += 1
        duration_sec = timer.delta_sec(start)
        downloaded_mb = float(downloaded_bytes) / 1024.0 / 1024.0
        print "Downloaded %.2fMB in %.2fs (%.2fMB/s)" % (downloaded_mb, duration_sec, downloaded_mb / duration_sec)
        return result

    def dedupe_remotes(self, remote_files, local_files):
        return [ r for r in remote_files if os.path.join(self._input_dir, r.name) not in local_files ]

    def mapreduce(self):
        # Find files matching specified input filter
        files = self.get_filtered_files(self._input_dir)
        remote_files = self.get_filtered_files_s3()

        # If we're using the cache dir as the data dir, we will end up reading
        # each already-downloaded file twice, so we should skip any remote files
        # that exist in the data dir.
        remote_files = self.dedupe_remotes(remote_files, files)

        file_count = len(files) + len(remote_files)

        if file_count == 0:
            print "Filter didn't match any files... nothing to do"
            return

        # Not useful to have more mappers than input files.
        if file_count < self._num_mappers:
            print "Filter matched only %s input files (%s local in %s and %s " \
                  "remote from %s). Reducing number of mappers accordingly." \
                  % (file_count, len(files), self._input_dir, len(remote_files),
                      self._bucket_name)
            self._num_mappers = file_count

        # Partition files into reasonably equal groups for use by mappers
        print "Partitioning input data..."
        partitions = self.partition(files, remote_files)
        print "Done"

        def checkExitCode(proc):
            # If process was terminated by a signal, exitcode is the negative signal value
            if proc.exitcode == -signal.SIGKILL:
                # SIGKILL is most likely an OOM kill
                raise MemoryError("%s ran out of memory" % proc.name)
            elif proc.exitcode:
                raise OSError("%s exited with code %d" % (proc.name, proc.exitcode))

        # Partitions are ready. Map.
        mappers = []
        for i in range(self._num_mappers):
            if len(partitions[i]) > 0:
                # Fetch the files we need for each mapper
                print "Fetching remotes for partition", i
                fetch_result = self.fetch_remotes(partitions[i])
                if fetch_result == 0:
                    print "Remote files fetched successfully"
                else:
                    print "ERROR: Failed to fetch", fetch_result, "files."
                    # TODO: Bail, since results will be unreliable?
                p = Process(
                        target=Mapper,
                        name=("Mapper-%d" % i),
                        args=(i, partitions[i], self._work_dir, self._job_module, self._num_reducers))
                mappers.append(p)
                p.start()
            else:
                print "Skipping mapper", i, "- no input files to process"
        for m in mappers:
            m.join()
            checkExitCode(m)

        # Mappers are done. Reduce.
        reducers = []
        for i in range(self._num_reducers):
            p = Process(
                    target=Reducer,
                    name=("Reducer-%d" % i),
                    args=(i, self._work_dir, self._job_module, self._num_mappers))
            reducers.append(p)
            p.start()
        for r in reducers:
            r.join()
            checkExitCode(r)

        # Reducers are done.  Output results.
        to_combine = 1
        try:
            os.rename(os.path.join(self._work_dir, "reducer_0"), self._output_file)
        except OSError, e:
            if e.errno != errno.EXDEV:
                raise
            else:
                # OSError: [Errno 18] Invalid cross-device link (EXDEV == 18)
                # We can't rename across devices :( Copy / delete instead.
                to_combine = 0

        # TODO: If _output_file ends with a compressed suffix (.gz, .xz, .bz2, etc),
        #       try to compress it after writing.
        if self._num_reducers > to_combine:
            out = open(self._output_file, "a")
            for i in range(to_combine, self._num_reducers):
                # FIXME: this reads the entire reducer output into memory
                reducer_filename = os.path.join(self._work_dir, "reducer_" + str(i))
                reducer_output = open(reducer_filename, "r")
                out.write(reducer_output.read())
                reducer_output.close()
                os.remove(reducer_filename)

        # TODO: clean up downloaded files?

        # Clean up mapper outputs
        for m in range(self._num_mappers):
            for r in range(self._num_reducers):
                mfile = os.path.join(self._work_dir, "mapper_%d_%d" % (m, r))
                if os.path.exists(mfile):
                    os.remove(mfile)
                else:
                    print "Warning: Could not find", mfile
Ejemplo n.º 2
0
class Job:
    """A class for orchestrating a Telemetry MapReduce job"""

    # 1. read input filter
    # 2. generate filtered list of local input files
    # 2a. generate filtered list of remote input files
    # 3. load mapper
    # 4. spawn N processes
    # 5. distribute files among processes
    # 6. map(key, value, dims) each line in the file
    # 7. combine map output for each file
    # 8. reduce combine output overall

    def __init__(self, config):
        # Sanity check args.
        if config.get("num_mappers") <= 0:
            raise ValueError("Number of mappers must be greater than zero")
        if config.get("num_reducers") <= 0:
            raise ValueError("Number of reducers must be greater than zero")
        if not os.path.isdir(config.get("data_dir")):
            raise ValueError("Data dir must be a valid directory")
        if not os.path.isdir(config.get("work_dir")):
            raise ValueError("Work dir must be a valid directory")
        if not os.path.isfile(config.get("job_script", "")):
            raise ValueError("Job script must be a valid python file")
        if not os.path.isfile(config.get("input_filter")):
            raise ValueError("Input filter must be a valid json file")

        self._input_dir = config.get("data_dir")
        if self._input_dir[-1] == os.path.sep:
            self._input_dir = self._input_dir[0:-1]
        self._work_dir = config.get("work_dir")
        self._input_filter = TelemetrySchema(
            json.load(open(config.get("input_filter"))))
        self._allowed_values = self._input_filter.sanitize_allowed_values()
        self._output_file = config.get("output")
        self._num_mappers = config.get("num_mappers")
        self._num_reducers = config.get("num_reducers")
        self._local_only = config.get("local_only")
        self._bucket_name = config.get("bucket")
        self._aws_key = config.get("aws_key")
        self._aws_secret_key = config.get("aws_secret_key")
        self._profile = config.get("profile")
        modulefd = open(config.get("job_script"))
        # let the job script import additional modules under its path
        sys.path.append(os.path.dirname(config.get("job_script")))
        ## Lifted from FileDriver.py in jydoop.
        self._job_module = imp.load_module("telemetry_job", modulefd,
                                           config.get("job_script"),
                                           ('.py', 'U', 1))

    def dump_stats(self, partitions):
        total = sum(partitions)
        avg = total / len(partitions)
        for i in range(len(partitions)):
            print "Partition %d contained %d (%+d)" % (
                i, partitions[i], float(partitions[i]) - avg)

    def fetch_remotes(self, remotes):
        # TODO: fetch remotes inside Mappers, and process each one as it becomes available.
        remote_names = [r["name"] for r in remotes if r["type"] == "remote"]

        # TODO: check cache first.
        result = 0
        if len(remote_names) == 0:
            return result

        fetch_cwd = os.path.join(self._work_dir, "cache")
        if not os.path.isdir(fetch_cwd):
            os.makedirs(fetch_cwd)
        loader = s3util.Loader(fetch_cwd,
                               self._bucket_name,
                               aws_key=self._aws_key,
                               aws_secret_key=self._aws_secret_key)
        start = datetime.now()
        downloaded_bytes = 0
        for local, remote, err in loader.get_list(remote_names):
            if err is None:
                print "Downloaded", remote
                downloaded_bytes += os.path.getsize(local)
            else:
                print "Failed to download", remote
                result += 1
        duration_sec = timer.delta_sec(start)
        downloaded_mb = float(downloaded_bytes) / 1024.0 / 1024.0
        print "Downloaded %.2fMB in %.2fs (%.2fMB/s)" % (
            downloaded_mb, duration_sec, downloaded_mb / duration_sec)
        return result

    def dedupe_remotes(self, remote_files, local_files):
        return [
            r for r in remote_files
            if os.path.join(self._input_dir, r.name) not in local_files
        ]

    def mapreduce(self):
        # Find files matching specified input filter
        files = self.get_filtered_files(self._input_dir)
        remote_files = self.get_filtered_files_s3()

        # If we're using the cache dir as the data dir, we will end up reading
        # each already-downloaded file twice, so we should skip any remote files
        # that exist in the data dir.
        remote_files = self.dedupe_remotes(remote_files, files)

        file_count = len(files) + len(remote_files)

        if file_count == 0:
            print "Filter didn't match any files... nothing to do"
            return

        # Not useful to have more mappers than input files.
        if file_count < self._num_mappers:
            print "Filter matched only %s input files (%s local in %s and %s " \
                  "remote from %s). Reducing number of mappers accordingly." \
                  % (file_count, len(files), self._input_dir, len(remote_files),
                      self._bucket_name)
            self._num_mappers = file_count

        # Partition files into reasonably equal groups for use by mappers
        print "Partitioning input data..."
        partitions = self.partition(files, remote_files)
        print "Done"

        def checkExitCode(proc):
            # If process was terminated by a signal, exitcode is the negative signal value
            if proc.exitcode == -signal.SIGKILL:
                # SIGKILL is most likely an OOM kill
                raise MemoryError("%s ran out of memory" % proc.name)
            elif proc.exitcode:
                raise OSError("%s exited with code %d" %
                              (proc.name, proc.exitcode))

        # Partitions are ready. Map.
        mappers = []
        for i in range(self._num_mappers):
            if len(partitions[i]) > 0:
                # Fetch the files we need for each mapper
                print "Fetching remotes for partition", i
                fetch_result = self.fetch_remotes(partitions[i])
                if fetch_result == 0:
                    print "Remote files fetched successfully"
                else:
                    print "ERROR: Failed to fetch", fetch_result, "files."
                    # TODO: Bail, since results will be unreliable?
                p = Process(target=Mapper,
                            name=("Mapper-%d" % i),
                            args=(i, self._profile, partitions[i],
                                  self._work_dir, self._job_module,
                                  self._num_reducers))
                mappers.append(p)
                p.start()
            else:
                print "Skipping mapper", i, "- no input files to process"
        for m in mappers:
            m.join()
            checkExitCode(m)

        # Mappers are done. Reduce.
        reducers = []
        for i in range(self._num_reducers):
            p = Process(target=Reducer,
                        name=("Reducer-%d" % i),
                        args=(i, self._profile, self._work_dir,
                              self._job_module, self._num_mappers))
            reducers.append(p)
            p.start()
        for r in reducers:
            r.join()
            checkExitCode(r)

        # Reducers are done.  Output results.
        to_combine = 1
        try:
            os.rename(os.path.join(self._work_dir, "reducer_0"),
                      self._output_file)
        except OSError, e:
            if e.errno != errno.EXDEV:
                raise
            else:
                # OSError: [Errno 18] Invalid cross-device link (EXDEV == 18)
                # We can't rename across devices :( Copy / delete instead.
                to_combine = 0

        # TODO: If _output_file ends with a compressed suffix (.gz, .xz, .bz2, etc),
        #       try to compress it after writing.
        if self._num_reducers > to_combine:
            out = open(self._output_file, "a")
            for i in range(to_combine, self._num_reducers):
                # FIXME: this reads the entire reducer output into memory
                reducer_filename = os.path.join(self._work_dir,
                                                "reducer_" + str(i))
                reducer_output = open(reducer_filename, "r")
                out.write(reducer_output.read())
                reducer_output.close()
                os.remove(reducer_filename)

        # TODO: clean up downloaded files?

        # Clean up mapper outputs
        for m in range(self._num_mappers):
            for r in range(self._num_reducers):
                mfile = os.path.join(self._work_dir, "mapper_%d_%d" % (m, r))
                if os.path.exists(mfile):
                    os.remove(mfile)
                else:
                    print "Warning: Could not find", mfile
Ejemplo n.º 3
0
class AnalysisJob:
    def __init__(self, cfg):
        self.job_bundle = cfg.job_bundle
        if cfg.input_filter:
            self.input_filter = TelemetrySchema(
                json.load(open(cfg.input_filter)))
        else:
            self.input_filter = None

        if cfg.input_list_file:
            self.input_list = cfg.input_list_file
        else:
            self.input_list = None

        self.job_id = str(uuid4())
        self.target_queue = cfg.target_queue
        self.aws_key = cfg.aws_key
        self.aws_secret_key = cfg.aws_secret_key
        self.input_bucket = "telemetry-published-v1"
        self.job_name = cfg.name
        self.job_owner = cfg.owner
        self.date_limit = cfg.date_limit

        # Bucket with intermediate data for this analysis job
        self.analysis_bucket = "jonasfj-telemetry-analysis"

        self.s3_code_path = "batch-jobs/" + self.job_id + ".tar.gz"

        # S3 region of operation
        self.aws_region = "us-west-2"
        self.task_size_limit = 400 * 1024 * 1024
        self.sqs_input_name = cfg.sqs_queue

    def get_filtered_files(self):
        conn = S3Connection(self.aws_key, self.aws_secret_key)
        bucket = conn.get_bucket(self.input_bucket)
        # date_limit is a hack that makes it easy to launch everything before
        # a given date... say to back process all we have in the bucket...
        if self.date_limit != None:
            print "Launching limiting to before " + self.date_limit
            for k, s in self.list_partitions(bucket):
                if k.split('/')[-1].split('.')[1] < self.date_limit:
                    yield (k, s)
        else:
            for k, s in self.list_partitions(bucket):
                yield (k, s)

    def get_filtered_files_old(self):
        """ Get tuples of name and size for all input files """
        # Setup some auxiliary functions
        allowed_values = self.input_filter.sanitize_allowed_values()
        nb_dims = len(allowed_values)

        def filter_includes(level, value):
            return self.input_filter.is_allowed(value, allowed_values[level])

        # iterate over all files in bucket, this is very slow and we should be
        # be able to something much smarter using prefix listing and ordering
        # to break listing.
        count = 0
        selected = 0
        conn = S3Connection(self.aws_key, self.aws_secret_key)
        bucket = conn.get_bucket(self.input_bucket)
        for f in bucket.list():
            count += 1
            dims = self.input_filter.get_dimensions(".", f.key)
            include = True
            for i in xrange(nb_dims):
                if not filter_includes(i, dims[i]):
                    include = False
                    break
            if include:
                selected += 1
                yield (f.key, f.size)
            if count % 5000 == 0:
                print "%i files listed with %i selected" % (count, selected)
        conn.close()

    def list_partitions(self, bucket, prefix='', level=0):
        if self.input_filter:
            #print "Listing...", prefix, level
            allowed_values = self.input_filter.sanitize_allowed_values()
            delimiter = '/'
            if level > 3:
                delimiter = '.'
            for k in bucket.list(prefix=prefix, delimiter=delimiter):
                partitions = k.name.split("/")
                if level > 3:
                    # split the last couple of partition components by "." instead of "/"
                    partitions.extend(partitions.pop().split(".", 2))
                if self.input_filter.is_allowed(partitions[level],
                                                allowed_values[level]):
                    if level >= 5:
                        for f in bucket.list(prefix=k.name):
                            yield (f.key, f.size)
                    else:
                        for k, s in self.list_partitions(
                                bucket, k.name, level + 1):
                            yield (k, s)
        elif self.input_list:
            print "Using input list..."
            for line in self.input_list:
                key_name = line.strip()
                k = bucket.get_key(key_name)
                yield (k.key, k.size)
        else:
            print "Don't know how to list partitions without a filter or list :("
            raise ValueError("Missing both input_filter and input_list")

    def generate_tasks(self):
        """ Generates SQS tasks, we batch small files into a single task """
        taskid = 1
        taskfiles = []
        tasksize = 0
        total_size_of_all = 0
        for key, size in self.get_filtered_files():
            # If the task have reached desired size we yield it
            # Note, as SQS messages are limited to 65 KiB we limit tasks to
            # 100 filenames, for simplicity
            # boto only uses signature version 4, hence, we're limited to 65 KiB
            if 0 < len(taskfiles) and (tasksize + size > self.task_size_limit
                                       or len(taskfiles) > 200):
                # Reduce to only filenames, sort by size... smallest first they are
                # faster to download when handling the job
                taskfiles = [
                    f for f, s in sorted(taskfiles, key=lambda (f, s): s)
                ]
                yield {
                    'id': self.job_id + "/" + str(taskid),
                    'name': self.job_name,
                    'owner': self.job_owner,
                    'code': self.s3_code_path,
                    'target-queue': self.target_queue,
                    'files': taskfiles,
                    'size': tasksize
                }
                total_size_of_all += tasksize
                print "%i tasks created acc. size: %s" % (taskid,
                                                          total_size_of_all)
                taskid += 1
                taskfiles = []
                tasksize = 0
            tasksize += size
            taskfiles.append((key, size))
        if len(taskfiles) > 0:
            taskfiles = [f for f, s in sorted(taskfiles, key=lambda (f, s): s)]
            yield {
                'id': self.job_id + "/" + str(taskid),
                'name': self.job_name,
                'owner': self.job_owner,
                'code': self.s3_code_path,
                'target-queue': self.target_queue,
                'files': taskfiles,
                'size': tasksize
            }
        print "Finished:"
        print "%i tasks created total size: %s" % (taskid, total_size_of_all +
                                                   tasksize)

    def put_sqs_tasks(self):
        """ Create an SQS tasks for this analysis job """
        print "Populate SQS input queue with tasks"
        # Connect to SQS is desired region
        conn = sqs.connect_to_region(self.aws_region,
                                     aws_access_key_id=self.aws_key,
                                     aws_secret_access_key=self.aws_secret_key)
        # Create queue
        queue = conn.get_queue(self.sqs_input_name)
        queue.set_message_class(JSONMessage)
        # Populate queue with tasks
        for task in self.generate_tasks():
            #print "enqueueing", task["id"], "size:", task["size"]
            msg = queue.new_message(body=task)
            queue.write(msg)
        conn.close()

    def setup(self):
        self.upload_job_bundle()
        self.put_sqs_tasks()
        print "Uploaded with job_id: %s" % self.job_id

    def upload_job_bundle(self):
        """ Upload job bundle to S3 """
        conn = S3Connection(self.aws_key, self.aws_secret_key)
        bucket = conn.get_bucket(self.analysis_bucket)
        k = Key(bucket)
        k.key = self.s3_code_path
        k.set_contents_from_filename(self.job_bundle)
        conn.close()
Ejemplo n.º 4
0
class AnalysisJob:
    def __init__(self, cfg):
        self.job_bundle = cfg.job_bundle
        if cfg.input_filter:
            self.input_filter = TelemetrySchema(json.load(open(cfg.input_filter)))
        else:
            self.input_filter = None

        if cfg.input_list_file:
            self.input_list = cfg.input_list_file
        else:
            self.input_list = None

        self.job_id = str(uuid4())
        self.target_queue = cfg.target_queue
        self.aws_key = cfg.aws_key
        self.aws_secret_key = cfg.aws_secret_key
        self.input_bucket = "telemetry-published-v1"
        self.job_name = cfg.name
        self.job_owner = cfg.owner
        self.date_limit = cfg.date_limit

        # Bucket with intermediate data for this analysis job
        self.analysis_bucket = "jonasfj-telemetry-analysis"

        self.s3_code_path = "batch-jobs/" + self.job_id + ".tar.gz"

        # S3 region of operation
        self.aws_region = "us-west-2"
        self.task_size_limit = 400 * 1024 * 1024
        self.sqs_input_name = cfg.sqs_queue


    def get_filtered_files(self):
        conn = S3Connection(self.aws_key, self.aws_secret_key)
        bucket = conn.get_bucket(self.input_bucket)
        # date_limit is a hack that makes it easy to launch everything before
        # a given date... say to back process all we have in the bucket...
        if self.date_limit != None:
            print "Launching limiting to before " + self.date_limit
            for k,s in self.list_partitions(bucket):
                if k.split('/')[-1].split('.')[1] < self.date_limit:
                    yield (k, s)
        else:
            for k,s in self.list_partitions(bucket):
                yield (k, s)

    def get_filtered_files_old(self):
        """ Get tuples of name and size for all input files """
        # Setup some auxiliary functions
        allowed_values = self.input_filter.sanitize_allowed_values()
        nb_dims = len(allowed_values)
        def filter_includes(level, value):
            return self.input_filter.is_allowed(value, allowed_values[level])

        # iterate over all files in bucket, this is very slow and we should be
        # be able to something much smarter using prefix listing and ordering
        # to break listing.
        count = 0
        selected = 0
        conn = S3Connection(self.aws_key, self.aws_secret_key)
        bucket = conn.get_bucket(self.input_bucket)
        for f in bucket.list():
            count += 1
            dims = self.input_filter.get_dimensions(".", f.key)
            include = True
            for i in xrange(nb_dims):
                if not filter_includes(i, dims[i]):
                    include = False
                    break
            if include:
                selected += 1
                yield (f.key, f.size)
            if count % 5000 == 0:
                print "%i files listed with %i selected" % (count, selected)
        conn.close()

    def list_partitions(self, bucket, prefix='', level=0):
        if self.input_filter:
            #print "Listing...", prefix, level
            allowed_values = self.input_filter.sanitize_allowed_values()
            delimiter = '/'
            if level > 3:
                delimiter = '.'
            for k in bucket.list(prefix=prefix, delimiter=delimiter):
                partitions = k.name.split("/")
                if level > 3:
                    # split the last couple of partition components by "." instead of "/"
                    partitions.extend(partitions.pop().split(".", 2))
                if self.input_filter.is_allowed(partitions[level], allowed_values[level]):
                    if level >= 5:
                        for f in bucket.list(prefix=k.name):
                            yield (f.key, f.size)
                    else:
                        for k, s in self.list_partitions(bucket, k.name, level + 1):
                            yield (k, s)
        elif self.input_list:
            print "Using input list..."
            for line in self.input_list:
                key_name = line.strip()
                k = bucket.get_key(key_name)
                yield (k.key, k.size)
        else:
            print "Don't know how to list partitions without a filter or list :("
            raise ValueError("Missing both input_filter and input_list")

    def generate_tasks(self):
        """ Generates SQS tasks, we batch small files into a single task """
        taskid = 1
        taskfiles = []
        tasksize = 0
        total_size_of_all = 0
        for key, size in self.get_filtered_files():
            # If the task have reached desired size we yield it
            # Note, as SQS messages are limited to 65 KiB we limit tasks to
            # 100 filenames, for simplicity
            # boto only uses signature version 4, hence, we're limited to 65 KiB
            if 0 < len(taskfiles) and (tasksize + size > self.task_size_limit or
                                       len(taskfiles) > 200):
                # Reduce to only filenames, sort by size... smallest first they are
                # faster to download when handling the job
                taskfiles =  [f for f,s in sorted(taskfiles, key=lambda (f,s): s)]
                yield {
                    'id':               self.job_id + "/" +  str(taskid),
                    'name':             self.job_name,
                    'owner':            self.job_owner,
                    'code':             self.s3_code_path,
                    'target-queue':     self.target_queue,
                    'files':            taskfiles,
                    'size':             tasksize
                }
                total_size_of_all += tasksize
                print "%i tasks created acc. size: %s" % (taskid, total_size_of_all)
                taskid += 1
                taskfiles = []
                tasksize = 0
            tasksize += size
            taskfiles.append((key, size))
        if len(taskfiles) > 0:
            taskfiles =  [f for f,s in sorted(taskfiles, key=lambda (f,s): s)]
            yield {
                'id':               self.job_id + "/" + str(taskid),
                'name':             self.job_name,
                'owner':            self.job_owner,
                'code':             self.s3_code_path,
                'target-queue':     self.target_queue,
                'files':            taskfiles,
                'size':             tasksize
            }
        print "Finished:"
        print "%i tasks created total size: %s" % (taskid, total_size_of_all + tasksize)

    def put_sqs_tasks(self):
        """ Create an SQS tasks for this analysis job """
        print "Populate SQS input queue with tasks"
        # Connect to SQS is desired region
        conn = sqs.connect_to_region(
           self.aws_region,
           aws_access_key_id = self.aws_key,
           aws_secret_access_key = self.aws_secret_key
        )
        # Create queue
        queue = conn.get_queue(self.sqs_input_name)
        queue.set_message_class(JSONMessage)
        # Populate queue with tasks
        for task in self.generate_tasks():
            #print "enqueueing", task["id"], "size:", task["size"]
            msg = queue.new_message(body = task)
            queue.write(msg)
        conn.close()

    def setup(self):
        self.upload_job_bundle()
        self.put_sqs_tasks()
        print "Uploaded with job_id: %s" % self.job_id

    def upload_job_bundle(self):
        """ Upload job bundle to S3 """
        conn = S3Connection(self.aws_key, self.aws_secret_key)
        bucket = conn.get_bucket(self.analysis_bucket)
        k = Key(bucket)
        k.key = self.s3_code_path
        k.set_contents_from_filename(self.job_bundle)
        conn.close()
Ejemplo n.º 5
0
class Job:
    """A class for orchestrating a Heka MapReduce job"""

    # 1. read input filter
    # 2. generate filtered list of local input files
    # 2a. generate filtered list of remote input files
    # 3. load mapper
    # 4. spawn N processes
    # 5. distribute files among processes
    # 6. map(key, value, dims) each line in the file
    # 7. combine map output for each file
    # 8. reduce combine output overall

    def __init__(self, config):
        # Sanity check args.
        if config.get("num_mappers") <= 0:
            raise ValueError("Number of mappers must be greater than zero")
        if config.get("num_reducers") <= 0:
            raise ValueError("Number of reducers must be greater than zero")
        if not os.path.isdir(config.get("data_dir")):
            raise ValueError("Data dir must be a valid directory")
        if not os.path.isdir(config.get("work_dir")):
            raise ValueError("Work dir must be a valid directory")
        if not os.path.isfile(config.get("job_script", "")):
            raise ValueError("Job script must be a valid python file")
        if not os.path.isfile(config.get("input_filter")):
            raise ValueError("Input filter must be a valid json file")

        self._input_dir = config.get("data_dir")
        if self._input_dir[-1] == os.path.sep:
            self._input_dir = self._input_dir[0:-1]
        self._work_dir = config.get("work_dir")
        with open(config.get("input_filter")) as filter_file:
            self._input_filter = TelemetrySchema(json.load(filter_file))
        self._allowed_values = self._input_filter.sanitize_allowed_values()
        self._output_file = config.get("output")
        self._num_mappers = config.get("num_mappers")
        self._num_reducers = config.get("num_reducers")
        self._local_only = config.get("local_only")
        self._bucket_name = config.get("bucket")
        self._aws_key = config.get("aws_key")
        self._aws_secret_key = config.get("aws_secret_key")
        self._profile = config.get("profile")
        self._delete_data = config.get("delete_data")
        with open(config.get("job_script")) as modulefd:
            # let the job script import additional modules under its path
            sys.path.append(os.path.dirname(config.get("job_script")))
            ## Lifted from FileDriver.py in jydoop.
            self._job_module = imp.load_module("telemetry_job", modulefd, config.get("job_script"), (".py", "U", 1))

    def dump_stats(self, partitions):
        total = sum(partitions)
        avg = total / len(partitions)
        for i in range(len(partitions)):
            print "Partition %d contained %d (%+d)" % (i, partitions[i], float(partitions[i]) - avg)

    def dedupe_remotes(self, remote_files, local_files):
        return (r for r in remote_files if os.path.join(self._input_dir, r.name) not in local_files)

    def mapreduce(self):
        # Find files matching specified input filter
        files = set(self.get_filtered_files(self._input_dir))
        remote_files = self.get_filtered_files_s3()

        # If we're using the cache dir as the data dir, we will end up reading
        # each already-downloaded file twice, so we should skip any remote files
        # that exist in the data dir.
        remote_files = self.dedupe_remotes(remote_files, files)

        # Partition files into reasonably equal groups for use by mappers
        print "Partitioning input data..."
        partitions = self.partition(files, remote_files)
        print "Done"

        if not any(part for part in partitions):
            print "Filter didn't match any files... nothing to do"
            return

        partitions = [part for part in partitions if part]

        # Not useful to have more mappers than partitions.
        if len(partitions) < self._num_mappers:
            print "Filter matched only %d input files. Reducing number of mappers accordingly." % (len(partitions),)
            self._num_mappers = len(partitions)

        # Free up our set of names. We want to minimize
        # our memory usage prior to forking map jobs.
        # files = None
        gc.collect()

        def checkExitCode(proc):
            # If process was terminated by a signal, exitcode is the negative signal value
            if proc.exitcode == -signal.SIGKILL:
                # SIGKILL is most likely an OOM kill
                raise MemoryError("%s ran out of memory" % proc.name)
            elif proc.exitcode:
                raise OSError("%s exited with code %d" % (proc.name, proc.exitcode))

        # Partitions are ready. Map.
        mappers = []
        for i in range(self._num_mappers):
            if len(partitions[i]) > 0:
                p = Process(
                    target=Mapper,
                    name=("Mapper-%d" % i),
                    args=(
                        i,
                        self._profile,
                        partitions[i],
                        self._work_dir,
                        self._job_module,
                        self._num_reducers,
                        self._delete_data,
                        self._aws_key,
                        self._aws_secret_key,
                        self._bucket_name,
                    ),
                )
                mappers.append(p)
                p.start()
            else:
                print "Skipping mapper", i, "- no input files to process"
        for m in mappers:
            m.join()
            checkExitCode(m)

        # Mappers are done. Reduce.
        reducers = []
        for i in range(self._num_reducers):
            p = Process(
                target=Reducer,
                name=("Reducer-%d" % i),
                args=(i, self._profile, self._work_dir, self._job_module, self._num_mappers),
            )
            reducers.append(p)
            p.start()
        for r in reducers:
            r.join()
            checkExitCode(r)

        # Reducers are done.  Output results.
        to_combine = 1
        try:
            os.rename(os.path.join(self._work_dir, "reducer_0"), self._output_file)
        except OSError, e:
            if e.errno != errno.EXDEV:
                raise
            else:
                # OSError: [Errno 18] Invalid cross-device link (EXDEV == 18)
                # We can't rename across devices :( Copy / delete instead.
                to_combine = 0

        # TODO: If _output_file ends with a compressed suffix (.gz, .xz, .bz2, etc),
        #       try to compress it after writing.
        if self._num_reducers > to_combine:
            with open(self._output_file, "a") as out:
                for i in range(to_combine, self._num_reducers):
                    # FIXME: this reads the entire reducer output into memory
                    reducer_filename = os.path.join(self._work_dir, "reducer_" + str(i))
                    with open(reducer_filename, "r") as reducer_output:
                        out.write(reducer_output.read())
                    os.remove(reducer_filename)

        # Clean up mapper outputs
        for m in range(self._num_mappers):
            for r in range(self._num_reducers):
                mfile = os.path.join(self._work_dir, "mapper_%d_%d" % (m, r))
                if os.path.exists(mfile):
                    os.remove(mfile)
                else:
                    print "Warning: Could not find", mfile
Ejemplo n.º 6
0
class Job:
    """A class for orchestrating a Heka MapReduce job"""
    # 1. read input filter
    # 2. generate filtered list of local input files
    # 2a. generate filtered list of remote input files
    # 3. load mapper
    # 4. spawn N processes
    # 5. distribute files among processes
    # 6. map(key, value, dims) each line in the file
    # 7. combine map output for each file
    # 8. reduce combine output overall

    def __init__(self, config):
        # Sanity check args.
        if config.get("num_mappers") <= 0:
            raise ValueError("Number of mappers must be greater than zero")
        if config.get("num_reducers") <= 0:
            raise ValueError("Number of reducers must be greater than zero")
        if not os.path.isdir(config.get("data_dir")):
            raise ValueError("Data dir must be a valid directory")
        if not os.path.isdir(config.get("work_dir")):
            raise ValueError("Work dir must be a valid directory")
        if not os.path.isfile(config.get("job_script", "")):
            raise ValueError("Job script must be a valid python file")
        if not os.path.isfile(config.get("input_filter")):
            raise ValueError("Input filter must be a valid json file")

        self._input_dir = config.get("data_dir")
        if self._input_dir[-1] == os.path.sep:
            self._input_dir = self._input_dir[0:-1]
        self._work_dir = config.get("work_dir")
        with open(config.get("input_filter")) as filter_file:
            self._input_filter = TelemetrySchema(json.load(filter_file))
        self._allowed_values = self._input_filter.sanitize_allowed_values()
        self._output_file = config.get("output")
        self._num_mappers = config.get("num_mappers")
        self._num_reducers = config.get("num_reducers")
        self._local_only = config.get("local_only")
        self._bucket_name = config.get("bucket")
        self._aws_key = config.get("aws_key")
        self._aws_secret_key = config.get("aws_secret_key")
        self._profile = config.get("profile")
        self._delete_data = config.get("delete_data")
        with open(config.get("job_script")) as modulefd:
            # let the job script import additional modules under its path
            sys.path.append(os.path.dirname(config.get("job_script")))
            ## Lifted from FileDriver.py in jydoop.
            self._job_module = imp.load_module(
                "telemetry_job", modulefd, config.get("job_script"), ('.py', 'U', 1))

    def dump_stats(self, partitions):
        total = sum(partitions)
        avg = total / len(partitions)
        for i in range(len(partitions)):
            print "Partition %d contained %d (%+d)" % (i, partitions[i], float(partitions[i]) - avg)

    def dedupe_remotes(self, remote_files, local_files):
        return ( r for r in remote_files
                   if os.path.join(self._input_dir, r.name) not in local_files )

    def mapreduce(self):
        # Find files matching specified input filter
        files = set(self.get_filtered_files(self._input_dir))
        remote_files = self.get_filtered_files_s3()

        # If we're using the cache dir as the data dir, we will end up reading
        # each already-downloaded file twice, so we should skip any remote files
        # that exist in the data dir.
        remote_files = self.dedupe_remotes(remote_files, files)

        # Partition files into reasonably equal groups for use by mappers
        print "Partitioning input data..."
        partitions = self.partition(files, remote_files)
        print "Done"

        if not any(part for part in partitions):
             print "Filter didn't match any files... nothing to do"
             return

        partitions = [part for part in partitions if part]

        # Not useful to have more mappers than partitions.
        if len(partitions) < self._num_mappers:
            print "Filter matched only %d input files. Reducing number of mappers accordingly." % (
                  len(partitions),)
            self._num_mappers = len(partitions)

        # Free up our set of names. We want to minimize
        # our memory usage prior to forking map jobs.
        #files = None
        gc.collect()

        def checkExitCode(proc):
            # If process was terminated by a signal, exitcode is the negative signal value
            if proc.exitcode == -signal.SIGKILL:
                # SIGKILL is most likely an OOM kill
                raise MemoryError("%s ran out of memory" % proc.name)
            elif proc.exitcode:
                raise OSError("%s exited with code %d" % (proc.name, proc.exitcode))

        # Partitions are ready. Map.
        mappers = []
        for i in range(self._num_mappers):
            if len(partitions[i]) > 0:
                p = Process(
                        target=Mapper,
                        name=("Mapper-%d" % i),
                        args=(i, self._profile, partitions[i], self._work_dir, self._job_module, self._num_reducers, self._delete_data, self._aws_key, self._aws_secret_key, self._bucket_name))
                mappers.append(p)
                p.start()
            else:
                print "Skipping mapper", i, "- no input files to process"
        for m in mappers:
            m.join()
            checkExitCode(m)

        # Mappers are done. Reduce.
        reducers = []
        for i in range(self._num_reducers):
            p = Process(
                    target=Reducer,
                    name=("Reducer-%d" % i),
                    args=(i, self._profile, self._work_dir, self._job_module, self._num_mappers))
            reducers.append(p)
            p.start()
        for r in reducers:
            r.join()
            checkExitCode(r)

        # Reducers are done.  Output results.
        to_combine = 1
        try:
            os.rename(os.path.join(self._work_dir, "reducer_0"), self._output_file)
        except OSError, e:
            if e.errno != errno.EXDEV:
                raise
            else:
                # OSError: [Errno 18] Invalid cross-device link (EXDEV == 18)
                # We can't rename across devices :( Copy / delete instead.
                to_combine = 0

        # TODO: If _output_file ends with a compressed suffix (.gz, .xz, .bz2, etc),
        #       try to compress it after writing.
        if self._num_reducers > to_combine:
            with open(self._output_file, "a") as out:
                for i in range(to_combine, self._num_reducers):
                    # FIXME: this reads the entire reducer output into memory
                    reducer_filename = os.path.join(self._work_dir, "reducer_" + str(i))
                    with open(reducer_filename, "r") as reducer_output:
                        out.write(reducer_output.read())
                    os.remove(reducer_filename)

        # Clean up mapper outputs
        for m in range(self._num_mappers):
            for r in range(self._num_reducers):
                mfile = os.path.join(self._work_dir, "mapper_%d_%d" % (m, r))
                if os.path.exists(mfile):
                    os.remove(mfile)
                else:
                    print "Warning: Could not find", mfile