def test_more_allowed(self):
        spec = {
            "version": 1,
            "dimensions": [
                {
                    "field_name": "reason",
                    "allowed_values": ["saved-session"]
                },
                {
                    "field_name": "appName",
                    "allowed_values": "*"
                },
                {
                    "field_name": "appUpdateChannel",
                    "allowed_values": ["nightly"]
                },
                {
                    "field_name": "appVersion",
                    "allowed_values": "*"
                },
                {
                    "field_name": "appBuildID",
                    "allowed_values": "one_specific_build"
                },
                {
                    "field_name": "submission_date",
                    "allowed_values": {
                        "min": "20130908",
                        "max": "20140401"
                    }
                }
            ]
        }
        schema = TelemetrySchema(spec)
        allowed = schema.sanitize_allowed_values()
        self.assertTrue(schema.is_allowed("20130908", allowed[5]))
        self.assertTrue(schema.is_allowed("20140401", allowed[5]))
        self.assertTrue(schema.is_allowed("20130909", allowed[5]))
        self.assertTrue(schema.is_allowed("20140101", allowed[5]))
        self.assertFalse(schema.is_allowed("20130907", allowed[5]))
        self.assertFalse(schema.is_allowed("20000000", allowed[5]))
        self.assertFalse(schema.is_allowed("20140402", allowed[5]))
        self.assertFalse(schema.is_allowed("99999999", allowed[5]))

        self.assertTrue(schema.is_allowed("one_specific_build", allowed[4]))
        self.assertFalse(schema.is_allowed("two_specific_build", allowed[4]))
        self.assertFalse(schema.is_allowed("*", allowed[4]))
        self.assertFalse(schema.is_allowed("one_specific_build ", allowed[4]))
        self.assertFalse(schema.is_allowed("one-specific-build", allowed[4]))
Ejemplo n.º 2
0
    def test_more_allowed(self):
        spec = {
            "version":
            1,
            "dimensions": [{
                "field_name": "reason",
                "allowed_values": ["saved-session"]
            }, {
                "field_name": "appName",
                "allowed_values": "*"
            }, {
                "field_name": "appUpdateChannel",
                "allowed_values": ["nightly"]
            }, {
                "field_name": "appVersion",
                "allowed_values": "*"
            }, {
                "field_name": "appBuildID",
                "allowed_values": "one_specific_build"
            }, {
                "field_name": "submission_date",
                "allowed_values": {
                    "min": "20130908",
                    "max": "20140401"
                }
            }]
        }
        schema = TelemetrySchema(spec)
        allowed = schema.sanitize_allowed_values()
        self.assertTrue(schema.is_allowed("20130908", allowed[5]))
        self.assertTrue(schema.is_allowed("20140401", allowed[5]))
        self.assertTrue(schema.is_allowed("20130909", allowed[5]))
        self.assertTrue(schema.is_allowed("20140101", allowed[5]))
        self.assertFalse(schema.is_allowed("20130907", allowed[5]))
        self.assertFalse(schema.is_allowed("20000000", allowed[5]))
        self.assertFalse(schema.is_allowed("20140402", allowed[5]))
        self.assertFalse(schema.is_allowed("99999999", allowed[5]))

        self.assertTrue(schema.is_allowed("one_specific_build", allowed[4]))
        self.assertFalse(schema.is_allowed("two_specific_build", allowed[4]))
        self.assertFalse(schema.is_allowed("*", allowed[4]))
        self.assertFalse(schema.is_allowed("one_specific_build ", allowed[4]))
        self.assertFalse(schema.is_allowed("one-specific-build", allowed[4]))
Ejemplo n.º 3
0
class Job:
    """A class for orchestrating a Telemetry MapReduce job"""
    DOWNLOAD_BATCH_SIZE = 100
    # 1. read input filter
    # 2. generate filtered list of local input files
    # 2a. generate filtered list of remote input files
    # 3. load mapper
    # 4. spawn N processes
    # 5. distribute files among processes
    # 6. map(key, value, dims) each line in the file
    # 7. combine map output for each file
    # 8. reduce combine output overall

    def __init__(self, config):
        # Sanity check args.
        if config.num_mappers <= 0:
            raise ValueError("Number of mappers must be greater than zero")
        if config.num_reducers <= 0:
            raise ValueError("Number of reducers must be greater than zero")
        if not os.path.isdir(config.data_dir):
            raise ValueError("Data dir must be a valid directory")
        if not os.path.isdir(config.work_dir):
            raise ValueError("Work dir must be a valid directory")
        if not os.path.isfile(config.job_script):
            raise ValueError("Job script must be a valid python file")
        if not os.path.isfile(config.input_filter):
            raise ValueError("Input filter must be a valid json file")

        self._input_dir = config.data_dir
        if self._input_dir[-1] == os.path.sep:
            self._input_dir = self._input_dir[0:-1]
        self._work_dir = config.work_dir
        self._input_filter = TelemetrySchema(json.load(open(config.input_filter)))
        self._allowed_values = self._input_filter.sanitize_allowed_values()
        self._output_file = config.output
        self._num_mappers = config.num_mappers
        self._num_reducers = config.num_reducers
        self._local_only = config.local_only
        self._bucket_name = config.bucket
        self._aws_key = config.aws_key
        self._aws_secret_key = config.aws_secret_key
        modulefd = open(config.job_script)
        ## Lifted from FileDriver.py in jydoop.
        self._job_module = imp.load_module("telemetry_job", modulefd, config.job_script, ('.py', 'U', 1))

    def dump_stats(self, partitions):
        total = sum(partitions)
        avg = total / len(partitions)
        for i in range(len(partitions)):
            print "Partition %d contained %d (%+d)" % (i, partitions[i], float(partitions[i]) - avg)

    def fetch_remotes(self, remotes):
        # TODO: download remotes in groups of size DOWNLOAD_BATCH_SIZE
        remote_names = [ r["name"] for r in remotes if r["type"] == "remote" ]

        # TODO: check cache first.
        result = 0

        fetch_cwd = os.path.join(self._work_dir, "cache")
        if len(remote_names) > 0:
            if not os.path.isdir(fetch_cwd):
                os.makedirs(fetch_cwd)
            fetch_cmd = ["/usr/local/bin/s3funnel"]
            fetch_cmd.append(self._bucket_name)
            fetch_cmd.append("get")
            fetch_cmd.append("-a")
            fetch_cmd.append(self._aws_key)
            fetch_cmd.append("-s")
            fetch_cmd.append(self._aws_secret_key)
            fetch_cmd.append("-t")
            fetch_cmd.append("8")
            start = datetime.now()
            result = subprocess.call(fetch_cmd + remote_names, cwd=fetch_cwd)
            delta = (datetime.now() - start)
            duration_sec = float(delta.seconds) + float(delta.microseconds) / 1000000
            downloaded_bytes = sum([ r["size"] for r in remotes if r["type"] == "remote" ])
            downloaded_mb = float(downloaded_bytes) / 1024.0 / 1024.0
            print "Downloaded %.2fMB in %.2fs (%.2fMB/s)" % (downloaded_mb, duration_sec, downloaded_mb / duration_sec)
        return result

    def mapreduce(self):
        # Find files matching specified input filter
        files = self.local_files()
        remote_files = self.get_filtered_files_s3()

        file_count = len(files) + len(remote_files)
        # Not useful to have more mappers than input files.
        if file_count < self._num_mappers:
            print "There are only", file_count, "input files. Reducing number of mappers accordingly."
            self._num_mappers = file_count

        # Partition files into reasonably equal groups for use by mappers
        print "Partitioning input data..."
        partitions = self.partition(files, remote_files)
        print "Done"

        # Partitions are ready. Map.
        mappers = []
        for i in range(self._num_mappers):
            if len(partitions[i]) > 0:
                # Fetch the files we need for each mapper
                print "Fetching remotes for partition", i
                self.fetch_remotes(partitions[i])
                print "Done"
                p = Process(
                        target=Mapper,
                        args=(i, partitions[i], self._work_dir, self._job_module, self._num_reducers))
                mappers.append(p)
                p.start()
            else:
                print "Skipping mapper", i, "- no input files to process"
        for m in mappers:
            m.join()

        # Mappers are done. Reduce.
        reducers = []
        for i in range(self._num_reducers):
            p = Process(
                    target=Reducer,
                    args=(i, self._work_dir, self._job_module, self._num_mappers))
            reducers.append(p)
            p.start()
        for r in reducers:
            r.join()

        # Reducers are done.  Output results.
        os.rename(os.path.join(self._work_dir, "reducer_0"), self._output_file)
        if self._num_reducers > 1:
            out = open(self._output_file, "a")
            for i in range(1, self._num_reducers):
                # FIXME: this reads the entire reducer output into memory
                reducer_filename = os.path.join(self._work_dir, "reducer_" + str(i))
                reducer_output = open(reducer_filename, "r")
                out.write(reducer_output.read())
                reducer_output.close()
                os.remove(reducer_filename)

        # TODO: clean up downloaded files?

        # Clean up mapper outputs
        for m in range(self._num_mappers):
            for r in range(self._num_reducers):
                mfile = os.path.join(self._work_dir, "mapper_%d_%d" % (m, r))
                if os.path.exists(mfile):
                    os.remove(mfile)
                else:
                    print "Warning: Could not find", mfile

    def local_files(self):
        out_files = self.get_filtered_files(self._input_dir)
        if self._input_filter._include_invalid:
            invalid_dir = os.path.join(self._input_dir, TelemetrySchema.INVALID_DIR)
            #print "Looking for invalid data in", invalid_dir
            out_files += self.get_filtered_files(invalid_dir)
        return out_files

    # Split up the input files into groups of approximately-equal on-disk size.
    def partition(self, files, remote_files):
        namesize = [ { "type": "local", "name": files[i], "size": os.stat(files[i]).st_size, "dimensions": self._input_filter.get_dimensions(self._input_dir, files[i]) } for i in range(0, len(files)) ]
        partitions = []
        sums = []
        for p in range(self._num_mappers):
            partitions.append([])
            sums.append(0)
        min_idx = 0

        # Greedily assign the largest file to the smallest partition
        while len(namesize) > 0:
            current = namesize.pop()
            #print "putting", current, "into partition", min_idx
            partitions[min_idx].append(current)
            sums[min_idx] += current["size"]
            min_idx = find_min_idx(sums)

        # And now do the same with the remote files.
        # TODO: if this is too slow, just distribute remote files round-robin.
        if len(remote_files) > 0:
            conn = S3Connection(self._aws_key, self._aws_secret_key)
            bucket = conn.get_bucket(self._bucket_name)
            for r in remote_files:
                key = bucket.lookup(r)
                size = key.size
                dims = self._input_filter.get_dimensions(".", r)
                remote = {"type": "remote", "name": r, "size": size, "dimensions": dims}
                #print "putting", remote, "into partition", min_idx
                partitions[min_idx].append(remote)
                sums[min_idx] += size
                min_idx = find_min_idx(sums)

        # Print out some info to see how balanced the partitions were:
        self.dump_stats(sums)

        return partitions

    def get_filtered_files(self, searchdir):
        level_offset = searchdir.count(os.path.sep)
        out_files = []
        for root, dirs, files in os.walk(searchdir):
            level = root.count(os.path.sep) - level_offset
            dirs[:] = [i for i in dirs if self.filter_includes(level, i)]
            for f in files:
                full_filename = os.path.join(root, f)
                dims = self._input_filter.get_dimensions(searchdir, full_filename)
                include = True
                for l in range(level, len(self._allowed_values)):
                    if not self.filter_includes(l, dims[l]):
                        include = False
                        break
                if include:
                    out_files.append(full_filename)
        return out_files

    def get_filtered_files_s3(self):
        out_files = []
        if not self._local_only:
            print "Fetching file list from S3..."
            # Plain boto should be fast enough to list bucket contents.
            conn = S3Connection(self._aws_key, self._aws_secret_key)
            bucket = conn.get_bucket(self._bucket_name)

            # TODO: potential optimization - if our input filter is reasonably
            #       restrictive an/or our list of keys is very long, it may be
            #       a win to use the "prefix" and "delimiter" params.
            for f in bucket.list():
                dims = self._input_filter.get_dimensions(".", f.name)
                #print f.name, "->", ",".join(dims)
                include = True
                for i in range(len(self._allowed_values)):
                    if not self.filter_includes(i, dims[i]):
                        include = False
                        break
                if include:
                    out_files.append(f.name)
            conn.close()
            print "Done!"
        return out_files

    def filter_includes(self, level, value):
        # Filter out 'invalid' data.  It is included explicitly if needed.
        if level == 0 and value == TelemetrySchema.INVALID_DIR:
            return False
        allowed_values = self._allowed_values[level]
        return self._input_filter.is_allowed(value, allowed_values)
class TelemetrySchemaTest(unittest.TestCase):
    def setUp(self):
        self.schema = TelemetrySchema(self.get_schema_spec())
        self.allowed_values = self.schema.sanitize_allowed_values()

    def get_schema_spec(self):
        return {
            "version": 1,
            "dimensions": [
                {
                    "field_name": "reason",
                    "allowed_values": ["saved-session"]
                },
                {
                    "field_name": "appName",
                    "allowed_values": "*"
                },
                {
                    "field_name": "appUpdateChannel",
                    "allowed_values": ["nightly"]
                },
                {
                    "field_name": "appVersion",
                    "allowed_values": "*"
                },
                {
                    "field_name": "appBuildID",
                    "allowed_values": "*"
                },
                {
                    "field_name": "submission_date",
                    "allowed_values": ["20130908"]
                }
            ]
        }
    def get_file_list(self):
        return [
            "/foo/bar/baz/bla.txt",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130826074752.20130903.log.e0c7ff434e474c8aa745763eed408b9c.lzma",
            "garbage/idle_daily/Firefox/nightly/26.0a1/20130826074752.20130903.log.e0c7ff434e474c8aa745763eed408b9c.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130901030218.20130903.log.17ff07fda8994e23baf983550246a94b.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130819030205.20130907.log.ec6f22acd37349b3b5ef03da1cc150da.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130807161117.20130903.log.64478ac84a734677bc14cbcf6cc114b7.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130814030204.20130906.log.a382f1337d1f47ef8aad08f8fb14a79a.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130830030205.20130903.log.c86e2c3f31c043ac8fc311d5dd1abc28.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130823151250.20130907.log.939bec39c3d24c89a09834463b220d9a.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130830030205.20130906.log.0bf2c1edf2634ca5bdc865a54957a690.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130826074752.20130903.log.8e33cc0f130849dfbb8afe7331123be3.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130826074752.20130902.log.2349f0434be64c6684f91eccabf9b3e6.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130826030203.20130902.log.57a017a3378b420cbbfb666532606b16.lzma",
            "processed/saved_session/Firefox/nightly/26.0a1/20130902030220.20130908.log.7e9556a5e32b4990a9d378eea65f57a9.lzma",
            "processed/saved_session/Firefox/nightly/26.0a1/20130829030201.20130909.log.c227775e57e24854b1aac7c21c59f85c.lzma",
            "processed/saved_session/Firefox/nightly/26.0a1/20130826030203.20130906.log.88620da62e77482285f28d5ea69beb1e.lzma",
            "processed/saved_session/Firefox/nightly/26.0a1/20130826030203.20130908.log.cc1b0c52365947c38ac2636f3384503c.lzma",
            "processed/saved_session/Firefox/nightly/26.0a1/20130830030205.20130906.log.77905bb7503a4a98aa7231b10073f47e.lzma",
            "processed/saved_session/Firefox/nightly/26.0a1/20130823030204.20130908.log.55f4ab6ada3c4e1d939f24b5da7f8dc2.lzma",
            "processed/saved_session/Firefox/nightly/26.0a1/20130902030220.20130908.log.f213918c08804d449d30e1aaec70089a.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130813030205.20130907.log.24c445d3d2c241bcb5001a63a78e98fa.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130831030224.20130902.log.ebe3cd20fa264cd19aab02b8ffe8cbf1.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130821030213.20130906.log.778737ad596d43e4a5e9e59c38428b61.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130809030203.20130903.log.43ae292120ca475589b20be24fa70171.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130814141812.20130907.log.7c6c5d65b702443cac2768eb6f0e3c91.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130823030204.20130903.log.f73682ebc57a4661a6f48a2a5cf2629c.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130821050136.20130904.log.2d423ec779e04113996914ce81e27bfe.lzma"
        ]

    def test_filtering(self):
        all_files = self.get_file_list()
        error_files = []
        included_files = []
        excluded_files = []
        for f in all_files:
            include = True
            try:
                dims = self.schema.get_dimensions("processed", f)
                for i in range(len(self.allowed_values)):
                    if not self.schema.is_allowed(dims[i], self.allowed_values[i]):
                        include = False
                        break
            except ValueError:
                include = False
                error_files.append(f)
            if include:
                included_files.append(f)
            else:
                excluded_files.append(f)

        #print "Found", len(excluded_files), "excluded files:"
        #for f in excluded_files:
        #    print " - ", f
        #print "Found", len(included_files), "included files:"
        #for f in included_files:
        #    print " + ", f
        #print "Found", len(error_files), "invalid files"
        #for f in error_files:
        #    print " x ", f

        self.assertEqual(len(included_files), 4)
        self.assertEqual(len(error_files), 2)
        self.assertEqual(len(all_files), (len(excluded_files) + len(included_files)))

    def test_safe_filename(self):
        tests = {
            "Hello World!": "Hello_World_",
            "what\nam\ni": "what_am_i",
            "saved-session": "saved_session"
        }
        for key, value in tests.iteritems():
            self.assertEqual(self.schema.safe_filename(key), value)

    def test_sanitize_allowed_values(self):
        self.assertEqual(self.allowed_values[0][0], "saved_session")

    def test_allowed_values(self):
        allowed = "saved_session"
        not_allowed = "anything_else"
        self.assertEqual(self.schema.get_allowed_value(allowed, self.allowed_values[0]), allowed)
        self.assertEqual(self.schema.get_allowed_value(not_allowed, self.allowed_values[0]), TelemetrySchema.DISALLOWED_VALUE)

    def test_apply_schema(self):
        test_inputs = []
        expected_ot = [] # <-- bad name, convenient indenting.
        other = TelemetrySchema.DISALLOWED_VALUE
        # fields:            reason           appName        channel      appVersion      appBuildID     submission_date
        #                    -------------    -----------    -------    --------------    -----------    ---------------
        # allowed:           saved-session        *          nightly          *               *          20130908
        test_inputs.append(["saved-session", "someAppName", "nightly", "someAppVersion", "someBuildID", "20130908"])
        expected_ot.append(["saved-session", "someAppName", "nightly", "someAppVersion", "someBuildID", "20130908"])

        test_inputs.append(["saved-session", "another",     "nightly", "anything is ok", "wooooo",      "20130908"])
        expected_ot.append(["saved-session", "another",     "nightly", "anything is ok", "wooooo",      "20130908"])

        test_inputs.append(["bogus",         "someAppName", "nightly", "someAppVersion", "someBuildID", "20130908"])
        expected_ot.append([other,           "someAppName", "nightly", "someAppVersion", "someBuildID", "20130908"])

        test_inputs.append(["bogus",         "someAppName", "aurora",  "someAppVersion", "someBuildID", "20140428"])
        expected_ot.append([other,           "someAppName", other,     "someAppVersion", "someBuildID", other])

        test_inputs.append(["saved-session", "someAppName", "nightly", "someAppVersion", "someBuildID", "20130908", "more", "bonus", "dimensions!"])
        expected_ot.append(["saved-session", "someAppName", "nightly", "someAppVersion", "someBuildID", "20130908"])

        for i in range(len(test_inputs)):
            actual = self.schema.apply_schema(test_inputs[i])
            self.assertEqual(actual, expected_ot[i])

    def test_get_current_file(self):
        # everything but "submission_date":
        dims = ["saved-session", "someAppName", "nightly", "someAppVersion", "someBuildID"]
        filename = self.schema.get_current_file("foo", dims, "20130908", 1)
        self.assertEqual(filename, "foo/saved_session/someAppName/nightly/someAppVersion/someBuildID.20130908.v1.log")

    def test_get_filename(self):
        dims = ["saved-session", "someAppName", "nightly", "someAppVersion", "someBuildID", "20130908"]
        filename = self.schema.get_filename("foo", dims, 99)
        self.assertEqual(filename, "foo/saved_session/someAppName/nightly/someAppVersion/someBuildID.20130908.v99.log")

    def test_dimensions_from(self):
        test_inputs = []
        expected_ot = []

        test_inputs.append({"reason": "saved-session", "appName": "Firefox", "appUpdateChannel": "release", "appVersion": "28.0", "appBuildID": "20140401001122"})
        expected_ot.append(["saved-session", "Firefox", "release", "28.0", "20140401001122", "20130908"])

        test_inputs.append({"reason": "idle-daily", "appUpdateChannel": "release", "appVersion": "28.0", "appBuildID": "20140401001122"})
        expected_ot.append(["idle-daily", "UNKNOWN", "release", "28.0", "20140401001122", "20130908"])

        test_inputs.append({})
        expected_ot.append(["UNKNOWN", "UNKNOWN", "UNKNOWN", "UNKNOWN", "UNKNOWN", "20130908"])
        for i in range(len(test_inputs)):
            actual = self.schema.dimensions_from(test_inputs[i], "20130908")
            self.assertEqual(actual, expected_ot[i])

    def test_get_field(self):
        dims = ["saved-session", "Firefox", "release", "28.0", "20130908010101", "20130908"]
        # Basic functionality
        self.assertEqual(self.schema.get_field(dims, "reason"),           "saved-session")
        self.assertEqual(self.schema.get_field(dims, "appName"),          "Firefox")
        self.assertEqual(self.schema.get_field(dims, "appUpdateChannel"), "release")
        self.assertEqual(self.schema.get_field(dims, "appVersion"),       "28.0")
        self.assertEqual(self.schema.get_field(dims, "appBuildID"),       "20130908010101")
        self.assertEqual(self.schema.get_field(dims, "submission_date"),  "20130908")

        other = TelemetrySchema.DISALLOWED_VALUE
        allowed = True
        sanitize = True
        # T, T
        self.assertEqual(self.schema.get_field(dims, "reason", limit_to_allowed=allowed, sanitize=sanitize),           "saved_session")
        self.assertEqual(self.schema.get_field(dims, "appName", limit_to_allowed=allowed, sanitize=sanitize),          "Firefox")
        self.assertEqual(self.schema.get_field(dims, "appUpdateChannel", limit_to_allowed=allowed, sanitize=sanitize), other)
        self.assertEqual(self.schema.get_field(dims, "appVersion", limit_to_allowed=allowed, sanitize=sanitize),       "28.0")
        self.assertEqual(self.schema.get_field(dims, "appBuildID", limit_to_allowed=allowed, sanitize=sanitize),       "20130908010101")
        self.assertEqual(self.schema.get_field(dims, "submission_date", limit_to_allowed=allowed, sanitize=sanitize),  "20130908")
        sanitize = False
        # T, F
        self.assertEqual(self.schema.get_field(dims, "reason", limit_to_allowed=allowed, sanitize=sanitize),           "saved-session")
        self.assertEqual(self.schema.get_field(dims, "appName", limit_to_allowed=allowed, sanitize=sanitize),          "Firefox")
        self.assertEqual(self.schema.get_field(dims, "appUpdateChannel", limit_to_allowed=allowed, sanitize=sanitize), other)
        self.assertEqual(self.schema.get_field(dims, "appVersion", limit_to_allowed=allowed, sanitize=sanitize),       "28.0")
        self.assertEqual(self.schema.get_field(dims, "appBuildID", limit_to_allowed=allowed, sanitize=sanitize),       "20130908010101")
        self.assertEqual(self.schema.get_field(dims, "submission_date", limit_to_allowed=allowed, sanitize=sanitize),  "20130908")
        allowed = False
        sanitize = True
        # F, T
        self.assertEqual(self.schema.get_field(dims, "reason", limit_to_allowed=allowed, sanitize=sanitize),           "saved_session")
        self.assertEqual(self.schema.get_field(dims, "appName", limit_to_allowed=allowed, sanitize=sanitize),          "Firefox")
        self.assertEqual(self.schema.get_field(dims, "appUpdateChannel", limit_to_allowed=allowed, sanitize=sanitize), "release")
        self.assertEqual(self.schema.get_field(dims, "appVersion", limit_to_allowed=allowed, sanitize=sanitize),       "28.0")
        self.assertEqual(self.schema.get_field(dims, "appBuildID", limit_to_allowed=allowed, sanitize=sanitize),       "20130908010101")
        self.assertEqual(self.schema.get_field(dims, "submission_date", limit_to_allowed=allowed, sanitize=sanitize),  "20130908")
        sanitize = False
        # F, F
        self.assertEqual(self.schema.get_field(dims, "reason", limit_to_allowed=allowed, sanitize=sanitize),           "saved-session")
        self.assertEqual(self.schema.get_field(dims, "appName", limit_to_allowed=allowed, sanitize=sanitize),          "Firefox")
        self.assertEqual(self.schema.get_field(dims, "appUpdateChannel", limit_to_allowed=allowed, sanitize=sanitize), "release")
        self.assertEqual(self.schema.get_field(dims, "appVersion", limit_to_allowed=allowed, sanitize=sanitize),       "28.0")
        self.assertEqual(self.schema.get_field(dims, "appBuildID", limit_to_allowed=allowed, sanitize=sanitize),       "20130908010101")
        self.assertEqual(self.schema.get_field(dims, "submission_date", limit_to_allowed=allowed, sanitize=sanitize),  "20130908")

        with self.assertRaises(ValueError):
            v = self.schema.get_field(dims, "oranges")

        # Remove the last dimension:
        dims.pop()
        with self.assertRaises(ValueError):
            v = self.schema.get_field(dims, "submission_date")

    def test_more_allowed(self):
        spec = {
            "version": 1,
            "dimensions": [
                {
                    "field_name": "reason",
                    "allowed_values": ["saved-session"]
                },
                {
                    "field_name": "appName",
                    "allowed_values": "*"
                },
                {
                    "field_name": "appUpdateChannel",
                    "allowed_values": ["nightly"]
                },
                {
                    "field_name": "appVersion",
                    "allowed_values": "*"
                },
                {
                    "field_name": "appBuildID",
                    "allowed_values": "one_specific_build"
                },
                {
                    "field_name": "submission_date",
                    "allowed_values": {
                        "min": "20130908",
                        "max": "20140401"
                    }
                }
            ]
        }
        schema = TelemetrySchema(spec)
        allowed = schema.sanitize_allowed_values()
        self.assertTrue(schema.is_allowed("20130908", allowed[5]))
        self.assertTrue(schema.is_allowed("20140401", allowed[5]))
        self.assertTrue(schema.is_allowed("20130909", allowed[5]))
        self.assertTrue(schema.is_allowed("20140101", allowed[5]))
        self.assertFalse(schema.is_allowed("20130907", allowed[5]))
        self.assertFalse(schema.is_allowed("20000000", allowed[5]))
        self.assertFalse(schema.is_allowed("20140402", allowed[5]))
        self.assertFalse(schema.is_allowed("99999999", allowed[5]))

        self.assertTrue(schema.is_allowed("one_specific_build", allowed[4]))
        self.assertFalse(schema.is_allowed("two_specific_build", allowed[4]))
        self.assertFalse(schema.is_allowed("*", allowed[4]))
        self.assertFalse(schema.is_allowed("one_specific_build ", allowed[4]))
        self.assertFalse(schema.is_allowed("one-specific-build", allowed[4]))
Ejemplo n.º 5
0
class TelemetrySchemaTest(unittest.TestCase):
    def setUp(self):
        self.schema = TelemetrySchema(self.get_schema_spec())
        self.allowed_values = self.schema.sanitize_allowed_values()

    def get_schema_spec(self):
        return {
            "version":
            1,
            "dimensions": [{
                "field_name": "reason",
                "allowed_values": ["saved-session"]
            }, {
                "field_name": "appName",
                "allowed_values": "*"
            }, {
                "field_name": "appUpdateChannel",
                "allowed_values": ["nightly"]
            }, {
                "field_name": "appVersion",
                "allowed_values": "*"
            }, {
                "field_name": "appBuildID",
                "allowed_values": "*"
            }, {
                "field_name": "submission_date",
                "allowed_values": ["20130908"]
            }]
        }

    def get_file_list(self):
        return [
            "/foo/bar/baz/bla.txt",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130826074752.20130903.log.e0c7ff434e474c8aa745763eed408b9c.lzma",
            "garbage/idle_daily/Firefox/nightly/26.0a1/20130826074752.20130903.log.e0c7ff434e474c8aa745763eed408b9c.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130901030218.20130903.log.17ff07fda8994e23baf983550246a94b.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130819030205.20130907.log.ec6f22acd37349b3b5ef03da1cc150da.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130807161117.20130903.log.64478ac84a734677bc14cbcf6cc114b7.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130814030204.20130906.log.a382f1337d1f47ef8aad08f8fb14a79a.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130830030205.20130903.log.c86e2c3f31c043ac8fc311d5dd1abc28.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130823151250.20130907.log.939bec39c3d24c89a09834463b220d9a.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130830030205.20130906.log.0bf2c1edf2634ca5bdc865a54957a690.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130826074752.20130903.log.8e33cc0f130849dfbb8afe7331123be3.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130826074752.20130902.log.2349f0434be64c6684f91eccabf9b3e6.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130826030203.20130902.log.57a017a3378b420cbbfb666532606b16.lzma",
            "processed/saved_session/Firefox/nightly/26.0a1/20130902030220.20130908.log.7e9556a5e32b4990a9d378eea65f57a9.lzma",
            "processed/saved_session/Firefox/nightly/26.0a1/20130829030201.20130909.log.c227775e57e24854b1aac7c21c59f85c.lzma",
            "processed/saved_session/Firefox/nightly/26.0a1/20130826030203.20130906.log.88620da62e77482285f28d5ea69beb1e.lzma",
            "processed/saved_session/Firefox/nightly/26.0a1/20130826030203.20130908.log.cc1b0c52365947c38ac2636f3384503c.lzma",
            "processed/saved_session/Firefox/nightly/26.0a1/20130830030205.20130906.log.77905bb7503a4a98aa7231b10073f47e.lzma",
            "processed/saved_session/Firefox/nightly/26.0a1/20130823030204.20130908.log.55f4ab6ada3c4e1d939f24b5da7f8dc2.lzma",
            "processed/saved_session/Firefox/nightly/26.0a1/20130902030220.20130908.log.f213918c08804d449d30e1aaec70089a.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130813030205.20130907.log.24c445d3d2c241bcb5001a63a78e98fa.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130831030224.20130902.log.ebe3cd20fa264cd19aab02b8ffe8cbf1.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130821030213.20130906.log.778737ad596d43e4a5e9e59c38428b61.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130809030203.20130903.log.43ae292120ca475589b20be24fa70171.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130814141812.20130907.log.7c6c5d65b702443cac2768eb6f0e3c91.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130823030204.20130903.log.f73682ebc57a4661a6f48a2a5cf2629c.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130821050136.20130904.log.2d423ec779e04113996914ce81e27bfe.lzma"
        ]

    def test_filtering(self):
        all_files = self.get_file_list()
        error_files = []
        included_files = []
        excluded_files = []
        for f in all_files:
            include = True
            try:
                dims = self.schema.get_dimensions("processed", f)
                for i in range(len(self.allowed_values)):
                    if not self.schema.is_allowed(dims[i],
                                                  self.allowed_values[i]):
                        include = False
                        break
            except ValueError:
                include = False
                error_files.append(f)
            if include:
                included_files.append(f)
            else:
                excluded_files.append(f)

        #print "Found", len(excluded_files), "excluded files:"
        #for f in excluded_files:
        #    print " - ", f
        #print "Found", len(included_files), "included files:"
        #for f in included_files:
        #    print " + ", f
        #print "Found", len(error_files), "invalid files"
        #for f in error_files:
        #    print " x ", f

        self.assertEqual(len(included_files), 4)
        self.assertEqual(len(error_files), 2)
        self.assertEqual(len(all_files),
                         (len(excluded_files) + len(included_files)))

    def test_safe_filename(self):
        tests = {
            "Hello World!": "Hello_World_",
            "what\nam\ni": "what_am_i",
            "saved-session": "saved_session"
        }
        for key, value in tests.iteritems():
            self.assertEqual(self.schema.safe_filename(key), value)

    def test_sanitize_allowed_values(self):
        self.assertEqual(self.allowed_values[0][0], "saved_session")

    def test_allowed_values(self):
        allowed = "saved_session"
        not_allowed = "anything_else"
        self.assertEqual(
            self.schema.get_allowed_value(allowed, self.allowed_values[0]),
            allowed)
        self.assertEqual(
            self.schema.get_allowed_value(not_allowed, self.allowed_values[0]),
            TelemetrySchema.DISALLOWED_VALUE)

    def test_apply_schema(self):
        test_inputs = []
        expected_ot = []  # <-- bad name, convenient indenting.
        other = TelemetrySchema.DISALLOWED_VALUE
        # fields:            reason           appName        channel      appVersion      appBuildID     submission_date
        #                    -------------    -----------    -------    --------------    -----------    ---------------
        # allowed:           saved-session        *          nightly          *               *          20130908
        test_inputs.append([
            "saved-session", "someAppName", "nightly", "someAppVersion",
            "someBuildID", "20130908"
        ])
        expected_ot.append([
            "saved-session", "someAppName", "nightly", "someAppVersion",
            "someBuildID", "20130908"
        ])

        test_inputs.append([
            "saved-session", "another", "nightly", "anything is ok", "wooooo",
            "20130908"
        ])
        expected_ot.append([
            "saved-session", "another", "nightly", "anything is ok", "wooooo",
            "20130908"
        ])

        test_inputs.append([
            "bogus", "someAppName", "nightly", "someAppVersion", "someBuildID",
            "20130908"
        ])
        expected_ot.append([
            other, "someAppName", "nightly", "someAppVersion", "someBuildID",
            "20130908"
        ])

        test_inputs.append([
            "bogus", "someAppName", "aurora", "someAppVersion", "someBuildID",
            "20140428"
        ])
        expected_ot.append([
            other, "someAppName", other, "someAppVersion", "someBuildID", other
        ])

        test_inputs.append([
            "saved-session", "someAppName", "nightly", "someAppVersion",
            "someBuildID", "20130908", "more", "bonus", "dimensions!"
        ])
        expected_ot.append([
            "saved-session", "someAppName", "nightly", "someAppVersion",
            "someBuildID", "20130908"
        ])

        for i in range(len(test_inputs)):
            actual = self.schema.apply_schema(test_inputs[i])
            self.assertEqual(actual, expected_ot[i])

    def test_get_current_file(self):
        # everything but "submission_date":
        dims = [
            "saved-session", "someAppName", "nightly", "someAppVersion",
            "someBuildID"
        ]
        filename = self.schema.get_current_file("foo", dims, "20130908", 1)
        self.assertEqual(
            filename,
            "foo/saved_session/someAppName/nightly/someAppVersion/someBuildID.20130908.v1.log"
        )

    def test_get_filename(self):
        dims = [
            "saved-session", "someAppName", "nightly", "someAppVersion",
            "someBuildID", "20130908"
        ]
        filename = self.schema.get_filename("foo", dims, 99)
        self.assertEqual(
            filename,
            "foo/saved_session/someAppName/nightly/someAppVersion/someBuildID.20130908.v99.log"
        )

    def test_dimensions_from(self):
        test_inputs = []
        expected_ot = []

        test_inputs.append({
            "reason": "saved-session",
            "appName": "Firefox",
            "appUpdateChannel": "release",
            "appVersion": "28.0",
            "appBuildID": "20140401001122"
        })
        expected_ot.append([
            "saved-session", "Firefox", "release", "28.0", "20140401001122",
            "20130908"
        ])

        test_inputs.append({
            "reason": "idle-daily",
            "appUpdateChannel": "release",
            "appVersion": "28.0",
            "appBuildID": "20140401001122"
        })
        expected_ot.append([
            "idle-daily", "UNKNOWN", "release", "28.0", "20140401001122",
            "20130908"
        ])

        test_inputs.append({})
        expected_ot.append([
            "UNKNOWN", "UNKNOWN", "UNKNOWN", "UNKNOWN", "UNKNOWN", "20130908"
        ])
        for i in range(len(test_inputs)):
            actual = self.schema.dimensions_from(test_inputs[i], "20130908")
            self.assertEqual(actual, expected_ot[i])

    def test_get_field(self):
        dims = [
            "saved-session", "Firefox", "release", "28.0", "20130908010101",
            "20130908"
        ]
        # Basic functionality
        self.assertEqual(self.schema.get_field(dims, "reason"),
                         "saved-session")
        self.assertEqual(self.schema.get_field(dims, "appName"), "Firefox")
        self.assertEqual(self.schema.get_field(dims, "appUpdateChannel"),
                         "release")
        self.assertEqual(self.schema.get_field(dims, "appVersion"), "28.0")
        self.assertEqual(self.schema.get_field(dims, "appBuildID"),
                         "20130908010101")
        self.assertEqual(self.schema.get_field(dims, "submission_date"),
                         "20130908")

        other = TelemetrySchema.DISALLOWED_VALUE
        allowed = True
        sanitize = True
        # T, T
        self.assertEqual(
            self.schema.get_field(dims,
                                  "reason",
                                  limit_to_allowed=allowed,
                                  sanitize=sanitize), "saved_session")
        self.assertEqual(
            self.schema.get_field(dims,
                                  "appName",
                                  limit_to_allowed=allowed,
                                  sanitize=sanitize), "Firefox")
        self.assertEqual(
            self.schema.get_field(dims,
                                  "appUpdateChannel",
                                  limit_to_allowed=allowed,
                                  sanitize=sanitize), other)
        self.assertEqual(
            self.schema.get_field(dims,
                                  "appVersion",
                                  limit_to_allowed=allowed,
                                  sanitize=sanitize), "28.0")
        self.assertEqual(
            self.schema.get_field(dims,
                                  "appBuildID",
                                  limit_to_allowed=allowed,
                                  sanitize=sanitize), "20130908010101")
        self.assertEqual(
            self.schema.get_field(dims,
                                  "submission_date",
                                  limit_to_allowed=allowed,
                                  sanitize=sanitize), "20130908")
        sanitize = False
        # T, F
        self.assertEqual(
            self.schema.get_field(dims,
                                  "reason",
                                  limit_to_allowed=allowed,
                                  sanitize=sanitize), "saved-session")
        self.assertEqual(
            self.schema.get_field(dims,
                                  "appName",
                                  limit_to_allowed=allowed,
                                  sanitize=sanitize), "Firefox")
        self.assertEqual(
            self.schema.get_field(dims,
                                  "appUpdateChannel",
                                  limit_to_allowed=allowed,
                                  sanitize=sanitize), other)
        self.assertEqual(
            self.schema.get_field(dims,
                                  "appVersion",
                                  limit_to_allowed=allowed,
                                  sanitize=sanitize), "28.0")
        self.assertEqual(
            self.schema.get_field(dims,
                                  "appBuildID",
                                  limit_to_allowed=allowed,
                                  sanitize=sanitize), "20130908010101")
        self.assertEqual(
            self.schema.get_field(dims,
                                  "submission_date",
                                  limit_to_allowed=allowed,
                                  sanitize=sanitize), "20130908")
        allowed = False
        sanitize = True
        # F, T
        self.assertEqual(
            self.schema.get_field(dims,
                                  "reason",
                                  limit_to_allowed=allowed,
                                  sanitize=sanitize), "saved_session")
        self.assertEqual(
            self.schema.get_field(dims,
                                  "appName",
                                  limit_to_allowed=allowed,
                                  sanitize=sanitize), "Firefox")
        self.assertEqual(
            self.schema.get_field(dims,
                                  "appUpdateChannel",
                                  limit_to_allowed=allowed,
                                  sanitize=sanitize), "release")
        self.assertEqual(
            self.schema.get_field(dims,
                                  "appVersion",
                                  limit_to_allowed=allowed,
                                  sanitize=sanitize), "28.0")
        self.assertEqual(
            self.schema.get_field(dims,
                                  "appBuildID",
                                  limit_to_allowed=allowed,
                                  sanitize=sanitize), "20130908010101")
        self.assertEqual(
            self.schema.get_field(dims,
                                  "submission_date",
                                  limit_to_allowed=allowed,
                                  sanitize=sanitize), "20130908")
        sanitize = False
        # F, F
        self.assertEqual(
            self.schema.get_field(dims,
                                  "reason",
                                  limit_to_allowed=allowed,
                                  sanitize=sanitize), "saved-session")
        self.assertEqual(
            self.schema.get_field(dims,
                                  "appName",
                                  limit_to_allowed=allowed,
                                  sanitize=sanitize), "Firefox")
        self.assertEqual(
            self.schema.get_field(dims,
                                  "appUpdateChannel",
                                  limit_to_allowed=allowed,
                                  sanitize=sanitize), "release")
        self.assertEqual(
            self.schema.get_field(dims,
                                  "appVersion",
                                  limit_to_allowed=allowed,
                                  sanitize=sanitize), "28.0")
        self.assertEqual(
            self.schema.get_field(dims,
                                  "appBuildID",
                                  limit_to_allowed=allowed,
                                  sanitize=sanitize), "20130908010101")
        self.assertEqual(
            self.schema.get_field(dims,
                                  "submission_date",
                                  limit_to_allowed=allowed,
                                  sanitize=sanitize), "20130908")

        with self.assertRaises(ValueError):
            v = self.schema.get_field(dims, "oranges")

        # Remove the last dimension:
        dims.pop()
        with self.assertRaises(ValueError):
            v = self.schema.get_field(dims, "submission_date")

    def test_more_allowed(self):
        spec = {
            "version":
            1,
            "dimensions": [{
                "field_name": "reason",
                "allowed_values": ["saved-session"]
            }, {
                "field_name": "appName",
                "allowed_values": "*"
            }, {
                "field_name": "appUpdateChannel",
                "allowed_values": ["nightly"]
            }, {
                "field_name": "appVersion",
                "allowed_values": "*"
            }, {
                "field_name": "appBuildID",
                "allowed_values": "one_specific_build"
            }, {
                "field_name": "submission_date",
                "allowed_values": {
                    "min": "20130908",
                    "max": "20140401"
                }
            }]
        }
        schema = TelemetrySchema(spec)
        allowed = schema.sanitize_allowed_values()
        self.assertTrue(schema.is_allowed("20130908", allowed[5]))
        self.assertTrue(schema.is_allowed("20140401", allowed[5]))
        self.assertTrue(schema.is_allowed("20130909", allowed[5]))
        self.assertTrue(schema.is_allowed("20140101", allowed[5]))
        self.assertFalse(schema.is_allowed("20130907", allowed[5]))
        self.assertFalse(schema.is_allowed("20000000", allowed[5]))
        self.assertFalse(schema.is_allowed("20140402", allowed[5]))
        self.assertFalse(schema.is_allowed("99999999", allowed[5]))

        self.assertTrue(schema.is_allowed("one_specific_build", allowed[4]))
        self.assertFalse(schema.is_allowed("two_specific_build", allowed[4]))
        self.assertFalse(schema.is_allowed("*", allowed[4]))
        self.assertFalse(schema.is_allowed("one_specific_build ", allowed[4]))
        self.assertFalse(schema.is_allowed("one-specific-build", allowed[4]))
Ejemplo n.º 6
0
class Job:
    """A class for orchestrating a Telemetry MapReduce job"""
    DOWNLOAD_BATCH_SIZE = 100

    # 1. read input filter
    # 2. generate filtered list of local input files
    # 2a. generate filtered list of remote input files
    # 3. load mapper
    # 4. spawn N processes
    # 5. distribute files among processes
    # 6. map(key, value, dims) each line in the file
    # 7. combine map output for each file
    # 8. reduce combine output overall

    def __init__(self, config):
        # Sanity check args.
        if config.num_mappers <= 0:
            raise ValueError("Number of mappers must be greater than zero")
        if config.num_reducers <= 0:
            raise ValueError("Number of reducers must be greater than zero")
        if not os.path.isdir(config.data_dir):
            raise ValueError("Data dir must be a valid directory")
        if not os.path.isdir(config.work_dir):
            raise ValueError("Work dir must be a valid directory")
        if not os.path.isfile(config.job_script):
            raise ValueError("Job script must be a valid python file")
        if not os.path.isfile(config.input_filter):
            raise ValueError("Input filter must be a valid json file")

        self._input_dir = config.data_dir
        if self._input_dir[-1] == os.path.sep:
            self._input_dir = self._input_dir[0:-1]
        self._work_dir = config.work_dir
        self._input_filter = TelemetrySchema(
            json.load(open(config.input_filter)))
        self._allowed_values = self._input_filter.sanitize_allowed_values()
        self._output_file = config.output
        self._num_mappers = config.num_mappers
        self._num_reducers = config.num_reducers
        self._local_only = config.local_only
        self._bucket_name = config.bucket
        self._aws_key = config.aws_key
        self._aws_secret_key = config.aws_secret_key
        modulefd = open(config.job_script)
        ## Lifted from FileDriver.py in jydoop.
        self._job_module = imp.load_module("telemetry_job", modulefd,
                                           config.job_script, ('.py', 'U', 1))

    def dump_stats(self, partitions):
        total = sum(partitions)
        avg = total / len(partitions)
        for i in range(len(partitions)):
            print "Partition %d contained %d (%+d)" % (
                i, partitions[i], float(partitions[i]) - avg)

    def fetch_remotes(self, remotes):
        # TODO: download remotes in groups of size DOWNLOAD_BATCH_SIZE
        remote_names = [r["name"] for r in remotes if r["type"] == "remote"]

        # TODO: check cache first.
        result = 0

        fetch_cwd = os.path.join(self._work_dir, "cache")
        if len(remote_names) > 0:
            if not os.path.isdir(fetch_cwd):
                os.makedirs(fetch_cwd)
            fetch_cmd = ["/usr/local/bin/s3funnel"]
            fetch_cmd.append(self._bucket_name)
            fetch_cmd.append("get")
            fetch_cmd.append("-a")
            fetch_cmd.append(self._aws_key)
            fetch_cmd.append("-s")
            fetch_cmd.append(self._aws_secret_key)
            fetch_cmd.append("-t")
            fetch_cmd.append("8")
            start = datetime.now()
            result = subprocess.call(fetch_cmd + remote_names, cwd=fetch_cwd)
            delta = (datetime.now() - start)
            duration_sec = float(
                delta.seconds) + float(delta.microseconds) / 1000000
            downloaded_bytes = sum(
                [r["size"] for r in remotes if r["type"] == "remote"])
            downloaded_mb = float(downloaded_bytes) / 1024.0 / 1024.0
            print "Downloaded %.2fMB in %.2fs (%.2fMB/s)" % (
                downloaded_mb, duration_sec, downloaded_mb / duration_sec)
        return result

    def mapreduce(self):
        # Find files matching specified input filter
        files = self.local_files()
        remote_files = self.get_filtered_files_s3()

        file_count = len(files) + len(remote_files)
        # Not useful to have more mappers than input files.
        if file_count < self._num_mappers:
            print "There are only", file_count, "input files. Reducing number of mappers accordingly."
            self._num_mappers = file_count

        # Partition files into reasonably equal groups for use by mappers
        print "Partitioning input data..."
        partitions = self.partition(files, remote_files)
        print "Done"

        # Partitions are ready. Map.
        mappers = []
        for i in range(self._num_mappers):
            if len(partitions[i]) > 0:
                # Fetch the files we need for each mapper
                print "Fetching remotes for partition", i
                self.fetch_remotes(partitions[i])
                print "Done"
                p = Process(target=Mapper,
                            args=(i, partitions[i], self._work_dir,
                                  self._job_module, self._num_reducers))
                mappers.append(p)
                p.start()
            else:
                print "Skipping mapper", i, "- no input files to process"
        for m in mappers:
            m.join()

        # Mappers are done. Reduce.
        reducers = []
        for i in range(self._num_reducers):
            p = Process(target=Reducer,
                        args=(i, self._work_dir, self._job_module,
                              self._num_mappers))
            reducers.append(p)
            p.start()
        for r in reducers:
            r.join()

        # Reducers are done.  Output results.
        os.rename(os.path.join(self._work_dir, "reducer_0"), self._output_file)
        if self._num_reducers > 1:
            out = open(self._output_file, "a")
            for i in range(1, self._num_reducers):
                # FIXME: this reads the entire reducer output into memory
                reducer_filename = os.path.join(self._work_dir,
                                                "reducer_" + str(i))
                reducer_output = open(reducer_filename, "r")
                out.write(reducer_output.read())
                reducer_output.close()
                os.remove(reducer_filename)

        # TODO: clean up downloaded files?

        # Clean up mapper outputs
        for m in range(self._num_mappers):
            for r in range(self._num_reducers):
                mfile = os.path.join(self._work_dir, "mapper_%d_%d" % (m, r))
                if os.path.exists(mfile):
                    os.remove(mfile)
                else:
                    print "Warning: Could not find", mfile

    def local_files(self):
        out_files = self.get_filtered_files(self._input_dir)
        if self._input_filter._include_invalid:
            invalid_dir = os.path.join(self._input_dir,
                                       TelemetrySchema.INVALID_DIR)
            #print "Looking for invalid data in", invalid_dir
            out_files += self.get_filtered_files(invalid_dir)
        return out_files

    # Split up the input files into groups of approximately-equal on-disk size.
    def partition(self, files, remote_files):
        namesize = [{
            "type":
            "local",
            "name":
            files[i],
            "size":
            os.stat(files[i]).st_size,
            "dimensions":
            self._input_filter.get_dimensions(self._input_dir, files[i])
        } for i in range(0, len(files))]
        partitions = []
        sums = []
        for p in range(self._num_mappers):
            partitions.append([])
            sums.append(0)
        min_idx = 0

        # Greedily assign the largest file to the smallest partition
        while len(namesize) > 0:
            current = namesize.pop()
            #print "putting", current, "into partition", min_idx
            partitions[min_idx].append(current)
            sums[min_idx] += current["size"]
            min_idx = find_min_idx(sums)

        # And now do the same with the remote files.
        # TODO: if this is too slow, just distribute remote files round-robin.
        if len(remote_files) > 0:
            conn = S3Connection(self._aws_key, self._aws_secret_key)
            bucket = conn.get_bucket(self._bucket_name)
            for r in remote_files:
                key = bucket.lookup(r)
                size = key.size
                dims = self._input_filter.get_dimensions(".", r)
                remote = {
                    "type": "remote",
                    "name": r,
                    "size": size,
                    "dimensions": dims
                }
                #print "putting", remote, "into partition", min_idx
                partitions[min_idx].append(remote)
                sums[min_idx] += size
                min_idx = find_min_idx(sums)

        # Print out some info to see how balanced the partitions were:
        self.dump_stats(sums)

        return partitions

    def get_filtered_files(self, searchdir):
        level_offset = searchdir.count(os.path.sep)
        out_files = []
        for root, dirs, files in os.walk(searchdir):
            level = root.count(os.path.sep) - level_offset
            dirs[:] = [i for i in dirs if self.filter_includes(level, i)]
            for f in files:
                full_filename = os.path.join(root, f)
                dims = self._input_filter.get_dimensions(
                    searchdir, full_filename)
                include = True
                for l in range(level, len(self._allowed_values)):
                    if not self.filter_includes(l, dims[l]):
                        include = False
                        break
                if include:
                    out_files.append(full_filename)
        return out_files

    def get_filtered_files_s3(self):
        out_files = []
        if not self._local_only:
            print "Fetching file list from S3..."
            # Plain boto should be fast enough to list bucket contents.
            conn = S3Connection(self._aws_key, self._aws_secret_key)
            bucket = conn.get_bucket(self._bucket_name)

            # TODO: potential optimization - if our input filter is reasonably
            #       restrictive an/or our list of keys is very long, it may be
            #       a win to use the "prefix" and "delimiter" params.
            for f in bucket.list():
                dims = self._input_filter.get_dimensions(".", f.name)
                #print f.name, "->", ",".join(dims)
                include = True
                for i in range(len(self._allowed_values)):
                    if not self.filter_includes(i, dims[i]):
                        include = False
                        break
                if include:
                    out_files.append(f.name)
            conn.close()
            print "Done!"
        return out_files

    def filter_includes(self, level, value):
        # Filter out 'invalid' data.  It is included explicitly if needed.
        if level == 0 and value == TelemetrySchema.INVALID_DIR:
            return False
        allowed_values = self._allowed_values[level]
        return self._input_filter.is_allowed(value, allowed_values)