Ejemplo n.º 1
0
def main():
    parser = argparse.ArgumentParser(description='Split raw logs into partitioned files.', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("-m", "--max-output-size", metavar="N", help="Rotate output files after N bytes", type=int, default=500000000)
    parser.add_argument("-i", "--input-file", help="Filename to read from", required=True)
    parser.add_argument("-o", "--output-dir", help="Base directory to store split files", required=True)
    parser.add_argument("-t", "--telemetry-schema", help="Filename of telemetry schema spec", required=True)
    parser.add_argument("-b", "--bucket", help="S3 Bucket name")
    parser.add_argument("-k", "--aws-key", help="AWS Key")
    parser.add_argument("-s", "--aws-secret-key", help="AWS Secret Key")
    args = parser.parse_args()

    schema_data = open(args.telemetry_schema)
    schema = TelemetrySchema(json.load(schema_data))
    schema_data.close()

    storage = StorageLayout(schema, args.output_dir, args.max_output_size)

    expected_dim_count = len(schema._dimensions)

    if not os.path.isdir(args.output_dir):
        os.makedirs(args.output_dir)

    record_count = 0;
    fin = open(args.input_file, "rb")

    bytes_read = 0
    start = datetime.now()
    while True:
        record_count += 1
        # Read two 4-byte values and one 8-byte value
        lengths = fin.read(16)
        if lengths == '':
            break
        len_path, len_data, timestamp = struct.unpack("<IIQ", lengths)

        # Incoming timestamps are in milliseconds, so convert to POSIX first
        # (ie. seconds)
        submission_date = date.fromtimestamp(timestamp / 1000).strftime("%Y%m%d")
        path = unicode(fin.read(len_path), errors="replace")
        #print "Path for record", record_count, path, "length of data:", len_data

        # Detect and handle gzipped data.
        data = fin.read(len_data)
        try:
            # Note: from brief testing, cStringIO doesn't appear to be any
            #       faster. In fact, it seems slightly slower than StringIO.
            data_reader = StringIO.StringIO(data)
            uncompressor = gzip.GzipFile(fileobj=data_reader, mode="r")
            data = unicode(uncompressor.read(), errors="replace")
            uncompressor.close()
            data_reader.close()
        except Exception, e:
            #print e
            # Use the string as-is
            data = unicode(data, errors="replace")

        bytes_read += 8 + len_path + len_data
        #print "Path for record", record_count, path, "length of data:", len_data, "data:", data[0:5] + "..."

        path_components = path.split("/")
        if len(path_components) != expected_dim_count:
            # We're going to pop the ID off, but we'll also add the submission,
            # so it evens out.
            print "Found an invalid path in record", record_count, path
            continue

        key = path_components.pop(0)
        info = {}
        info["reason"] = path_components.pop(0)
        info["appName"] = path_components.pop(0)
        info["appVersion"] = path_components.pop(0)
        info["appUpdateChannel"] = path_components.pop(0)
        info["appBuildID"] = path_components.pop(0)
        dimensions = schema.dimensions_from(info, submission_date)
        #print "  Converted path to filename", schema.get_filename(args.output_dir, dimensions)
        storage.write(key, data, dimensions)
Ejemplo n.º 2
0
class TelemetrySchemaTest(unittest.TestCase):
    def setUp(self):
        self.schema = TelemetrySchema(self.get_schema_spec())
        self.allowed_values = self.schema.sanitize_allowed_values()

    def get_schema_spec(self):
        return {
            "version":
            1,
            "dimensions": [{
                "field_name": "reason",
                "allowed_values": ["saved-session"]
            }, {
                "field_name": "appName",
                "allowed_values": "*"
            }, {
                "field_name": "appUpdateChannel",
                "allowed_values": ["nightly"]
            }, {
                "field_name": "appVersion",
                "allowed_values": "*"
            }, {
                "field_name": "appBuildID",
                "allowed_values": "*"
            }, {
                "field_name": "submission_date",
                "allowed_values": ["20130908"]
            }]
        }

    def get_file_list(self):
        return [
            "/foo/bar/baz/bla.txt",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130826074752.20130903.log.e0c7ff434e474c8aa745763eed408b9c.lzma",
            "garbage/idle_daily/Firefox/nightly/26.0a1/20130826074752.20130903.log.e0c7ff434e474c8aa745763eed408b9c.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130901030218.20130903.log.17ff07fda8994e23baf983550246a94b.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130819030205.20130907.log.ec6f22acd37349b3b5ef03da1cc150da.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130807161117.20130903.log.64478ac84a734677bc14cbcf6cc114b7.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130814030204.20130906.log.a382f1337d1f47ef8aad08f8fb14a79a.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130830030205.20130903.log.c86e2c3f31c043ac8fc311d5dd1abc28.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130823151250.20130907.log.939bec39c3d24c89a09834463b220d9a.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130830030205.20130906.log.0bf2c1edf2634ca5bdc865a54957a690.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130826074752.20130903.log.8e33cc0f130849dfbb8afe7331123be3.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130826074752.20130902.log.2349f0434be64c6684f91eccabf9b3e6.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130826030203.20130902.log.57a017a3378b420cbbfb666532606b16.lzma",
            "processed/saved_session/Firefox/nightly/26.0a1/20130902030220.20130908.log.7e9556a5e32b4990a9d378eea65f57a9.lzma",
            "processed/saved_session/Firefox/nightly/26.0a1/20130829030201.20130909.log.c227775e57e24854b1aac7c21c59f85c.lzma",
            "processed/saved_session/Firefox/nightly/26.0a1/20130826030203.20130906.log.88620da62e77482285f28d5ea69beb1e.lzma",
            "processed/saved_session/Firefox/nightly/26.0a1/20130826030203.20130908.log.cc1b0c52365947c38ac2636f3384503c.lzma",
            "processed/saved_session/Firefox/nightly/26.0a1/20130830030205.20130906.log.77905bb7503a4a98aa7231b10073f47e.lzma",
            "processed/saved_session/Firefox/nightly/26.0a1/20130823030204.20130908.log.55f4ab6ada3c4e1d939f24b5da7f8dc2.lzma",
            "processed/saved_session/Firefox/nightly/26.0a1/20130902030220.20130908.log.f213918c08804d449d30e1aaec70089a.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130813030205.20130907.log.24c445d3d2c241bcb5001a63a78e98fa.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130831030224.20130902.log.ebe3cd20fa264cd19aab02b8ffe8cbf1.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130821030213.20130906.log.778737ad596d43e4a5e9e59c38428b61.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130809030203.20130903.log.43ae292120ca475589b20be24fa70171.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130814141812.20130907.log.7c6c5d65b702443cac2768eb6f0e3c91.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130823030204.20130903.log.f73682ebc57a4661a6f48a2a5cf2629c.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130821050136.20130904.log.2d423ec779e04113996914ce81e27bfe.lzma"
        ]

    def test_filtering(self):
        all_files = self.get_file_list()
        error_files = []
        included_files = []
        excluded_files = []
        for f in all_files:
            include = True
            try:
                dims = self.schema.get_dimensions("processed", f)
                for i in range(len(self.allowed_values)):
                    if not self.schema.is_allowed(dims[i],
                                                  self.allowed_values[i]):
                        include = False
                        break
            except ValueError:
                include = False
                error_files.append(f)
            if include:
                included_files.append(f)
            else:
                excluded_files.append(f)

        #print "Found", len(excluded_files), "excluded files:"
        #for f in excluded_files:
        #    print " - ", f
        #print "Found", len(included_files), "included files:"
        #for f in included_files:
        #    print " + ", f
        #print "Found", len(error_files), "invalid files"
        #for f in error_files:
        #    print " x ", f

        self.assertEqual(len(included_files), 4)
        self.assertEqual(len(error_files), 2)
        self.assertEqual(len(all_files),
                         (len(excluded_files) + len(included_files)))

    def test_safe_filename(self):
        tests = {
            "Hello World!": "Hello_World_",
            "what\nam\ni": "what_am_i",
            "saved-session": "saved_session"
        }
        for key, value in tests.iteritems():
            self.assertEqual(self.schema.safe_filename(key), value)

    def test_sanitize_allowed_values(self):
        self.assertEqual(self.allowed_values[0][0], "saved_session")

    def test_allowed_values(self):
        allowed = "saved_session"
        not_allowed = "anything_else"
        self.assertEqual(
            self.schema.get_allowed_value(allowed, self.allowed_values[0]),
            allowed)
        self.assertEqual(
            self.schema.get_allowed_value(not_allowed, self.allowed_values[0]),
            TelemetrySchema.DISALLOWED_VALUE)

    def test_apply_schema(self):
        test_inputs = []
        expected_ot = []  # <-- bad name, convenient indenting.
        other = TelemetrySchema.DISALLOWED_VALUE
        # fields:            reason           appName        channel      appVersion      appBuildID     submission_date
        #                    -------------    -----------    -------    --------------    -----------    ---------------
        # allowed:           saved-session        *          nightly          *               *          20130908
        test_inputs.append([
            "saved-session", "someAppName", "nightly", "someAppVersion",
            "someBuildID", "20130908"
        ])
        expected_ot.append([
            "saved-session", "someAppName", "nightly", "someAppVersion",
            "someBuildID", "20130908"
        ])

        test_inputs.append([
            "saved-session", "another", "nightly", "anything is ok", "wooooo",
            "20130908"
        ])
        expected_ot.append([
            "saved-session", "another", "nightly", "anything is ok", "wooooo",
            "20130908"
        ])

        test_inputs.append([
            "bogus", "someAppName", "nightly", "someAppVersion", "someBuildID",
            "20130908"
        ])
        expected_ot.append([
            other, "someAppName", "nightly", "someAppVersion", "someBuildID",
            "20130908"
        ])

        test_inputs.append([
            "bogus", "someAppName", "aurora", "someAppVersion", "someBuildID",
            "20140428"
        ])
        expected_ot.append([
            other, "someAppName", other, "someAppVersion", "someBuildID", other
        ])

        test_inputs.append([
            "saved-session", "someAppName", "nightly", "someAppVersion",
            "someBuildID", "20130908", "more", "bonus", "dimensions!"
        ])
        expected_ot.append([
            "saved-session", "someAppName", "nightly", "someAppVersion",
            "someBuildID", "20130908"
        ])

        for i in range(len(test_inputs)):
            actual = self.schema.apply_schema(test_inputs[i])
            self.assertEqual(actual, expected_ot[i])

    def test_get_current_file(self):
        # everything but "submission_date":
        dims = [
            "saved-session", "someAppName", "nightly", "someAppVersion",
            "someBuildID"
        ]
        filename = self.schema.get_current_file("foo", dims, "20130908", 1)
        self.assertEqual(
            filename,
            "foo/saved_session/someAppName/nightly/someAppVersion/someBuildID.20130908.v1.log"
        )

    def test_get_filename(self):
        dims = [
            "saved-session", "someAppName", "nightly", "someAppVersion",
            "someBuildID", "20130908"
        ]
        filename = self.schema.get_filename("foo", dims, 99)
        self.assertEqual(
            filename,
            "foo/saved_session/someAppName/nightly/someAppVersion/someBuildID.20130908.v99.log"
        )

    def test_dimensions_from(self):
        test_inputs = []
        expected_ot = []

        test_inputs.append({
            "reason": "saved-session",
            "appName": "Firefox",
            "appUpdateChannel": "release",
            "appVersion": "28.0",
            "appBuildID": "20140401001122"
        })
        expected_ot.append([
            "saved-session", "Firefox", "release", "28.0", "20140401001122",
            "20130908"
        ])

        test_inputs.append({
            "reason": "idle-daily",
            "appUpdateChannel": "release",
            "appVersion": "28.0",
            "appBuildID": "20140401001122"
        })
        expected_ot.append([
            "idle-daily", "UNKNOWN", "release", "28.0", "20140401001122",
            "20130908"
        ])

        test_inputs.append({})
        expected_ot.append([
            "UNKNOWN", "UNKNOWN", "UNKNOWN", "UNKNOWN", "UNKNOWN", "20130908"
        ])
        for i in range(len(test_inputs)):
            actual = self.schema.dimensions_from(test_inputs[i], "20130908")
            self.assertEqual(actual, expected_ot[i])

    def test_get_field(self):
        dims = [
            "saved-session", "Firefox", "release", "28.0", "20130908010101",
            "20130908"
        ]
        # Basic functionality
        self.assertEqual(self.schema.get_field(dims, "reason"),
                         "saved-session")
        self.assertEqual(self.schema.get_field(dims, "appName"), "Firefox")
        self.assertEqual(self.schema.get_field(dims, "appUpdateChannel"),
                         "release")
        self.assertEqual(self.schema.get_field(dims, "appVersion"), "28.0")
        self.assertEqual(self.schema.get_field(dims, "appBuildID"),
                         "20130908010101")
        self.assertEqual(self.schema.get_field(dims, "submission_date"),
                         "20130908")

        other = TelemetrySchema.DISALLOWED_VALUE
        allowed = True
        sanitize = True
        # T, T
        self.assertEqual(
            self.schema.get_field(dims,
                                  "reason",
                                  limit_to_allowed=allowed,
                                  sanitize=sanitize), "saved_session")
        self.assertEqual(
            self.schema.get_field(dims,
                                  "appName",
                                  limit_to_allowed=allowed,
                                  sanitize=sanitize), "Firefox")
        self.assertEqual(
            self.schema.get_field(dims,
                                  "appUpdateChannel",
                                  limit_to_allowed=allowed,
                                  sanitize=sanitize), other)
        self.assertEqual(
            self.schema.get_field(dims,
                                  "appVersion",
                                  limit_to_allowed=allowed,
                                  sanitize=sanitize), "28.0")
        self.assertEqual(
            self.schema.get_field(dims,
                                  "appBuildID",
                                  limit_to_allowed=allowed,
                                  sanitize=sanitize), "20130908010101")
        self.assertEqual(
            self.schema.get_field(dims,
                                  "submission_date",
                                  limit_to_allowed=allowed,
                                  sanitize=sanitize), "20130908")
        sanitize = False
        # T, F
        self.assertEqual(
            self.schema.get_field(dims,
                                  "reason",
                                  limit_to_allowed=allowed,
                                  sanitize=sanitize), "saved-session")
        self.assertEqual(
            self.schema.get_field(dims,
                                  "appName",
                                  limit_to_allowed=allowed,
                                  sanitize=sanitize), "Firefox")
        self.assertEqual(
            self.schema.get_field(dims,
                                  "appUpdateChannel",
                                  limit_to_allowed=allowed,
                                  sanitize=sanitize), other)
        self.assertEqual(
            self.schema.get_field(dims,
                                  "appVersion",
                                  limit_to_allowed=allowed,
                                  sanitize=sanitize), "28.0")
        self.assertEqual(
            self.schema.get_field(dims,
                                  "appBuildID",
                                  limit_to_allowed=allowed,
                                  sanitize=sanitize), "20130908010101")
        self.assertEqual(
            self.schema.get_field(dims,
                                  "submission_date",
                                  limit_to_allowed=allowed,
                                  sanitize=sanitize), "20130908")
        allowed = False
        sanitize = True
        # F, T
        self.assertEqual(
            self.schema.get_field(dims,
                                  "reason",
                                  limit_to_allowed=allowed,
                                  sanitize=sanitize), "saved_session")
        self.assertEqual(
            self.schema.get_field(dims,
                                  "appName",
                                  limit_to_allowed=allowed,
                                  sanitize=sanitize), "Firefox")
        self.assertEqual(
            self.schema.get_field(dims,
                                  "appUpdateChannel",
                                  limit_to_allowed=allowed,
                                  sanitize=sanitize), "release")
        self.assertEqual(
            self.schema.get_field(dims,
                                  "appVersion",
                                  limit_to_allowed=allowed,
                                  sanitize=sanitize), "28.0")
        self.assertEqual(
            self.schema.get_field(dims,
                                  "appBuildID",
                                  limit_to_allowed=allowed,
                                  sanitize=sanitize), "20130908010101")
        self.assertEqual(
            self.schema.get_field(dims,
                                  "submission_date",
                                  limit_to_allowed=allowed,
                                  sanitize=sanitize), "20130908")
        sanitize = False
        # F, F
        self.assertEqual(
            self.schema.get_field(dims,
                                  "reason",
                                  limit_to_allowed=allowed,
                                  sanitize=sanitize), "saved-session")
        self.assertEqual(
            self.schema.get_field(dims,
                                  "appName",
                                  limit_to_allowed=allowed,
                                  sanitize=sanitize), "Firefox")
        self.assertEqual(
            self.schema.get_field(dims,
                                  "appUpdateChannel",
                                  limit_to_allowed=allowed,
                                  sanitize=sanitize), "release")
        self.assertEqual(
            self.schema.get_field(dims,
                                  "appVersion",
                                  limit_to_allowed=allowed,
                                  sanitize=sanitize), "28.0")
        self.assertEqual(
            self.schema.get_field(dims,
                                  "appBuildID",
                                  limit_to_allowed=allowed,
                                  sanitize=sanitize), "20130908010101")
        self.assertEqual(
            self.schema.get_field(dims,
                                  "submission_date",
                                  limit_to_allowed=allowed,
                                  sanitize=sanitize), "20130908")

        with self.assertRaises(ValueError):
            v = self.schema.get_field(dims, "oranges")

        # Remove the last dimension:
        dims.pop()
        with self.assertRaises(ValueError):
            v = self.schema.get_field(dims, "submission_date")

    def test_more_allowed(self):
        spec = {
            "version":
            1,
            "dimensions": [{
                "field_name": "reason",
                "allowed_values": ["saved-session"]
            }, {
                "field_name": "appName",
                "allowed_values": "*"
            }, {
                "field_name": "appUpdateChannel",
                "allowed_values": ["nightly"]
            }, {
                "field_name": "appVersion",
                "allowed_values": "*"
            }, {
                "field_name": "appBuildID",
                "allowed_values": "one_specific_build"
            }, {
                "field_name": "submission_date",
                "allowed_values": {
                    "min": "20130908",
                    "max": "20140401"
                }
            }]
        }
        schema = TelemetrySchema(spec)
        allowed = schema.sanitize_allowed_values()
        self.assertTrue(schema.is_allowed("20130908", allowed[5]))
        self.assertTrue(schema.is_allowed("20140401", allowed[5]))
        self.assertTrue(schema.is_allowed("20130909", allowed[5]))
        self.assertTrue(schema.is_allowed("20140101", allowed[5]))
        self.assertFalse(schema.is_allowed("20130907", allowed[5]))
        self.assertFalse(schema.is_allowed("20000000", allowed[5]))
        self.assertFalse(schema.is_allowed("20140402", allowed[5]))
        self.assertFalse(schema.is_allowed("99999999", allowed[5]))

        self.assertTrue(schema.is_allowed("one_specific_build", allowed[4]))
        self.assertFalse(schema.is_allowed("two_specific_build", allowed[4]))
        self.assertFalse(schema.is_allowed("*", allowed[4]))
        self.assertFalse(schema.is_allowed("one_specific_build ", allowed[4]))
        self.assertFalse(schema.is_allowed("one-specific-build", allowed[4]))
class TelemetrySchemaTest(unittest.TestCase):
    def setUp(self):
        self.schema = TelemetrySchema(self.get_schema_spec())
        self.allowed_values = self.schema.sanitize_allowed_values()

    def get_schema_spec(self):
        return {
            "version": 1,
            "dimensions": [
                {
                    "field_name": "reason",
                    "allowed_values": ["saved-session"]
                },
                {
                    "field_name": "appName",
                    "allowed_values": "*"
                },
                {
                    "field_name": "appUpdateChannel",
                    "allowed_values": ["nightly"]
                },
                {
                    "field_name": "appVersion",
                    "allowed_values": "*"
                },
                {
                    "field_name": "appBuildID",
                    "allowed_values": "*"
                },
                {
                    "field_name": "submission_date",
                    "allowed_values": ["20130908"]
                }
            ]
        }
    def get_file_list(self):
        return [
            "/foo/bar/baz/bla.txt",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130826074752.20130903.log.e0c7ff434e474c8aa745763eed408b9c.lzma",
            "garbage/idle_daily/Firefox/nightly/26.0a1/20130826074752.20130903.log.e0c7ff434e474c8aa745763eed408b9c.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130901030218.20130903.log.17ff07fda8994e23baf983550246a94b.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130819030205.20130907.log.ec6f22acd37349b3b5ef03da1cc150da.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130807161117.20130903.log.64478ac84a734677bc14cbcf6cc114b7.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130814030204.20130906.log.a382f1337d1f47ef8aad08f8fb14a79a.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130830030205.20130903.log.c86e2c3f31c043ac8fc311d5dd1abc28.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130823151250.20130907.log.939bec39c3d24c89a09834463b220d9a.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130830030205.20130906.log.0bf2c1edf2634ca5bdc865a54957a690.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130826074752.20130903.log.8e33cc0f130849dfbb8afe7331123be3.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130826074752.20130902.log.2349f0434be64c6684f91eccabf9b3e6.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130826030203.20130902.log.57a017a3378b420cbbfb666532606b16.lzma",
            "processed/saved_session/Firefox/nightly/26.0a1/20130902030220.20130908.log.7e9556a5e32b4990a9d378eea65f57a9.lzma",
            "processed/saved_session/Firefox/nightly/26.0a1/20130829030201.20130909.log.c227775e57e24854b1aac7c21c59f85c.lzma",
            "processed/saved_session/Firefox/nightly/26.0a1/20130826030203.20130906.log.88620da62e77482285f28d5ea69beb1e.lzma",
            "processed/saved_session/Firefox/nightly/26.0a1/20130826030203.20130908.log.cc1b0c52365947c38ac2636f3384503c.lzma",
            "processed/saved_session/Firefox/nightly/26.0a1/20130830030205.20130906.log.77905bb7503a4a98aa7231b10073f47e.lzma",
            "processed/saved_session/Firefox/nightly/26.0a1/20130823030204.20130908.log.55f4ab6ada3c4e1d939f24b5da7f8dc2.lzma",
            "processed/saved_session/Firefox/nightly/26.0a1/20130902030220.20130908.log.f213918c08804d449d30e1aaec70089a.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130813030205.20130907.log.24c445d3d2c241bcb5001a63a78e98fa.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130831030224.20130902.log.ebe3cd20fa264cd19aab02b8ffe8cbf1.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130821030213.20130906.log.778737ad596d43e4a5e9e59c38428b61.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130809030203.20130903.log.43ae292120ca475589b20be24fa70171.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130814141812.20130907.log.7c6c5d65b702443cac2768eb6f0e3c91.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130823030204.20130903.log.f73682ebc57a4661a6f48a2a5cf2629c.lzma",
            "processed/idle_daily/Firefox/nightly/26.0a1/20130821050136.20130904.log.2d423ec779e04113996914ce81e27bfe.lzma"
        ]

    def test_filtering(self):
        all_files = self.get_file_list()
        error_files = []
        included_files = []
        excluded_files = []
        for f in all_files:
            include = True
            try:
                dims = self.schema.get_dimensions("processed", f)
                for i in range(len(self.allowed_values)):
                    if not self.schema.is_allowed(dims[i], self.allowed_values[i]):
                        include = False
                        break
            except ValueError:
                include = False
                error_files.append(f)
            if include:
                included_files.append(f)
            else:
                excluded_files.append(f)

        #print "Found", len(excluded_files), "excluded files:"
        #for f in excluded_files:
        #    print " - ", f
        #print "Found", len(included_files), "included files:"
        #for f in included_files:
        #    print " + ", f
        #print "Found", len(error_files), "invalid files"
        #for f in error_files:
        #    print " x ", f

        self.assertEqual(len(included_files), 4)
        self.assertEqual(len(error_files), 2)
        self.assertEqual(len(all_files), (len(excluded_files) + len(included_files)))

    def test_safe_filename(self):
        tests = {
            "Hello World!": "Hello_World_",
            "what\nam\ni": "what_am_i",
            "saved-session": "saved_session"
        }
        for key, value in tests.iteritems():
            self.assertEqual(self.schema.safe_filename(key), value)

    def test_sanitize_allowed_values(self):
        self.assertEqual(self.allowed_values[0][0], "saved_session")

    def test_allowed_values(self):
        allowed = "saved_session"
        not_allowed = "anything_else"
        self.assertEqual(self.schema.get_allowed_value(allowed, self.allowed_values[0]), allowed)
        self.assertEqual(self.schema.get_allowed_value(not_allowed, self.allowed_values[0]), TelemetrySchema.DISALLOWED_VALUE)

    def test_apply_schema(self):
        test_inputs = []
        expected_ot = [] # <-- bad name, convenient indenting.
        other = TelemetrySchema.DISALLOWED_VALUE
        # fields:            reason           appName        channel      appVersion      appBuildID     submission_date
        #                    -------------    -----------    -------    --------------    -----------    ---------------
        # allowed:           saved-session        *          nightly          *               *          20130908
        test_inputs.append(["saved-session", "someAppName", "nightly", "someAppVersion", "someBuildID", "20130908"])
        expected_ot.append(["saved-session", "someAppName", "nightly", "someAppVersion", "someBuildID", "20130908"])

        test_inputs.append(["saved-session", "another",     "nightly", "anything is ok", "wooooo",      "20130908"])
        expected_ot.append(["saved-session", "another",     "nightly", "anything is ok", "wooooo",      "20130908"])

        test_inputs.append(["bogus",         "someAppName", "nightly", "someAppVersion", "someBuildID", "20130908"])
        expected_ot.append([other,           "someAppName", "nightly", "someAppVersion", "someBuildID", "20130908"])

        test_inputs.append(["bogus",         "someAppName", "aurora",  "someAppVersion", "someBuildID", "20140428"])
        expected_ot.append([other,           "someAppName", other,     "someAppVersion", "someBuildID", other])

        test_inputs.append(["saved-session", "someAppName", "nightly", "someAppVersion", "someBuildID", "20130908", "more", "bonus", "dimensions!"])
        expected_ot.append(["saved-session", "someAppName", "nightly", "someAppVersion", "someBuildID", "20130908"])

        for i in range(len(test_inputs)):
            actual = self.schema.apply_schema(test_inputs[i])
            self.assertEqual(actual, expected_ot[i])

    def test_get_current_file(self):
        # everything but "submission_date":
        dims = ["saved-session", "someAppName", "nightly", "someAppVersion", "someBuildID"]
        filename = self.schema.get_current_file("foo", dims, "20130908", 1)
        self.assertEqual(filename, "foo/saved_session/someAppName/nightly/someAppVersion/someBuildID.20130908.v1.log")

    def test_get_filename(self):
        dims = ["saved-session", "someAppName", "nightly", "someAppVersion", "someBuildID", "20130908"]
        filename = self.schema.get_filename("foo", dims, 99)
        self.assertEqual(filename, "foo/saved_session/someAppName/nightly/someAppVersion/someBuildID.20130908.v99.log")

    def test_dimensions_from(self):
        test_inputs = []
        expected_ot = []

        test_inputs.append({"reason": "saved-session", "appName": "Firefox", "appUpdateChannel": "release", "appVersion": "28.0", "appBuildID": "20140401001122"})
        expected_ot.append(["saved-session", "Firefox", "release", "28.0", "20140401001122", "20130908"])

        test_inputs.append({"reason": "idle-daily", "appUpdateChannel": "release", "appVersion": "28.0", "appBuildID": "20140401001122"})
        expected_ot.append(["idle-daily", "UNKNOWN", "release", "28.0", "20140401001122", "20130908"])

        test_inputs.append({})
        expected_ot.append(["UNKNOWN", "UNKNOWN", "UNKNOWN", "UNKNOWN", "UNKNOWN", "20130908"])
        for i in range(len(test_inputs)):
            actual = self.schema.dimensions_from(test_inputs[i], "20130908")
            self.assertEqual(actual, expected_ot[i])

    def test_get_field(self):
        dims = ["saved-session", "Firefox", "release", "28.0", "20130908010101", "20130908"]
        # Basic functionality
        self.assertEqual(self.schema.get_field(dims, "reason"),           "saved-session")
        self.assertEqual(self.schema.get_field(dims, "appName"),          "Firefox")
        self.assertEqual(self.schema.get_field(dims, "appUpdateChannel"), "release")
        self.assertEqual(self.schema.get_field(dims, "appVersion"),       "28.0")
        self.assertEqual(self.schema.get_field(dims, "appBuildID"),       "20130908010101")
        self.assertEqual(self.schema.get_field(dims, "submission_date"),  "20130908")

        other = TelemetrySchema.DISALLOWED_VALUE
        allowed = True
        sanitize = True
        # T, T
        self.assertEqual(self.schema.get_field(dims, "reason", limit_to_allowed=allowed, sanitize=sanitize),           "saved_session")
        self.assertEqual(self.schema.get_field(dims, "appName", limit_to_allowed=allowed, sanitize=sanitize),          "Firefox")
        self.assertEqual(self.schema.get_field(dims, "appUpdateChannel", limit_to_allowed=allowed, sanitize=sanitize), other)
        self.assertEqual(self.schema.get_field(dims, "appVersion", limit_to_allowed=allowed, sanitize=sanitize),       "28.0")
        self.assertEqual(self.schema.get_field(dims, "appBuildID", limit_to_allowed=allowed, sanitize=sanitize),       "20130908010101")
        self.assertEqual(self.schema.get_field(dims, "submission_date", limit_to_allowed=allowed, sanitize=sanitize),  "20130908")
        sanitize = False
        # T, F
        self.assertEqual(self.schema.get_field(dims, "reason", limit_to_allowed=allowed, sanitize=sanitize),           "saved-session")
        self.assertEqual(self.schema.get_field(dims, "appName", limit_to_allowed=allowed, sanitize=sanitize),          "Firefox")
        self.assertEqual(self.schema.get_field(dims, "appUpdateChannel", limit_to_allowed=allowed, sanitize=sanitize), other)
        self.assertEqual(self.schema.get_field(dims, "appVersion", limit_to_allowed=allowed, sanitize=sanitize),       "28.0")
        self.assertEqual(self.schema.get_field(dims, "appBuildID", limit_to_allowed=allowed, sanitize=sanitize),       "20130908010101")
        self.assertEqual(self.schema.get_field(dims, "submission_date", limit_to_allowed=allowed, sanitize=sanitize),  "20130908")
        allowed = False
        sanitize = True
        # F, T
        self.assertEqual(self.schema.get_field(dims, "reason", limit_to_allowed=allowed, sanitize=sanitize),           "saved_session")
        self.assertEqual(self.schema.get_field(dims, "appName", limit_to_allowed=allowed, sanitize=sanitize),          "Firefox")
        self.assertEqual(self.schema.get_field(dims, "appUpdateChannel", limit_to_allowed=allowed, sanitize=sanitize), "release")
        self.assertEqual(self.schema.get_field(dims, "appVersion", limit_to_allowed=allowed, sanitize=sanitize),       "28.0")
        self.assertEqual(self.schema.get_field(dims, "appBuildID", limit_to_allowed=allowed, sanitize=sanitize),       "20130908010101")
        self.assertEqual(self.schema.get_field(dims, "submission_date", limit_to_allowed=allowed, sanitize=sanitize),  "20130908")
        sanitize = False
        # F, F
        self.assertEqual(self.schema.get_field(dims, "reason", limit_to_allowed=allowed, sanitize=sanitize),           "saved-session")
        self.assertEqual(self.schema.get_field(dims, "appName", limit_to_allowed=allowed, sanitize=sanitize),          "Firefox")
        self.assertEqual(self.schema.get_field(dims, "appUpdateChannel", limit_to_allowed=allowed, sanitize=sanitize), "release")
        self.assertEqual(self.schema.get_field(dims, "appVersion", limit_to_allowed=allowed, sanitize=sanitize),       "28.0")
        self.assertEqual(self.schema.get_field(dims, "appBuildID", limit_to_allowed=allowed, sanitize=sanitize),       "20130908010101")
        self.assertEqual(self.schema.get_field(dims, "submission_date", limit_to_allowed=allowed, sanitize=sanitize),  "20130908")

        with self.assertRaises(ValueError):
            v = self.schema.get_field(dims, "oranges")

        # Remove the last dimension:
        dims.pop()
        with self.assertRaises(ValueError):
            v = self.schema.get_field(dims, "submission_date")

    def test_more_allowed(self):
        spec = {
            "version": 1,
            "dimensions": [
                {
                    "field_name": "reason",
                    "allowed_values": ["saved-session"]
                },
                {
                    "field_name": "appName",
                    "allowed_values": "*"
                },
                {
                    "field_name": "appUpdateChannel",
                    "allowed_values": ["nightly"]
                },
                {
                    "field_name": "appVersion",
                    "allowed_values": "*"
                },
                {
                    "field_name": "appBuildID",
                    "allowed_values": "one_specific_build"
                },
                {
                    "field_name": "submission_date",
                    "allowed_values": {
                        "min": "20130908",
                        "max": "20140401"
                    }
                }
            ]
        }
        schema = TelemetrySchema(spec)
        allowed = schema.sanitize_allowed_values()
        self.assertTrue(schema.is_allowed("20130908", allowed[5]))
        self.assertTrue(schema.is_allowed("20140401", allowed[5]))
        self.assertTrue(schema.is_allowed("20130909", allowed[5]))
        self.assertTrue(schema.is_allowed("20140101", allowed[5]))
        self.assertFalse(schema.is_allowed("20130907", allowed[5]))
        self.assertFalse(schema.is_allowed("20000000", allowed[5]))
        self.assertFalse(schema.is_allowed("20140402", allowed[5]))
        self.assertFalse(schema.is_allowed("99999999", allowed[5]))

        self.assertTrue(schema.is_allowed("one_specific_build", allowed[4]))
        self.assertFalse(schema.is_allowed("two_specific_build", allowed[4]))
        self.assertFalse(schema.is_allowed("*", allowed[4]))
        self.assertFalse(schema.is_allowed("one_specific_build ", allowed[4]))
        self.assertFalse(schema.is_allowed("one-specific-build", allowed[4]))
Ejemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser(
        description='Split raw logs into partitioned files.',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("-m",
                        "--max-output-size",
                        metavar="N",
                        help="Rotate output files after N bytes",
                        type=int,
                        default=500000000)
    parser.add_argument("-i",
                        "--input-file",
                        help="Filename to read from",
                        required=True)
    parser.add_argument("-o",
                        "--output-dir",
                        help="Base directory to store split files",
                        required=True)
    parser.add_argument("-t",
                        "--telemetry-schema",
                        help="Filename of telemetry schema spec",
                        required=True)
    parser.add_argument("-b", "--bucket", help="S3 Bucket name")
    parser.add_argument("-k", "--aws-key", help="AWS Key")
    parser.add_argument("-s", "--aws-secret-key", help="AWS Secret Key")
    args = parser.parse_args()

    schema_data = open(args.telemetry_schema)
    schema = TelemetrySchema(json.load(schema_data))
    schema_data.close()

    storage = StorageLayout(schema, args.output_dir, args.max_output_size)

    expected_dim_count = len(schema._dimensions)

    if not os.path.isdir(args.output_dir):
        os.makedirs(args.output_dir)

    record_count = 0
    fin = open(args.input_file, "rb")

    bytes_read = 0
    start = datetime.now()
    while True:
        record_count += 1
        # Read two 4-byte values and one 8-byte value
        lengths = fin.read(16)
        if lengths == '':
            break
        len_path, len_data, timestamp = struct.unpack("<IIQ", lengths)

        # Incoming timestamps are in milliseconds, so convert to POSIX first
        # (ie. seconds)
        submission_date = date.fromtimestamp(timestamp /
                                             1000).strftime("%Y%m%d")
        path = unicode(fin.read(len_path), errors="replace")
        #print "Path for record", record_count, path, "length of data:", len_data

        # Detect and handle gzipped data.
        data = fin.read(len_data)
        try:
            # Note: from brief testing, cStringIO doesn't appear to be any
            #       faster. In fact, it seems slightly slower than StringIO.
            data_reader = StringIO.StringIO(data)
            uncompressor = gzip.GzipFile(fileobj=data_reader, mode="r")
            data = unicode(uncompressor.read(), errors="replace")
            uncompressor.close()
            data_reader.close()
        except Exception, e:
            #print e
            # Use the string as-is
            data = unicode(data, errors="replace")

        bytes_read += 8 + len_path + len_data
        #print "Path for record", record_count, path, "length of data:", len_data, "data:", data[0:5] + "..."

        path_components = path.split("/")
        if len(path_components) != expected_dim_count:
            # We're going to pop the ID off, but we'll also add the submission,
            # so it evens out.
            print "Found an invalid path in record", record_count, path
            continue

        key = path_components.pop(0)
        info = {}
        info["reason"] = path_components.pop(0)
        info["appName"] = path_components.pop(0)
        info["appVersion"] = path_components.pop(0)
        info["appUpdateChannel"] = path_components.pop(0)
        info["appBuildID"] = path_components.pop(0)
        dimensions = schema.dimensions_from(info, submission_date)
        #print "  Converted path to filename", schema.get_filename(args.output_dir, dimensions)
        storage.write(key, data, dimensions)