def main(argv=None):
    parser = argparse.ArgumentParser(description="Convert Telemetry data")
    parser.add_argument("-c",
                        "--config-file",
                        help="Read configuration from this file",
                        default="./telemetry_server_config.json")
    parser.add_argument("-d",
                        "--date",
                        help="Use specified date for dimensions")
    args = parser.parse_args()

    try:
        server_config = open(args.config_file, "r")
        config = json.load(server_config)
        server_config.close()
    except IOError:
        config = {}

    cache_dir = config.get("revision_cache_path", "./histogram_cache")
    server = config.get("revision_cache_server", "hg.mozilla.org")
    schema_filename = config.get("schema_filename", "./telemetry_schema.json")
    schema_data = open(schema_filename)
    schema = TelemetrySchema(json.load(schema_data))
    schema_data.close()

    cache = revision_cache.RevisionCache(cache_dir, server)
    converter = Converter(cache, schema)
    process(converter, args.date)
Exemple #2
0
    def __init__(self, config):
        # Sanity check args.
        if config.num_mappers <= 0:
            raise ValueError("Number of mappers must be greater than zero")
        if config.num_reducers <= 0:
            raise ValueError("Number of reducers must be greater than zero")
        if not os.path.isdir(config.data_dir):
            raise ValueError("Data dir must be a valid directory")
        if not os.path.isdir(config.work_dir):
            raise ValueError("Work dir must be a valid directory")
        if not os.path.isfile(config.job_script):
            raise ValueError("Job script must be a valid python file")
        if not os.path.isfile(config.input_filter):
            raise ValueError("Input filter must be a valid json file")

        self._input_dir = config.data_dir
        if self._input_dir[-1] == os.path.sep:
            self._input_dir = self._input_dir[0:-1]
        self._work_dir = config.work_dir
        self._input_filter = TelemetrySchema(
            json.load(open(config.input_filter)))
        self._allowed_values = self._input_filter.sanitize_allowed_values()
        self._output_file = config.output
        self._num_mappers = config.num_mappers
        self._num_reducers = config.num_reducers
        self._local_only = config.local_only
        self._bucket_name = config.bucket
        self._aws_key = config.aws_key
        self._aws_secret_key = config.aws_secret_key
        modulefd = open(config.job_script)
        ## Lifted from FileDriver.py in jydoop.
        self._job_module = imp.load_module("telemetry_job", modulefd,
                                           config.job_script, ('.py', 'U', 1))
Exemple #3
0
    def setUpClass(cls):
        cls.cache_dir = "/tmp/histogram_revision_cache"
        cls.schema_filename = "./telemetry/telemetry_schema.json"
        assert not os.path.exists(cls.cache_dir)

        schema_file = open(cls.schema_filename, "r")
        cls.schema = TelemetrySchema(json.load(schema_file))
        schema_file.close()
        cls.cache = revision_cache.RevisionCache(cls.cache_dir,
                                                 'hg.mozilla.org')
        cls.converter = Converter(cls.cache, cls.schema)
Exemple #4
0
    def test_more_allowed(self):
        spec = {
            "version":
            1,
            "dimensions": [{
                "field_name": "reason",
                "allowed_values": ["saved-session"]
            }, {
                "field_name": "appName",
                "allowed_values": "*"
            }, {
                "field_name": "appUpdateChannel",
                "allowed_values": ["nightly"]
            }, {
                "field_name": "appVersion",
                "allowed_values": "*"
            }, {
                "field_name": "appBuildID",
                "allowed_values": "one_specific_build"
            }, {
                "field_name": "submission_date",
                "allowed_values": {
                    "min": "20130908",
                    "max": "20140401"
                }
            }]
        }
        schema = TelemetrySchema(spec)
        allowed = schema.sanitize_allowed_values()
        self.assertTrue(schema.is_allowed("20130908", allowed[5]))
        self.assertTrue(schema.is_allowed("20140401", allowed[5]))
        self.assertTrue(schema.is_allowed("20130909", allowed[5]))
        self.assertTrue(schema.is_allowed("20140101", allowed[5]))
        self.assertFalse(schema.is_allowed("20130907", allowed[5]))
        self.assertFalse(schema.is_allowed("20000000", allowed[5]))
        self.assertFalse(schema.is_allowed("20140402", allowed[5]))
        self.assertFalse(schema.is_allowed("99999999", allowed[5]))

        self.assertTrue(schema.is_allowed("one_specific_build", allowed[4]))
        self.assertFalse(schema.is_allowed("two_specific_build", allowed[4]))
        self.assertFalse(schema.is_allowed("*", allowed[4]))
        self.assertFalse(schema.is_allowed("one_specific_build ", allowed[4]))
        self.assertFalse(schema.is_allowed("one-specific-build", allowed[4]))
Exemple #5
0
 def setUp(self):
     self.schema = TelemetrySchema(self.get_schema_spec())
     self.allowed_values = self.schema.sanitize_allowed_values()
def main():
    parser = argparse.ArgumentParser(
        description='Split raw logs into partitioned files.',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("-m",
                        "--max-output-size",
                        metavar="N",
                        help="Rotate output files after N bytes",
                        type=int,
                        default=500000000)
    parser.add_argument("-i",
                        "--input-file",
                        help="Filename to read from",
                        required=True)
    parser.add_argument("-o",
                        "--output-dir",
                        help="Base directory to store split files",
                        required=True)
    parser.add_argument("-t",
                        "--telemetry-schema",
                        help="Filename of telemetry schema spec",
                        required=True)
    parser.add_argument("-b", "--bucket", help="S3 Bucket name")
    parser.add_argument("-k", "--aws-key", help="AWS Key")
    parser.add_argument("-s", "--aws-secret-key", help="AWS Secret Key")
    args = parser.parse_args()

    schema_data = open(args.telemetry_schema)
    schema = TelemetrySchema(json.load(schema_data))
    schema_data.close()

    storage = StorageLayout(schema, args.output_dir, args.max_output_size)

    expected_dim_count = len(schema._dimensions)

    if not os.path.isdir(args.output_dir):
        os.makedirs(args.output_dir)

    record_count = 0
    fin = open(args.input_file, "rb")

    bytes_read = 0
    start = datetime.now()
    while True:
        record_count += 1
        # Read two 4-byte values and one 8-byte value
        lengths = fin.read(16)
        if lengths == '':
            break
        len_path, len_data, timestamp = struct.unpack("<IIQ", lengths)

        # Incoming timestamps are in milliseconds, so convert to POSIX first
        # (ie. seconds)
        submission_date = date.fromtimestamp(timestamp /
                                             1000).strftime("%Y%m%d")
        path = unicode(fin.read(len_path), errors="replace")
        #print "Path for record", record_count, path, "length of data:", len_data

        # Detect and handle gzipped data.
        data = fin.read(len_data)
        try:
            # Note: from brief testing, cStringIO doesn't appear to be any
            #       faster. In fact, it seems slightly slower than StringIO.
            data_reader = StringIO.StringIO(data)
            uncompressor = gzip.GzipFile(fileobj=data_reader, mode="r")
            data = unicode(uncompressor.read(), errors="replace")
            uncompressor.close()
            data_reader.close()
        except Exception, e:
            #print e
            # Use the string as-is
            data = unicode(data, errors="replace")

        bytes_read += 8 + len_path + len_data
        #print "Path for record", record_count, path, "length of data:", len_data, "data:", data[0:5] + "..."

        path_components = path.split("/")
        if len(path_components) != expected_dim_count:
            # We're going to pop the ID off, but we'll also add the submission,
            # so it evens out.
            print "Found an invalid path in record", record_count, path
            continue

        key = path_components.pop(0)
        info = {}
        info["reason"] = path_components.pop(0)
        info["appName"] = path_components.pop(0)
        info["appVersion"] = path_components.pop(0)
        info["appUpdateChannel"] = path_components.pop(0)
        info["appBuildID"] = path_components.pop(0)
        dimensions = schema.dimensions_from(info, submission_date)
        #print "  Converted path to filename", schema.get_filename(args.output_dir, dimensions)
        storage.write(key, data, dimensions)