Ejemplo n.º 1
0
    def __init__(self, cfg):
        self.job_bundle = cfg.job_bundle
        if cfg.input_filter:
            self.input_filter = TelemetrySchema(
                json.load(open(cfg.input_filter)))
        else:
            self.input_filter = None

        if cfg.input_list_file:
            self.input_list = cfg.input_list_file
        else:
            self.input_list = None

        self.job_id = str(uuid4())
        self.target_queue = cfg.target_queue
        self.aws_key = cfg.aws_key
        self.aws_secret_key = cfg.aws_secret_key
        self.input_bucket = "telemetry-published-v1"
        self.job_name = cfg.name
        self.job_owner = cfg.owner
        self.date_limit = cfg.date_limit

        # Bucket with intermediate data for this analysis job
        self.analysis_bucket = "jonasfj-telemetry-analysis"

        self.s3_code_path = "batch-jobs/" + self.job_id + ".tar.gz"

        # S3 region of operation
        self.aws_region = "us-west-2"
        self.task_size_limit = 400 * 1024 * 1024
        self.sqs_input_name = cfg.sqs_queue
def update_published_v2_files(sdb, from_submission_date=None, to_submission_date=None, limit=None):
    s3 = S3Connection()
    bucket_name = "telemetry-published-v2"
    bucket = s3.get_bucket(bucket_name)
    schema_key = bucket.get_key("telemetry_schema.json")
    schema_string = schema_key.get_contents_as_string()
    schema = TelemetrySchema(json.loads(schema_string))

    termination_requested = [False]
    def keyboard_interrupt_handler(signal, frame):
        termination_requested[0] = True
    signal.signal(signal.SIGINT, keyboard_interrupt_handler)

    added_count = 0
    total_count = 0
    start_time = datetime.now()
    done = False
    last_key = ''
    batch = BatchPut(sdb)

    while not done:
        try:
            for key in bucket.list(marker=last_key):
                last_key = key.name

                if total_count % 1e5 == 0:
                    print("Looked at {} total records in {} seconds, added {}".
                          format(total_count, delta_sec(start_time), added_count))

                dims = schema.get_dimension_map(schema.get_dimensions(".", key.name))

                if (from_submission_date is None or dims["submission_date"] >= from_submission_date) and \
                   (to_submission_date is None or dims["submission_date"] <= to_submission_date) and \
                   dims["submission_date"][:-2] in sdb and \
                   dims["reason"] != "idle_daily":
                    attributes = {"reason": dims.get("reason"),
                                  "appName": dims.get("appName"),
                                  "appUpdateChannel": dims.get("appUpdateChannel"),
                                  "appVersion": dims.get("appVersion"),
                                  "appBuildID": dims.get("appBuildID"),
                                  "submissionDate": dims.get("submission_date")}
                    batch.put(dims["submission_date"][:-2], key.name, attributes)
                    added_count += 1

                total_count += 1
                if total_count == limit or termination_requested[0]:
                    done = True
                    break

        except Exception as e:
            print("Error listing keys: {}".format(e))
            traceback.print_exc()
            print("Continuing from last seen key: {}".format(last_key))
            continue

        break

    batch.flush()
    print("Overall, added {} of {} in {} seconds".format(added_count, total_count, delta_sec(start_time)))
Ejemplo n.º 3
0
def main():
    parser = ArgumentParser(
        description='Convert local Telemetry pings to server storage structure'
    )
    parser.add_argument("--input-dir", required=True)
    parser.add_argument("--output-dir", required=True)
    parser.add_argument("--schema",
                        type=file,
                        default='./telemetry/telemetry_schema.json')
    parser.add_argument("--histogram-cache-dir",
                        default='/tmp/telemetry_histogram_cache')
    args = parser.parse_args()

    print "Getting pings from", args.input_dir, "converting them and storing them in", args.output_dir
    schema = TelemetrySchema(json.load(args.schema))
    cache = RevisionCache(args.histogram_cache_dir, 'hg.mozilla.org')
    converter = Converter(cache, schema)
    storage = StorageLayout(schema, args.output_dir, 500000000)
    ping_dir = args.input_dir
    ping_files = get_pings(ping_dir)
    if len(ping_files) == 0:
        # Try the usual ping dir (if the user just gave the Profile Dir)
        ping_dir = os.path.join(args.input_dir, "saved-telemetry-pings")
        ping_files = get_pings(ping_dir)

    print "found", len(ping_files), "pings"
    for ping_file in ping_files:
        with open(os.path.join(ping_dir, ping_file), "r") as f:
            ping = json.load(f)
            reason = ping['reason']
            key = ping['slug']
            payload = ping['payload']
            submission_date = date.today().strftime("%Y%m%d")
            dims = schema.dimensions_from(payload, submission_date)
            try:
                parsed_data, dims = converter.convert_obj(payload, dims[-1])
                serialized_data = converter.serialize(parsed_data)
                data_version = Converter.VERSION_CONVERTED
                try:
                    # Write to persistent storage
                    n = storage.write(key, serialized_data, dims, data_version)
                    print "Successfully saved ping", key, "to", n
                except Exception, e:
                    traceback.print_exc()
            except BadPayloadError, e:
                print "Bad Payload:", e.msg
            except Exception, e:
                traceback.print_exc()
Ejemplo n.º 4
0
def update_published_v4_files(sdb, bucket, bucket_prefix, submission_date, limit=None):
    s3 = S3Connection()
    metadata = s3.get_bucket(METADATA_BUCKET)
    schema_key = metadata.get_key("{}/schema.json".format(bucket_prefix))
    schema_string = schema_key.get_contents_as_string()
    schema = TelemetrySchema(json.loads(schema_string))
    bucket = s3.get_bucket(bucket)

    added_count = 0
    total_count = 0
    start_time = datetime.now()
    done = False
    last_key = ''
    batch = BatchPut(sdb)
    prefix = "{}/{}".format(bucket_prefix, submission_date) if submission_date else bucket_prefix

    print "Bucket: {} - Prefix: {} - Date: {}".format(bucket.name, bucket_prefix, submission_date)

    while not done:
        try:
            for key in bucket.list(marker=last_key, prefix=prefix):
                last_key = key.name

                if total_count % 1e5 == 0:
                    print("Looked at {} total records in {} seconds, added {}".
                            format(total_count, delta_sec(start_time), added_count))

                dims = schema.get_dimension_map(schema.get_dimensions(".", key.name[len(bucket_prefix) + 1:], dirs_only=True))

                if (dims["submissionDate"] == submission_date) and dims["submissionDate"][:-2] in sdb:
                    batch.put(dims["submissionDate"][:-2], key.name, dims)
                    added_count += 1

                total_count += 1
                if total_count == limit:
                    done = True
                    break

        except Exception as e:
            print("Error listing keys: {}".format(e))
            traceback.print_exc()
            print("Continuing from last seen key: {}".format(last_key))
            continue

        break

    batch.flush()
    print("Overall, added {} of {} in {} seconds".format(added_count, total_count, delta_sec(start_time)))
Ejemplo n.º 5
0
    def __init__(self, config):
        # Sanity check args.
        if config.num_mappers <= 0:
            raise ValueError("Number of mappers must be greater than zero")
        if config.num_reducers <= 0:
            raise ValueError("Number of reducers must be greater than zero")
        if not os.path.isdir(config.data_dir):
            raise ValueError("Data dir must be a valid directory")
        if not os.path.isdir(config.work_dir):
            raise ValueError("Work dir must be a valid directory")
        if not os.path.isfile(config.job_script):
            raise ValueError("Job script must be a valid python file")
        if not os.path.isfile(config.input_filter):
            raise ValueError("Input filter must be a valid json file")

        self._input_dir = config.data_dir
        if self._input_dir[-1] == os.path.sep:
            self._input_dir = self._input_dir[0:-1]
        self._work_dir = config.work_dir
        self._input_filter = TelemetrySchema(json.load(open(config.input_filter)))
        self._allowed_values = self._input_filter.sanitize_allowed_values()
        self._output_file = config.output
        self._num_mappers = config.num_mappers
        self._num_reducers = config.num_reducers
        self._local_only = config.local_only
        self._bucket_name = config.bucket
        self._aws_key = config.aws_key
        self._aws_secret_key = config.aws_secret_key
        modulefd = open(config.job_script)
        # let the job script import additional modules under its path
        sys.path.append(os.path.dirname(config.job_script))
        ## Lifted from FileDriver.py in jydoop.
        self._job_module = imp.load_module("telemetry_job", modulefd, config.job_script, ('.py', 'U', 1))
Ejemplo n.º 6
0
    def __init__(self, cfg):
        self.job_bundle = cfg.job_bundle
        if cfg.input_filter:
            self.input_filter = TelemetrySchema(json.load(open(cfg.input_filter)))
        else:
            self.input_filter = None

        if cfg.input_list_file:
            self.input_list = cfg.input_list_file
        else:
            self.input_list = None

        self.job_id = str(uuid4())
        self.target_queue = cfg.target_queue
        self.aws_key = cfg.aws_key
        self.aws_secret_key = cfg.aws_secret_key
        self.input_bucket = "telemetry-published-v1"
        self.job_name = cfg.name
        self.job_owner = cfg.owner
        self.date_limit = cfg.date_limit

        # Bucket with intermediate data for this analysis job
        self.analysis_bucket = "jonasfj-telemetry-analysis"

        self.s3_code_path = "batch-jobs/" + self.job_id + ".tar.gz"

        # S3 region of operation
        self.aws_region = "us-west-2"
        self.task_size_limit = 400 * 1024 * 1024
        self.sqs_input_name = cfg.sqs_queue
def main():
    parser = ArgumentParser(description='Convert local Telemetry pings to server storage structure')
    parser.add_argument("--input-dir", required=True)
    parser.add_argument("--output-dir", required=True)
    parser.add_argument("--schema", type=file, default='./telemetry/telemetry_schema.json')
    parser.add_argument("--histogram-cache-dir", default='/tmp/telemetry_histogram_cache')
    args = parser.parse_args()

    print "Getting pings from", args.input_dir, "converting them and storing them in", args.output_dir
    schema = TelemetrySchema(json.load(args.schema))
    cache = RevisionCache(args.histogram_cache_dir, 'hg.mozilla.org')
    converter = Converter(cache, schema)
    storage = StorageLayout(schema, args.output_dir, 500000000)
    ping_dir = args.input_dir
    ping_files = get_pings(ping_dir)
    if len(ping_files) == 0:
        # Try the usual ping dir (if the user just gave the Profile Dir)
        ping_dir = os.path.join(args.input_dir, "saved-telemetry-pings")
        ping_files = get_pings(ping_dir)

    print "found", len(ping_files), "pings"
    for ping_file in ping_files:
        with open(os.path.join(ping_dir, ping_file), "r") as f:
            ping = json.load(f)
            reason = ping['reason']
            key = ping['slug']
            payload = ping['payload']
            submission_date = date.today().strftime("%Y%m%d")
            dims = schema.dimensions_from(payload, submission_date)
            try:
                parsed_data, dims = converter.convert_obj(payload, dims[-1])
                serialized_data = converter.serialize(parsed_data)
                data_version = Converter.VERSION_CONVERTED
                try:
                    # Write to persistent storage
                    n = storage.write(key, serialized_data, dims, data_version)
                    print "Successfully saved ping", key, "to", n
                except Exception, e:
                    traceback.print_exc()
            except BadPayloadError, e:
                print "Bad Payload:", e.msg
            except Exception, e:
                traceback.print_exc()
Ejemplo n.º 8
0
def _filter_to_schema(schema, filter_args):
    new_schema = {"version": 1, "dimensions": []}
    for i, dim in enumerate(schema["dimensions"]):
        new_filter = {
            "field_name":
            schema["dimensions"][i].get("field_name", "field{}".format(i)),
            "allowed_values":
            "*"
        }
        if dim["field_name"] in filter_args:
            new_filter["allowed_values"] = filter_args[dim["field_name"]]
        new_schema["dimensions"].append(new_filter)
    return TelemetrySchema(new_schema)
Ejemplo n.º 9
0
def test_v4execschema():
    schema_spec = {
        "version":
        2,
        "dimensions": [{
            "field_name": "submissionDate",
            "allowed_values": {
                "max": "20150901"
            }
        }]
    }
    schema = TelemetrySchema(schema_spec)

    found = set()
    for f in s3util.list_heka_partitions(v4execbucket, schema=schema):
        found.add(f.name)

    assert (len(found) == 3)
    assert ("20150901/20150901221519.541_ip-172-31-16-184" in found)
    assert ("20150901/20150901223019.579_ip-172-31-16-184" in found)
    assert ("20150901/20150901224519.623_ip-172-31-16-184" in found)

    # Test with a prefix:
    found = set()
    for f in s3util.list_heka_partitions(
            v4prefixbucket, prefix="telemetry-executive-summary-2",
            schema=schema):
        found.add(f.name)

    assert (len(found) == 3)
    assert (
        "telemetry-executive-summary-2/20150901/20150901221519.541_ip-172-31-16-184"
        in found)
    assert (
        "telemetry-executive-summary-2/20150901/20150901223019.579_ip-172-31-16-184"
        in found)
    assert (
        "telemetry-executive-summary-2/20150901/20150901224519.623_ip-172-31-16-184"
        in found)

    # Test with a bunch of prefixes:
    found = set()
    for f in s3util.list_heka_partitions(multiprefixbucket,
                                         prefix="a/b/c/d",
                                         schema=schema):
        found.add(f.name)

    assert (len(found) == 3)
    assert ("a/b/c/d/20150901/20150901221519.541_ip-172-31-16-184" in found)
    assert ("a/b/c/d/20150901/20150901223019.579_ip-172-31-16-184" in found)
    assert ("a/b/c/d/20150901/20150901224519.623_ip-172-31-16-184" in found)
Ejemplo n.º 10
0
    def __init__(self, config):
        # Sanity check args.
        if config.get("num_mappers") <= 0:
            raise ValueError("Number of mappers must be greater than zero")
        if config.get("num_reducers") <= 0:
            raise ValueError("Number of reducers must be greater than zero")
        if not os.path.isdir(config.get("data_dir")):
            raise ValueError("Data dir must be a valid directory")
        if not os.path.isdir(config.get("work_dir")):
            raise ValueError("Work dir must be a valid directory")
        if not os.path.isfile(config.get("job_script", "")):
            raise ValueError("Job script must be a valid python file")
        if not os.path.isfile(config.get("input_filter")):
            raise ValueError("Input filter must be a valid json file")

        self._input_dir = config.get("data_dir")
        if self._input_dir[-1] == os.path.sep:
            self._input_dir = self._input_dir[0:-1]
        self._work_dir = config.get("work_dir")
        with open(config.get("input_filter")) as filter_file:
            self._input_filter = TelemetrySchema(json.load(filter_file))
        self._allowed_values = self._input_filter.sanitize_allowed_values()
        self._output_file = config.get("output")
        self._num_mappers = config.get("num_mappers")
        self._num_reducers = config.get("num_reducers")
        self._local_only = config.get("local_only")
        self._bucket_name = config.get("bucket")
        self._aws_key = config.get("aws_key")
        self._aws_secret_key = config.get("aws_secret_key")
        self._profile = config.get("profile")
        self._delete_data = config.get("delete_data")
        with open(config.get("job_script")) as modulefd:
            # let the job script import additional modules under its path
            sys.path.append(os.path.dirname(config.get("job_script")))
            ## Lifted from FileDriver.py in jydoop.
            self._job_module = imp.load_module("telemetry_job", modulefd,
                                               config.get("job_script"),
                                               ('.py', 'U', 1))
Ejemplo n.º 11
0
def test_v4schema():
    schema_spec = {
        "version":
        2,
        "dimensions": [{
            "field_name": "submissionDate",
            "allowed_values": "20150903"
        }, {
            "field_name": "sourceName",
            "allowed_values": "*"
        }, {
            "field_name": "sourceVersion",
            "allowed_values": "4"
        }, {
            "field_name": "docType",
            "allowed_values": ["saved-session"]
        }, {
            "field_name": "appName",
            "allowed_values": ["Firefox"]
        }, {
            "field_name": "appUpdateChannel",
            "allowed_values": ["release"]
        }, {
            "field_name": "appVersion",
            "allowed_values": "24.0"
        }, {
            "field_name": "appBuildId",
            "allowed_values": "20130910160258"
        }]
    }
    schema = TelemetrySchema(schema_spec)

    found = set()
    for f in s3util.list_heka_partitions(v4bucket, schema=schema):
        found.add(f.name)

    assert (len(found) == 3)
    assert (
        "20150903/telemetry/4/saved_session/Firefox/release/24.0/20130910160258/20150903051633.482_ip-172-31-16-184"
        in found)
    assert (
        "20150903/telemetry/4/saved_session/Firefox/release/24.0/20130910160258/20150903051644.482_ip-172-31-16-184"
        in found)
    assert (
        "20150903/telemetry/4/saved_session/Firefox/release/24.0/20130910160258/20150903051655.482_ip-172-31-16-184"
        in found)
Ejemplo n.º 12
0
def test_v2schema():
    schema_spec = {
        "version":
        1,
        "dimensions": [{
            "field_name": "reason",
            "allowed_values": ["saved-session"]
        }, {
            "field_name": "appName",
            "allowed_values": ["Firefox"]
        }, {
            "field_name": "appUpdateChannel",
            "allowed_values": ["release"]
        }, {
            "field_name": "appVersion",
            "allowed_values": ["24.0"]
        }, {
            "field_name": "appBuildID",
            "allowed_values": ["20130910160258"]
        }, {
            "field_name": "submission_date",
            "allowed_values": ["20131003", "20131004"]
        }]
    }

    schema = TelemetrySchema(schema_spec)

    found = set()
    for f in s3util.list_partitions(v2bucket, schema=schema,
                                    include_keys=True):
        found.add(f.name)

    assert (len(found) == 2)
    assert (
        "saved_session/Firefox/release/24.0/20130910160258.20131003.v2.log.25b53e7042c74188b08d71ce32e87237.lzma"
        in found)
    assert (
        "saved_session/Firefox/release/24.0/20130910160258.20131004.v2.log.29afd7a250154729bd53c20253f8af78.lzma"
        in found)
Ejemplo n.º 13
0
def test_schema(d):
    schema_spec = {
        "version":
        1,
        "dimensions": [{
            "field_name": "reason",
            "allowed_values": ["saved-session"]
        }, {
            "field_name": "appName",
            "allowed_values": ["Firefox"]
        }, {
            "field_name": "appUpdateChannel",
            "allowed_values": ["nightly"]
        }, {
            "field_name": "appVersion",
            "allowed_values": ["27.0a1"]
        }, {
            "field_name": "appBuildID",
            "allowed_values": ["20130918030202"]
        }, {
            "field_name": "submission_date",
            "allowed_values": ["20131001"]
        }]
    }

    schema = TelemetrySchema(schema_spec)

    successfully_downloaded = []
    failfully_downloaded = []
    for f, r, err in d.get_schema(schema):
        if err is not None:
            print err
            failfully_downloaded.append(f)
        else:
            print "Downloaded", f
            successfully_downloaded.append(f)
    assert len(failfully_downloaded) == 0
    print "Successfully downloaded", len(successfully_downloaded)
    assert len(successfully_downloaded) == 20
Ejemplo n.º 14
0
    def __init__(self, config):
        # Sanity check args.
        if config.get("num_mappers") <= 0:
            raise ValueError("Number of mappers must be greater than zero")
        if config.get("num_reducers") <= 0:
            raise ValueError("Number of reducers must be greater than zero")
        if not os.path.isdir(config.get("data_dir")):
            raise ValueError("Data dir must be a valid directory")
        if not os.path.isdir(config.get("work_dir")):
            raise ValueError("Work dir must be a valid directory")
        if not os.path.isfile(config.get("job_script", "")):
            raise ValueError("Job script must be a valid python file")
        if not os.path.isfile(config.get("input_filter")):
            raise ValueError("Input filter must be a valid json file")

        self._input_dir = config.get("data_dir")
        if self._input_dir[-1] == os.path.sep:
            self._input_dir = self._input_dir[0:-1]
        self._work_dir = config.get("work_dir")
        with open(config.get("input_filter")) as filter_file:
            self._input_filter = TelemetrySchema(json.load(filter_file))
        self._allowed_values = self._input_filter.sanitize_allowed_values()
        self._output_file = config.get("output")
        self._num_mappers = config.get("num_mappers")
        self._num_reducers = config.get("num_reducers")
        self._local_only = config.get("local_only")
        self._bucket_name = config.get("bucket")
        self._aws_key = config.get("aws_key")
        self._aws_secret_key = config.get("aws_secret_key")
        self._profile = config.get("profile")
        self._delete_data = config.get("delete_data")
        with open(config.get("job_script")) as modulefd:
            # let the job script import additional modules under its path
            sys.path.append(os.path.dirname(config.get("job_script")))
            ## Lifted from FileDriver.py in jydoop.
            self._job_module = imp.load_module(
                "telemetry_job", modulefd, config.get("job_script"), ('.py', 'U', 1))
Ejemplo n.º 15
0
class TestPersist(unittest.TestCase):
    def setUp(self):
        test_dir = self.get_test_dir()
        self.schema = TelemetrySchema(self.get_schema_spec())
        self.storage = StorageLayout(self.schema, test_dir, 10000)
        assert not os.path.exists(test_dir)
        os.makedirs(test_dir)

    def tearDown(self):
        shutil.rmtree(self.get_test_dir())

    def get_test_dir(self):
        return "/tmp/test_telemetry_persist"

    def get_schema_spec(self):
        return {
            "version":
            1,
            "dimensions": [{
                "field_name": "reason",
                "allowed_values": ["r1", "r2"]
            }, {
                "field_name": "appName",
                "allowed_values": ["a1"]
            }, {
                "field_name": "appUpdateChannel",
                "allowed_values": ["c1", "c2", "c3"]
            }, {
                "field_name": "appVersion",
                "allowed_values": "*"
            }, {
                "field_name": "appBuildID",
                "allowed_values": "*"
            }, {
                "field_name": "submission_date",
                "allowed_values": {
                    "min": "20130101",
                    "max": "20131231"
                }
            }]
        }

    def test_write_filename(self):
        test_file = os.path.join(self.get_test_dir(), "test.log")
        self.storage.write_filename("foo", '{"bar":"baz"}', test_file)
        test_file_md5, test_file_size = fileutil.md5file(test_file)
        self.assertEqual(test_file_md5, "0ea91df239ea79ed2ebab34b46d455fc")

        test_file = os.path.join(self.get_test_dir(), "test2.log")
        # Now test writing an object
        self.storage.write_filename("foo", {"bar": "baz"}, test_file)
        test_file_md5, test_file_size = fileutil.md5file(test_file)
        self.assertEqual(test_file_md5, "0ea91df239ea79ed2ebab34b46d455fc")

    def test_write(self):
        dims = ["r1", "a1", "c1", "v1", "b1", "20130102"]
        test_dir = self.get_test_dir()
        test_file = self.schema.get_filename(test_dir, dims)
        self.assertEquals(test_file,
                          test_dir + "/r1/a1/c1/v1/b1.20130102.v1.log")

        self.storage.write("foo", '{"bar":"baz"}', dims)
        md5, size = fileutil.md5file(test_file)
        self.assertEqual(md5, "0ea91df239ea79ed2ebab34b46d455fc")

    def test_clean_newlines(self):
        self.assertEqual(self.storage.clean_newlines("ab\n\ncd\r\n"),
                         "ab  cd  ")

    def test_rotate(self):
        test_file = os.path.join(self.get_test_dir(), "test.log")
        key = "01234567890123456789012345678901234567890123456789"
        value = '{"some filler stuff here":"fffffffffffffffffff"}'
        # each iteration should be 100 bytes.
        for i in range(99):
            result = self.storage.write_filename(key, value, test_file)
            self.assertEquals(result, test_file)

        # The 100th iteration should cause the file to rotate
        rolled = self.storage.write_filename(key, value, test_file)
        # rolled should be <test_dir>/test.log.<pid>.<timestamp><suffix>
        self.assertNotEqual(rolled, test_file)
        self.assertTrue(rolled.startswith(test_file))
        self.assertTrue(
            rolled.endswith(StorageLayout.PENDING_COMPRESSION_SUFFIX))
Ejemplo n.º 16
0
 def setUp(self):
     test_dir = self.get_test_dir()
     self.schema = TelemetrySchema(self.get_schema_spec())
     self.storage = StorageLayout(self.schema, test_dir, 10000)
     assert not os.path.exists(test_dir)
     os.makedirs(test_dir)
Ejemplo n.º 17
0
class Job:
    """A class for orchestrating a Heka MapReduce job"""
    # 1. read input filter
    # 2. generate filtered list of local input files
    # 2a. generate filtered list of remote input files
    # 3. load mapper
    # 4. spawn N processes
    # 5. distribute files among processes
    # 6. map(key, value, dims) each line in the file
    # 7. combine map output for each file
    # 8. reduce combine output overall

    def __init__(self, config):
        # Sanity check args.
        if config.get("num_mappers") <= 0:
            raise ValueError("Number of mappers must be greater than zero")
        if config.get("num_reducers") <= 0:
            raise ValueError("Number of reducers must be greater than zero")
        if not os.path.isdir(config.get("data_dir")):
            raise ValueError("Data dir must be a valid directory")
        if not os.path.isdir(config.get("work_dir")):
            raise ValueError("Work dir must be a valid directory")
        if not os.path.isfile(config.get("job_script", "")):
            raise ValueError("Job script must be a valid python file")
        if not os.path.isfile(config.get("input_filter")):
            raise ValueError("Input filter must be a valid json file")

        self._input_dir = config.get("data_dir")
        if self._input_dir[-1] == os.path.sep:
            self._input_dir = self._input_dir[0:-1]
        self._work_dir = config.get("work_dir")
        with open(config.get("input_filter")) as filter_file:
            self._input_filter = TelemetrySchema(json.load(filter_file))
        self._allowed_values = self._input_filter.sanitize_allowed_values()
        self._output_file = config.get("output")
        self._num_mappers = config.get("num_mappers")
        self._num_reducers = config.get("num_reducers")
        self._local_only = config.get("local_only")
        self._bucket_name = config.get("bucket")
        self._aws_key = config.get("aws_key")
        self._aws_secret_key = config.get("aws_secret_key")
        self._profile = config.get("profile")
        self._delete_data = config.get("delete_data")
        with open(config.get("job_script")) as modulefd:
            # let the job script import additional modules under its path
            sys.path.append(os.path.dirname(config.get("job_script")))
            ## Lifted from FileDriver.py in jydoop.
            self._job_module = imp.load_module(
                "telemetry_job", modulefd, config.get("job_script"), ('.py', 'U', 1))

    def dump_stats(self, partitions):
        total = sum(partitions)
        avg = total / len(partitions)
        for i in range(len(partitions)):
            print "Partition %d contained %d (%+d)" % (i, partitions[i], float(partitions[i]) - avg)

    def dedupe_remotes(self, remote_files, local_files):
        return ( r for r in remote_files
                   if os.path.join(self._input_dir, r.name) not in local_files )

    def mapreduce(self):
        # Find files matching specified input filter
        files = set(self.get_filtered_files(self._input_dir))
        remote_files = self.get_filtered_files_s3()

        # If we're using the cache dir as the data dir, we will end up reading
        # each already-downloaded file twice, so we should skip any remote files
        # that exist in the data dir.
        remote_files = self.dedupe_remotes(remote_files, files)

        # Partition files into reasonably equal groups for use by mappers
        print "Partitioning input data..."
        partitions = self.partition(files, remote_files)
        print "Done"

        if not any(part for part in partitions):
             print "Filter didn't match any files... nothing to do"
             return

        partitions = [part for part in partitions if part]

        # Not useful to have more mappers than partitions.
        if len(partitions) < self._num_mappers:
            print "Filter matched only %d input files. Reducing number of mappers accordingly." % (
                  len(partitions),)
            self._num_mappers = len(partitions)

        # Free up our set of names. We want to minimize
        # our memory usage prior to forking map jobs.
        #files = None
        gc.collect()

        def checkExitCode(proc):
            # If process was terminated by a signal, exitcode is the negative signal value
            if proc.exitcode == -signal.SIGKILL:
                # SIGKILL is most likely an OOM kill
                raise MemoryError("%s ran out of memory" % proc.name)
            elif proc.exitcode:
                raise OSError("%s exited with code %d" % (proc.name, proc.exitcode))

        # Partitions are ready. Map.
        mappers = []
        for i in range(self._num_mappers):
            if len(partitions[i]) > 0:
                p = Process(
                        target=Mapper,
                        name=("Mapper-%d" % i),
                        args=(i, self._profile, partitions[i], self._work_dir, self._job_module, self._num_reducers, self._delete_data, self._aws_key, self._aws_secret_key, self._bucket_name))
                mappers.append(p)
                p.start()
            else:
                print "Skipping mapper", i, "- no input files to process"
        for m in mappers:
            m.join()
            checkExitCode(m)

        # Mappers are done. Reduce.
        reducers = []
        for i in range(self._num_reducers):
            p = Process(
                    target=Reducer,
                    name=("Reducer-%d" % i),
                    args=(i, self._profile, self._work_dir, self._job_module, self._num_mappers))
            reducers.append(p)
            p.start()
        for r in reducers:
            r.join()
            checkExitCode(r)

        # Reducers are done.  Output results.
        to_combine = 1
        try:
            os.rename(os.path.join(self._work_dir, "reducer_0"), self._output_file)
        except OSError, e:
            if e.errno != errno.EXDEV:
                raise
            else:
                # OSError: [Errno 18] Invalid cross-device link (EXDEV == 18)
                # We can't rename across devices :( Copy / delete instead.
                to_combine = 0

        # TODO: If _output_file ends with a compressed suffix (.gz, .xz, .bz2, etc),
        #       try to compress it after writing.
        if self._num_reducers > to_combine:
            with open(self._output_file, "a") as out:
                for i in range(to_combine, self._num_reducers):
                    # FIXME: this reads the entire reducer output into memory
                    reducer_filename = os.path.join(self._work_dir, "reducer_" + str(i))
                    with open(reducer_filename, "r") as reducer_output:
                        out.write(reducer_output.read())
                    os.remove(reducer_filename)

        # Clean up mapper outputs
        for m in range(self._num_mappers):
            for r in range(self._num_reducers):
                mfile = os.path.join(self._work_dir, "mapper_%d_%d" % (m, r))
                if os.path.exists(mfile):
                    os.remove(mfile)
                else:
                    print "Warning: Could not find", mfile
Ejemplo n.º 18
0
class Job:
    """A class for orchestrating a Telemetry MapReduce job"""
    # 1. read input filter
    # 2. generate filtered list of local input files
    # 2a. generate filtered list of remote input files
    # 3. load mapper
    # 4. spawn N processes
    # 5. distribute files among processes
    # 6. map(key, value, dims) each line in the file
    # 7. combine map output for each file
    # 8. reduce combine output overall

    def __init__(self, config):
        # Sanity check args.
        if config.num_mappers <= 0:
            raise ValueError("Number of mappers must be greater than zero")
        if config.num_reducers <= 0:
            raise ValueError("Number of reducers must be greater than zero")
        if not os.path.isdir(config.data_dir):
            raise ValueError("Data dir must be a valid directory")
        if not os.path.isdir(config.work_dir):
            raise ValueError("Work dir must be a valid directory")
        if not os.path.isfile(config.job_script):
            raise ValueError("Job script must be a valid python file")
        if not os.path.isfile(config.input_filter):
            raise ValueError("Input filter must be a valid json file")

        self._input_dir = config.data_dir
        if self._input_dir[-1] == os.path.sep:
            self._input_dir = self._input_dir[0:-1]
        self._work_dir = config.work_dir
        self._input_filter = TelemetrySchema(json.load(open(config.input_filter)))
        self._allowed_values = self._input_filter.sanitize_allowed_values()
        self._output_file = config.output
        self._num_mappers = config.num_mappers
        self._num_reducers = config.num_reducers
        self._local_only = config.local_only
        self._bucket_name = config.bucket
        self._aws_key = config.aws_key
        self._aws_secret_key = config.aws_secret_key
        modulefd = open(config.job_script)
        # let the job script import additional modules under its path
        sys.path.append(os.path.dirname(config.job_script))
        ## Lifted from FileDriver.py in jydoop.
        self._job_module = imp.load_module("telemetry_job", modulefd, config.job_script, ('.py', 'U', 1))

    def dump_stats(self, partitions):
        total = sum(partitions)
        avg = total / len(partitions)
        for i in range(len(partitions)):
            print "Partition %d contained %d (%+d)" % (i, partitions[i], float(partitions[i]) - avg)

    def fetch_remotes(self, remotes):
        # TODO: fetch remotes inside Mappers, and process each one as it becomes available.
        remote_names = [ r["name"] for r in remotes if r["type"] == "remote" ]

        # TODO: check cache first.
        result = 0
        if len(remote_names) == 0:
            return result

        fetch_cwd = os.path.join(self._work_dir, "cache")
        if not os.path.isdir(fetch_cwd):
            os.makedirs(fetch_cwd)
        loader = s3util.Loader(fetch_cwd, self._bucket_name, aws_key=self._aws_key, aws_secret_key=self._aws_secret_key)
        start = datetime.now()
        downloaded_bytes = 0
        for local, remote, err in loader.get_list(remote_names):
            if err is None:
                print "Downloaded", remote
                downloaded_bytes += os.path.getsize(local)
            else:
                print "Failed to download", remote
                result += 1
        duration_sec = timer.delta_sec(start)
        downloaded_mb = float(downloaded_bytes) / 1024.0 / 1024.0
        print "Downloaded %.2fMB in %.2fs (%.2fMB/s)" % (downloaded_mb, duration_sec, downloaded_mb / duration_sec)
        return result

    def dedupe_remotes(self, remote_files, local_files):
        return [ r for r in remote_files if os.path.join(self._input_dir, r.name) not in local_files ]

    def mapreduce(self):
        # Find files matching specified input filter
        files = self.get_filtered_files(self._input_dir)
        remote_files = self.get_filtered_files_s3()

        # If we're using the cache dir as the data dir, we will end up reading
        # each already-downloaded file twice, so we should skip any remote files
        # that exist in the data dir.
        remote_files = self.dedupe_remotes(remote_files, files)

        file_count = len(files) + len(remote_files)

        if file_count == 0:
            print "Filter didn't match any files... nothing to do"
            return

        # Not useful to have more mappers than input files.
        if file_count < self._num_mappers:
            print "Filter matched only %s input files (%s local in %s and %s " \
                  "remote from %s). Reducing number of mappers accordingly." \
                  % (file_count, len(files), self._input_dir, len(remote_files),
                      self._bucket_name)
            self._num_mappers = file_count

        # Partition files into reasonably equal groups for use by mappers
        print "Partitioning input data..."
        partitions = self.partition(files, remote_files)
        print "Done"

        def checkExitCode(proc):
            # If process was terminated by a signal, exitcode is the negative signal value
            if proc.exitcode == -signal.SIGKILL:
                # SIGKILL is most likely an OOM kill
                raise MemoryError("%s ran out of memory" % proc.name)
            elif proc.exitcode:
                raise OSError("%s exited with code %d" % (proc.name, proc.exitcode))

        # Partitions are ready. Map.
        mappers = []
        for i in range(self._num_mappers):
            if len(partitions[i]) > 0:
                # Fetch the files we need for each mapper
                print "Fetching remotes for partition", i
                fetch_result = self.fetch_remotes(partitions[i])
                if fetch_result == 0:
                    print "Remote files fetched successfully"
                else:
                    print "ERROR: Failed to fetch", fetch_result, "files."
                    # TODO: Bail, since results will be unreliable?
                p = Process(
                        target=Mapper,
                        name=("Mapper-%d" % i),
                        args=(i, partitions[i], self._work_dir, self._job_module, self._num_reducers))
                mappers.append(p)
                p.start()
            else:
                print "Skipping mapper", i, "- no input files to process"
        for m in mappers:
            m.join()
            checkExitCode(m)

        # Mappers are done. Reduce.
        reducers = []
        for i in range(self._num_reducers):
            p = Process(
                    target=Reducer,
                    name=("Reducer-%d" % i),
                    args=(i, self._work_dir, self._job_module, self._num_mappers))
            reducers.append(p)
            p.start()
        for r in reducers:
            r.join()
            checkExitCode(r)

        # Reducers are done.  Output results.
        to_combine = 1
        try:
            os.rename(os.path.join(self._work_dir, "reducer_0"), self._output_file)
        except OSError, e:
            if e.errno != errno.EXDEV:
                raise
            else:
                # OSError: [Errno 18] Invalid cross-device link (EXDEV == 18)
                # We can't rename across devices :( Copy / delete instead.
                to_combine = 0

        # TODO: If _output_file ends with a compressed suffix (.gz, .xz, .bz2, etc),
        #       try to compress it after writing.
        if self._num_reducers > to_combine:
            out = open(self._output_file, "a")
            for i in range(to_combine, self._num_reducers):
                # FIXME: this reads the entire reducer output into memory
                reducer_filename = os.path.join(self._work_dir, "reducer_" + str(i))
                reducer_output = open(reducer_filename, "r")
                out.write(reducer_output.read())
                reducer_output.close()
                os.remove(reducer_filename)

        # TODO: clean up downloaded files?

        # Clean up mapper outputs
        for m in range(self._num_mappers):
            for r in range(self._num_reducers):
                mfile = os.path.join(self._work_dir, "mapper_%d_%d" % (m, r))
                if os.path.exists(mfile):
                    os.remove(mfile)
                else:
                    print "Warning: Could not find", mfile
Ejemplo n.º 19
0
class Job:
    """A class for orchestrating a Heka MapReduce job"""

    # 1. read input filter
    # 2. generate filtered list of local input files
    # 2a. generate filtered list of remote input files
    # 3. load mapper
    # 4. spawn N processes
    # 5. distribute files among processes
    # 6. map(key, value, dims) each line in the file
    # 7. combine map output for each file
    # 8. reduce combine output overall

    def __init__(self, config):
        # Sanity check args.
        if config.get("num_mappers") <= 0:
            raise ValueError("Number of mappers must be greater than zero")
        if config.get("num_reducers") <= 0:
            raise ValueError("Number of reducers must be greater than zero")
        if not os.path.isdir(config.get("data_dir")):
            raise ValueError("Data dir must be a valid directory")
        if not os.path.isdir(config.get("work_dir")):
            raise ValueError("Work dir must be a valid directory")
        if not os.path.isfile(config.get("job_script", "")):
            raise ValueError("Job script must be a valid python file")
        if not os.path.isfile(config.get("input_filter")):
            raise ValueError("Input filter must be a valid json file")

        self._input_dir = config.get("data_dir")
        if self._input_dir[-1] == os.path.sep:
            self._input_dir = self._input_dir[0:-1]
        self._work_dir = config.get("work_dir")
        with open(config.get("input_filter")) as filter_file:
            self._input_filter = TelemetrySchema(json.load(filter_file))
        self._allowed_values = self._input_filter.sanitize_allowed_values()
        self._output_file = config.get("output")
        self._num_mappers = config.get("num_mappers")
        self._num_reducers = config.get("num_reducers")
        self._local_only = config.get("local_only")
        self._bucket_name = config.get("bucket")
        self._aws_key = config.get("aws_key")
        self._aws_secret_key = config.get("aws_secret_key")
        self._profile = config.get("profile")
        self._delete_data = config.get("delete_data")
        with open(config.get("job_script")) as modulefd:
            # let the job script import additional modules under its path
            sys.path.append(os.path.dirname(config.get("job_script")))
            ## Lifted from FileDriver.py in jydoop.
            self._job_module = imp.load_module("telemetry_job", modulefd, config.get("job_script"), (".py", "U", 1))

    def dump_stats(self, partitions):
        total = sum(partitions)
        avg = total / len(partitions)
        for i in range(len(partitions)):
            print "Partition %d contained %d (%+d)" % (i, partitions[i], float(partitions[i]) - avg)

    def dedupe_remotes(self, remote_files, local_files):
        return (r for r in remote_files if os.path.join(self._input_dir, r.name) not in local_files)

    def mapreduce(self):
        # Find files matching specified input filter
        files = set(self.get_filtered_files(self._input_dir))
        remote_files = self.get_filtered_files_s3()

        # If we're using the cache dir as the data dir, we will end up reading
        # each already-downloaded file twice, so we should skip any remote files
        # that exist in the data dir.
        remote_files = self.dedupe_remotes(remote_files, files)

        # Partition files into reasonably equal groups for use by mappers
        print "Partitioning input data..."
        partitions = self.partition(files, remote_files)
        print "Done"

        if not any(part for part in partitions):
            print "Filter didn't match any files... nothing to do"
            return

        partitions = [part for part in partitions if part]

        # Not useful to have more mappers than partitions.
        if len(partitions) < self._num_mappers:
            print "Filter matched only %d input files. Reducing number of mappers accordingly." % (len(partitions),)
            self._num_mappers = len(partitions)

        # Free up our set of names. We want to minimize
        # our memory usage prior to forking map jobs.
        # files = None
        gc.collect()

        def checkExitCode(proc):
            # If process was terminated by a signal, exitcode is the negative signal value
            if proc.exitcode == -signal.SIGKILL:
                # SIGKILL is most likely an OOM kill
                raise MemoryError("%s ran out of memory" % proc.name)
            elif proc.exitcode:
                raise OSError("%s exited with code %d" % (proc.name, proc.exitcode))

        # Partitions are ready. Map.
        mappers = []
        for i in range(self._num_mappers):
            if len(partitions[i]) > 0:
                p = Process(
                    target=Mapper,
                    name=("Mapper-%d" % i),
                    args=(
                        i,
                        self._profile,
                        partitions[i],
                        self._work_dir,
                        self._job_module,
                        self._num_reducers,
                        self._delete_data,
                        self._aws_key,
                        self._aws_secret_key,
                        self._bucket_name,
                    ),
                )
                mappers.append(p)
                p.start()
            else:
                print "Skipping mapper", i, "- no input files to process"
        for m in mappers:
            m.join()
            checkExitCode(m)

        # Mappers are done. Reduce.
        reducers = []
        for i in range(self._num_reducers):
            p = Process(
                target=Reducer,
                name=("Reducer-%d" % i),
                args=(i, self._profile, self._work_dir, self._job_module, self._num_mappers),
            )
            reducers.append(p)
            p.start()
        for r in reducers:
            r.join()
            checkExitCode(r)

        # Reducers are done.  Output results.
        to_combine = 1
        try:
            os.rename(os.path.join(self._work_dir, "reducer_0"), self._output_file)
        except OSError, e:
            if e.errno != errno.EXDEV:
                raise
            else:
                # OSError: [Errno 18] Invalid cross-device link (EXDEV == 18)
                # We can't rename across devices :( Copy / delete instead.
                to_combine = 0

        # TODO: If _output_file ends with a compressed suffix (.gz, .xz, .bz2, etc),
        #       try to compress it after writing.
        if self._num_reducers > to_combine:
            with open(self._output_file, "a") as out:
                for i in range(to_combine, self._num_reducers):
                    # FIXME: this reads the entire reducer output into memory
                    reducer_filename = os.path.join(self._work_dir, "reducer_" + str(i))
                    with open(reducer_filename, "r") as reducer_output:
                        out.write(reducer_output.read())
                    os.remove(reducer_filename)

        # Clean up mapper outputs
        for m in range(self._num_mappers):
            for r in range(self._num_reducers):
                mfile = os.path.join(self._work_dir, "mapper_%d_%d" % (m, r))
                if os.path.exists(mfile):
                    os.remove(mfile)
                else:
                    print "Warning: Could not find", mfile
Ejemplo n.º 20
0
class Job:
    """A class for orchestrating a Telemetry MapReduce job"""

    # 1. read input filter
    # 2. generate filtered list of local input files
    # 2a. generate filtered list of remote input files
    # 3. load mapper
    # 4. spawn N processes
    # 5. distribute files among processes
    # 6. map(key, value, dims) each line in the file
    # 7. combine map output for each file
    # 8. reduce combine output overall

    def __init__(self, config):
        # Sanity check args.
        if config.get("num_mappers") <= 0:
            raise ValueError("Number of mappers must be greater than zero")
        if config.get("num_reducers") <= 0:
            raise ValueError("Number of reducers must be greater than zero")
        if not os.path.isdir(config.get("data_dir")):
            raise ValueError("Data dir must be a valid directory")
        if not os.path.isdir(config.get("work_dir")):
            raise ValueError("Work dir must be a valid directory")
        if not os.path.isfile(config.get("job_script", "")):
            raise ValueError("Job script must be a valid python file")
        if not os.path.isfile(config.get("input_filter")):
            raise ValueError("Input filter must be a valid json file")

        self._input_dir = config.get("data_dir")
        if self._input_dir[-1] == os.path.sep:
            self._input_dir = self._input_dir[0:-1]
        self._work_dir = config.get("work_dir")
        self._input_filter = TelemetrySchema(
            json.load(open(config.get("input_filter"))))
        self._allowed_values = self._input_filter.sanitize_allowed_values()
        self._output_file = config.get("output")
        self._num_mappers = config.get("num_mappers")
        self._num_reducers = config.get("num_reducers")
        self._local_only = config.get("local_only")
        self._bucket_name = config.get("bucket")
        self._aws_key = config.get("aws_key")
        self._aws_secret_key = config.get("aws_secret_key")
        self._profile = config.get("profile")
        modulefd = open(config.get("job_script"))
        # let the job script import additional modules under its path
        sys.path.append(os.path.dirname(config.get("job_script")))
        ## Lifted from FileDriver.py in jydoop.
        self._job_module = imp.load_module("telemetry_job", modulefd,
                                           config.get("job_script"),
                                           ('.py', 'U', 1))

    def dump_stats(self, partitions):
        total = sum(partitions)
        avg = total / len(partitions)
        for i in range(len(partitions)):
            print "Partition %d contained %d (%+d)" % (
                i, partitions[i], float(partitions[i]) - avg)

    def fetch_remotes(self, remotes):
        # TODO: fetch remotes inside Mappers, and process each one as it becomes available.
        remote_names = [r["name"] for r in remotes if r["type"] == "remote"]

        # TODO: check cache first.
        result = 0
        if len(remote_names) == 0:
            return result

        fetch_cwd = os.path.join(self._work_dir, "cache")
        if not os.path.isdir(fetch_cwd):
            os.makedirs(fetch_cwd)
        loader = s3util.Loader(fetch_cwd,
                               self._bucket_name,
                               aws_key=self._aws_key,
                               aws_secret_key=self._aws_secret_key)
        start = datetime.now()
        downloaded_bytes = 0
        for local, remote, err in loader.get_list(remote_names):
            if err is None:
                print "Downloaded", remote
                downloaded_bytes += os.path.getsize(local)
            else:
                print "Failed to download", remote
                result += 1
        duration_sec = timer.delta_sec(start)
        downloaded_mb = float(downloaded_bytes) / 1024.0 / 1024.0
        print "Downloaded %.2fMB in %.2fs (%.2fMB/s)" % (
            downloaded_mb, duration_sec, downloaded_mb / duration_sec)
        return result

    def dedupe_remotes(self, remote_files, local_files):
        return [
            r for r in remote_files
            if os.path.join(self._input_dir, r.name) not in local_files
        ]

    def mapreduce(self):
        # Find files matching specified input filter
        files = self.get_filtered_files(self._input_dir)
        remote_files = self.get_filtered_files_s3()

        # If we're using the cache dir as the data dir, we will end up reading
        # each already-downloaded file twice, so we should skip any remote files
        # that exist in the data dir.
        remote_files = self.dedupe_remotes(remote_files, files)

        file_count = len(files) + len(remote_files)

        if file_count == 0:
            print "Filter didn't match any files... nothing to do"
            return

        # Not useful to have more mappers than input files.
        if file_count < self._num_mappers:
            print "Filter matched only %s input files (%s local in %s and %s " \
                  "remote from %s). Reducing number of mappers accordingly." \
                  % (file_count, len(files), self._input_dir, len(remote_files),
                      self._bucket_name)
            self._num_mappers = file_count

        # Partition files into reasonably equal groups for use by mappers
        print "Partitioning input data..."
        partitions = self.partition(files, remote_files)
        print "Done"

        def checkExitCode(proc):
            # If process was terminated by a signal, exitcode is the negative signal value
            if proc.exitcode == -signal.SIGKILL:
                # SIGKILL is most likely an OOM kill
                raise MemoryError("%s ran out of memory" % proc.name)
            elif proc.exitcode:
                raise OSError("%s exited with code %d" %
                              (proc.name, proc.exitcode))

        # Partitions are ready. Map.
        mappers = []
        for i in range(self._num_mappers):
            if len(partitions[i]) > 0:
                # Fetch the files we need for each mapper
                print "Fetching remotes for partition", i
                fetch_result = self.fetch_remotes(partitions[i])
                if fetch_result == 0:
                    print "Remote files fetched successfully"
                else:
                    print "ERROR: Failed to fetch", fetch_result, "files."
                    # TODO: Bail, since results will be unreliable?
                p = Process(target=Mapper,
                            name=("Mapper-%d" % i),
                            args=(i, self._profile, partitions[i],
                                  self._work_dir, self._job_module,
                                  self._num_reducers))
                mappers.append(p)
                p.start()
            else:
                print "Skipping mapper", i, "- no input files to process"
        for m in mappers:
            m.join()
            checkExitCode(m)

        # Mappers are done. Reduce.
        reducers = []
        for i in range(self._num_reducers):
            p = Process(target=Reducer,
                        name=("Reducer-%d" % i),
                        args=(i, self._profile, self._work_dir,
                              self._job_module, self._num_mappers))
            reducers.append(p)
            p.start()
        for r in reducers:
            r.join()
            checkExitCode(r)

        # Reducers are done.  Output results.
        to_combine = 1
        try:
            os.rename(os.path.join(self._work_dir, "reducer_0"),
                      self._output_file)
        except OSError, e:
            if e.errno != errno.EXDEV:
                raise
            else:
                # OSError: [Errno 18] Invalid cross-device link (EXDEV == 18)
                # We can't rename across devices :( Copy / delete instead.
                to_combine = 0

        # TODO: If _output_file ends with a compressed suffix (.gz, .xz, .bz2, etc),
        #       try to compress it after writing.
        if self._num_reducers > to_combine:
            out = open(self._output_file, "a")
            for i in range(to_combine, self._num_reducers):
                # FIXME: this reads the entire reducer output into memory
                reducer_filename = os.path.join(self._work_dir,
                                                "reducer_" + str(i))
                reducer_output = open(reducer_filename, "r")
                out.write(reducer_output.read())
                reducer_output.close()
                os.remove(reducer_filename)

        # TODO: clean up downloaded files?

        # Clean up mapper outputs
        for m in range(self._num_mappers):
            for r in range(self._num_reducers):
                mfile = os.path.join(self._work_dir, "mapper_%d_%d" % (m, r))
                if os.path.exists(mfile):
                    os.remove(mfile)
                else:
                    print "Warning: Could not find", mfile
Ejemplo n.º 21
0
class TestPersist(unittest.TestCase):
    def setUp(self):
        test_dir = self.get_test_dir()
        self.schema = TelemetrySchema(self.get_schema_spec())
        self.storage = StorageLayout(self.schema, test_dir, 10000)
        assert not os.path.exists(test_dir)
        os.makedirs(test_dir)

    def tearDown(self):
        shutil.rmtree(self.get_test_dir())


    def get_test_dir(self):
        return "/tmp/test_telemetry_persist"

    def get_schema_spec(self):
        return {
            "version": 1,
            "dimensions": [
                {
                    "field_name": "reason",
                    "allowed_values": ["r1","r2"]
                },
                {
                    "field_name": "appName",
                    "allowed_values": ["a1"]
                },
                {
                    "field_name": "appUpdateChannel",
                    "allowed_values": ["c1", "c2", "c3"]
                },
                {
                    "field_name": "appVersion",
                    "allowed_values": "*"
                },
                {
                    "field_name": "appBuildID",
                   "allowed_values": "*"
                },
                {
                    "field_name": "submission_date",
                    "allowed_values": {
                          "min": "20130101",
                          "max": "20131231"
                    }
                }
            ]
        }

    def test_write_filename(self):
        test_file = os.path.join(self.get_test_dir(), "test.log")
        self.storage.write_filename("foo", '{"bar":"baz"}', test_file)
        test_file_md5, test_file_size = fileutil.md5file(test_file)
        self.assertEqual(test_file_md5, "0ea91df239ea79ed2ebab34b46d455fc")

        test_file = os.path.join(self.get_test_dir(), "test2.log")
        # Now test writing an object
        self.storage.write_filename("foo", {"bar":"baz"}, test_file)
        test_file_md5, test_file_size = fileutil.md5file(test_file)
        self.assertEqual(test_file_md5, "0ea91df239ea79ed2ebab34b46d455fc")

    def test_write(self):
        dims = ["r1", "a1", "c1", "v1", "b1", "20130102"]
        test_dir = self.get_test_dir()
        test_file = self.schema.get_filename(test_dir, dims)
        self.assertEquals(test_file, test_dir + "/r1/a1/c1/v1/b1.20130102.v1.log")

        self.storage.write("foo", '{"bar":"baz"}', dims)
        md5, size = fileutil.md5file(test_file)
        self.assertEqual(md5, "0ea91df239ea79ed2ebab34b46d455fc")

    def test_clean_newlines(self):
        self.assertEqual(self.storage.clean_newlines("ab\n\ncd\r\n"), "ab  cd  ")

    def test_rotate(self):
        test_file = os.path.join(self.get_test_dir(), "test.log")
        key = "01234567890123456789012345678901234567890123456789"
        value = '{"some filler stuff here":"fffffffffffffffffff"}'
        # each iteration should be 100 bytes.
        for i in range(99):
            result = self.storage.write_filename(key, value, test_file)
            self.assertEquals(result, test_file)

        # The 100th iteration should cause the file to rotate
        rolled = self.storage.write_filename(key, value, test_file)
        # rolled should be <test_dir>/test.log.<pid>.<timestamp><suffix>
        self.assertNotEqual(rolled, test_file)
        self.assertTrue(rolled.startswith(test_file))
        self.assertTrue(rolled.endswith(StorageLayout.PENDING_COMPRESSION_SUFFIX))
Ejemplo n.º 22
0
def main():
    parser = argparse.ArgumentParser(
        description="Split raw logs into partitioned files.", formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "-m", "--max-output-size", metavar="N", help="Rotate output files after N bytes", type=int, default=500000000
    )
    parser.add_argument("-i", "--input-file", help="Filename to read from", required=True)
    parser.add_argument("-o", "--output-dir", help="Base directory to store split files", required=True)
    parser.add_argument("-t", "--telemetry-schema", help="Filename of telemetry schema spec", required=True)
    parser.add_argument("-b", "--bucket", help="S3 Bucket name")
    parser.add_argument("-k", "--aws-key", help="AWS Key")
    parser.add_argument("-s", "--aws-secret-key", help="AWS Secret Key")
    args = parser.parse_args()

    schema_data = open(args.telemetry_schema)
    schema = TelemetrySchema(json.load(schema_data))
    schema_data.close()

    storage = StorageLayout(schema, args.output_dir, args.max_output_size)

    expected_dim_count = len(schema._dimensions)

    if not os.path.isdir(args.output_dir):
        os.makedirs(args.output_dir)

    record_count = 0
    bad_record_count = 0
    bytes_read = 0
    start = datetime.now()
    for len_path, len_data, timestamp, path, data, err in fileutil.unpack(args.input_file):
        record_count += 1
        if err:
            bad_record_count += 1
            continue
        # Incoming timestamps are in milliseconds, so convert to POSIX first
        # (ie. seconds)
        submission_date = date.fromtimestamp(timestamp / 1000).strftime("%Y%m%d")
        # Deal with unicode
        path = unicode(path, errors="replace")
        data = unicode(data, errors="replace")

        bytes_read += len_path + len_data + fileutil.RECORD_PREAMBLE_LENGTH
        # print "Path for record", record_count, path, "length of data:", len_data, "data:", data[0:5] + "..."

        path_components = path.split("/")
        if len(path_components) != expected_dim_count:
            # We're going to pop the ID off, but we'll also add the submission
            # date, so it evens out.
            print "Found an invalid path in record", record_count, path
            bad_record_count += 1
            continue

        key = path_components.pop(0)
        info = {}
        info["reason"] = path_components.pop(0)
        info["appName"] = path_components.pop(0)
        info["appVersion"] = path_components.pop(0)
        info["appUpdateChannel"] = path_components.pop(0)
        info["appBuildID"] = path_components.pop(0)
        dimensions = schema.dimensions_from(info, submission_date)
        # print "  Converted path to filename", schema.get_filename(args.output_dir, dimensions)
        storage.write(key, data, dimensions)
    duration = timer.delta_sec(start)
    mb_read = bytes_read / 1024.0 / 1024.0
    print "Read %.2fMB in %.2fs (%.2fMB/s), %d of %d records were bad" % (
        mb_read,
        duration,
        mb_read / duration,
        bad_record_count,
        record_count,
    )
    return 0
Ejemplo n.º 23
0
class AnalysisJob:
    def __init__(self, cfg):
        self.job_bundle = cfg.job_bundle
        if cfg.input_filter:
            self.input_filter = TelemetrySchema(
                json.load(open(cfg.input_filter)))
        else:
            self.input_filter = None

        if cfg.input_list_file:
            self.input_list = cfg.input_list_file
        else:
            self.input_list = None

        self.job_id = str(uuid4())
        self.target_queue = cfg.target_queue
        self.aws_key = cfg.aws_key
        self.aws_secret_key = cfg.aws_secret_key
        self.input_bucket = "telemetry-published-v1"
        self.job_name = cfg.name
        self.job_owner = cfg.owner
        self.date_limit = cfg.date_limit

        # Bucket with intermediate data for this analysis job
        self.analysis_bucket = "jonasfj-telemetry-analysis"

        self.s3_code_path = "batch-jobs/" + self.job_id + ".tar.gz"

        # S3 region of operation
        self.aws_region = "us-west-2"
        self.task_size_limit = 400 * 1024 * 1024
        self.sqs_input_name = cfg.sqs_queue

    def get_filtered_files(self):
        conn = S3Connection(self.aws_key, self.aws_secret_key)
        bucket = conn.get_bucket(self.input_bucket)
        # date_limit is a hack that makes it easy to launch everything before
        # a given date... say to back process all we have in the bucket...
        if self.date_limit != None:
            print "Launching limiting to before " + self.date_limit
            for k, s in self.list_partitions(bucket):
                if k.split('/')[-1].split('.')[1] < self.date_limit:
                    yield (k, s)
        else:
            for k, s in self.list_partitions(bucket):
                yield (k, s)

    def get_filtered_files_old(self):
        """ Get tuples of name and size for all input files """
        # Setup some auxiliary functions
        allowed_values = self.input_filter.sanitize_allowed_values()
        nb_dims = len(allowed_values)

        def filter_includes(level, value):
            return self.input_filter.is_allowed(value, allowed_values[level])

        # iterate over all files in bucket, this is very slow and we should be
        # be able to something much smarter using prefix listing and ordering
        # to break listing.
        count = 0
        selected = 0
        conn = S3Connection(self.aws_key, self.aws_secret_key)
        bucket = conn.get_bucket(self.input_bucket)
        for f in bucket.list():
            count += 1
            dims = self.input_filter.get_dimensions(".", f.key)
            include = True
            for i in xrange(nb_dims):
                if not filter_includes(i, dims[i]):
                    include = False
                    break
            if include:
                selected += 1
                yield (f.key, f.size)
            if count % 5000 == 0:
                print "%i files listed with %i selected" % (count, selected)
        conn.close()

    def list_partitions(self, bucket, prefix='', level=0):
        if self.input_filter:
            #print "Listing...", prefix, level
            allowed_values = self.input_filter.sanitize_allowed_values()
            delimiter = '/'
            if level > 3:
                delimiter = '.'
            for k in bucket.list(prefix=prefix, delimiter=delimiter):
                partitions = k.name.split("/")
                if level > 3:
                    # split the last couple of partition components by "." instead of "/"
                    partitions.extend(partitions.pop().split(".", 2))
                if self.input_filter.is_allowed(partitions[level],
                                                allowed_values[level]):
                    if level >= 5:
                        for f in bucket.list(prefix=k.name):
                            yield (f.key, f.size)
                    else:
                        for k, s in self.list_partitions(
                                bucket, k.name, level + 1):
                            yield (k, s)
        elif self.input_list:
            print "Using input list..."
            for line in self.input_list:
                key_name = line.strip()
                k = bucket.get_key(key_name)
                yield (k.key, k.size)
        else:
            print "Don't know how to list partitions without a filter or list :("
            raise ValueError("Missing both input_filter and input_list")

    def generate_tasks(self):
        """ Generates SQS tasks, we batch small files into a single task """
        taskid = 1
        taskfiles = []
        tasksize = 0
        total_size_of_all = 0
        for key, size in self.get_filtered_files():
            # If the task have reached desired size we yield it
            # Note, as SQS messages are limited to 65 KiB we limit tasks to
            # 100 filenames, for simplicity
            # boto only uses signature version 4, hence, we're limited to 65 KiB
            if 0 < len(taskfiles) and (tasksize + size > self.task_size_limit
                                       or len(taskfiles) > 200):
                # Reduce to only filenames, sort by size... smallest first they are
                # faster to download when handling the job
                taskfiles = [
                    f for f, s in sorted(taskfiles, key=lambda (f, s): s)
                ]
                yield {
                    'id': self.job_id + "/" + str(taskid),
                    'name': self.job_name,
                    'owner': self.job_owner,
                    'code': self.s3_code_path,
                    'target-queue': self.target_queue,
                    'files': taskfiles,
                    'size': tasksize
                }
                total_size_of_all += tasksize
                print "%i tasks created acc. size: %s" % (taskid,
                                                          total_size_of_all)
                taskid += 1
                taskfiles = []
                tasksize = 0
            tasksize += size
            taskfiles.append((key, size))
        if len(taskfiles) > 0:
            taskfiles = [f for f, s in sorted(taskfiles, key=lambda (f, s): s)]
            yield {
                'id': self.job_id + "/" + str(taskid),
                'name': self.job_name,
                'owner': self.job_owner,
                'code': self.s3_code_path,
                'target-queue': self.target_queue,
                'files': taskfiles,
                'size': tasksize
            }
        print "Finished:"
        print "%i tasks created total size: %s" % (taskid, total_size_of_all +
                                                   tasksize)

    def put_sqs_tasks(self):
        """ Create an SQS tasks for this analysis job """
        print "Populate SQS input queue with tasks"
        # Connect to SQS is desired region
        conn = sqs.connect_to_region(self.aws_region,
                                     aws_access_key_id=self.aws_key,
                                     aws_secret_access_key=self.aws_secret_key)
        # Create queue
        queue = conn.get_queue(self.sqs_input_name)
        queue.set_message_class(JSONMessage)
        # Populate queue with tasks
        for task in self.generate_tasks():
            #print "enqueueing", task["id"], "size:", task["size"]
            msg = queue.new_message(body=task)
            queue.write(msg)
        conn.close()

    def setup(self):
        self.upload_job_bundle()
        self.put_sqs_tasks()
        print "Uploaded with job_id: %s" % self.job_id

    def upload_job_bundle(self):
        """ Upload job bundle to S3 """
        conn = S3Connection(self.aws_key, self.aws_secret_key)
        bucket = conn.get_bucket(self.analysis_bucket)
        k = Key(bucket)
        k.key = self.s3_code_path
        k.set_contents_from_filename(self.job_bundle)
        conn.close()
Ejemplo n.º 24
0
def main():
    parser = argparse.ArgumentParser(
        description='Split raw logs into partitioned files.',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("-m",
                        "--max-output-size",
                        metavar="N",
                        help="Rotate output files after N bytes",
                        type=int,
                        default=500000000)
    parser.add_argument("-i",
                        "--input-file",
                        help="Filename to read from",
                        required=True)
    parser.add_argument("-o",
                        "--output-dir",
                        help="Base directory to store split files",
                        required=True)
    parser.add_argument("-t",
                        "--telemetry-schema",
                        help="Filename of telemetry schema spec",
                        required=True)
    parser.add_argument("-f",
                        "--file-version",
                        help="Log file version (if omitted, we'll guess)")
    args = parser.parse_args()

    schema_data = open(args.telemetry_schema)
    schema = TelemetrySchema(json.load(schema_data))
    schema_data.close()

    storage = StorageLayout(schema, args.output_dir, args.max_output_size)

    expected_dim_count = len(schema._dimensions)

    if not os.path.isdir(args.output_dir):
        os.makedirs(args.output_dir)

    record_count = 0
    bad_record_count = 0
    bytes_read = 0
    start = datetime.now()
    file_version = args.file_version
    if not file_version:
        file_version = fileutil.detect_file_version(args.input_file)
    for r in fileutil.unpack(args.input_file, file_version=file_version):
        record_count += 1
        if r.error:
            bad_record_count += 1
            continue
        # Incoming timestamps are in milliseconds, so convert to POSIX first
        # (ie. seconds)
        submission_date = date.fromtimestamp(r.timestamp /
                                             1000).strftime("%Y%m%d")
        # Deal with unicode
        path = unicode(r.path, errors="replace")
        data = unicode(r.data, errors="replace")

        bytes_read += r.len_ip + r.len_path + r.len_data + fileutil.RECORD_PREAMBLE_LENGTH[
            file_version]
        #print "Path for record", record_count, path, "length of data:", r.len_data, "data:", data[0:5] + "..."

        path_components = path.split("/")
        if len(path_components) != expected_dim_count:
            # We're going to pop the ID off, but we'll also add the submission
            # date, so it evens out.
            print "Found an invalid path in record", record_count, path
            bad_record_count += 1
            continue

        key = path_components.pop(0)
        info = {}
        info["reason"] = path_components.pop(0)
        info["appName"] = path_components.pop(0)
        info["appVersion"] = path_components.pop(0)
        info["appUpdateChannel"] = path_components.pop(0)
        info["appBuildID"] = path_components.pop(0)
        dimensions = schema.dimensions_from(info, submission_date)
        #print "  Converted path to filename", schema.get_filename(args.output_dir, dimensions)
        storage.write(key, data, dimensions)
    duration = timer.delta_sec(start)
    mb_read = bytes_read / 1024.0 / 1024.0
    print "Read %.2fMB in %.2fs (%.2fMB/s), %d of %d records were bad" % (
        mb_read, duration, mb_read / duration, bad_record_count, record_count)
    return 0
Ejemplo n.º 25
0
 def setUp(self):
     test_dir = self.get_test_dir()
     self.schema = TelemetrySchema(self.get_schema_spec())
     self.storage = StorageLayout(self.schema, test_dir, 10000)
     assert not os.path.exists(test_dir)
     os.makedirs(test_dir)
Ejemplo n.º 26
0
def update_published_v4_files(sdb,
                              bucket,
                              bucket_prefix,
                              submission_date,
                              limit=None):
    conn = boto.connect_s3(host=S3_DEFAULT_ENDPOINT)
    metadata = conn.get_bucket(METADATA_BUCKET, validate=False)
    schema_key = metadata.get_key("{}/schema.json".format(bucket_prefix))
    schema_string = schema_key.get_contents_as_string()
    schema = TelemetrySchema(json.loads(schema_string))
    bucket = conn.get_bucket(bucket, validate=False)

    added_count = 0
    total_count = 0
    start_time = datetime.now()
    done = False
    last_key = ''
    batch = BatchPut(sdb)
    prefix = "{}/{}".format(
        bucket_prefix, submission_date) if submission_date else bucket_prefix

    print "Bucket: {} - Prefix: {} - Date: {}".format(bucket.name,
                                                      bucket_prefix,
                                                      submission_date)

    while not done:
        try:
            for key in bucket.list(marker=last_key, prefix=prefix):
                last_key = key.name

                if total_count % 1e5 == 0:
                    print("Looked at {} total records in {} seconds, added {}".
                          format(total_count, delta_sec(start_time),
                                 added_count))

                dims = schema.get_dimension_map(
                    schema.get_dimensions(".",
                                          key.name[len(bucket_prefix) + 1:],
                                          dirs_only=True))

                if (dims["submissionDate"] == submission_date
                    ) and dims["submissionDate"][:-2] in sdb:
                    batch.put(dims["submissionDate"][:-2], key.name, dims)
                    added_count += 1

                total_count += 1
                if total_count == limit:
                    done = True
                    break

        except Exception as e:
            print("Error listing keys: {}".format(e))
            traceback.print_exc()
            print("Continuing from last seen key: {}".format(last_key))
            continue

        break

    batch.flush()
    print("Overall, added {} of {} in {} seconds".format(
        added_count, total_count, delta_sec(start_time)))
Ejemplo n.º 27
0
def main():
    signal.signal(signal.SIGINT, handle_sigint)
    parser = argparse.ArgumentParser(
        description='Process incoming Telemetry data',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("-c",
                        "--config",
                        required=True,
                        type=file,
                        help="AWS Configuration file (json)")
    parser.add_argument("-w",
                        "--work-dir",
                        required=True,
                        help="Location to cache downloaded files")
    parser.add_argument("-o",
                        "--output-dir",
                        required=True,
                        help="Base dir to store processed data")
    parser.add_argument("-i",
                        "--input-files",
                        type=file,
                        help="File containing a list of keys to process")
    parser.add_argument("-b",
                        "--bad-data-log",
                        help="Save bad records to this file")
    parser.add_argument("-l", "--log-file", help="Log output to this file")
    parser.add_argument("-s",
                        "--stats-file",
                        help="Log statistics to this file")
    parser.add_argument("--histogram-cache-path",
                        default="./histogram_cache",
                        help="Path to store a local cache of histograms")
    parser.add_argument("-t",
                        "--telemetry-schema",
                        required=True,
                        help="Location of the desired telemetry schema")
    parser.add_argument("-m",
                        "--max-output-size",
                        metavar="N",
                        type=int,
                        default=500000000,
                        help="Rotate output files after N bytes")
    parser.add_argument("-D",
                        "--dry-run",
                        action="store_true",
                        help="Don't modify remote files")
    parser.add_argument("-n",
                        "--no-clean",
                        action="store_true",
                        help="Don't clean out the output-dir before beginning")
    parser.add_argument("-v",
                        "--verbose",
                        action="store_true",
                        help="Print more detailed output")
    args = parser.parse_args()

    if args.verbose:
        # Turn on mp logging
        multiprocessing.log_to_stderr(logging.DEBUG)

    config = json.load(args.config)
    # TODO: allow commandline args to override config values.

    if not os.path.isdir(args.output_dir):
        os.makedirs(args.output_dir)

    schema_data = open(args.telemetry_schema)
    schema = TelemetrySchema(json.load(schema_data))
    schema_data.close()
    cache = RevisionCache(args.histogram_cache_path, "hg.mozilla.org")
    converter = Converter(cache, schema)
    storage = StorageLayout(schema, args.output_dir, args.max_output_size)
    logger = Log(args.log_file, "Master")
    num_cpus = multiprocessing.cpu_count()
    conn = None
    incoming_bucket = None
    incoming_queue = None
    s3downloader = None
    raw_readers = None
    compressors = None
    exporters = None
    done = False

    if args.no_clean:
        logger.log("Not removing log files in {}".format(args.output_dir))
    else:
        # Remove existing log files from output_dir (to clean up after an
        # incomplete previous run, for example).
        logger.log("Removing log files in {}".format(args.output_dir))
        for root, dirs, files in os.walk(args.output_dir):
            for f in files:
                if f.endswith(".log"):
                    full = os.path.join(root, f)
                    if args.dry_run:
                        logger.log("Would be deleting {}, except it's a " \
                                   "dry run".format(full))
                    else:
                        try:
                            logger.log("Removing existing file: " + full)
                            os.remove(full)
                        except Exception, e:
                            logger.log("Error removing existing " \
                                       " file {}: {}".format(full, e))
def main():
    parser = argparse.ArgumentParser(
        description='Process incoming Telemetry data',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("incoming_bucket",
                        help="The S3 bucket containing incoming files")
    parser.add_argument("publish_bucket",
                        help="The S3 bucket to save processed files")
    parser.add_argument("-k", "--aws-key", help="AWS Key", required=True)
    parser.add_argument("-s",
                        "--aws-secret-key",
                        help="AWS Secret Key",
                        required=True)
    parser.add_argument("-r",
                        "--aws-region",
                        help="AWS Region",
                        default="us-west-2")
    parser.add_argument("-w",
                        "--work-dir",
                        help="Location to cache downloaded files",
                        required=True)
    parser.add_argument("-o",
                        "--output-dir",
                        help="Base dir to store processed data",
                        required=True)
    parser.add_argument("-i",
                        "--input-files",
                        help="File containing a list of keys to process",
                        type=file)
    parser.add_argument("-b",
                        "--bad-data-log",
                        help="Save bad records to this file")
    parser.add_argument("-q",
                        "--queue",
                        help="SQS Queue name to poll for incoming data")
    parser.add_argument("-c",
                        "--histogram-cache-path",
                        help="Path to store a local cache of histograms",
                        default="./histogram_cache")
    parser.add_argument("-t",
                        "--telemetry-schema",
                        help="Location of the desired telemetry schema",
                        required=True)
    parser.add_argument("-m",
                        "--max-output-size",
                        metavar="N",
                        help="Rotate output files after N bytes",
                        type=int,
                        default=500000000)
    parser.add_argument("-D",
                        "--dry-run",
                        help="Don't modify remote files",
                        action="store_true")
    parser.add_argument("-C",
                        "--skip-conversion",
                        help="Skip validation/conversion of payloads",
                        action="store_true")
    args = parser.parse_args()

    if not os.path.isfile(S3FUNNEL_PATH):
        print "ERROR: s3funnel not found at", S3FUNNEL_PATH
        print "You can get it from github: https://github.com/sstoiana/s3funnel"
        return -1

    if not os.path.isdir(args.output_dir):
        os.makedirs(args.output_dir)

    schema_data = open(args.telemetry_schema)
    schema = TelemetrySchema(json.load(schema_data))
    schema_data.close()
    cache = RevisionCache(args.histogram_cache_path, "hg.mozilla.org")
    if args.skip_conversion:
        converter = None
    else:
        converter = Converter(cache, schema)
    storage = StorageLayout(schema, args.output_dir, args.max_output_size)

    num_cpus = multiprocessing.cpu_count()

    start = datetime.now()
    conn = None
    incoming_bucket = None
    incoming_queue = None
    incoming_queue_messages = []

    if not args.dry_run:
        conn = S3Connection(args.aws_key, args.aws_secret_key)
        incoming_bucket = conn.get_bucket(args.incoming_bucket)

    incoming_filenames = []
    if args.queue is not None:
        print "Fetching file list from queue", args.queue
        if args.dry_run:
            print "Dry run mode... can't read from the queue without messing things up..."
        else:
            q_conn = boto.sqs.connect_to_region(
                args.aws_region,
                aws_access_key_id=args.aws_key,
                aws_secret_access_key=args.aws_secret_key)
            incoming_queue = q_conn.get_queue(args.queue)
            if incoming_queue is None:
                print "Error: could not get queue", args.queue
                return -2
            # Sometimes we don't get all the messages, even if more are
            # available, so keep trying until we have enough (or there aren't
            # any left)
            for i in range(num_cpus):
                messages = incoming_queue.get_messages(num_cpus -
                                                       len(incoming_filenames))
                for m in messages:
                    # TODO: Make sure this file exists in S3 first?
                    possible_filename = m.get_body()
                    key = incoming_bucket.get_key(possible_filename)
                    if key is None:
                        print "Could not find queued filename in bucket", args.incoming_bucket, ":", possible_filename
                        # try to delete it:
                        incoming_queue.delete_message(m)
                    else:
                        incoming_filenames.append(possible_filename)
                        incoming_queue_messages.append(m)
                if len(messages) == 0 or len(incoming_filenames) >= num_cpus:
                    break
    elif args.input_files:
        print "Fetching file list from file", args.input_files
        incoming_filenames = [l.strip() for l in args.input_files.readlines()]
    else:
        print "Fetching file list from S3..."
        for f in incoming_bucket.list():
            incoming_filenames.append(f.name)
    print "Done"

    if len(incoming_filenames) == 0:
        print "Nothing to do!"
        return 0

    for f in incoming_filenames:
        print "  ", f

    print "Verifying that we can write to", args.publish_bucket
    if args.dry_run:
        print "Dry run mode: don't care!"
    else:
        try:
            publish_bucket = conn.get_bucket(args.publish_bucket)
            print "Looks good!"
        except S3ResponseError:
            print "Bucket", args.publish_bucket, "not found.  Attempting to create it."
            publish_bucket = conn.create_bucket(args.publish_bucket)

    result = 0
    print "Downloading", len(incoming_filenames), "files..."
    if args.dry_run:
        print "Dry run mode: skipping download from S3"
    else:
        result = fetch_s3_files(incoming_filenames, args.work_dir,
                                incoming_bucket, args.aws_key,
                                args.aws_secret_key)

    if result != 0:
        print "Error downloading files. Return code of s3funnel was", result
        return result
    print "Done"

    after_download = datetime.now()

    local_filenames = [
        os.path.join(args.work_dir, f) for f in incoming_filenames
    ]

    # TODO: try a SimpleQueue
    raw_files = Queue()
    for l in local_filenames:
        raw_files.put(l)

    completed_files = Queue()
    compressed_files = Queue()

    # Begin reading raw input
    raw_readers = start_workers(
        num_cpus, "Reader", ReadRawStep, raw_files,
        (completed_files, schema, converter, storage, args.bad_data_log))

    # Tell readers when to stop:
    for i in range(num_cpus):
        raw_files.put(PipeStep.SENTINEL)

    # Compress completed files.
    compressors = start_workers(num_cpus, "Compressor", CompressCompletedStep,
                                completed_files, (compressed_files, ))

    # Export compressed files to S3.
    exporters = start_workers(
        num_cpus, "Exporter", ExportCompressedStep, compressed_files,
        (args.output_dir, args.aws_key, args.aws_secret_key,
         args.publish_bucket, args.dry_run))

    wait_for(raw_readers, "Raw Readers")

    # `find <out_dir> -type f -not -name ".compressme"`
    # Add them to completed_files
    for root, dirs, files in os.walk(args.output_dir):
        for f in files:
            if f.endswith(".log"):
                completed_files.put(os.path.join(root, f))

    for i in range(num_cpus):
        completed_files.put(PipeStep.SENTINEL)

    wait_for(compressors, "Compressors")
    for i in range(num_cpus):
        compressed_files.put(PipeStep.SENTINEL)

    wait_for(exporters, "Exporters")

    print "Removing processed logs from S3..."
    for f in incoming_filenames:
        if args.dry_run:
            print "  Dry run, so not really deleting", f
        else:
            print "  Deleting", f
            incoming_bucket.delete_key(f)
            # Delete file locally too.
            os.remove(os.path.join(args.work_dir, f))
    print "Done"

    if len(incoming_queue_messages) > 0:
        print "Removing processed messages from SQS..."
        for m in incoming_queue_messages:
            if args.dry_run:
                print "  Dry run, so not really deleting", m.get_body()
            else:
                print "  Deleting", m.get_body()
                if incoming_queue.delete_message(m):
                    print "  Message deleted successfully"
                else:
                    print "  Failed to delete message :("
        print "Done"

    duration = timer.delta_sec(start)
    print "All done in %.2fs (%.2fs excluding download time)" % (
        duration, timer.delta_sec(after_download))
    return 0
Ejemplo n.º 29
0
class AnalysisJob:
    def __init__(self, cfg):
        self.job_bundle = cfg.job_bundle
        if cfg.input_filter:
            self.input_filter = TelemetrySchema(json.load(open(cfg.input_filter)))
        else:
            self.input_filter = None

        if cfg.input_list_file:
            self.input_list = cfg.input_list_file
        else:
            self.input_list = None

        self.job_id = str(uuid4())
        self.target_queue = cfg.target_queue
        self.aws_key = cfg.aws_key
        self.aws_secret_key = cfg.aws_secret_key
        self.input_bucket = "telemetry-published-v1"
        self.job_name = cfg.name
        self.job_owner = cfg.owner
        self.date_limit = cfg.date_limit

        # Bucket with intermediate data for this analysis job
        self.analysis_bucket = "jonasfj-telemetry-analysis"

        self.s3_code_path = "batch-jobs/" + self.job_id + ".tar.gz"

        # S3 region of operation
        self.aws_region = "us-west-2"
        self.task_size_limit = 400 * 1024 * 1024
        self.sqs_input_name = cfg.sqs_queue


    def get_filtered_files(self):
        conn = S3Connection(self.aws_key, self.aws_secret_key)
        bucket = conn.get_bucket(self.input_bucket)
        # date_limit is a hack that makes it easy to launch everything before
        # a given date... say to back process all we have in the bucket...
        if self.date_limit != None:
            print "Launching limiting to before " + self.date_limit
            for k,s in self.list_partitions(bucket):
                if k.split('/')[-1].split('.')[1] < self.date_limit:
                    yield (k, s)
        else:
            for k,s in self.list_partitions(bucket):
                yield (k, s)

    def get_filtered_files_old(self):
        """ Get tuples of name and size for all input files """
        # Setup some auxiliary functions
        allowed_values = self.input_filter.sanitize_allowed_values()
        nb_dims = len(allowed_values)
        def filter_includes(level, value):
            return self.input_filter.is_allowed(value, allowed_values[level])

        # iterate over all files in bucket, this is very slow and we should be
        # be able to something much smarter using prefix listing and ordering
        # to break listing.
        count = 0
        selected = 0
        conn = S3Connection(self.aws_key, self.aws_secret_key)
        bucket = conn.get_bucket(self.input_bucket)
        for f in bucket.list():
            count += 1
            dims = self.input_filter.get_dimensions(".", f.key)
            include = True
            for i in xrange(nb_dims):
                if not filter_includes(i, dims[i]):
                    include = False
                    break
            if include:
                selected += 1
                yield (f.key, f.size)
            if count % 5000 == 0:
                print "%i files listed with %i selected" % (count, selected)
        conn.close()

    def list_partitions(self, bucket, prefix='', level=0):
        if self.input_filter:
            #print "Listing...", prefix, level
            allowed_values = self.input_filter.sanitize_allowed_values()
            delimiter = '/'
            if level > 3:
                delimiter = '.'
            for k in bucket.list(prefix=prefix, delimiter=delimiter):
                partitions = k.name.split("/")
                if level > 3:
                    # split the last couple of partition components by "." instead of "/"
                    partitions.extend(partitions.pop().split(".", 2))
                if self.input_filter.is_allowed(partitions[level], allowed_values[level]):
                    if level >= 5:
                        for f in bucket.list(prefix=k.name):
                            yield (f.key, f.size)
                    else:
                        for k, s in self.list_partitions(bucket, k.name, level + 1):
                            yield (k, s)
        elif self.input_list:
            print "Using input list..."
            for line in self.input_list:
                key_name = line.strip()
                k = bucket.get_key(key_name)
                yield (k.key, k.size)
        else:
            print "Don't know how to list partitions without a filter or list :("
            raise ValueError("Missing both input_filter and input_list")

    def generate_tasks(self):
        """ Generates SQS tasks, we batch small files into a single task """
        taskid = 1
        taskfiles = []
        tasksize = 0
        total_size_of_all = 0
        for key, size in self.get_filtered_files():
            # If the task have reached desired size we yield it
            # Note, as SQS messages are limited to 65 KiB we limit tasks to
            # 100 filenames, for simplicity
            # boto only uses signature version 4, hence, we're limited to 65 KiB
            if 0 < len(taskfiles) and (tasksize + size > self.task_size_limit or
                                       len(taskfiles) > 200):
                # Reduce to only filenames, sort by size... smallest first they are
                # faster to download when handling the job
                taskfiles =  [f for f,s in sorted(taskfiles, key=lambda (f,s): s)]
                yield {
                    'id':               self.job_id + "/" +  str(taskid),
                    'name':             self.job_name,
                    'owner':            self.job_owner,
                    'code':             self.s3_code_path,
                    'target-queue':     self.target_queue,
                    'files':            taskfiles,
                    'size':             tasksize
                }
                total_size_of_all += tasksize
                print "%i tasks created acc. size: %s" % (taskid, total_size_of_all)
                taskid += 1
                taskfiles = []
                tasksize = 0
            tasksize += size
            taskfiles.append((key, size))
        if len(taskfiles) > 0:
            taskfiles =  [f for f,s in sorted(taskfiles, key=lambda (f,s): s)]
            yield {
                'id':               self.job_id + "/" + str(taskid),
                'name':             self.job_name,
                'owner':            self.job_owner,
                'code':             self.s3_code_path,
                'target-queue':     self.target_queue,
                'files':            taskfiles,
                'size':             tasksize
            }
        print "Finished:"
        print "%i tasks created total size: %s" % (taskid, total_size_of_all + tasksize)

    def put_sqs_tasks(self):
        """ Create an SQS tasks for this analysis job """
        print "Populate SQS input queue with tasks"
        # Connect to SQS is desired region
        conn = sqs.connect_to_region(
           self.aws_region,
           aws_access_key_id = self.aws_key,
           aws_secret_access_key = self.aws_secret_key
        )
        # Create queue
        queue = conn.get_queue(self.sqs_input_name)
        queue.set_message_class(JSONMessage)
        # Populate queue with tasks
        for task in self.generate_tasks():
            #print "enqueueing", task["id"], "size:", task["size"]
            msg = queue.new_message(body = task)
            queue.write(msg)
        conn.close()

    def setup(self):
        self.upload_job_bundle()
        self.put_sqs_tasks()
        print "Uploaded with job_id: %s" % self.job_id

    def upload_job_bundle(self):
        """ Upload job bundle to S3 """
        conn = S3Connection(self.aws_key, self.aws_secret_key)
        bucket = conn.get_bucket(self.analysis_bucket)
        k = Key(bucket)
        k.key = self.s3_code_path
        k.set_contents_from_filename(self.job_bundle)
        conn.close()