Ejemplo n.º 1
0
def test_v4execschema():
    schema_spec = {
        "version":
        2,
        "dimensions": [{
            "field_name": "submissionDate",
            "allowed_values": {
                "max": "20150901"
            }
        }]
    }
    schema = TelemetrySchema(schema_spec)

    found = set()
    for f in s3util.list_heka_partitions(v4execbucket, schema=schema):
        found.add(f.name)

    assert (len(found) == 3)
    assert ("20150901/20150901221519.541_ip-172-31-16-184" in found)
    assert ("20150901/20150901223019.579_ip-172-31-16-184" in found)
    assert ("20150901/20150901224519.623_ip-172-31-16-184" in found)

    # Test with a prefix:
    found = set()
    for f in s3util.list_heka_partitions(
            v4prefixbucket, prefix="telemetry-executive-summary-2",
            schema=schema):
        found.add(f.name)

    assert (len(found) == 3)
    assert (
        "telemetry-executive-summary-2/20150901/20150901221519.541_ip-172-31-16-184"
        in found)
    assert (
        "telemetry-executive-summary-2/20150901/20150901223019.579_ip-172-31-16-184"
        in found)
    assert (
        "telemetry-executive-summary-2/20150901/20150901224519.623_ip-172-31-16-184"
        in found)

    # Test with a bunch of prefixes:
    found = set()
    for f in s3util.list_heka_partitions(multiprefixbucket,
                                         prefix="a/b/c/d",
                                         schema=schema):
        found.add(f.name)

    assert (len(found) == 3)
    assert ("a/b/c/d/20150901/20150901221519.541_ip-172-31-16-184" in found)
    assert ("a/b/c/d/20150901/20150901223019.579_ip-172-31-16-184" in found)
    assert ("a/b/c/d/20150901/20150901224519.623_ip-172-31-16-184" in found)
Ejemplo n.º 2
0
def test_v4execschema():
    schema_spec = {
      "version": 2,
      "dimensions": [
        {
          "field_name": "submissionDate",
          "allowed_values": {"max": "20150901"}
        }
      ]
    }
    schema = TelemetrySchema(schema_spec)

    found = set()
    for f in s3util.list_heka_partitions(v4execbucket, schema=schema):
      found.add(f.name)

    assert(len(found) == 3)
    assert("20150901/20150901221519.541_ip-172-31-16-184" in found)
    assert("20150901/20150901223019.579_ip-172-31-16-184" in found)
    assert("20150901/20150901224519.623_ip-172-31-16-184" in found)

    # Test with a prefix:
    found = set()
    for f in s3util.list_heka_partitions(v4prefixbucket, prefix="telemetry-executive-summary-2", schema=schema):
      found.add(f.name)

    assert(len(found) == 3)
    assert("telemetry-executive-summary-2/20150901/20150901221519.541_ip-172-31-16-184" in found)
    assert("telemetry-executive-summary-2/20150901/20150901223019.579_ip-172-31-16-184" in found)
    assert("telemetry-executive-summary-2/20150901/20150901224519.623_ip-172-31-16-184" in found)

    # Test with a bunch of prefixes:
    found = set()
    for f in s3util.list_heka_partitions(multiprefixbucket, prefix="a/b/c/d", schema=schema):
      found.add(f.name)

    assert(len(found) == 3)
    assert("a/b/c/d/20150901/20150901221519.541_ip-172-31-16-184" in found)
    assert("a/b/c/d/20150901/20150901223019.579_ip-172-31-16-184" in found)
    assert("a/b/c/d/20150901/20150901224519.623_ip-172-31-16-184" in found)
Ejemplo n.º 3
0
def test_v4schema():
    schema_spec = {
      "version": 2,
      "dimensions": [
        {
          "field_name": "submissionDate",
          "allowed_values": "20150903"
        },
        {
          "field_name": "sourceName",
          "allowed_values": "*"
        },
        {
          "field_name": "sourceVersion",
          "allowed_values": "4"
        },
        {
          "field_name": "docType",
          "allowed_values": ["saved-session"]
        },
        {
          "field_name": "appName",
          "allowed_values": ["Firefox"]
        },
        {
          "field_name": "appUpdateChannel",
          "allowed_values": ["release"]
        },
        {
          "field_name": "appVersion",
          "allowed_values": "24.0"
        },
        {
          "field_name": "appBuildId",
          "allowed_values": "20130910160258"
        }
      ]
    }
    schema = TelemetrySchema(schema_spec)

    found = set()
    for f in s3util.list_heka_partitions(v4bucket, schema=schema):
      found.add(f.name)

    assert(len(found) == 3)
    assert("20150903/telemetry/4/saved_session/Firefox/release/24.0/20130910160258/20150903051633.482_ip-172-31-16-184" in found)
    assert("20150903/telemetry/4/saved_session/Firefox/release/24.0/20130910160258/20150903051644.482_ip-172-31-16-184" in found)
    assert("20150903/telemetry/4/saved_session/Firefox/release/24.0/20130910160258/20150903051655.482_ip-172-31-16-184" in found)
Ejemplo n.º 4
0
def test_v4schema():
    schema_spec = {
        "version":
        2,
        "dimensions": [{
            "field_name": "submissionDate",
            "allowed_values": "20150903"
        }, {
            "field_name": "sourceName",
            "allowed_values": "*"
        }, {
            "field_name": "sourceVersion",
            "allowed_values": "4"
        }, {
            "field_name": "docType",
            "allowed_values": ["saved-session"]
        }, {
            "field_name": "appName",
            "allowed_values": ["Firefox"]
        }, {
            "field_name": "appUpdateChannel",
            "allowed_values": ["release"]
        }, {
            "field_name": "appVersion",
            "allowed_values": "24.0"
        }, {
            "field_name": "appBuildId",
            "allowed_values": "20130910160258"
        }]
    }
    schema = TelemetrySchema(schema_spec)

    found = set()
    for f in s3util.list_heka_partitions(v4bucket, schema=schema):
        found.add(f.name)

    assert (len(found) == 3)
    assert (
        "20150903/telemetry/4/saved_session/Firefox/release/24.0/20130910160258/20150903051633.482_ip-172-31-16-184"
        in found)
    assert (
        "20150903/telemetry/4/saved_session/Firefox/release/24.0/20130910160258/20150903051644.482_ip-172-31-16-184"
        in found)
    assert (
        "20150903/telemetry/4/saved_session/Firefox/release/24.0/20130910160258/20150903051655.482_ip-172-31-16-184"
        in found)
Ejemplo n.º 5
0
 def get_filtered_files_s3(self):
     if not self._local_only:
         print "Fetching file list from S3..."
         # Plain boto should be fast enough to list bucket contents.
         if self._aws_key is not None:
             conn = S3Connection(self._aws_key, self._aws_secret_key)
         else:
             conn = S3Connection()
         bucket = conn.get_bucket(self._bucket_name)
         start = datetime.now()
         count = 0
         # Filter input files by partition. If the filter is reasonably
         # selective, this can be much faster than listing all files in the
         # bucket.
         for f in s3util.list_heka_partitions(bucket, schema=self._input_filter):
             count += 1
             if count == 1 or count % 1000 == 0:
                 print "Listed", count, "so far"
             yield f
         conn.close()
         duration = timer.delta_sec(start)
         print "Listed", count, "files in", duration, "seconds"
Ejemplo n.º 6
0
 def get_filtered_files_s3(self):
     if not self._local_only:
         print "Fetching file list from S3..."
         # Plain boto should be fast enough to list bucket contents.
         if self._aws_key is not None:
             conn = S3Connection(self._aws_key, self._aws_secret_key)
         else:
             conn = S3Connection()
         bucket = conn.get_bucket(self._bucket_name)
         start = datetime.now()
         count = 0
         # Filter input files by partition. If the filter is reasonably
         # selective, this can be much faster than listing all files in the
         # bucket.
         for f in s3util.list_heka_partitions(bucket, schema=self._input_filter):
             count += 1
             if count == 1 or count % 1000 == 0:
                 print "Listed", count, "so far"
             yield f
         conn.close()
         duration = timer.delta_sec(start)
         print "Listed", count, "files in", duration, "seconds"
Ejemplo n.º 7
0
def _list_s3_filenames(bucket, prefix, schema):
    return [
        k.name for k in s3u.list_heka_partitions(bucket, prefix, schema=schema)
    ]
Ejemplo n.º 8
0
def _list_s3_filenames(bucket, prefix, schema):
    return [k.name for k in s3u.list_heka_partitions(bucket, prefix, schema=schema)]