Ejemplo n.º 1
0
    def __init__(self, cfg):
        self.job_bundle = cfg.job_bundle
        if cfg.input_filter:
            self.input_filter = TelemetrySchema(
                json.load(open(cfg.input_filter)))
        else:
            self.input_filter = None

        if cfg.input_list_file:
            self.input_list = cfg.input_list_file
        else:
            self.input_list = None

        self.job_id = str(uuid4())
        self.target_queue = cfg.target_queue
        self.aws_key = cfg.aws_key
        self.aws_secret_key = cfg.aws_secret_key
        self.input_bucket = "telemetry-published-v1"
        self.job_name = cfg.name
        self.job_owner = cfg.owner
        self.date_limit = cfg.date_limit

        # Bucket with intermediate data for this analysis job
        self.analysis_bucket = "jonasfj-telemetry-analysis"

        self.s3_code_path = "batch-jobs/" + self.job_id + ".tar.gz"

        # S3 region of operation
        self.aws_region = "us-west-2"
        self.task_size_limit = 400 * 1024 * 1024
        self.sqs_input_name = cfg.sqs_queue
Ejemplo n.º 2
0
def _filter_to_schema(schema, filter_args):
    new_schema = {"version": 1, "dimensions": []}
    for i, dim in enumerate(schema["dimensions"]):
        new_filter = {
            "field_name":
            schema["dimensions"][i].get("field_name", "field{}".format(i)),
            "allowed_values":
            "*"
        }
        if dim["field_name"] in filter_args:
            new_filter["allowed_values"] = filter_args[dim["field_name"]]
        new_schema["dimensions"].append(new_filter)
    return TelemetrySchema(new_schema)
Ejemplo n.º 3
0
def test_v4execschema():
    schema_spec = {
        "version":
        2,
        "dimensions": [{
            "field_name": "submissionDate",
            "allowed_values": {
                "max": "20150901"
            }
        }]
    }
    schema = TelemetrySchema(schema_spec)

    found = set()
    for f in s3util.list_heka_partitions(v4execbucket, schema=schema):
        found.add(f.name)

    assert (len(found) == 3)
    assert ("20150901/20150901221519.541_ip-172-31-16-184" in found)
    assert ("20150901/20150901223019.579_ip-172-31-16-184" in found)
    assert ("20150901/20150901224519.623_ip-172-31-16-184" in found)

    # Test with a prefix:
    found = set()
    for f in s3util.list_heka_partitions(
            v4prefixbucket, prefix="telemetry-executive-summary-2",
            schema=schema):
        found.add(f.name)

    assert (len(found) == 3)
    assert (
        "telemetry-executive-summary-2/20150901/20150901221519.541_ip-172-31-16-184"
        in found)
    assert (
        "telemetry-executive-summary-2/20150901/20150901223019.579_ip-172-31-16-184"
        in found)
    assert (
        "telemetry-executive-summary-2/20150901/20150901224519.623_ip-172-31-16-184"
        in found)

    # Test with a bunch of prefixes:
    found = set()
    for f in s3util.list_heka_partitions(multiprefixbucket,
                                         prefix="a/b/c/d",
                                         schema=schema):
        found.add(f.name)

    assert (len(found) == 3)
    assert ("a/b/c/d/20150901/20150901221519.541_ip-172-31-16-184" in found)
    assert ("a/b/c/d/20150901/20150901223019.579_ip-172-31-16-184" in found)
    assert ("a/b/c/d/20150901/20150901224519.623_ip-172-31-16-184" in found)
Ejemplo n.º 4
0
def main():
    parser = ArgumentParser(
        description='Convert local Telemetry pings to server storage structure'
    )
    parser.add_argument("--input-dir", required=True)
    parser.add_argument("--output-dir", required=True)
    parser.add_argument("--schema",
                        type=file,
                        default='./telemetry/telemetry_schema.json')
    parser.add_argument("--histogram-cache-dir",
                        default='/tmp/telemetry_histogram_cache')
    args = parser.parse_args()

    print "Getting pings from", args.input_dir, "converting them and storing them in", args.output_dir
    schema = TelemetrySchema(json.load(args.schema))
    cache = RevisionCache(args.histogram_cache_dir, 'hg.mozilla.org')
    converter = Converter(cache, schema)
    storage = StorageLayout(schema, args.output_dir, 500000000)
    ping_dir = args.input_dir
    ping_files = get_pings(ping_dir)
    if len(ping_files) == 0:
        # Try the usual ping dir (if the user just gave the Profile Dir)
        ping_dir = os.path.join(args.input_dir, "saved-telemetry-pings")
        ping_files = get_pings(ping_dir)

    print "found", len(ping_files), "pings"
    for ping_file in ping_files:
        with open(os.path.join(ping_dir, ping_file), "r") as f:
            ping = json.load(f)
            reason = ping['reason']
            key = ping['slug']
            payload = ping['payload']
            submission_date = date.today().strftime("%Y%m%d")
            dims = schema.dimensions_from(payload, submission_date)
            try:
                parsed_data, dims = converter.convert_obj(payload, dims[-1])
                serialized_data = converter.serialize(parsed_data)
                data_version = Converter.VERSION_CONVERTED
                try:
                    # Write to persistent storage
                    n = storage.write(key, serialized_data, dims, data_version)
                    print "Successfully saved ping", key, "to", n
                except Exception, e:
                    traceback.print_exc()
            except BadPayloadError, e:
                print "Bad Payload:", e.msg
            except Exception, e:
                traceback.print_exc()
Ejemplo n.º 5
0
def test_v4schema():
    schema_spec = {
        "version":
        2,
        "dimensions": [{
            "field_name": "submissionDate",
            "allowed_values": "20150903"
        }, {
            "field_name": "sourceName",
            "allowed_values": "*"
        }, {
            "field_name": "sourceVersion",
            "allowed_values": "4"
        }, {
            "field_name": "docType",
            "allowed_values": ["saved-session"]
        }, {
            "field_name": "appName",
            "allowed_values": ["Firefox"]
        }, {
            "field_name": "appUpdateChannel",
            "allowed_values": ["release"]
        }, {
            "field_name": "appVersion",
            "allowed_values": "24.0"
        }, {
            "field_name": "appBuildId",
            "allowed_values": "20130910160258"
        }]
    }
    schema = TelemetrySchema(schema_spec)

    found = set()
    for f in s3util.list_heka_partitions(v4bucket, schema=schema):
        found.add(f.name)

    assert (len(found) == 3)
    assert (
        "20150903/telemetry/4/saved_session/Firefox/release/24.0/20130910160258/20150903051633.482_ip-172-31-16-184"
        in found)
    assert (
        "20150903/telemetry/4/saved_session/Firefox/release/24.0/20130910160258/20150903051644.482_ip-172-31-16-184"
        in found)
    assert (
        "20150903/telemetry/4/saved_session/Firefox/release/24.0/20130910160258/20150903051655.482_ip-172-31-16-184"
        in found)
Ejemplo n.º 6
0
def test_schema(d):
    schema_spec = {
        "version":
        1,
        "dimensions": [{
            "field_name": "reason",
            "allowed_values": ["saved-session"]
        }, {
            "field_name": "appName",
            "allowed_values": ["Firefox"]
        }, {
            "field_name": "appUpdateChannel",
            "allowed_values": ["nightly"]
        }, {
            "field_name": "appVersion",
            "allowed_values": ["27.0a1"]
        }, {
            "field_name": "appBuildID",
            "allowed_values": ["20130918030202"]
        }, {
            "field_name": "submission_date",
            "allowed_values": ["20131001"]
        }]
    }

    schema = TelemetrySchema(schema_spec)

    successfully_downloaded = []
    failfully_downloaded = []
    for f, r, err in d.get_schema(schema):
        if err is not None:
            print err
            failfully_downloaded.append(f)
        else:
            print "Downloaded", f
            successfully_downloaded.append(f)
    assert len(failfully_downloaded) == 0
    print "Successfully downloaded", len(successfully_downloaded)
    assert len(successfully_downloaded) == 20
Ejemplo n.º 7
0
def test_v2schema():
    schema_spec = {
        "version":
        1,
        "dimensions": [{
            "field_name": "reason",
            "allowed_values": ["saved-session"]
        }, {
            "field_name": "appName",
            "allowed_values": ["Firefox"]
        }, {
            "field_name": "appUpdateChannel",
            "allowed_values": ["release"]
        }, {
            "field_name": "appVersion",
            "allowed_values": ["24.0"]
        }, {
            "field_name": "appBuildID",
            "allowed_values": ["20130910160258"]
        }, {
            "field_name": "submission_date",
            "allowed_values": ["20131003", "20131004"]
        }]
    }

    schema = TelemetrySchema(schema_spec)

    found = set()
    for f in s3util.list_partitions(v2bucket, schema=schema,
                                    include_keys=True):
        found.add(f.name)

    assert (len(found) == 2)
    assert (
        "saved_session/Firefox/release/24.0/20130910160258.20131003.v2.log.25b53e7042c74188b08d71ce32e87237.lzma"
        in found)
    assert (
        "saved_session/Firefox/release/24.0/20130910160258.20131004.v2.log.29afd7a250154729bd53c20253f8af78.lzma"
        in found)
Ejemplo n.º 8
0
    def __init__(self, config):
        # Sanity check args.
        if config.get("num_mappers") <= 0:
            raise ValueError("Number of mappers must be greater than zero")
        if config.get("num_reducers") <= 0:
            raise ValueError("Number of reducers must be greater than zero")
        if not os.path.isdir(config.get("data_dir")):
            raise ValueError("Data dir must be a valid directory")
        if not os.path.isdir(config.get("work_dir")):
            raise ValueError("Work dir must be a valid directory")
        if not os.path.isfile(config.get("job_script", "")):
            raise ValueError("Job script must be a valid python file")
        if not os.path.isfile(config.get("input_filter")):
            raise ValueError("Input filter must be a valid json file")

        self._input_dir = config.get("data_dir")
        if self._input_dir[-1] == os.path.sep:
            self._input_dir = self._input_dir[0:-1]
        self._work_dir = config.get("work_dir")
        with open(config.get("input_filter")) as filter_file:
            self._input_filter = TelemetrySchema(json.load(filter_file))
        self._allowed_values = self._input_filter.sanitize_allowed_values()
        self._output_file = config.get("output")
        self._num_mappers = config.get("num_mappers")
        self._num_reducers = config.get("num_reducers")
        self._local_only = config.get("local_only")
        self._bucket_name = config.get("bucket")
        self._aws_key = config.get("aws_key")
        self._aws_secret_key = config.get("aws_secret_key")
        self._profile = config.get("profile")
        self._delete_data = config.get("delete_data")
        with open(config.get("job_script")) as modulefd:
            # let the job script import additional modules under its path
            sys.path.append(os.path.dirname(config.get("job_script")))
            ## Lifted from FileDriver.py in jydoop.
            self._job_module = imp.load_module("telemetry_job", modulefd,
                                               config.get("job_script"),
                                               ('.py', 'U', 1))
Ejemplo n.º 9
0
def update_published_v4_files(sdb,
                              bucket,
                              bucket_prefix,
                              submission_date,
                              limit=None):
    conn = boto.connect_s3(host=S3_DEFAULT_ENDPOINT)
    metadata = conn.get_bucket(METADATA_BUCKET, validate=False)
    schema_key = metadata.get_key("{}/schema.json".format(bucket_prefix))
    schema_string = schema_key.get_contents_as_string()
    schema = TelemetrySchema(json.loads(schema_string))
    bucket = conn.get_bucket(bucket, validate=False)

    added_count = 0
    total_count = 0
    start_time = datetime.now()
    done = False
    last_key = ''
    batch = BatchPut(sdb)
    prefix = "{}/{}".format(
        bucket_prefix, submission_date) if submission_date else bucket_prefix

    print "Bucket: {} - Prefix: {} - Date: {}".format(bucket.name,
                                                      bucket_prefix,
                                                      submission_date)

    while not done:
        try:
            for key in bucket.list(marker=last_key, prefix=prefix):
                last_key = key.name

                if total_count % 1e5 == 0:
                    print("Looked at {} total records in {} seconds, added {}".
                          format(total_count, delta_sec(start_time),
                                 added_count))

                dims = schema.get_dimension_map(
                    schema.get_dimensions(".",
                                          key.name[len(bucket_prefix) + 1:],
                                          dirs_only=True))

                if (dims["submissionDate"] == submission_date
                    ) and dims["submissionDate"][:-2] in sdb:
                    batch.put(dims["submissionDate"][:-2], key.name, dims)
                    added_count += 1

                total_count += 1
                if total_count == limit:
                    done = True
                    break

        except Exception as e:
            print("Error listing keys: {}".format(e))
            traceback.print_exc()
            print("Continuing from last seen key: {}".format(last_key))
            continue

        break

    batch.flush()
    print("Overall, added {} of {} in {} seconds".format(
        added_count, total_count, delta_sec(start_time)))
Ejemplo n.º 10
0
def main():
    signal.signal(signal.SIGINT, handle_sigint)
    parser = argparse.ArgumentParser(
        description='Process incoming Telemetry data',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("-c",
                        "--config",
                        required=True,
                        type=file,
                        help="AWS Configuration file (json)")
    parser.add_argument("-w",
                        "--work-dir",
                        required=True,
                        help="Location to cache downloaded files")
    parser.add_argument("-o",
                        "--output-dir",
                        required=True,
                        help="Base dir to store processed data")
    parser.add_argument("-i",
                        "--input-files",
                        type=file,
                        help="File containing a list of keys to process")
    parser.add_argument("-b",
                        "--bad-data-log",
                        help="Save bad records to this file")
    parser.add_argument("-l", "--log-file", help="Log output to this file")
    parser.add_argument("-s",
                        "--stats-file",
                        help="Log statistics to this file")
    parser.add_argument("--histogram-cache-path",
                        default="./histogram_cache",
                        help="Path to store a local cache of histograms")
    parser.add_argument("-t",
                        "--telemetry-schema",
                        required=True,
                        help="Location of the desired telemetry schema")
    parser.add_argument("-m",
                        "--max-output-size",
                        metavar="N",
                        type=int,
                        default=500000000,
                        help="Rotate output files after N bytes")
    parser.add_argument("-D",
                        "--dry-run",
                        action="store_true",
                        help="Don't modify remote files")
    parser.add_argument("-n",
                        "--no-clean",
                        action="store_true",
                        help="Don't clean out the output-dir before beginning")
    parser.add_argument("-v",
                        "--verbose",
                        action="store_true",
                        help="Print more detailed output")
    args = parser.parse_args()

    if args.verbose:
        # Turn on mp logging
        multiprocessing.log_to_stderr(logging.DEBUG)

    config = json.load(args.config)
    # TODO: allow commandline args to override config values.

    if not os.path.isdir(args.output_dir):
        os.makedirs(args.output_dir)

    schema_data = open(args.telemetry_schema)
    schema = TelemetrySchema(json.load(schema_data))
    schema_data.close()
    cache = RevisionCache(args.histogram_cache_path, "hg.mozilla.org")
    converter = Converter(cache, schema)
    storage = StorageLayout(schema, args.output_dir, args.max_output_size)
    logger = Log(args.log_file, "Master")
    num_cpus = multiprocessing.cpu_count()
    conn = None
    incoming_bucket = None
    incoming_queue = None
    s3downloader = None
    raw_readers = None
    compressors = None
    exporters = None
    done = False

    if args.no_clean:
        logger.log("Not removing log files in {}".format(args.output_dir))
    else:
        # Remove existing log files from output_dir (to clean up after an
        # incomplete previous run, for example).
        logger.log("Removing log files in {}".format(args.output_dir))
        for root, dirs, files in os.walk(args.output_dir):
            for f in files:
                if f.endswith(".log"):
                    full = os.path.join(root, f)
                    if args.dry_run:
                        logger.log("Would be deleting {}, except it's a " \
                                   "dry run".format(full))
                    else:
                        try:
                            logger.log("Removing existing file: " + full)
                            os.remove(full)
                        except Exception, e:
                            logger.log("Error removing existing " \
                                       " file {}: {}".format(full, e))
Ejemplo n.º 11
0
def main():
    parser = argparse.ArgumentParser(
        description='Split raw logs into partitioned files.',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("-m",
                        "--max-output-size",
                        metavar="N",
                        help="Rotate output files after N bytes",
                        type=int,
                        default=500000000)
    parser.add_argument("-i",
                        "--input-file",
                        help="Filename to read from",
                        required=True)
    parser.add_argument("-o",
                        "--output-dir",
                        help="Base directory to store split files",
                        required=True)
    parser.add_argument("-t",
                        "--telemetry-schema",
                        help="Filename of telemetry schema spec",
                        required=True)
    parser.add_argument("-f",
                        "--file-version",
                        help="Log file version (if omitted, we'll guess)")
    args = parser.parse_args()

    schema_data = open(args.telemetry_schema)
    schema = TelemetrySchema(json.load(schema_data))
    schema_data.close()

    storage = StorageLayout(schema, args.output_dir, args.max_output_size)

    expected_dim_count = len(schema._dimensions)

    if not os.path.isdir(args.output_dir):
        os.makedirs(args.output_dir)

    record_count = 0
    bad_record_count = 0
    bytes_read = 0
    start = datetime.now()
    file_version = args.file_version
    if not file_version:
        file_version = fileutil.detect_file_version(args.input_file)
    for r in fileutil.unpack(args.input_file, file_version=file_version):
        record_count += 1
        if r.error:
            bad_record_count += 1
            continue
        # Incoming timestamps are in milliseconds, so convert to POSIX first
        # (ie. seconds)
        submission_date = date.fromtimestamp(r.timestamp /
                                             1000).strftime("%Y%m%d")
        # Deal with unicode
        path = unicode(r.path, errors="replace")
        data = unicode(r.data, errors="replace")

        bytes_read += r.len_ip + r.len_path + r.len_data + fileutil.RECORD_PREAMBLE_LENGTH[
            file_version]
        #print "Path for record", record_count, path, "length of data:", r.len_data, "data:", data[0:5] + "..."

        path_components = path.split("/")
        if len(path_components) != expected_dim_count:
            # We're going to pop the ID off, but we'll also add the submission
            # date, so it evens out.
            print "Found an invalid path in record", record_count, path
            bad_record_count += 1
            continue

        key = path_components.pop(0)
        info = {}
        info["reason"] = path_components.pop(0)
        info["appName"] = path_components.pop(0)
        info["appVersion"] = path_components.pop(0)
        info["appUpdateChannel"] = path_components.pop(0)
        info["appBuildID"] = path_components.pop(0)
        dimensions = schema.dimensions_from(info, submission_date)
        #print "  Converted path to filename", schema.get_filename(args.output_dir, dimensions)
        storage.write(key, data, dimensions)
    duration = timer.delta_sec(start)
    mb_read = bytes_read / 1024.0 / 1024.0
    print "Read %.2fMB in %.2fs (%.2fMB/s), %d of %d records were bad" % (
        mb_read, duration, mb_read / duration, bad_record_count, record_count)
    return 0
def main():
    parser = argparse.ArgumentParser(
        description='Process incoming Telemetry data',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("incoming_bucket",
                        help="The S3 bucket containing incoming files")
    parser.add_argument("publish_bucket",
                        help="The S3 bucket to save processed files")
    parser.add_argument("-k", "--aws-key", help="AWS Key", required=True)
    parser.add_argument("-s",
                        "--aws-secret-key",
                        help="AWS Secret Key",
                        required=True)
    parser.add_argument("-r",
                        "--aws-region",
                        help="AWS Region",
                        default="us-west-2")
    parser.add_argument("-w",
                        "--work-dir",
                        help="Location to cache downloaded files",
                        required=True)
    parser.add_argument("-o",
                        "--output-dir",
                        help="Base dir to store processed data",
                        required=True)
    parser.add_argument("-i",
                        "--input-files",
                        help="File containing a list of keys to process",
                        type=file)
    parser.add_argument("-b",
                        "--bad-data-log",
                        help="Save bad records to this file")
    parser.add_argument("-q",
                        "--queue",
                        help="SQS Queue name to poll for incoming data")
    parser.add_argument("-c",
                        "--histogram-cache-path",
                        help="Path to store a local cache of histograms",
                        default="./histogram_cache")
    parser.add_argument("-t",
                        "--telemetry-schema",
                        help="Location of the desired telemetry schema",
                        required=True)
    parser.add_argument("-m",
                        "--max-output-size",
                        metavar="N",
                        help="Rotate output files after N bytes",
                        type=int,
                        default=500000000)
    parser.add_argument("-D",
                        "--dry-run",
                        help="Don't modify remote files",
                        action="store_true")
    parser.add_argument("-C",
                        "--skip-conversion",
                        help="Skip validation/conversion of payloads",
                        action="store_true")
    args = parser.parse_args()

    if not os.path.isfile(S3FUNNEL_PATH):
        print "ERROR: s3funnel not found at", S3FUNNEL_PATH
        print "You can get it from github: https://github.com/sstoiana/s3funnel"
        return -1

    if not os.path.isdir(args.output_dir):
        os.makedirs(args.output_dir)

    schema_data = open(args.telemetry_schema)
    schema = TelemetrySchema(json.load(schema_data))
    schema_data.close()
    cache = RevisionCache(args.histogram_cache_path, "hg.mozilla.org")
    if args.skip_conversion:
        converter = None
    else:
        converter = Converter(cache, schema)
    storage = StorageLayout(schema, args.output_dir, args.max_output_size)

    num_cpus = multiprocessing.cpu_count()

    start = datetime.now()
    conn = None
    incoming_bucket = None
    incoming_queue = None
    incoming_queue_messages = []

    if not args.dry_run:
        conn = S3Connection(args.aws_key, args.aws_secret_key)
        incoming_bucket = conn.get_bucket(args.incoming_bucket)

    incoming_filenames = []
    if args.queue is not None:
        print "Fetching file list from queue", args.queue
        if args.dry_run:
            print "Dry run mode... can't read from the queue without messing things up..."
        else:
            q_conn = boto.sqs.connect_to_region(
                args.aws_region,
                aws_access_key_id=args.aws_key,
                aws_secret_access_key=args.aws_secret_key)
            incoming_queue = q_conn.get_queue(args.queue)
            if incoming_queue is None:
                print "Error: could not get queue", args.queue
                return -2
            # Sometimes we don't get all the messages, even if more are
            # available, so keep trying until we have enough (or there aren't
            # any left)
            for i in range(num_cpus):
                messages = incoming_queue.get_messages(num_cpus -
                                                       len(incoming_filenames))
                for m in messages:
                    # TODO: Make sure this file exists in S3 first?
                    possible_filename = m.get_body()
                    key = incoming_bucket.get_key(possible_filename)
                    if key is None:
                        print "Could not find queued filename in bucket", args.incoming_bucket, ":", possible_filename
                        # try to delete it:
                        incoming_queue.delete_message(m)
                    else:
                        incoming_filenames.append(possible_filename)
                        incoming_queue_messages.append(m)
                if len(messages) == 0 or len(incoming_filenames) >= num_cpus:
                    break
    elif args.input_files:
        print "Fetching file list from file", args.input_files
        incoming_filenames = [l.strip() for l in args.input_files.readlines()]
    else:
        print "Fetching file list from S3..."
        for f in incoming_bucket.list():
            incoming_filenames.append(f.name)
    print "Done"

    if len(incoming_filenames) == 0:
        print "Nothing to do!"
        return 0

    for f in incoming_filenames:
        print "  ", f

    print "Verifying that we can write to", args.publish_bucket
    if args.dry_run:
        print "Dry run mode: don't care!"
    else:
        try:
            publish_bucket = conn.get_bucket(args.publish_bucket)
            print "Looks good!"
        except S3ResponseError:
            print "Bucket", args.publish_bucket, "not found.  Attempting to create it."
            publish_bucket = conn.create_bucket(args.publish_bucket)

    result = 0
    print "Downloading", len(incoming_filenames), "files..."
    if args.dry_run:
        print "Dry run mode: skipping download from S3"
    else:
        result = fetch_s3_files(incoming_filenames, args.work_dir,
                                incoming_bucket, args.aws_key,
                                args.aws_secret_key)

    if result != 0:
        print "Error downloading files. Return code of s3funnel was", result
        return result
    print "Done"

    after_download = datetime.now()

    local_filenames = [
        os.path.join(args.work_dir, f) for f in incoming_filenames
    ]

    # TODO: try a SimpleQueue
    raw_files = Queue()
    for l in local_filenames:
        raw_files.put(l)

    completed_files = Queue()
    compressed_files = Queue()

    # Begin reading raw input
    raw_readers = start_workers(
        num_cpus, "Reader", ReadRawStep, raw_files,
        (completed_files, schema, converter, storage, args.bad_data_log))

    # Tell readers when to stop:
    for i in range(num_cpus):
        raw_files.put(PipeStep.SENTINEL)

    # Compress completed files.
    compressors = start_workers(num_cpus, "Compressor", CompressCompletedStep,
                                completed_files, (compressed_files, ))

    # Export compressed files to S3.
    exporters = start_workers(
        num_cpus, "Exporter", ExportCompressedStep, compressed_files,
        (args.output_dir, args.aws_key, args.aws_secret_key,
         args.publish_bucket, args.dry_run))

    wait_for(raw_readers, "Raw Readers")

    # `find <out_dir> -type f -not -name ".compressme"`
    # Add them to completed_files
    for root, dirs, files in os.walk(args.output_dir):
        for f in files:
            if f.endswith(".log"):
                completed_files.put(os.path.join(root, f))

    for i in range(num_cpus):
        completed_files.put(PipeStep.SENTINEL)

    wait_for(compressors, "Compressors")
    for i in range(num_cpus):
        compressed_files.put(PipeStep.SENTINEL)

    wait_for(exporters, "Exporters")

    print "Removing processed logs from S3..."
    for f in incoming_filenames:
        if args.dry_run:
            print "  Dry run, so not really deleting", f
        else:
            print "  Deleting", f
            incoming_bucket.delete_key(f)
            # Delete file locally too.
            os.remove(os.path.join(args.work_dir, f))
    print "Done"

    if len(incoming_queue_messages) > 0:
        print "Removing processed messages from SQS..."
        for m in incoming_queue_messages:
            if args.dry_run:
                print "  Dry run, so not really deleting", m.get_body()
            else:
                print "  Deleting", m.get_body()
                if incoming_queue.delete_message(m):
                    print "  Message deleted successfully"
                else:
                    print "  Failed to delete message :("
        print "Done"

    duration = timer.delta_sec(start)
    print "All done in %.2fs (%.2fs excluding download time)" % (
        duration, timer.delta_sec(after_download))
    return 0
Ejemplo n.º 13
0
 def setUp(self):
     test_dir = self.get_test_dir()
     self.schema = TelemetrySchema(self.get_schema_spec())
     self.storage = StorageLayout(self.schema, test_dir, 10000)
     assert not os.path.exists(test_dir)
     os.makedirs(test_dir)