def update_published_v2_files(sdb, from_submission_date=None, to_submission_date=None, limit=None):
    s3 = S3Connection()
    bucket_name = "telemetry-published-v2"
    bucket = s3.get_bucket(bucket_name)
    schema_key = bucket.get_key("telemetry_schema.json")
    schema_string = schema_key.get_contents_as_string()
    schema = TelemetrySchema(json.loads(schema_string))

    termination_requested = [False]
    def keyboard_interrupt_handler(signal, frame):
        termination_requested[0] = True
    signal.signal(signal.SIGINT, keyboard_interrupt_handler)

    added_count = 0
    total_count = 0
    start_time = datetime.now()
    done = False
    last_key = ''
    batch = BatchPut(sdb)

    while not done:
        try:
            for key in bucket.list(marker=last_key):
                last_key = key.name

                if total_count % 1e5 == 0:
                    print("Looked at {} total records in {} seconds, added {}".
                          format(total_count, delta_sec(start_time), added_count))

                dims = schema.get_dimension_map(schema.get_dimensions(".", key.name))

                if (from_submission_date is None or dims["submission_date"] >= from_submission_date) and \
                   (to_submission_date is None or dims["submission_date"] <= to_submission_date) and \
                   dims["submission_date"][:-2] in sdb and \
                   dims["reason"] != "idle_daily":
                    attributes = {"reason": dims.get("reason"),
                                  "appName": dims.get("appName"),
                                  "appUpdateChannel": dims.get("appUpdateChannel"),
                                  "appVersion": dims.get("appVersion"),
                                  "appBuildID": dims.get("appBuildID"),
                                  "submissionDate": dims.get("submission_date")}
                    batch.put(dims["submission_date"][:-2], key.name, attributes)
                    added_count += 1

                total_count += 1
                if total_count == limit or termination_requested[0]:
                    done = True
                    break

        except Exception as e:
            print("Error listing keys: {}".format(e))
            traceback.print_exc()
            print("Continuing from last seen key: {}".format(last_key))
            continue

        break

    batch.flush()
    print("Overall, added {} of {} in {} seconds".format(added_count, total_count, delta_sec(start_time)))
Ejemplo n.º 2
0
def update_published_v4_files(sdb, bucket, bucket_prefix, submission_date, limit=None):
    s3 = S3Connection()
    metadata = s3.get_bucket(METADATA_BUCKET)
    schema_key = metadata.get_key("{}/schema.json".format(bucket_prefix))
    schema_string = schema_key.get_contents_as_string()
    schema = TelemetrySchema(json.loads(schema_string))
    bucket = s3.get_bucket(bucket)

    added_count = 0
    total_count = 0
    start_time = datetime.now()
    done = False
    last_key = ''
    batch = BatchPut(sdb)
    prefix = "{}/{}".format(bucket_prefix, submission_date) if submission_date else bucket_prefix

    print "Bucket: {} - Prefix: {} - Date: {}".format(bucket.name, bucket_prefix, submission_date)

    while not done:
        try:
            for key in bucket.list(marker=last_key, prefix=prefix):
                last_key = key.name

                if total_count % 1e5 == 0:
                    print("Looked at {} total records in {} seconds, added {}".
                            format(total_count, delta_sec(start_time), added_count))

                dims = schema.get_dimension_map(schema.get_dimensions(".", key.name[len(bucket_prefix) + 1:], dirs_only=True))

                if (dims["submissionDate"] == submission_date) and dims["submissionDate"][:-2] in sdb:
                    batch.put(dims["submissionDate"][:-2], key.name, dims)
                    added_count += 1

                total_count += 1
                if total_count == limit:
                    done = True
                    break

        except Exception as e:
            print("Error listing keys: {}".format(e))
            traceback.print_exc()
            print("Continuing from last seen key: {}".format(last_key))
            continue

        break

    batch.flush()
    print("Overall, added {} of {} in {} seconds".format(added_count, total_count, delta_sec(start_time)))
Ejemplo n.º 3
0
def update_published_v4_files(sdb,
                              bucket,
                              bucket_prefix,
                              submission_date,
                              limit=None):
    conn = boto.connect_s3(host=S3_DEFAULT_ENDPOINT)
    metadata = conn.get_bucket(METADATA_BUCKET, validate=False)
    schema_key = metadata.get_key("{}/schema.json".format(bucket_prefix))
    schema_string = schema_key.get_contents_as_string()
    schema = TelemetrySchema(json.loads(schema_string))
    bucket = conn.get_bucket(bucket, validate=False)

    added_count = 0
    total_count = 0
    start_time = datetime.now()
    done = False
    last_key = ''
    batch = BatchPut(sdb)
    prefix = "{}/{}".format(
        bucket_prefix, submission_date) if submission_date else bucket_prefix

    print "Bucket: {} - Prefix: {} - Date: {}".format(bucket.name,
                                                      bucket_prefix,
                                                      submission_date)

    while not done:
        try:
            for key in bucket.list(marker=last_key, prefix=prefix):
                last_key = key.name

                if total_count % 1e5 == 0:
                    print("Looked at {} total records in {} seconds, added {}".
                          format(total_count, delta_sec(start_time),
                                 added_count))

                dims = schema.get_dimension_map(
                    schema.get_dimensions(".",
                                          key.name[len(bucket_prefix) + 1:],
                                          dirs_only=True))

                if (dims["submissionDate"] == submission_date
                    ) and dims["submissionDate"][:-2] in sdb:
                    batch.put(dims["submissionDate"][:-2], key.name, dims)
                    added_count += 1

                total_count += 1
                if total_count == limit:
                    done = True
                    break

        except Exception as e:
            print("Error listing keys: {}".format(e))
            traceback.print_exc()
            print("Continuing from last seen key: {}".format(last_key))
            continue

        break

    batch.flush()
    print("Overall, added {} of {} in {} seconds".format(
        added_count, total_count, delta_sec(start_time)))
Ejemplo n.º 4
0
class AnalysisJob:
    def __init__(self, cfg):
        self.job_bundle = cfg.job_bundle
        if cfg.input_filter:
            self.input_filter = TelemetrySchema(
                json.load(open(cfg.input_filter)))
        else:
            self.input_filter = None

        if cfg.input_list_file:
            self.input_list = cfg.input_list_file
        else:
            self.input_list = None

        self.job_id = str(uuid4())
        self.target_queue = cfg.target_queue
        self.aws_key = cfg.aws_key
        self.aws_secret_key = cfg.aws_secret_key
        self.input_bucket = "telemetry-published-v1"
        self.job_name = cfg.name
        self.job_owner = cfg.owner
        self.date_limit = cfg.date_limit

        # Bucket with intermediate data for this analysis job
        self.analysis_bucket = "jonasfj-telemetry-analysis"

        self.s3_code_path = "batch-jobs/" + self.job_id + ".tar.gz"

        # S3 region of operation
        self.aws_region = "us-west-2"
        self.task_size_limit = 400 * 1024 * 1024
        self.sqs_input_name = cfg.sqs_queue

    def get_filtered_files(self):
        conn = S3Connection(self.aws_key, self.aws_secret_key)
        bucket = conn.get_bucket(self.input_bucket)
        # date_limit is a hack that makes it easy to launch everything before
        # a given date... say to back process all we have in the bucket...
        if self.date_limit != None:
            print "Launching limiting to before " + self.date_limit
            for k, s in self.list_partitions(bucket):
                if k.split('/')[-1].split('.')[1] < self.date_limit:
                    yield (k, s)
        else:
            for k, s in self.list_partitions(bucket):
                yield (k, s)

    def get_filtered_files_old(self):
        """ Get tuples of name and size for all input files """
        # Setup some auxiliary functions
        allowed_values = self.input_filter.sanitize_allowed_values()
        nb_dims = len(allowed_values)

        def filter_includes(level, value):
            return self.input_filter.is_allowed(value, allowed_values[level])

        # iterate over all files in bucket, this is very slow and we should be
        # be able to something much smarter using prefix listing and ordering
        # to break listing.
        count = 0
        selected = 0
        conn = S3Connection(self.aws_key, self.aws_secret_key)
        bucket = conn.get_bucket(self.input_bucket)
        for f in bucket.list():
            count += 1
            dims = self.input_filter.get_dimensions(".", f.key)
            include = True
            for i in xrange(nb_dims):
                if not filter_includes(i, dims[i]):
                    include = False
                    break
            if include:
                selected += 1
                yield (f.key, f.size)
            if count % 5000 == 0:
                print "%i files listed with %i selected" % (count, selected)
        conn.close()

    def list_partitions(self, bucket, prefix='', level=0):
        if self.input_filter:
            #print "Listing...", prefix, level
            allowed_values = self.input_filter.sanitize_allowed_values()
            delimiter = '/'
            if level > 3:
                delimiter = '.'
            for k in bucket.list(prefix=prefix, delimiter=delimiter):
                partitions = k.name.split("/")
                if level > 3:
                    # split the last couple of partition components by "." instead of "/"
                    partitions.extend(partitions.pop().split(".", 2))
                if self.input_filter.is_allowed(partitions[level],
                                                allowed_values[level]):
                    if level >= 5:
                        for f in bucket.list(prefix=k.name):
                            yield (f.key, f.size)
                    else:
                        for k, s in self.list_partitions(
                                bucket, k.name, level + 1):
                            yield (k, s)
        elif self.input_list:
            print "Using input list..."
            for line in self.input_list:
                key_name = line.strip()
                k = bucket.get_key(key_name)
                yield (k.key, k.size)
        else:
            print "Don't know how to list partitions without a filter or list :("
            raise ValueError("Missing both input_filter and input_list")

    def generate_tasks(self):
        """ Generates SQS tasks, we batch small files into a single task """
        taskid = 1
        taskfiles = []
        tasksize = 0
        total_size_of_all = 0
        for key, size in self.get_filtered_files():
            # If the task have reached desired size we yield it
            # Note, as SQS messages are limited to 65 KiB we limit tasks to
            # 100 filenames, for simplicity
            # boto only uses signature version 4, hence, we're limited to 65 KiB
            if 0 < len(taskfiles) and (tasksize + size > self.task_size_limit
                                       or len(taskfiles) > 200):
                # Reduce to only filenames, sort by size... smallest first they are
                # faster to download when handling the job
                taskfiles = [
                    f for f, s in sorted(taskfiles, key=lambda (f, s): s)
                ]
                yield {
                    'id': self.job_id + "/" + str(taskid),
                    'name': self.job_name,
                    'owner': self.job_owner,
                    'code': self.s3_code_path,
                    'target-queue': self.target_queue,
                    'files': taskfiles,
                    'size': tasksize
                }
                total_size_of_all += tasksize
                print "%i tasks created acc. size: %s" % (taskid,
                                                          total_size_of_all)
                taskid += 1
                taskfiles = []
                tasksize = 0
            tasksize += size
            taskfiles.append((key, size))
        if len(taskfiles) > 0:
            taskfiles = [f for f, s in sorted(taskfiles, key=lambda (f, s): s)]
            yield {
                'id': self.job_id + "/" + str(taskid),
                'name': self.job_name,
                'owner': self.job_owner,
                'code': self.s3_code_path,
                'target-queue': self.target_queue,
                'files': taskfiles,
                'size': tasksize
            }
        print "Finished:"
        print "%i tasks created total size: %s" % (taskid, total_size_of_all +
                                                   tasksize)

    def put_sqs_tasks(self):
        """ Create an SQS tasks for this analysis job """
        print "Populate SQS input queue with tasks"
        # Connect to SQS is desired region
        conn = sqs.connect_to_region(self.aws_region,
                                     aws_access_key_id=self.aws_key,
                                     aws_secret_access_key=self.aws_secret_key)
        # Create queue
        queue = conn.get_queue(self.sqs_input_name)
        queue.set_message_class(JSONMessage)
        # Populate queue with tasks
        for task in self.generate_tasks():
            #print "enqueueing", task["id"], "size:", task["size"]
            msg = queue.new_message(body=task)
            queue.write(msg)
        conn.close()

    def setup(self):
        self.upload_job_bundle()
        self.put_sqs_tasks()
        print "Uploaded with job_id: %s" % self.job_id

    def upload_job_bundle(self):
        """ Upload job bundle to S3 """
        conn = S3Connection(self.aws_key, self.aws_secret_key)
        bucket = conn.get_bucket(self.analysis_bucket)
        k = Key(bucket)
        k.key = self.s3_code_path
        k.set_contents_from_filename(self.job_bundle)
        conn.close()
Ejemplo n.º 5
0
class AnalysisJob:
    def __init__(self, cfg):
        self.job_bundle = cfg.job_bundle
        if cfg.input_filter:
            self.input_filter = TelemetrySchema(json.load(open(cfg.input_filter)))
        else:
            self.input_filter = None

        if cfg.input_list_file:
            self.input_list = cfg.input_list_file
        else:
            self.input_list = None

        self.job_id = str(uuid4())
        self.target_queue = cfg.target_queue
        self.aws_key = cfg.aws_key
        self.aws_secret_key = cfg.aws_secret_key
        self.input_bucket = "telemetry-published-v1"
        self.job_name = cfg.name
        self.job_owner = cfg.owner
        self.date_limit = cfg.date_limit

        # Bucket with intermediate data for this analysis job
        self.analysis_bucket = "jonasfj-telemetry-analysis"

        self.s3_code_path = "batch-jobs/" + self.job_id + ".tar.gz"

        # S3 region of operation
        self.aws_region = "us-west-2"
        self.task_size_limit = 400 * 1024 * 1024
        self.sqs_input_name = cfg.sqs_queue


    def get_filtered_files(self):
        conn = S3Connection(self.aws_key, self.aws_secret_key)
        bucket = conn.get_bucket(self.input_bucket)
        # date_limit is a hack that makes it easy to launch everything before
        # a given date... say to back process all we have in the bucket...
        if self.date_limit != None:
            print "Launching limiting to before " + self.date_limit
            for k,s in self.list_partitions(bucket):
                if k.split('/')[-1].split('.')[1] < self.date_limit:
                    yield (k, s)
        else:
            for k,s in self.list_partitions(bucket):
                yield (k, s)

    def get_filtered_files_old(self):
        """ Get tuples of name and size for all input files """
        # Setup some auxiliary functions
        allowed_values = self.input_filter.sanitize_allowed_values()
        nb_dims = len(allowed_values)
        def filter_includes(level, value):
            return self.input_filter.is_allowed(value, allowed_values[level])

        # iterate over all files in bucket, this is very slow and we should be
        # be able to something much smarter using prefix listing and ordering
        # to break listing.
        count = 0
        selected = 0
        conn = S3Connection(self.aws_key, self.aws_secret_key)
        bucket = conn.get_bucket(self.input_bucket)
        for f in bucket.list():
            count += 1
            dims = self.input_filter.get_dimensions(".", f.key)
            include = True
            for i in xrange(nb_dims):
                if not filter_includes(i, dims[i]):
                    include = False
                    break
            if include:
                selected += 1
                yield (f.key, f.size)
            if count % 5000 == 0:
                print "%i files listed with %i selected" % (count, selected)
        conn.close()

    def list_partitions(self, bucket, prefix='', level=0):
        if self.input_filter:
            #print "Listing...", prefix, level
            allowed_values = self.input_filter.sanitize_allowed_values()
            delimiter = '/'
            if level > 3:
                delimiter = '.'
            for k in bucket.list(prefix=prefix, delimiter=delimiter):
                partitions = k.name.split("/")
                if level > 3:
                    # split the last couple of partition components by "." instead of "/"
                    partitions.extend(partitions.pop().split(".", 2))
                if self.input_filter.is_allowed(partitions[level], allowed_values[level]):
                    if level >= 5:
                        for f in bucket.list(prefix=k.name):
                            yield (f.key, f.size)
                    else:
                        for k, s in self.list_partitions(bucket, k.name, level + 1):
                            yield (k, s)
        elif self.input_list:
            print "Using input list..."
            for line in self.input_list:
                key_name = line.strip()
                k = bucket.get_key(key_name)
                yield (k.key, k.size)
        else:
            print "Don't know how to list partitions without a filter or list :("
            raise ValueError("Missing both input_filter and input_list")

    def generate_tasks(self):
        """ Generates SQS tasks, we batch small files into a single task """
        taskid = 1
        taskfiles = []
        tasksize = 0
        total_size_of_all = 0
        for key, size in self.get_filtered_files():
            # If the task have reached desired size we yield it
            # Note, as SQS messages are limited to 65 KiB we limit tasks to
            # 100 filenames, for simplicity
            # boto only uses signature version 4, hence, we're limited to 65 KiB
            if 0 < len(taskfiles) and (tasksize + size > self.task_size_limit or
                                       len(taskfiles) > 200):
                # Reduce to only filenames, sort by size... smallest first they are
                # faster to download when handling the job
                taskfiles =  [f for f,s in sorted(taskfiles, key=lambda (f,s): s)]
                yield {
                    'id':               self.job_id + "/" +  str(taskid),
                    'name':             self.job_name,
                    'owner':            self.job_owner,
                    'code':             self.s3_code_path,
                    'target-queue':     self.target_queue,
                    'files':            taskfiles,
                    'size':             tasksize
                }
                total_size_of_all += tasksize
                print "%i tasks created acc. size: %s" % (taskid, total_size_of_all)
                taskid += 1
                taskfiles = []
                tasksize = 0
            tasksize += size
            taskfiles.append((key, size))
        if len(taskfiles) > 0:
            taskfiles =  [f for f,s in sorted(taskfiles, key=lambda (f,s): s)]
            yield {
                'id':               self.job_id + "/" + str(taskid),
                'name':             self.job_name,
                'owner':            self.job_owner,
                'code':             self.s3_code_path,
                'target-queue':     self.target_queue,
                'files':            taskfiles,
                'size':             tasksize
            }
        print "Finished:"
        print "%i tasks created total size: %s" % (taskid, total_size_of_all + tasksize)

    def put_sqs_tasks(self):
        """ Create an SQS tasks for this analysis job """
        print "Populate SQS input queue with tasks"
        # Connect to SQS is desired region
        conn = sqs.connect_to_region(
           self.aws_region,
           aws_access_key_id = self.aws_key,
           aws_secret_access_key = self.aws_secret_key
        )
        # Create queue
        queue = conn.get_queue(self.sqs_input_name)
        queue.set_message_class(JSONMessage)
        # Populate queue with tasks
        for task in self.generate_tasks():
            #print "enqueueing", task["id"], "size:", task["size"]
            msg = queue.new_message(body = task)
            queue.write(msg)
        conn.close()

    def setup(self):
        self.upload_job_bundle()
        self.put_sqs_tasks()
        print "Uploaded with job_id: %s" % self.job_id

    def upload_job_bundle(self):
        """ Upload job bundle to S3 """
        conn = S3Connection(self.aws_key, self.aws_secret_key)
        bucket = conn.get_bucket(self.analysis_bucket)
        k = Key(bucket)
        k.key = self.s3_code_path
        k.set_contents_from_filename(self.job_bundle)
        conn.close()