def update_published_v2_files(sdb, from_submission_date=None, to_submission_date=None, limit=None):
    s3 = S3Connection()
    bucket_name = "telemetry-published-v2"
    bucket = s3.get_bucket(bucket_name)
    schema_key = bucket.get_key("telemetry_schema.json")
    schema_string = schema_key.get_contents_as_string()
    schema = TelemetrySchema(json.loads(schema_string))

    termination_requested = [False]
    def keyboard_interrupt_handler(signal, frame):
        termination_requested[0] = True
    signal.signal(signal.SIGINT, keyboard_interrupt_handler)

    added_count = 0
    total_count = 0
    start_time = datetime.now()
    done = False
    last_key = ''
    batch = BatchPut(sdb)

    while not done:
        try:
            for key in bucket.list(marker=last_key):
                last_key = key.name

                if total_count % 1e5 == 0:
                    print("Looked at {} total records in {} seconds, added {}".
                          format(total_count, delta_sec(start_time), added_count))

                dims = schema.get_dimension_map(schema.get_dimensions(".", key.name))

                if (from_submission_date is None or dims["submission_date"] >= from_submission_date) and \
                   (to_submission_date is None or dims["submission_date"] <= to_submission_date) and \
                   dims["submission_date"][:-2] in sdb and \
                   dims["reason"] != "idle_daily":
                    attributes = {"reason": dims.get("reason"),
                                  "appName": dims.get("appName"),
                                  "appUpdateChannel": dims.get("appUpdateChannel"),
                                  "appVersion": dims.get("appVersion"),
                                  "appBuildID": dims.get("appBuildID"),
                                  "submissionDate": dims.get("submission_date")}
                    batch.put(dims["submission_date"][:-2], key.name, attributes)
                    added_count += 1

                total_count += 1
                if total_count == limit or termination_requested[0]:
                    done = True
                    break

        except Exception as e:
            print("Error listing keys: {}".format(e))
            traceback.print_exc()
            print("Continuing from last seen key: {}".format(last_key))
            continue

        break

    batch.flush()
    print("Overall, added {} of {} in {} seconds".format(added_count, total_count, delta_sec(start_time)))
Ejemplo n.º 2
0
def update_published_v4_files(sdb, bucket, bucket_prefix, submission_date, limit=None):
    s3 = S3Connection()
    metadata = s3.get_bucket(METADATA_BUCKET)
    schema_key = metadata.get_key("{}/schema.json".format(bucket_prefix))
    schema_string = schema_key.get_contents_as_string()
    schema = TelemetrySchema(json.loads(schema_string))
    bucket = s3.get_bucket(bucket)

    added_count = 0
    total_count = 0
    start_time = datetime.now()
    done = False
    last_key = ''
    batch = BatchPut(sdb)
    prefix = "{}/{}".format(bucket_prefix, submission_date) if submission_date else bucket_prefix

    print "Bucket: {} - Prefix: {} - Date: {}".format(bucket.name, bucket_prefix, submission_date)

    while not done:
        try:
            for key in bucket.list(marker=last_key, prefix=prefix):
                last_key = key.name

                if total_count % 1e5 == 0:
                    print("Looked at {} total records in {} seconds, added {}".
                            format(total_count, delta_sec(start_time), added_count))

                dims = schema.get_dimension_map(schema.get_dimensions(".", key.name[len(bucket_prefix) + 1:], dirs_only=True))

                if (dims["submissionDate"] == submission_date) and dims["submissionDate"][:-2] in sdb:
                    batch.put(dims["submissionDate"][:-2], key.name, dims)
                    added_count += 1

                total_count += 1
                if total_count == limit:
                    done = True
                    break

        except Exception as e:
            print("Error listing keys: {}".format(e))
            traceback.print_exc()
            print("Continuing from last seen key: {}".format(last_key))
            continue

        break

    batch.flush()
    print("Overall, added {} of {} in {} seconds".format(added_count, total_count, delta_sec(start_time)))
Ejemplo n.º 3
0
def update_published_v4_files(sdb,
                              bucket,
                              bucket_prefix,
                              submission_date,
                              limit=None):
    conn = boto.connect_s3(host=S3_DEFAULT_ENDPOINT)
    metadata = conn.get_bucket(METADATA_BUCKET, validate=False)
    schema_key = metadata.get_key("{}/schema.json".format(bucket_prefix))
    schema_string = schema_key.get_contents_as_string()
    schema = TelemetrySchema(json.loads(schema_string))
    bucket = conn.get_bucket(bucket, validate=False)

    added_count = 0
    total_count = 0
    start_time = datetime.now()
    done = False
    last_key = ''
    batch = BatchPut(sdb)
    prefix = "{}/{}".format(
        bucket_prefix, submission_date) if submission_date else bucket_prefix

    print "Bucket: {} - Prefix: {} - Date: {}".format(bucket.name,
                                                      bucket_prefix,
                                                      submission_date)

    while not done:
        try:
            for key in bucket.list(marker=last_key, prefix=prefix):
                last_key = key.name

                if total_count % 1e5 == 0:
                    print("Looked at {} total records in {} seconds, added {}".
                          format(total_count, delta_sec(start_time),
                                 added_count))

                dims = schema.get_dimension_map(
                    schema.get_dimensions(".",
                                          key.name[len(bucket_prefix) + 1:],
                                          dirs_only=True))

                if (dims["submissionDate"] == submission_date
                    ) and dims["submissionDate"][:-2] in sdb:
                    batch.put(dims["submissionDate"][:-2], key.name, dims)
                    added_count += 1

                total_count += 1
                if total_count == limit:
                    done = True
                    break

        except Exception as e:
            print("Error listing keys: {}".format(e))
            traceback.print_exc()
            print("Continuing from last seen key: {}".format(last_key))
            continue

        break

    batch.flush()
    print("Overall, added {} of {} in {} seconds".format(
        added_count, total_count, delta_sec(start_time)))