Example #1
0
def delete_remaining_zeros(fouled):
    conn = s3connection()
    for bucket, etag, mime in fouled:
        k = conn.Object('idigbio-{0}-prod'.format(bucket), etag)
        k.delete()
        for typ in ['thumbnail', 'webview', 'fullsize']:
            k = conn.Object('idigbio-{0}-prod-{1}'.format(bucket, typ), etag + '.jpg')
            if k.content_length == 0:
                k.delete()
        apidbpool.execute("UPDATE objects SET derivatives=false WHERE etag LIKE %s", (etag,))
Example #2
0
def migrate():
    """Migrate objects from the old media API

    Specifically the `idb_object_keys` into the new `media` and `objects`
    """
    t1 = datetime.now()
    logger.info("Checking for objects in the old media api")
    try:
        sql = """INSERT INTO objects (bucket, etag)
              (SELECT DISTINCT
                type,
                etag
              FROM idb_object_keys
              LEFT JOIN objects USING (etag)
              WHERE objects.etag IS NULL
                AND idb_object_keys.user_uuid <> %s);
        """
        rc = apidbpool.execute(sql, (config.IDB_UUID, ))
        logger.info("Objects Migrated: %s", rc)
        sql = """INSERT INTO media (url, type, owner, last_status, last_check)
              (SELECT
                idb_object_keys.lookup_key,
                idb_object_keys.type,
                idb_object_keys.user_uuid::uuid,
                200,
                now()
              FROM idb_object_keys
              LEFT JOIN media ON lookup_key = url
              WHERE media.url IS NULL
                AND idb_object_keys.user_uuid <> %s);
        """
        rc = apidbpool.execute(sql, (config.IDB_UUID, ))
        logger.info("Media Migrated: %s", rc)
        sql = """
            INSERT INTO media_objects (url, etag, modified)
              (SELECT
                idb_object_keys.lookup_key,
                idb_object_keys.etag,
                idb_object_keys.date_modified
              FROM idb_object_keys
              JOIN media ON idb_object_keys.lookup_key = media.url
              JOIN objects ON idb_object_keys.etag = objects.etag
              LEFT JOIN media_objects ON lookup_key = media.url
                    AND media_objects.etag = idb_object_keys.etag
              WHERE media_objects.url IS NULL
                AND idb_object_keys.user_uuid <> %s)
        """
        rc = apidbpool.execute(sql, (config.IDB_UUID, ))
        logger.info("Media Objects Migrated: %s in %ss", rc,
                    (datetime.now() - t1))
    except Exception:
        logger.exception("Failed migrating from old media api")
def update_db(ceph_bucket, ceph_name, status):
    global test

    if test:
        logger.debug("Skipping database udpate for {0}/{1}".format(
            ceph_bucket, ceph_name))
        return True
    else:
        logger.debug("Updating database for {0}/{1}".format(
            ceph_bucket, ceph_name))
        cols = []
        vals = {"ceph_name": ceph_name, "ceph_bucket": ceph_bucket}

        cols.append("rest_status=%(status)s")
        vals["status"] = status

        if status == "reconstructed":
            cols.append("rest_last_success=%(timestamp)s")
        else:
            cols.append("rest_last_failure=%(timestamp)s")
        vals["timestamp"] = '{:%Y-%m-%d %H:%M:%S}'.format(
            datetime.datetime.now())

        r = apidbpool.execute(
            """UPDATE ceph_objects
            SET {0}
            WHERE
            ceph_name=%(ceph_name)s
            AND ceph_bucket=%(ceph_bucket)s
        """.format(",".join(cols)), vals)
        return r > 0
Example #4
0
def update_db(row_obj, key_obj, status):
    """Some db records are incomplete due to the original db being used
    for backups and not a full accounting of object properties, backfill
    any missing information into the database. Verify status is the result of this check.
    """

    if TEST:
        logger.debug("Skipping metadata update for {0}:{1}".format(
            key_obj.bucket.name, key_obj.name))
        return True
    else:
        logger.debug("Updating database record for {0}:{1}".format(
            key_obj.bucket.name, key_obj.name))
        cols = []
        vals = {
            "ceph_name": row_obj["ceph_name"],
            "ceph_bucket": row_obj["ceph_bucket"]
        }

        cols.append("ver_status=%(status)s")
        vals["status"] = status

        if status == "verified" or status == "stashed":
            cols.append("ver_last_success=%(timestamp)s")
        else:
            cols.append("ver_last_failure=%(timestamp)s")

        # To save refactoring larger portions of the script and status returns, see if *this* name
        # is in the list of things successfully deleted. Performance won't be great on large
        # numbers of items.
        if key_obj.name in DELETED:
            cols.append("ceph_deleted=%(was_it_deleted)s")
            cols.append("ceph_deleted_date=%(timestamp)s")

        vals["timestamp"] = '{:%Y-%m-%d %H:%M:%S}'.format(
            datetime.datetime.now())
        vals["was_it_deleted"] = True

        # Even if the obj does not verify, update the db with what's in ceph
        if not row_obj["ceph_date"]:
            cols.append("ceph_date=%(last_modified)s")
            vals["last_modified"] = key_obj.last_modified

        if not row_obj["ceph_bytes"]:
            cols.append("ceph_bytes=%(size)s")
            vals["size"] = key_obj.size

        # Seems like if object does not transfer fully, etag is not set?
        if (status == "verified") and not row_obj["ceph_etag"]:
            cols.append("ceph_etag=%(etag)s")
            vals["etag"] = key_obj.etag[1:-1]

        return apidbpool.execute(
            """UPDATE ceph_objects
                             SET {0}
                             WHERE
                             ceph_name=%(ceph_name)s 
                             AND ceph_bucket=%(ceph_bucket)s""".format(
                ",".join(cols)), vals)
Example #5
0
def keyfn(obj):
    bucket, etag, mime = obj
    urls = geturls(etag)
    for url in urls:
        fi = FetchItem(url, bucket, mime)
        if not fi.prefix:
            continue
        fi.get_media()
        fi.upload_to_storage(store)
        fi.cleanup()
        if not fi.ok:
            continue
        else:
            apidbpool.execute("UPDATE objects SET derivatives=false WHERE etag LIKE %s", (etag,))
            logger.info("Downloaded %s from %s", etag, url)
            return "downloaded"
    else:
        return "novalidurl"
def create_temp_table(prefix):
    """Create temporary table for each prefix to hold list from Ceph
    Probably naming by prefix is ok because we loop over buckets in the outer
    loop so the same prefix should not be getting worked on in two buckets.
    """
    table = make_temp_table_name(prefix)
    try:
        # Drop and create temp table
        drop_temp_table(prefix)
        apidbpool.execute(("CREATE TABLE IF NOT EXISTS {0}( "
                           "ceph_bucket VARCHAR(32), "
                           "ceph_name VARCHAR(128), "
                           "ceph_date TIMESTAMP WITHOUT TIME ZONE, "
                           "ceph_bytes bigint, "
                           "ceph_etag uuid, "
                           "ceph_status VARCHAR(8));").format(table))
        return table
    except:
        logger.error("Failed to create temp table for prefix {0} {1}".format(
            prefix, traceback.format_exc()))
        return False
def copy_new_to_ceph_objects(prefix):
    """Copy 'new' records from ceph_objects_temp to ceph_objects
    """
    table = make_temp_table_name(prefix)
    logger.info(
        "Copying new ceph records from {0} to main table".format(table))
    return apidbpool.execute(
        """INSERT INTO ceph_objects
        (ceph_bucket, ceph_name, ceph_date, ceph_bytes, ceph_etag)
        SELECT ceph_bucket, ceph_name, ceph_date, ceph_bytes, ceph_etag
        FROM {0}
        WHERE ceph_status='new' AND ceph_etag IS NOT NULL
        """.format(table)
    )  # some etags might not get filled in, better to not copy them and let them be picked up next time
Example #8
0
def keyfn(obj):
    bucket, etag, mime = obj
    bucket = 'idigbio-' + bucket + '-prod'
    conn = s3connection()

    k = conn.Object(bucket, etag)
    status = check_key(k, mime)
    if status is Status.fouled or status is Status.missing:
        return status
    elif k.e_tag != '"{0}"'.format(etag):
        logger.warning("%s/%s etag doesn't match ceph's etag: %s", bucket,
                       etag, k.e_tag)
        return Status.etagmismatch

    for ext in ['webview', 'thumbnail', 'fullsize']:
        k = conn.Object(bucket + '-' + ext, etag + '.jpg')
        status = check_key(k, 'image/jpeg')
        if status is Status.fouled or status is Status.missing:
            apidbpool.execute(
                "UPDATE objects SET derivatives=false WHERE etag = %s",
                (etag, ))
            return Status.rederive
    return Status.ok
def flag_changed_records(prefix):
    """Compare records that exist in both tables and flag things as 'changed' if they differ
    """
    table = make_temp_table_name(prefix)
    logger.info("Flagging changed records in {0}".format(table))
    return apidbpool.execute("""UPDATE {0}
        SET ceph_status='changed'
        WHERE
          ctid IN (
            SELECT t.ctid
             FROM {0} t JOIN ceph_objects o
             ON t.ceph_bucket=o.ceph_bucket AND t.ceph_name=o.ceph_name
             WHERE t.ceph_date!=o.ceph_date OR t.ceph_bytes!=o.ceph_bytes
          )
        """.format(table))
def flag_new_records(prefix):
    """Mark records in the ceph_objects_temp table 'new' if they are not in ceph_objects
    """
    table = make_temp_table_name(prefix)
    logger.info("Flagging new records in {0}".format(table))
    return apidbpool.execute("""UPDATE {0}
        SET ceph_status='new'
        WHERE
          ctid IN (
            SELECT t.ctid
            FROM {0} t LEFT JOIN ceph_objects o
            ON t.ceph_bucket=o.ceph_bucket AND t.ceph_name=o.ceph_name
            WHERE o.ceph_name IS NULL
          )
        """.format(table))
Example #11
0
def keyfn(obj):
    bucket, etag, mime = obj
    bucket = 'idigbio-' + bucket + '-prod'
    conn = s3connection()

    k = conn.Object(bucket, etag)
    try:
        k.load()
    except botocore.exceptions.ClientError as ce:
        if ce.response['Error']['Code'] == "404":
            raise MissingOriginalError()
    if k.content_length == 0:
        fullk = conn.Object(bucket + '-fullsize', etag + '.jpg')
        try:
            fullk.load()
        except botocore.exceptions.ClientError as ce:
            return "gone"
        if fullk.content_length == 0:
            return "gone"
        if fullk.e_tag == '"{0}"'.format(etag):
            src = fullk.bucket_name + '/' + fullk.key
            k.copy_from(ACL='public-read',
                        ContentType='image/jpeg',
                        CopySource=src)
            logger.debug("Restored %s from fullsize", etag)
        else:
            return "gone"

    for ext in ['webview', 'thumbnail', 'fullsize']:
        k = conn.Object(bucket + '-' + ext, etag + '.jpg')
        if k.content_length == 0:
            apidbpool.execute(
                "UPDATE objects SET derivatives=false WHERE etag LIKE %s",
                (etag, ))
            return "rederive"
    return "fine"
Example #12
0
def forcerederive(obj):
    "Mark an object in the database as needing rederive"
    bucket, etag, mime = obj
    logger.debug("idigbio-%s-%s/%s forcing rederive", bucket, config.ENV, etag)
    apidbpool.execute("UPDATE objects SET derivatives=false WHERE etag=%s", (etag,))
    return Status.rederive
def drop_temp_table(prefix):
    """Clean up a temporary table when done
    """
    table = make_temp_table_name(prefix)
    apidbpool.execute("DROP TABLE IF EXISTS {0};".format(table))