def delete_remaining_zeros(fouled): conn = s3connection() for bucket, etag, mime in fouled: k = conn.Object('idigbio-{0}-prod'.format(bucket), etag) k.delete() for typ in ['thumbnail', 'webview', 'fullsize']: k = conn.Object('idigbio-{0}-prod-{1}'.format(bucket, typ), etag + '.jpg') if k.content_length == 0: k.delete() apidbpool.execute("UPDATE objects SET derivatives=false WHERE etag LIKE %s", (etag,))
def migrate(): """Migrate objects from the old media API Specifically the `idb_object_keys` into the new `media` and `objects` """ t1 = datetime.now() logger.info("Checking for objects in the old media api") try: sql = """INSERT INTO objects (bucket, etag) (SELECT DISTINCT type, etag FROM idb_object_keys LEFT JOIN objects USING (etag) WHERE objects.etag IS NULL AND idb_object_keys.user_uuid <> %s); """ rc = apidbpool.execute(sql, (config.IDB_UUID, )) logger.info("Objects Migrated: %s", rc) sql = """INSERT INTO media (url, type, owner, last_status, last_check) (SELECT idb_object_keys.lookup_key, idb_object_keys.type, idb_object_keys.user_uuid::uuid, 200, now() FROM idb_object_keys LEFT JOIN media ON lookup_key = url WHERE media.url IS NULL AND idb_object_keys.user_uuid <> %s); """ rc = apidbpool.execute(sql, (config.IDB_UUID, )) logger.info("Media Migrated: %s", rc) sql = """ INSERT INTO media_objects (url, etag, modified) (SELECT idb_object_keys.lookup_key, idb_object_keys.etag, idb_object_keys.date_modified FROM idb_object_keys JOIN media ON idb_object_keys.lookup_key = media.url JOIN objects ON idb_object_keys.etag = objects.etag LEFT JOIN media_objects ON lookup_key = media.url AND media_objects.etag = idb_object_keys.etag WHERE media_objects.url IS NULL AND idb_object_keys.user_uuid <> %s) """ rc = apidbpool.execute(sql, (config.IDB_UUID, )) logger.info("Media Objects Migrated: %s in %ss", rc, (datetime.now() - t1)) except Exception: logger.exception("Failed migrating from old media api")
def update_db(ceph_bucket, ceph_name, status): global test if test: logger.debug("Skipping database udpate for {0}/{1}".format( ceph_bucket, ceph_name)) return True else: logger.debug("Updating database for {0}/{1}".format( ceph_bucket, ceph_name)) cols = [] vals = {"ceph_name": ceph_name, "ceph_bucket": ceph_bucket} cols.append("rest_status=%(status)s") vals["status"] = status if status == "reconstructed": cols.append("rest_last_success=%(timestamp)s") else: cols.append("rest_last_failure=%(timestamp)s") vals["timestamp"] = '{:%Y-%m-%d %H:%M:%S}'.format( datetime.datetime.now()) r = apidbpool.execute( """UPDATE ceph_objects SET {0} WHERE ceph_name=%(ceph_name)s AND ceph_bucket=%(ceph_bucket)s """.format(",".join(cols)), vals) return r > 0
def update_db(row_obj, key_obj, status): """Some db records are incomplete due to the original db being used for backups and not a full accounting of object properties, backfill any missing information into the database. Verify status is the result of this check. """ if TEST: logger.debug("Skipping metadata update for {0}:{1}".format( key_obj.bucket.name, key_obj.name)) return True else: logger.debug("Updating database record for {0}:{1}".format( key_obj.bucket.name, key_obj.name)) cols = [] vals = { "ceph_name": row_obj["ceph_name"], "ceph_bucket": row_obj["ceph_bucket"] } cols.append("ver_status=%(status)s") vals["status"] = status if status == "verified" or status == "stashed": cols.append("ver_last_success=%(timestamp)s") else: cols.append("ver_last_failure=%(timestamp)s") # To save refactoring larger portions of the script and status returns, see if *this* name # is in the list of things successfully deleted. Performance won't be great on large # numbers of items. if key_obj.name in DELETED: cols.append("ceph_deleted=%(was_it_deleted)s") cols.append("ceph_deleted_date=%(timestamp)s") vals["timestamp"] = '{:%Y-%m-%d %H:%M:%S}'.format( datetime.datetime.now()) vals["was_it_deleted"] = True # Even if the obj does not verify, update the db with what's in ceph if not row_obj["ceph_date"]: cols.append("ceph_date=%(last_modified)s") vals["last_modified"] = key_obj.last_modified if not row_obj["ceph_bytes"]: cols.append("ceph_bytes=%(size)s") vals["size"] = key_obj.size # Seems like if object does not transfer fully, etag is not set? if (status == "verified") and not row_obj["ceph_etag"]: cols.append("ceph_etag=%(etag)s") vals["etag"] = key_obj.etag[1:-1] return apidbpool.execute( """UPDATE ceph_objects SET {0} WHERE ceph_name=%(ceph_name)s AND ceph_bucket=%(ceph_bucket)s""".format( ",".join(cols)), vals)
def keyfn(obj): bucket, etag, mime = obj urls = geturls(etag) for url in urls: fi = FetchItem(url, bucket, mime) if not fi.prefix: continue fi.get_media() fi.upload_to_storage(store) fi.cleanup() if not fi.ok: continue else: apidbpool.execute("UPDATE objects SET derivatives=false WHERE etag LIKE %s", (etag,)) logger.info("Downloaded %s from %s", etag, url) return "downloaded" else: return "novalidurl"
def create_temp_table(prefix): """Create temporary table for each prefix to hold list from Ceph Probably naming by prefix is ok because we loop over buckets in the outer loop so the same prefix should not be getting worked on in two buckets. """ table = make_temp_table_name(prefix) try: # Drop and create temp table drop_temp_table(prefix) apidbpool.execute(("CREATE TABLE IF NOT EXISTS {0}( " "ceph_bucket VARCHAR(32), " "ceph_name VARCHAR(128), " "ceph_date TIMESTAMP WITHOUT TIME ZONE, " "ceph_bytes bigint, " "ceph_etag uuid, " "ceph_status VARCHAR(8));").format(table)) return table except: logger.error("Failed to create temp table for prefix {0} {1}".format( prefix, traceback.format_exc())) return False
def copy_new_to_ceph_objects(prefix): """Copy 'new' records from ceph_objects_temp to ceph_objects """ table = make_temp_table_name(prefix) logger.info( "Copying new ceph records from {0} to main table".format(table)) return apidbpool.execute( """INSERT INTO ceph_objects (ceph_bucket, ceph_name, ceph_date, ceph_bytes, ceph_etag) SELECT ceph_bucket, ceph_name, ceph_date, ceph_bytes, ceph_etag FROM {0} WHERE ceph_status='new' AND ceph_etag IS NOT NULL """.format(table) ) # some etags might not get filled in, better to not copy them and let them be picked up next time
def keyfn(obj): bucket, etag, mime = obj bucket = 'idigbio-' + bucket + '-prod' conn = s3connection() k = conn.Object(bucket, etag) status = check_key(k, mime) if status is Status.fouled or status is Status.missing: return status elif k.e_tag != '"{0}"'.format(etag): logger.warning("%s/%s etag doesn't match ceph's etag: %s", bucket, etag, k.e_tag) return Status.etagmismatch for ext in ['webview', 'thumbnail', 'fullsize']: k = conn.Object(bucket + '-' + ext, etag + '.jpg') status = check_key(k, 'image/jpeg') if status is Status.fouled or status is Status.missing: apidbpool.execute( "UPDATE objects SET derivatives=false WHERE etag = %s", (etag, )) return Status.rederive return Status.ok
def flag_changed_records(prefix): """Compare records that exist in both tables and flag things as 'changed' if they differ """ table = make_temp_table_name(prefix) logger.info("Flagging changed records in {0}".format(table)) return apidbpool.execute("""UPDATE {0} SET ceph_status='changed' WHERE ctid IN ( SELECT t.ctid FROM {0} t JOIN ceph_objects o ON t.ceph_bucket=o.ceph_bucket AND t.ceph_name=o.ceph_name WHERE t.ceph_date!=o.ceph_date OR t.ceph_bytes!=o.ceph_bytes ) """.format(table))
def flag_new_records(prefix): """Mark records in the ceph_objects_temp table 'new' if they are not in ceph_objects """ table = make_temp_table_name(prefix) logger.info("Flagging new records in {0}".format(table)) return apidbpool.execute("""UPDATE {0} SET ceph_status='new' WHERE ctid IN ( SELECT t.ctid FROM {0} t LEFT JOIN ceph_objects o ON t.ceph_bucket=o.ceph_bucket AND t.ceph_name=o.ceph_name WHERE o.ceph_name IS NULL ) """.format(table))
def keyfn(obj): bucket, etag, mime = obj bucket = 'idigbio-' + bucket + '-prod' conn = s3connection() k = conn.Object(bucket, etag) try: k.load() except botocore.exceptions.ClientError as ce: if ce.response['Error']['Code'] == "404": raise MissingOriginalError() if k.content_length == 0: fullk = conn.Object(bucket + '-fullsize', etag + '.jpg') try: fullk.load() except botocore.exceptions.ClientError as ce: return "gone" if fullk.content_length == 0: return "gone" if fullk.e_tag == '"{0}"'.format(etag): src = fullk.bucket_name + '/' + fullk.key k.copy_from(ACL='public-read', ContentType='image/jpeg', CopySource=src) logger.debug("Restored %s from fullsize", etag) else: return "gone" for ext in ['webview', 'thumbnail', 'fullsize']: k = conn.Object(bucket + '-' + ext, etag + '.jpg') if k.content_length == 0: apidbpool.execute( "UPDATE objects SET derivatives=false WHERE etag LIKE %s", (etag, )) return "rederive" return "fine"
def forcerederive(obj): "Mark an object in the database as needing rederive" bucket, etag, mime = obj logger.debug("idigbio-%s-%s/%s forcing rederive", bucket, config.ENV, etag) apidbpool.execute("UPDATE objects SET derivatives=false WHERE etag=%s", (etag,)) return Status.rederive
def drop_temp_table(prefix): """Clean up a temporary table when done """ table = make_temp_table_name(prefix) apidbpool.execute("DROP TABLE IF EXISTS {0};".format(table))