def post_all_deleted_rs(): deleted_query = { "query": { "filtered": { "filter": { "term": { "deleted": True } } } }, "size": 10000 } deleted_recordsets = set() rsp = stats.search(doc_type="digest", body=deleted_query) for rs in rsp["hits"]["hits"]: deleted_recordsets.add(rs["_source"]["recordset_id"]) print("{} recordsets already marked as deleted in stats.".format( len(deleted_recordsets))) count = 0 with apidbpool.cursor() as cursor: cursor.execute("SELECT id FROM uuids WHERE type='recordset' and deleted=true") for r in cursor: if r["id"] not in deleted_recordsets: count += 1 print("Deleting {}.".format(r["id"])) post_delete_stats(r["id"]) print("{} recordsets deleted from stats.".format(count))
def write_urls_to_db(to_insert, to_update): with apidbpool.cursor(autocommit=True) as cur: cur.executemany("INSERT INTO media (url,type,mime) VALUES (%s,%s,%s)", ((k, v[0], v[1]) for k, v in to_insert.items())) inserted = cur.rowcount cur.executemany( "UPDATE media SET type=%s, mime=%s, last_status=NULL, last_check=NULL WHERE url=%s", to_update) updated = cur.rowcount logger.info("Inserted : %8d, Updated : %8d", inserted, updated)
def update_db_status(items): rc = 0 with apidbpool.cursor(autocommit=True) as cur: for fi in items: try: status = fi.status_code.value except AttributeError: status = fi.status_code cur.execute( "UPDATE media SET last_status=%s, last_check=now() WHERE url=%s", (status, fi.url)) rc += cur.rowcount yield fi logger.info("Finished updating %d records", rc)
def get_objects_from_ceph(): import magic existing_objects = set(r[0] for r in apidbpool.fetchiter( "SELECT etag FROM objects", cursor_factory=cursor)) logger.info("Found %d objects", len(existing_objects)) s = IDigBioStorage() buckets = ["datasets", "images"] count = 0 rowcount = 0 lrc = 0 with apidbpool.connection() as conn: with apidbpool.cursor() as cur: for b_k in buckets: b = s.get_bucket("idigbio-" + b_k + "-prod") for k in b.list(): if k.name not in existing_objects: try: ks = k.get_contents_as_string( headers={'Range': 'bytes=0-100'}) detected_mime = magic.from_buffer(ks, mime=True) cur.execute( """INSERT INTO objects (bucket,etag,detected_mime) SELECT %(bucket)s,%(etag)s,%(dm)s WHERE NOT EXISTS( SELECT 1 FROM objects WHERE etag=%(etag)s)""", { "bucket": b_k, "etag": k.name, "dm": detected_mime }) existing_objects.add(k.name) rowcount += cur.rowcount except: logger.exception( "Ceph Error; bucket:%s keyname:%s", b_k, k.name) count += 1 if rowcount != lrc and rowcount % 10000 == 0: logger.info("Count: %8d, rowcount: %8d", count, rowcount) conn.commit() lrc = rowcount conn.commit() logger.info("Count: %8d, rowcount: %8d (Finished %s)", count, rowcount, b_k)