def backfill_flagged_worker(rows): """Parallel worker for backfill_flagged_etags() Expects a list of records to work on and commit as a group """ storage = IDigBioStorage() with apidbpool.connection(autocommit=False) as conn: cur = conn.cursor() for row in rows: try: table = make_temp_table_name(row['prefix']) b = storage.get_bucket( row['ceph_bucket'] ) # Two phase here because validate=False in storage class so etag is not populated row["etag"] = b.get_key(row["ceph_name"]).etag[1:-1] cur.execute( """UPDATE {0} SET ceph_etag=%(etag)s WHERE ceph_bucket=%(ceph_bucket)s AND ceph_name=%(ceph_name)s """.format(table), row) except: logger.error( "Failed to update etag for {0}:{1} in {2} {3}".format( row["ceph_bucket"], row["ceph_name"], row["prefix"], traceback.format_exc())) conn.commit() return 0
def get_objects_from_ceph(): local_cur.execute("SELECT etag FROM objects") existing_objects = set() for r in local_cur: existing_objects.add(r[0]) print len(existing_objects) s = IDigBioStorage() buckets = ["datasets","images"] count = 0 rowcount = 0 lrc = 0 for b_k in buckets: b = s.get_bucket("idigbio-" + b_k + "-prod") for k in b.list(): if k.name not in existing_objects: try: ks = k.get_contents_as_string(headers={'Range' : 'bytes=0-100'}) detected_mime = magic.from_buffer(ks, mime=True) local_cur.execute("INSERT INTO objects (bucket,etag,detected_mime) SELECT %(bucket)s,%(etag)s,%(dm)s WHERE NOT EXISTS (SELECT 1 FROM objects WHERE etag=%(etag)s)", {"bucket": b_k, "etag": k.name, "dm": detected_mime}) existing_objects.add(k.name) rowcount += local_cur.rowcount except: print "Ceph Error", b_k, k.name count += 1 if rowcount != lrc and rowcount % 10000 == 0: print count, rowcount local_pg.commit() lrc = rowcount print count, rowcount local_pg.commit()
def bucket_list_worker(work): logger.debug("Listing and inserting prefix {0} from bucket {1}".format( work["prefix"], work["bucket"])) storage = IDigBioStorage() # Read through bucket inserting into temp table with apidbpool.connection( autocommit=False ) as conn: # use a single connection from the pool to commit groups of statements cur = conn.cursor() # inserted = 1 #logger.info("Importing bucket listing for {0}.".format(bucket)) b = storage.get_bucket(work["bucket"]) for f in b.list(prefix=work["prefix"]): # see backfill_new_etags() for why no etag here cur.execute(("INSERT INTO {0} " "(ceph_bucket, ceph_name, ceph_date, ceph_bytes) " "VALUES (%s, %s, %s, %s)").format( make_temp_table_name(work["prefix"])), (work["bucket"], f.name, f.last_modified, f.size)) # inserted += 1 # if (inserted % 10000) == 0: # logger.info("Committing {0}".format(inserted)) # conn.commit() conn.commit() return 1
def set_deriv_from_ceph(): s = IDigBioStorage() b = s.get_bucket("idigbio-images-prod-thumbnail") count = 0 for k in b.list(): local_cur.execute("UPDATE objects SET derivatives=true WHERE etag=%s", (k.name.split(".")[0],)) count += 1 if count % 10000 == 0: print count local_pg.commit() print count local_pg.commit()
def get_objects_from_ceph(): import magic existing_objects = set(r[0] for r in apidbpool.fetchiter( "SELECT etag FROM objects", cursor_factory=cursor)) logger.info("Found %d objects", len(existing_objects)) s = IDigBioStorage() buckets = ["datasets", "images"] count = 0 rowcount = 0 lrc = 0 with apidbpool.connection() as conn: with apidbpool.cursor() as cur: for b_k in buckets: b = s.get_bucket("idigbio-" + b_k + "-prod") for k in b.list(): if k.name not in existing_objects: try: ks = k.get_contents_as_string( headers={'Range': 'bytes=0-100'}) detected_mime = magic.from_buffer(ks, mime=True) cur.execute( """INSERT INTO objects (bucket,etag,detected_mime) SELECT %(bucket)s,%(etag)s,%(dm)s WHERE NOT EXISTS( SELECT 1 FROM objects WHERE etag=%(etag)s)""", { "bucket": b_k, "etag": k.name, "dm": detected_mime }) existing_objects.add(k.name) rowcount += cur.rowcount except: logger.exception( "Ceph Error; bucket:%s keyname:%s", b_k, k.name) count += 1 if rowcount != lrc and rowcount % 10000 == 0: logger.info("Count: %8d, rowcount: %8d", count, rowcount) conn.commit() lrc = rowcount conn.commit() logger.info("Count: %8d, rowcount: %8d (Finished %s)", count, rowcount, b_k)
def main(): index_file_name = "index.txt" query = { "size": 0, "aggs": { "rs": { "terms": { "field": "recordset", "size": 1000 }, "aggs": { "ic":{ "terms": { "field": "institutioncode", "size": 1000, }, "aggs": { "cc": { "terms":{ "field": "collectioncode", "size": 1000, } } } } } } } } r = requests.post("http://search.idigbio.org/idigbio/records/_search", data=json.dumps(query), headers={"Content-Type": "application/json"}) r.raise_for_status() ro = r.json() recordsets = {} for rs_b in ro["aggregations"]["rs"]["buckets"]: rsid = rs_b["key"] ic = "" cc = "" if len(rs_b["ic"]["buckets"]) == 0: ic = "" cc = "" elif len(rs_b["ic"]["buckets"]) == 1 or ( float(rs_b["ic"]["buckets"][0]["doc_count"]) / float(rs_b["doc_count"]) > 0.9 ): ic_b = rs_b["ic"]["buckets"][0] ic = get_true_ic(ic_b["key"]) if len(ic_b["cc"]["buckets"]) == 0: cc = "" elif len(ic_b["cc"]["buckets"]) == 1: cc = ic_b["cc"]["buckets"][0]["key"] else: cc = "MULTIPLE" else: # print(rs_b) ic = "MULTIPLE" cc = "MULTIPLE" recordsets[rsid] = { "institutioncode": ic, "collectioncode": cc } s = IDigBioStorage() b = s.get_bucket("idigbio-static-downloads") headers = ["zipfile","emlfile","etag","modified","recordset_id", "institutioncode", "collectioncode"] files = {} for k in b.list(): # Skip the index itself if k == index_file_name: continue # Skip files older than 8 days lm_d = dateutil.parser.parse(k.last_modified).date() if lm_d < (datetime.datetime.now() - datetime.timedelta(7)).date(): continue fkey = k.name.split(".")[0] if fkey not in files: files[fkey] = {k:"" for k in headers} if k.name.endswith(".eml"): files[fkey]["emlfile"] = k.name elif k.name.endswith(".zip"): files[fkey]["zipfile"] = k.name files[fkey]["modified"] = k.last_modified files[fkey]["etag"] = k.etag if is_uuid(fkey): files[fkey]["recordset_id"] = fkey if fkey in recordsets: files[fkey]["institutioncode"] = recordsets[fkey]["institutioncode"] files[fkey]["collectioncode"] = recordsets[fkey]["collectioncode"] else: files[fkey]["institutioncode"] = "" files[fkey]["collectioncode"] = "" fil = StringIO() cw = csv.writer(fil,delimiter="\t") cw.writerow(headers) for k in files: if files[k]["zipfile"] != "": cw.writerow([files[k][h].replace("\"","") for h in headers]) fil.seek(0) ik = b.get_key(index_file_name,validate=False) ik.content_type = 'text/tsv' ik.set_contents_from_file(fil) ik.make_public()