def backfill_flagged_worker(rows): """Parallel worker for backfill_flagged_etags() Expects a list of records to work on and commit as a group """ storage = IDigBioStorage() with apidbpool.connection(autocommit=False) as conn: cur = conn.cursor() for row in rows: try: table = make_temp_table_name(row['prefix']) b = storage.get_bucket( row['ceph_bucket'] ) # Two phase here because validate=False in storage class so etag is not populated row["etag"] = b.get_key(row["ceph_name"]).etag[1:-1] cur.execute( """UPDATE {0} SET ceph_etag=%(etag)s WHERE ceph_bucket=%(ceph_bucket)s AND ceph_name=%(ceph_name)s """.format(table), row) except: logger.error( "Failed to update etag for {0}:{1} in {2} {3}".format( row["ceph_bucket"], row["ceph_name"], row["prefix"], traceback.format_exc())) conn.commit() return 0
def get_objects_from_ceph(): local_cur.execute("SELECT etag FROM objects") existing_objects = set() for r in local_cur: existing_objects.add(r[0]) print len(existing_objects) s = IDigBioStorage() buckets = ["datasets","images"] count = 0 rowcount = 0 lrc = 0 for b_k in buckets: b = s.get_bucket("idigbio-" + b_k + "-prod") for k in b.list(): if k.name not in existing_objects: try: ks = k.get_contents_as_string(headers={'Range' : 'bytes=0-100'}) detected_mime = magic.from_buffer(ks, mime=True) local_cur.execute("INSERT INTO objects (bucket,etag,detected_mime) SELECT %(bucket)s,%(etag)s,%(dm)s WHERE NOT EXISTS (SELECT 1 FROM objects WHERE etag=%(etag)s)", {"bucket": b_k, "etag": k.name, "dm": detected_mime}) existing_objects.add(k.name) rowcount += local_cur.rowcount except: print "Ceph Error", b_k, k.name count += 1 if rowcount != lrc and rowcount % 10000 == 0: print count, rowcount local_pg.commit() lrc = rowcount print count, rowcount local_pg.commit()
def bucket_list_worker(work): logger.debug("Listing and inserting prefix {0} from bucket {1}".format( work["prefix"], work["bucket"])) storage = IDigBioStorage() # Read through bucket inserting into temp table with apidbpool.connection( autocommit=False ) as conn: # use a single connection from the pool to commit groups of statements cur = conn.cursor() # inserted = 1 #logger.info("Importing bucket listing for {0}.".format(bucket)) b = storage.get_bucket(work["bucket"]) for f in b.list(prefix=work["prefix"]): # see backfill_new_etags() for why no etag here cur.execute(("INSERT INTO {0} " "(ceph_bucket, ceph_name, ceph_date, ceph_bytes) " "VALUES (%s, %s, %s, %s)").format( make_temp_table_name(work["prefix"])), (work["bucket"], f.name, f.last_modified, f.size)) # inserted += 1 # if (inserted % 10000) == 0: # logger.info("Committing {0}".format(inserted)) # conn.commit() conn.commit() return 1
def upload_download_file_to_ceph(filename): s = IDigBioStorage() keyname, bucket = os.path.basename(filename), "idigbio-downloads" fkey = s.upload(s.get_key(keyname, bucket), filename, content_type='application/zip', public=True) return "http://s.idigbio.org/idigbio-downloads/" + fkey.name
def upload_all(gr): if not gr: return try: for item in gr.items: IDigBioStorage.retry_loop(lambda: upload_item(item)) return gr except (BotoServerError, BotoClientError): logger.exception("%s failed uploading derivatives", gr.etag) except Exception: logger.exception("%s Unexpected error", gr.etag)
def get_key_object(bucket, name): """Get a key object from Ceph for the requested object. Note that most of the metadata with the key won't be populated until after it has been fetched. """ global STORAGE_HOST storage = IDigBioStorage(host=STORAGE_HOST) logger.debug("Retreiving key for {0}:{1}".format(bucket, name)) key = storage.get_key(name, bucket) return key
def set_deriv_from_ceph(): s = IDigBioStorage() b = s.get_bucket("idigbio-images-prod-thumbnail") count = 0 for k in b.list(): local_cur.execute("UPDATE objects SET derivatives=true WHERE etag=%s", (k.name.split(".")[0],)) count += 1 if count % 10000 == 0: print count local_pg.commit() print count local_pg.commit()
def get_objects_from_ceph(): import magic existing_objects = set(r[0] for r in apidbpool.fetchiter( "SELECT etag FROM objects", cursor_factory=cursor)) logger.info("Found %d objects", len(existing_objects)) s = IDigBioStorage() buckets = ["datasets", "images"] count = 0 rowcount = 0 lrc = 0 with apidbpool.connection() as conn: with apidbpool.cursor() as cur: for b_k in buckets: b = s.get_bucket("idigbio-" + b_k + "-prod") for k in b.list(): if k.name not in existing_objects: try: ks = k.get_contents_as_string( headers={'Range': 'bytes=0-100'}) detected_mime = magic.from_buffer(ks, mime=True) cur.execute( """INSERT INTO objects (bucket,etag,detected_mime) SELECT %(bucket)s,%(etag)s,%(dm)s WHERE NOT EXISTS( SELECT 1 FROM objects WHERE etag=%(etag)s)""", { "bucket": b_k, "etag": k.name, "dm": detected_mime }) existing_objects.add(k.name) rowcount += cur.rowcount except: logger.exception( "Ceph Error; bucket:%s keyname:%s", b_k, k.name) count += 1 if rowcount != lrc and rowcount % 10000 == 0: logger.info("Count: %8d, rowcount: %8d", count, rowcount) conn.commit() lrc = rowcount conn.commit() logger.info("Count: %8d, rowcount: %8d (Finished %s)", count, rowcount, b_k)
def fetch_media(key): try: return IDigBioStorage.get_contents_to_mem(key, md5=key.name) except BotoServerError as e: logger.error("%r failed downloading with %r %s %s", key, e.status, e.reason, key.name) raise except S3DataError as e: logger.error("%r failed downloading on md5 mismatch", key) raise except BotoClientError as e: logger.exception("%r failed downloading because...") raise
def get_file(rsid): fname = rsid if not os.path.exists(fname): try: RecordSet.fetch_file(rsid, fname, media_store=IDigBioStorage(), logger=logger.getChild(rsid)) except (S3ResponseError, S3DataError): getrslogger(rsid).exception("failed fetching archive") raise mime = magic.from_file(fname, mime=True) return (fname, mime)
def upload_recordset(rsid, fname, idbmodel): filereference = "http://api.idigbio.org/v1/recordsets/" + rsid logger.debug("Starting Upload of %r", rsid) stor = IDigBioStorage() with open(fname, 'rb') as fobj: mo = MediaObject.fromobj(fobj, url=filereference, type='datasets', owner=config.IDB_UUID) k = mo.get_key(stor) if k.exists(): logger.debug("ETAG %s already present in Storage.", mo.etag) else: mo.upload(stor, fobj) logger.debug("ETAG %s uploading from %r", mo.etag, fname) mo.ensure_media(idbmodel) mo.ensure_object(idbmodel) mo.ensure_media_object(idbmodel) logger.debug("Finished Upload of %r, etag = %s", rsid, mo.etag) return mo.etag
def process_list(fetchitems, forprefix=''): """Process a list of FetchItems. This is intended to be the toplevel entry point of a subprocess working on a list of one domain's urls """ try: store = IDigBioStorage() fetchrpool = gevent.pool.Pool(get_fetcher_count(forprefix)) uploadpool = gevent.pool.Pool(8) items = fetchrpool.imap_unordered(lambda fi: fi.get_media(), fetchitems, maxsize=10) items = uploadpool.imap_unordered( lambda fi: fi.upload_to_storage(store), items, maxsize=10) items = itertools.imap(FetchItem.cleanup, items) items = update_db_status(items) items = count_result_types(items, forprefix=forprefix) return ilen(items) # consume the generator except StandardError: logger.exception("Unhandled error forprefix:%s", forprefix) raise
def verify_object(row_obj, key_obj): """Download an object and check it against the expected metadata. Return is a string status of result of checking the file: verified - Object downloads and all available data matches stashed - Object is verified and a copy has been kept in stash directory timeout - Download times out, probably due to file being truncated nosuchkey - Object does not exist, 404 error when downloading invalid - Some of the metadata does not match failed - No longer used, when this function was boolean this was False """ storage = IDigBioStorage(host=STORAGE_HOST) try: if not os.path.exists(TMP_DIR): os.makedirs(TMP_DIR) fn = os.path.join(TMP_DIR, key_obj.name) logger.debug("Fetching file {0}:{1}".format(key_obj.bucket.name, key_obj.name)) storage.get_contents_to_filename(key_obj, fn) md5 = calc_md5(fn) size = os.stat(fn).st_size except (S3ResponseError) as ex: if "NoSuchKey" in str(ex): logger.error("No such key when getting {0}:{1}".format( key_obj.bucket.name, key_obj.name)) return "nosuchkey" else: logger.error( "Exception while attempting to get file {0}:{1} {2}".format( key_obj.bucket.name, key_obj.name, traceback.format_exc())) # raise return "S3ResponseError" except (HTTPException, socket_error) as ex: # Timeout can be controlled by /etc/boto.cfg - see http://boto.cloudhackers.com/en/latest/boto_config_tut.html logger.error( "Socket timeout when getting {0}:{1}, file is probably corrupt in ceph" .format(key_obj.bucket.name, key_obj.name)) if os.path.exists(fn): os.unlink(fn) return "timeout" except Exception as ex: if "503 Service Unavailable" in str(ex): logger.error("Service unavailable getting {0}:{1}".format( key_obj.bucket.name, key_obj.name)) return "503Error" else: logger.error( "Exception while attempting to get file {0}:{1} {2}".format( key_obj.bucket.name, key_obj.name, traceback.format_exc())) raise # The db may have partial information so we need to support it being # empty, but if it exists, it should match. Use logging to say what's # wrong with file, maintain a return value if anything fails. retval = False if not retval and (not size == key_obj.size): logger.error( "File size {0} does not match ceph size {1} for {2}:{3}".format( size, key_obj.size, key_obj.bucket.name, key_obj.name)) retval = "invalid" if not retval and (row_obj["ceph_bytes"] and (not size == row_obj["ceph_bytes"])): logger.error( "File size {0} does not match db size {1} for {2}:{3}".format( size, row_obj["ceph_bytes"], key_obj.bucket.name, key_obj.name)) retval = "invalid" if not retval and (not md5 == key_obj.etag[1:-1]): # etag is wraped in "" logger.error( "File md5 {0} does not match ceph etag {1} for {2}:{3}".format( md5, key_obj.etag[1:-1], key_obj.bucket.name, key_obj.name)) retval = "invalid" # db etag has extra '-' chars if not retval and (row_obj["ceph_etag"] and (not md5 == row_obj["ceph_etag"].replace('-', ''))): logger.error( "File md5 {0} does not match db etag {1} for {2}:{3}".format( md5, row_obj["ceph_etag"], key_obj.bucket.name, key_obj.name)) retval = "invalid" if not retval: logger.debug("Object {0}:{1} verified".format(key_obj.bucket.name, key_obj.name)) #global args if STASH and stash_file(fn, key_obj): retval = "stashed" if DELETE and not TEST: logger.debug("Deleting {0}:{1} from ceph.".format( key_obj.bucket.name, key_obj.name)) try: key_obj.delete() DELETED.append(key_obj.name) except: logger.error( "Unable to delete object {0}:{1} in ceph.".format( key_obj.bucket.name, key_obj.name)) else: retval = "verified" else: logger.warn("Object {0}:{1} failed verification".format( key_obj.bucket.name, key_obj.name)) try: os.unlink(fn) except: pass return retval
from idb.helpers.storage import IDigBioStorage from idb.helpers import gipcpool from idigbio_ingestion.lib.dwca import Dwca from idigbio_ingestion.lib.delimited import DelimitedFile bad_chars = u"\ufeff" bad_char_re = re.compile("[%s]" % re.escape(bad_chars)) logger = idblogger.getChild("db-check") uuid_re = re.compile( "([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})" ) s = IDigBioStorage() class RecordException(Exception): pass def getrslogger(rsid): return logger.getChild(rsid) def mungeid(s): return bad_char_re.sub('', s).strip() identifier_fields = {
from __future__ import division, absolute_import, print_function from idb.helpers.logging import getLogger, configure_app_log from idb.helpers.storage import IDigBioStorage from idb.helpers.media_validation import sniff_mime from idb.helpers.memoize import filecached import hashlib logger = getLogger("restore") store = IDigBioStorage() def get_object_from_backup(etag): "need to return a buffer that is the contents of the object that should be in `idigbio-images-prod/$etag`" obj = open('reestores/' + etag, 'rb').read() md5 = hashlib.md5() md5.update(obj) assert etag == md5.hexdigest() return obj @filecached("/tmp/restore-from-backup.picklecache") def get_fouled(): """Return the original list, cached in a file that is written back as we make progress. This way as we make progrss and kill/rerun- the file it only tries new ones """ return {(u'images', u'06fbc3c99d7d9f06e1487adbbe171f82', u'image/jpeg'), (u'images', u'0f4ceba6d970ad48e43e740a794b288a', u'image/jpeg'), (u'images', u'1155e9e1d4af33ebf3ab841523bb9a4c', None),
def main(): index_file_name = "index.txt" query = { "size": 0, "aggs": { "rs": { "terms": { "field": "recordset", "size": 1000 }, "aggs": { "ic":{ "terms": { "field": "institutioncode", "size": 1000, }, "aggs": { "cc": { "terms":{ "field": "collectioncode", "size": 1000, } } } } } } } } r = requests.post("http://search.idigbio.org/idigbio/records/_search", data=json.dumps(query), headers={"Content-Type": "application/json"}) r.raise_for_status() ro = r.json() recordsets = {} for rs_b in ro["aggregations"]["rs"]["buckets"]: rsid = rs_b["key"] ic = "" cc = "" if len(rs_b["ic"]["buckets"]) == 0: ic = "" cc = "" elif len(rs_b["ic"]["buckets"]) == 1 or ( float(rs_b["ic"]["buckets"][0]["doc_count"]) / float(rs_b["doc_count"]) > 0.9 ): ic_b = rs_b["ic"]["buckets"][0] ic = get_true_ic(ic_b["key"]) if len(ic_b["cc"]["buckets"]) == 0: cc = "" elif len(ic_b["cc"]["buckets"]) == 1: cc = ic_b["cc"]["buckets"][0]["key"] else: cc = "MULTIPLE" else: # print(rs_b) ic = "MULTIPLE" cc = "MULTIPLE" recordsets[rsid] = { "institutioncode": ic, "collectioncode": cc } s = IDigBioStorage() b = s.get_bucket("idigbio-static-downloads") headers = ["zipfile","emlfile","etag","modified","recordset_id", "institutioncode", "collectioncode"] files = {} for k in b.list(): # Skip the index itself if k == index_file_name: continue # Skip files older than 8 days lm_d = dateutil.parser.parse(k.last_modified).date() if lm_d < (datetime.datetime.now() - datetime.timedelta(7)).date(): continue fkey = k.name.split(".")[0] if fkey not in files: files[fkey] = {k:"" for k in headers} if k.name.endswith(".eml"): files[fkey]["emlfile"] = k.name elif k.name.endswith(".zip"): files[fkey]["zipfile"] = k.name files[fkey]["modified"] = k.last_modified files[fkey]["etag"] = k.etag if is_uuid(fkey): files[fkey]["recordset_id"] = fkey if fkey in recordsets: files[fkey]["institutioncode"] = recordsets[fkey]["institutioncode"] files[fkey]["collectioncode"] = recordsets[fkey]["collectioncode"] else: files[fkey]["institutioncode"] = "" files[fkey]["collectioncode"] = "" fil = StringIO() cw = csv.writer(fil,delimiter="\t") cw.writerow(headers) for k in files: if files[k]["zipfile"] != "": cw.writerow([files[k][h].replace("\"","") for h in headers]) fil.seek(0) ik = b.get_key(index_file_name,validate=False) ik.content_type = 'text/tsv' ik.set_contents_from_file(fil) ik.make_public()
def thumb_key(img_etag): from idb.helpers.storage import IDigBioStorage return IDigBioStorage().get_key(img_etag + '.jpg', 'idigbio-images-prod-thumbnail')
def img_key(img_etag): from idb.helpers.storage import IDigBioStorage return IDigBioStorage().get_key(img_etag, 'idigbio-images-prod')
def sounds_key(sounds_etag): from idb.helpers.storage import IDigBioStorage return IDigBioStorage().get_key(sounds_etag, 'idigbio-sounds-prod')
def upload(): vals = {} j = request.get_json() if j is not None: vals.update(j) for k, v in request.values.iteritems(): vals[k] = v filereference = vals.get("filereference") if not filereference: logger.warning("No filereference specified") return json_error(400, "Missing filereference") obj = request.files.get('file') etag = vals.get('etag') media_type = vals.get("media_type") mime = vals.get("mime") try: mime, media_type = validate_mime_for_type(mime, media_type) except MediaValidationError as mve: logger.warning("Bad mime/media_type combo: %r/%r", mime, media_type) return json_error(400, str(mve)) r = MediaObject.fromurl(filereference, idbmodel=idbmodel) if r: logger.warning("Found existing object for %r", r.url) if r.owner != request.authorization.username: return json_error(403) if obj: # if either type or mime are null it will be ignored, if # present they change the behavior of fromobj try: mo = MediaObject.fromobj(obj, type=media_type, mime=mime, url=filereference, etag=etag) except MediaValidationError as mve: logger.warning("Validation failure, %r", mve) return json_error(400, str(mve)) mo.upload(IDigBioStorage(), obj) mo.insert_object(idbmodel) elif etag: mo = MediaObject.frometag(etag, idbmodel) if not mo or not mo.get_key(IDigBioStorage()).exists(): return json_error(404, "Unknown etag {0!r}".format(etag)) mo.last_status = 200 mo.last_check = datetime.now() mo.mime = mime or mo.detected_mime mo.type = media_type or mo.bucket else: mo = r or MediaObject() mo.last_status = None mo.last_check = None try: mo.mime, mo.type = validate_mime_for_type(mo.mime or mime, mo.type or media_type) if not (mo.mime and mo.type): logger.warning("Missing either mime(%r) or type(%r)", mo.mime, mo.type) return json_error(400, "Incomplete request") except MediaValidationError as mve: logger.warning("Validation Failure, %r", mve) return json_error(400, str(mve)) mo.url = filereference mo.owner = request.authorization.username if r: mo.update_media(idbmodel) else: mo.insert_media(idbmodel) if mo.etag: mo.ensure_media_object(idbmodel) idbmodel.commit() return respond_to_record(mo, format='json')
c['erred'] += 1 elif len(result.items) > 0: c['generated'] += 1 else: c['existed'] += 1 if count % update_freq == 0: output() yield result except KeyboardInterrupt: output() raise output() get_store = memoized()(lambda: IDigBioStorage()) def get_keys(obj): etag, bucket = obj.etag, obj.bucket etag = unicode(etag) s = get_store() bucketbase = u"idigbio-{0}-{1}".format(bucket, config.ENV) mediakey = s.get_key(etag, bucketbase) keys = [ s.get_key(etag + ".jpg", bucketbase + '-' + dtype) for dtype in DTYPES ] return CheckItem(etag, bucket, mediakey, keys) def generate_all(item):
def main(): s = IDigBioStorage() # static_queries = [ # ({},"idigbio"), # ({"hasImage": True},"idigbio-images"), # ({"geopoint":{"type":"exists"},"taxonid":{"type":"exists"}},"idigbio-geotaxon") # ] # rsquery = { # "query": { # "match_all": {} # }, # "size": 0, # "aggs": { # "recordset_counts": { # "terms": { # "field": "recordset", # "size": 10000 # } # } # } # } # ro = runQuery(rsquery) # if ro is not None: # print(len(ro["aggregations"]["recordset_counts"]["buckets"])) # for b in ro["aggregations"]["recordset_counts"]["buckets"]: # #print(b["key"], b["doc_count"], b["doc_count"] * 7 / 10000) # static_queries.append(({ # "recordset": b["key"] # },b["key"])) # print(len(static_queries)) # count = 0 # for q in reversed(static_queries): # print(count, q) # file_name = generate_files(record_query=queryFromShim(q[0])["query"],form="dwca-csv",filename=q[1]) # print(q[1], file_name) # u = upload_download_file_to_ceph(s,file_name) # # # rseml = eml_from_recordset(q[1],env="prod") # # # e = upload_eml_file_to_ceph(s,q[1],rseml) # print(q[1], u) # count += 1 file_name = generate_files(record_query=queryFromShim({ "geopoint": { "type": "exists" }, "taxonomicstatus": "accepted", "taxonid": { "type": "exists" }, "flags": "gbif_taxon_corrected", "kingdom": "plantae", "geopoint": { "type": "geo_bounding_box", "top_left": { "lat": 89, "lon": -179 }, "bottom_right": { "lat": -89, "lon": -33 } } })["query"], form="dwca-csv", filename="idigbio-plantae-w") u = upload_download_file_to_ceph(s, file_name)
def store(request): from idb.helpers.storage import IDigBioStorage store = IDigBioStorage() return store