def get_postgres_media_objects(prefix): assert prefix is None, "prefix isn't implemented on this function" count, rowcount, lrc = 0, 0, 0 sql = "SELECT lookup_key, etag, date_modified FROM idb_object_keys" with apidbpool.connection() as insertconn: with insertconn.cursor(cursor_factory=cursor) as cur: for r in apidbpool.fetchiter(sql, name="get_postgres_media_objects"): cur.execute( """ INSERT INTO media_objects (url, etag, modified) SELECT %(url)s, %(etag)s, %(modified)s WHERE EXISTS (SELECT 1 FROM media WHERE url=%(url)s) AND EXISTS (SELECT 1 FROM objects WHERE etag=%(etag)s) AND NOT EXISTS (SELECT 1 FROM media_objects WHERE url=%(url)s AND etag=%(etag)s) """, { "url": r[0], "etag": r[1], "modified": r[2] }) count += 1 rowcount += cur.rowcount if rowcount != lrc and rowcount % 10000 == 0: insertconn.commit() logger.info("Count: %8d, rowcount: %8d", count, rowcount) lrc = rowcount insertconn.commit() logger.info("Count: %8d, rowcount: %8d", count, rowcount)
def get_annotations(u): annotations = [] for r in apidbpool.fetchiter( "select * from annotations where uuids_id='{}'", name=str(uuid.uuid4()), cursor_factory=NamedTupleCursor): annotations.append(r) return annotations
def find_new_urls(media_urls, prefix=None, since=None): """Iterate through mediarecords' urls and ensure they are in existing urls""" logger.info("Searching for new URLs") scanned = 0 to_insert = {} # prevent duplication to_update = [] # just accumulate itersql = """SELECT data FROM idigbio_uuids_data WHERE type='mediarecord' AND deleted=false""" params = [] if since: logger.debug("Filtering mediarecords modified > %s", since) itersql += "\n AND modified > %s" params.append(since) results = apidbpool.fetchiter(itersql, params, name='write_urls_to_db', cursor_factory=cursor) for row in results: data = row[0] if scanned % 100000 == 0: logger.info("Inserting: %8d, Updating: %8d, Scanned: %8d", len(to_insert), len(to_update), scanned) scanned += 1 url = get_accessuri('mediarecords', data)["accessuri"] if url is None: continue url = url.replace("&", "&").strip() if prefix and not url.startswith(prefix): continue if check_ignore_media(url): continue o = get_media_type('mediarecords', data) t, mime = o["mediatype"], o["format"] entry = media_urls.get(url) if entry: # We're going to change something, but only if we're # adding/replacing things, not nulling existing values. if (t, mime) != entry and mime and (t or entry[0] is None): to_update.append((t, mime, url)) elif url not in to_insert: to_insert[url] = (t, mime) else: logger.debug("Repeated insert from ") logger.info("Inserting: %8d, Updating: %8d, Scanned: %8d; Finished Scan", len(to_insert), len(to_update), scanned) return to_insert, to_update
def type_yield(ei, rc, typ, yield_record=False): # drop the trailing s pg_typ = "".join(typ[:-1]) sql = "SELECT * FROM idigbio_uuids_data WHERE type=%s AND deleted=false" results = apidbpool.fetchiter(sql, (pg_typ, ), named=True, cursor_factory=DictCursor) for r in rate_logger(typ, results): if yield_record: yield r else: yield index_record(ei, rc, typ, r, do_index=False)
def existing_media_urls(prefix=None): "Find existing media urls" logger.info("Get Media URLs, prefix: %r", prefix) sql = "SELECT url,type,mime FROM media" params = [] if prefix: sql += " WHERE url LIKE %s" params.append(prefix + '%') rows = apidbpool.fetchiter(sql, params, cursor_factory=cursor) media_urls = {r[0]: (r[1], r[2]) for r in rows} logger.info("Found %d urls already in DB", len(media_urls)) return media_urls
def get_objects_from_ceph(): import magic existing_objects = set(r[0] for r in apidbpool.fetchiter( "SELECT etag FROM objects", cursor_factory=cursor)) logger.info("Found %d objects", len(existing_objects)) s = IDigBioStorage() buckets = ["datasets", "images"] count = 0 rowcount = 0 lrc = 0 with apidbpool.connection() as conn: with apidbpool.cursor() as cur: for b_k in buckets: b = s.get_bucket("idigbio-" + b_k + "-prod") for k in b.list(): if k.name not in existing_objects: try: ks = k.get_contents_as_string( headers={'Range': 'bytes=0-100'}) detected_mime = magic.from_buffer(ks, mime=True) cur.execute( """INSERT INTO objects (bucket,etag,detected_mime) SELECT %(bucket)s,%(etag)s,%(dm)s WHERE NOT EXISTS( SELECT 1 FROM objects WHERE etag=%(etag)s)""", { "bucket": b_k, "etag": k.name, "dm": detected_mime }) existing_objects.add(k.name) rowcount += cur.rowcount except: logger.exception( "Ceph Error; bucket:%s keyname:%s", b_k, k.name) count += 1 if rowcount != lrc and rowcount % 10000 == 0: logger.info("Count: %8d, rowcount: %8d", count, rowcount) conn.commit() lrc = rowcount conn.commit() logger.info("Count: %8d, rowcount: %8d (Finished %s)", count, rowcount, b_k)
def type_yield_resume(ei, rc, typ, also_delete=False, yield_record=False): es_ids = get_resume_cache(ei, typ) logger.info("%s: Indexing", typ) pg_typ = "".join(typ[:-1]) sql = "SELECT * FROM idigbio_uuids_data WHERE type=%s" if not also_delete: sql += " AND deleted=false" results = apidbpool.fetchiter(sql, (pg_typ, ), named=True, cursor_factory=DictCursor) for r in rate_logger(typ + " indexing", results): es_etag = es_ids.get(r["uuid"]) pg_etag = r['etag'] if es_etag == pg_etag or (pg_etag == tombstone_etag and es_etag is None): continue if yield_record: yield r else: yield index_record(ei, rc, typ, r, do_index=False)
def delete(ei, rc, no_index=False): logger.info("Running deletes") count = 0 sql = "SELECT id,type FROM uuids WHERE deleted=true" results = apidbpool.fetchiter(sql, named=True, cursor_factory=DictCursor) for r in results: count += 1 if not no_index: ei.es.delete(**{ "index": ei.indexName, "doc_type": r["type"] + 's', "id": r["id"] }) if count % 10000 == 0: logger.info("%s", count) logger.info("%s", count) try: ei.optimize() except: pass
def type_yield_modified(ei, rc, typ, yield_record=False): pg_typ = "".join(typ[:-1]) q = { "index": ei.indexName, "size": 0, "doc_type": typ, "body": { "aggs": { "mm": { "max": { "field": "datemodified" } } } } } o = ei.es.search(**q) after = datetime.datetime.utcfromtimestamp( math.ceil(o["aggregations"]["mm"]["value"] / 1000)) logger.info("Indexing %s after %s", typ, after.isoformat()) # Note, a subtle distinction: The below query will index every # _version_ of every record modified since the date it is thus # imperative that the records are process in ascending modified # order. in practice, this is unlikely to index more than one # record in a single run, but it is possible. sql = """SELECT uuids.id as uuid, type, deleted, data_etag as etag, version, modified, parent, recordids, siblings, uuids_data.id as vid, data FROM uuids_data LEFT JOIN uuids ON uuids.id = uuids_data.uuids_id LEFT JOIN data ON data.etag = uuids_data.data_etag LEFT JOIN LATERAL ( SELECT uuids_id, array_agg(identifier) as recordids FROM uuids_identifier WHERE uuids_id=uuids.id GROUP BY uuids_id ) as ids ON ids.uuids_id=uuids.id LEFT JOIN LATERAL ( SELECT count(*) AS annotation_count FROM annotations WHERE uuids_id = uuids.id ) AS ac ON TRUE LEFT JOIN LATERAL ( SELECT subject, json_object_agg(rel,array_agg) as siblings FROM ( SELECT subject, rel, array_agg(object) FROM ( SELECT r1 as subject, type as rel, r2 as object FROM ( SELECT r1,r2 FROM uuids_siblings UNION SELECT r2,r1 FROM uuids_siblings ) as rel_union JOIN uuids ON r2=id WHERE uuids.deleted = false ) as rel_table WHERE subject=uuids.id GROUP BY subject, rel ) as rels GROUP BY subject ) as sibs ON sibs.subject=uuids.id WHERE type=%s and modified>%s ORDER BY modified ASC; """ results = apidbpool.fetchiter(sql, (pg_typ, after), named=True, cursor_factory=DictCursor) for r in rate_logger(typ, results): if yield_record: yield r else: yield index_record(ei, rc, typ, r, do_index=False)