Beispiel #1
0
def get_postgres_media_objects(prefix):
    assert prefix is None, "prefix isn't implemented on this function"
    count, rowcount, lrc = 0, 0, 0
    sql = "SELECT lookup_key, etag, date_modified FROM idb_object_keys"
    with apidbpool.connection() as insertconn:
        with insertconn.cursor(cursor_factory=cursor) as cur:
            for r in apidbpool.fetchiter(sql,
                                         name="get_postgres_media_objects"):
                cur.execute(
                    """
                     INSERT INTO media_objects (url, etag, modified)
                     SELECT %(url)s, %(etag)s, %(modified)s
                     WHERE EXISTS (SELECT 1 FROM media WHERE url=%(url)s)
                       AND EXISTS (SELECT 1 FROM objects WHERE etag=%(etag)s)
                       AND NOT EXISTS (SELECT 1 FROM media_objects WHERE url=%(url)s AND etag=%(etag)s)
                """, {
                        "url": r[0],
                        "etag": r[1],
                        "modified": r[2]
                    })
                count += 1
                rowcount += cur.rowcount

                if rowcount != lrc and rowcount % 10000 == 0:
                    insertconn.commit()
                    logger.info("Count: %8d,  rowcount: %8d", count, rowcount)
                    lrc = rowcount
    insertconn.commit()
    logger.info("Count: %8d,  rowcount: %8d", count, rowcount)
Beispiel #2
0
 def get_annotations(u):
     annotations = []
     for r in apidbpool.fetchiter(
             "select * from annotations where uuids_id='{}'",
             name=str(uuid.uuid4()),
             cursor_factory=NamedTupleCursor):
         annotations.append(r)
     return annotations
Beispiel #3
0
def find_new_urls(media_urls, prefix=None, since=None):
    """Iterate through mediarecords' urls and ensure they are in existing urls"""
    logger.info("Searching for new URLs")

    scanned = 0
    to_insert = {}  # prevent duplication
    to_update = []  # just accumulate
    itersql = """SELECT data
                 FROM idigbio_uuids_data
                 WHERE type='mediarecord' AND deleted=false"""
    params = []
    if since:
        logger.debug("Filtering mediarecords modified > %s", since)
        itersql += "\n AND modified > %s"
        params.append(since)

    results = apidbpool.fetchiter(itersql,
                                  params,
                                  name='write_urls_to_db',
                                  cursor_factory=cursor)
    for row in results:
        data = row[0]
        if scanned % 100000 == 0:
            logger.info("Inserting: %8d, Updating: %8d, Scanned: %8d",
                        len(to_insert), len(to_update), scanned)

        scanned += 1
        url = get_accessuri('mediarecords', data)["accessuri"]
        if url is None:
            continue
        url = url.replace("&", "&").strip()
        if prefix and not url.startswith(prefix): continue
        if check_ignore_media(url): continue

        o = get_media_type('mediarecords', data)
        t, mime = o["mediatype"], o["format"]

        entry = media_urls.get(url)
        if entry:
            # We're going to change something, but only if we're
            # adding/replacing things, not nulling existing values.
            if (t, mime) != entry and mime and (t or entry[0] is None):
                to_update.append((t, mime, url))
        elif url not in to_insert:
            to_insert[url] = (t, mime)
        else:
            logger.debug("Repeated insert from ")

    logger.info("Inserting: %8d, Updating: %8d, Scanned: %8d; Finished Scan",
                len(to_insert), len(to_update), scanned)

    return to_insert, to_update
def type_yield(ei, rc, typ, yield_record=False):
    # drop the trailing s
    pg_typ = "".join(typ[:-1])

    sql = "SELECT * FROM idigbio_uuids_data WHERE type=%s AND deleted=false"
    results = apidbpool.fetchiter(sql, (pg_typ, ),
                                  named=True,
                                  cursor_factory=DictCursor)
    for r in rate_logger(typ, results):
        if yield_record:
            yield r
        else:
            yield index_record(ei, rc, typ, r, do_index=False)
Beispiel #5
0
def existing_media_urls(prefix=None):
    "Find existing media urls"
    logger.info("Get Media URLs, prefix: %r", prefix)
    sql = "SELECT url,type,mime FROM media"
    params = []
    if prefix:
        sql += " WHERE url LIKE %s"
        params.append(prefix + '%')

    rows = apidbpool.fetchiter(sql, params, cursor_factory=cursor)
    media_urls = {r[0]: (r[1], r[2]) for r in rows}
    logger.info("Found %d urls already in DB", len(media_urls))
    return media_urls
Beispiel #6
0
def get_objects_from_ceph():
    import magic
    existing_objects = set(r[0] for r in apidbpool.fetchiter(
        "SELECT etag FROM objects", cursor_factory=cursor))

    logger.info("Found %d objects", len(existing_objects))

    s = IDigBioStorage()
    buckets = ["datasets", "images"]
    count = 0
    rowcount = 0
    lrc = 0
    with apidbpool.connection() as conn:
        with apidbpool.cursor() as cur:
            for b_k in buckets:
                b = s.get_bucket("idigbio-" + b_k + "-prod")
                for k in b.list():
                    if k.name not in existing_objects:
                        try:
                            ks = k.get_contents_as_string(
                                headers={'Range': 'bytes=0-100'})
                            detected_mime = magic.from_buffer(ks, mime=True)
                            cur.execute(
                                """INSERT INTO objects (bucket,etag,detected_mime)
                                   SELECT %(bucket)s,%(etag)s,%(dm)s
                                   WHERE NOT EXISTS(
                                      SELECT 1 FROM objects WHERE etag=%(etag)s)""",
                                {
                                    "bucket": b_k,
                                    "etag": k.name,
                                    "dm": detected_mime
                                })
                            existing_objects.add(k.name)
                            rowcount += cur.rowcount
                        except:
                            logger.exception(
                                "Ceph Error; bucket:%s keyname:%s", b_k,
                                k.name)
                    count += 1

                    if rowcount != lrc and rowcount % 10000 == 0:
                        logger.info("Count: %8d,  rowcount: %8d", count,
                                    rowcount)

                        conn.commit()
                        lrc = rowcount
                conn.commit()
                logger.info("Count: %8d,  rowcount: %8d  (Finished %s)", count,
                            rowcount, b_k)
def type_yield_resume(ei, rc, typ, also_delete=False, yield_record=False):
    es_ids = get_resume_cache(ei, typ)
    logger.info("%s: Indexing", typ)
    pg_typ = "".join(typ[:-1])
    sql = "SELECT * FROM idigbio_uuids_data WHERE type=%s"
    if not also_delete:
        sql += " AND deleted=false"
    results = apidbpool.fetchiter(sql, (pg_typ, ),
                                  named=True,
                                  cursor_factory=DictCursor)
    for r in rate_logger(typ + " indexing", results):
        es_etag = es_ids.get(r["uuid"])
        pg_etag = r['etag']
        if es_etag == pg_etag or (pg_etag == tombstone_etag
                                  and es_etag is None):
            continue

        if yield_record:
            yield r
        else:
            yield index_record(ei, rc, typ, r, do_index=False)
def delete(ei, rc, no_index=False):
    logger.info("Running deletes")

    count = 0
    sql = "SELECT id,type FROM uuids WHERE deleted=true"
    results = apidbpool.fetchiter(sql, named=True, cursor_factory=DictCursor)
    for r in results:
        count += 1
        if not no_index:
            ei.es.delete(**{
                "index": ei.indexName,
                "doc_type": r["type"] + 's',
                "id": r["id"]
            })

        if count % 10000 == 0:
            logger.info("%s", count)

    logger.info("%s", count)
    try:
        ei.optimize()
    except:
        pass
def type_yield_modified(ei, rc, typ, yield_record=False):
    pg_typ = "".join(typ[:-1])

    q = {
        "index": ei.indexName,
        "size": 0,
        "doc_type": typ,
        "body": {
            "aggs": {
                "mm": {
                    "max": {
                        "field": "datemodified"
                    }
                }
            }
        }
    }

    o = ei.es.search(**q)

    after = datetime.datetime.utcfromtimestamp(
        math.ceil(o["aggregations"]["mm"]["value"] / 1000))

    logger.info("Indexing %s after %s", typ, after.isoformat())

    # Note, a subtle distinction: The below query will index every
    # _version_ of every record modified since the date it is thus
    # imperative that the records are process in ascending modified
    # order.  in practice, this is unlikely to index more than one
    # record in a single run, but it is possible.
    sql = """SELECT
            uuids.id as uuid,
            type,
            deleted,
            data_etag as etag,
            version,
            modified,
            parent,
            recordids,
            siblings,
            uuids_data.id as vid,
            data
        FROM uuids_data
        LEFT JOIN uuids
        ON uuids.id = uuids_data.uuids_id
        LEFT JOIN data
        ON data.etag = uuids_data.data_etag
        LEFT JOIN LATERAL (
            SELECT uuids_id, array_agg(identifier) as recordids
            FROM uuids_identifier
            WHERE uuids_id=uuids.id
            GROUP BY uuids_id
        ) as ids
        ON ids.uuids_id=uuids.id
        LEFT JOIN LATERAL (
            SELECT count(*) AS annotation_count
            FROM annotations
            WHERE uuids_id = uuids.id
        ) AS ac ON TRUE
        LEFT JOIN LATERAL (
            SELECT subject, json_object_agg(rel,array_agg) as siblings
            FROM (
                SELECT subject, rel, array_agg(object)
                FROM (
                    SELECT
                        r1 as subject,
                        type as rel,
                        r2 as object
                    FROM (
                        SELECT r1,r2
                        FROM uuids_siblings
                        UNION
                        SELECT r2,r1
                        FROM uuids_siblings
                    ) as rel_union
                    JOIN uuids
                    ON r2=id
                    WHERE uuids.deleted = false
                ) as rel_table
                WHERE subject=uuids.id
                GROUP BY subject, rel
            ) as rels
            GROUP BY subject
        ) as sibs
        ON sibs.subject=uuids.id
        WHERE type=%s and modified>%s
        ORDER BY modified ASC;
        """

    results = apidbpool.fetchiter(sql, (pg_typ, after),
                                  named=True,
                                  cursor_factory=DictCursor)

    for r in rate_logger(typ, results):
        if yield_record:
            yield r
        else:
            yield index_record(ei, rc, typ, r, do_index=False)