Example #1
0
def get_items(prefix=None, ignores=IGNORE_PREFIXES, last_check_interval=None):
    """Return the FetchItems that should be processed

    This takes into account ITEMCLASSES and will generate the
    appropriate subclass based on registered prefixes.

    """
    sql = """
        SELECT url, type, mime
        FROM media
        WHERE type IS NOT NULL
          AND (last_status IS NULL
               OR (last_status >= 400 AND last_check < now() - %s::interval))
    """
    params = []
    params.append(last_check_interval or LAST_CHECK_INTERVAL)
    if prefix:
        sql += "\n AND url LIKE %s"
        params.append(prefix + '%')
    else:
        for i in ignores:
            assert i, "Can't ignore {0!r}".format(i)
            sql += "\n AND url NOT LIKE %s"
            params.append(i + '%')
    sql += "\n ORDER BY url"
    logger.debug("Querying %r", apidbpool.mogrify(sql, params))
    url_rows = apidbpool.fetchall(sql, params, cursor_factory=cursor)
    logger.info("Found %d urls to check", len(url_rows))

    for r in url_rows:
        m = PREFIX_RE.search(r[0])
        prefix = m and m.group()
        cls = ITEMCLASSES.get(prefix, FetchItem)
        yield cls(*r, prefix=prefix)
Example #2
0
def objects_for_etags(etags):
    assert isinstance(etags, (tuple, list))
    sql = """SELECT etag, bucket
             FROM objects
             WHERE derivatives=false AND etag IN %s
    """
    return apidbpool.fetchall(sql, (etags, ), cursor_factory=NamedTupleCursor)
def find_parts_on_servers(parts_obj):
    # Using the index of all files on all servers, return a list of dicts with
    # information about that file is on disk(s)

    cols = ["server", "fullname", "filename", "size"]
    q = """SELECT
            {}
           FROM ceph_server_files
           WHERE
            filename LIKE %s
        """.format(','.join(cols))

    for i, part in enumerate(parts_obj):
        logger.debug("Looking up filenames for {0}".format(part["pattern"]))

        # When in doubt, add more backslashes!
        rows = apidbpool.fetchall(
            q, ("{0}%".format(part["pattern"].replace('\\', '\\\\')), ))

        copies = []
        for c in rows:
            copies.append(dict(zip(cols, c)))
        parts_obj[i]["copies"] = copies

    return parts_obj
Example #4
0
def getitems():
    sql = """SELECT objects.bucket, objects.etag, objects.detected_mime as mime
             FROM objects
             JOIN media_objects USING (etag)
             WHERE media_objects.modified > '2016-08-01'
               AND derivatives = false
    """
    return set(apidbpool.fetchall(sql, cursor_factory=cursor))
Example #5
0
def objects_for_buckets(buckets):
    assert isinstance(buckets, (tuple, list))
    sql = """SELECT etag, bucket
             FROM objects
             WHERE derivatives=false AND bucket IN %s
             ORDER BY random()
    """
    return apidbpool.fetchall(sql, (buckets, ),
                              cursor_factory=NamedTupleCursor)
Example #6
0
def get_paused_rsids():
    sql = """
        SELECT uuid
        FROM recordsets
        WHERE ingest=true
          AND uuid IS NOT NULL
          AND ingest_is_paused = true
    """
    params = []
    return [
        r[0] for r in apidbpool.fetchall(sql, params, cursor_factory=cursor)
    ]
def uuidsIter(uuid_l, ei, rc, typ, yield_record=False, children=False):
    for rid in uuid_l:
        if children:
            logger.debug("Selecting children of %s.", rid)
            sql = "SELECT * FROM idigbio_uuids_data WHERE parent=%s and type=%s"
        else:
            sql = "SELECT * FROM idigbio_uuids_data WHERE uuid=%s and type=%s"
        params = (rid.strip(), typ[:-1])
        results = apidbpool.fetchall(sql, params, cursor_factory=DictCursor)
        for rec in results:
            if yield_record:
                yield rec
            else:
                yield index_record(ei, rc, typ, rec, do_index=False)
Example #8
0
def get_active_rsids(since=None):
    sql = """
        SELECT uuid
        FROM recordsets
        WHERE ingest=true
          AND uuid IS NOT NULL
          AND file_harvest_date IS NOT NULL
    """
    params = []
    if since:
        sql += "AND file_harvest_date >= %s"
        params.append(since)
    sql += "ORDER BY file_harvest_date DESC"
    return [
        r[0] for r in apidbpool.fetchall(sql, params, cursor_factory=cursor)
    ]
def get_row_objs_from_db(args):
    """Get a list of objects to reconstruct from the database.

    Uses the user's arguments to build the query for which objects
    to verify.
    """
    cols = [
        "ceph_bucket", "ceph_name", "ceph_date", "ceph_bytes", "ceph_etag",
        "ver_status", "ver_last_success", "ver_last_failure"
    ]

    wheres = []
    wheres.append("length(ceph_name)>=10")
    #wheres.append("ceph_bytes IS NOT NULL") # for initial testing
    #wheres.append("ceph_date IS NULL") # for testing date updates

    if args["start"]:
        wheres.append("ceph_date>=%(start)s")
    if args["end"]:
        wheres.append("ceph_date<=%(end)s")
    if args["name"]:
        wheres.append("ceph_name like %(name)s")
    if args["bucket"]:
        wheres.append("ceph_bucket=%(bucket)s")
    if args["verify"]:
        wheres.append("ver_status=%(verify)s")
    else:
        wheres.append("ver_status='timeout'")
    if args["rereconstruct"]:
        wheres.append("rest_status=%(rereconstruct)s")
    else:
        wheres.append("rest_status IS NULL")

    rows = apidbpool.fetchall(
        """SELECT {0} FROM ceph_objects WHERE
                                  {1}
                                  LIMIT %(count)s""".format(
            ','.join(cols), ' AND '.join(wheres)), args)
    row_objs = []
    for row in rows:
        row_objs.append(dict(zip(cols, row)))
    logger.info("Found {0} objects to work on".format(len(row_objs)))

    return row_objs
def backfill_flagged_etags(prefix):
    """Update ceph_objects_temp etags where records have a flag of any kind
    """
    table = make_temp_table_name(prefix)
    logger.info(
        "Backfilling etags on new/changed records in {0}".format(table))
    cols = ["ceph_bucket", "ceph_name"]
    results = apidbpool.fetchall("""SELECT {0}
        FROM {1}
        WHERE ceph_status IS NOT NULL
        """.format(','.join(cols), table))

    row_objs = []  # Convert to something we can use with pool.imap_unordered
    for row in results:
        # pack prefix in so we know what temp table the works goes with
        row_objs.append(dict(zip(cols + ["prefix"], row + [prefix])))

    # Found that batching up rows saves a bit of CPU time rather than greenlet switching and commiting each row
    pools = 3
    batches = int(max(math.floor(len(row_objs) / 5000), pools))
    work = batch_work(row_objs, batches)
    p = pool.Pool(pools)
    results = p.imap_unordered(backfill_flagged_worker, work)
    return sum(results)
Example #11
0
def geturls(etag):
    sql = """SELECT DISTINCT url
             FROM media_objects
             WHERE etag LIKE %s"""
    return set(u[0] for u in apidbpool.fetchall(sql, (etag,), cursor_factory=cursor))