コード例 #1
0
def backfill_flagged_worker(rows):
    """Parallel worker for backfill_flagged_etags()
    Expects a list of records to work on and commit as a group
    """
    storage = IDigBioStorage()
    with apidbpool.connection(autocommit=False) as conn:
        cur = conn.cursor()
        for row in rows:
            try:
                table = make_temp_table_name(row['prefix'])
                b = storage.get_bucket(
                    row['ceph_bucket']
                )  # Two phase here because validate=False in storage class so etag is not populated
                row["etag"] = b.get_key(row["ceph_name"]).etag[1:-1]
                cur.execute(
                    """UPDATE {0}
                    SET ceph_etag=%(etag)s
                    WHERE ceph_bucket=%(ceph_bucket)s AND ceph_name=%(ceph_name)s
                    """.format(table), row)
            except:
                logger.error(
                    "Failed to update etag for {0}:{1} in {2} {3}".format(
                        row["ceph_bucket"], row["ceph_name"], row["prefix"],
                        traceback.format_exc()))
    conn.commit()
    return 0
コード例 #2
0
def get_objects_from_ceph():
    local_cur.execute("SELECT etag FROM objects")
    existing_objects = set()
    for r in local_cur:
        existing_objects.add(r[0])

    print len(existing_objects)

    s = IDigBioStorage()
    buckets = ["datasets","images"]
    count = 0
    rowcount = 0
    lrc = 0
    for b_k in buckets:
        b = s.get_bucket("idigbio-" + b_k + "-prod")
        for k in b.list():
            if k.name not in existing_objects:
                try:
                    ks = k.get_contents_as_string(headers={'Range' : 'bytes=0-100'})
                    detected_mime = magic.from_buffer(ks, mime=True)
                    local_cur.execute("INSERT INTO objects (bucket,etag,detected_mime) SELECT %(bucket)s,%(etag)s,%(dm)s WHERE NOT EXISTS (SELECT 1 FROM objects WHERE etag=%(etag)s)", {"bucket": b_k, "etag": k.name, "dm": detected_mime})
                    existing_objects.add(k.name)
                    rowcount += local_cur.rowcount
                except:
                    print "Ceph Error", b_k, k.name
            count += 1


            if rowcount != lrc and rowcount % 10000 == 0:
                print count, rowcount
                local_pg.commit()
                lrc = rowcount
        print count, rowcount
        local_pg.commit()
コード例 #3
0
def bucket_list_worker(work):
    logger.debug("Listing and inserting prefix {0} from bucket {1}".format(
        work["prefix"], work["bucket"]))

    storage = IDigBioStorage()
    # Read through bucket inserting into temp table
    with apidbpool.connection(
            autocommit=False
    ) as conn:  # use a single connection from the pool to commit groups of statements
        cur = conn.cursor()
        #        inserted = 1
        #logger.info("Importing bucket listing for {0}.".format(bucket))
        b = storage.get_bucket(work["bucket"])
        for f in b.list(prefix=work["prefix"]):
            # see backfill_new_etags() for why no etag here
            cur.execute(("INSERT INTO {0} "
                         "(ceph_bucket, ceph_name, ceph_date, ceph_bytes) "
                         "VALUES (%s, %s, %s, %s)").format(
                             make_temp_table_name(work["prefix"])),
                        (work["bucket"], f.name, f.last_modified, f.size))


#            inserted += 1

#            if (inserted % 10000) == 0:
#                logger.info("Committing {0}".format(inserted))
#                conn.commit()
        conn.commit()
        return 1
コード例 #4
0
def get_objects_from_ceph():
    local_cur.execute("SELECT etag FROM objects")
    existing_objects = set()
    for r in local_cur:
        existing_objects.add(r[0])

    print len(existing_objects)

    s = IDigBioStorage()
    buckets = ["datasets","images"]
    count = 0
    rowcount = 0
    lrc = 0
    for b_k in buckets:
        b = s.get_bucket("idigbio-" + b_k + "-prod")
        for k in b.list():
            if k.name not in existing_objects:
                try:
                    ks = k.get_contents_as_string(headers={'Range' : 'bytes=0-100'})
                    detected_mime = magic.from_buffer(ks, mime=True)
                    local_cur.execute("INSERT INTO objects (bucket,etag,detected_mime) SELECT %(bucket)s,%(etag)s,%(dm)s WHERE NOT EXISTS (SELECT 1 FROM objects WHERE etag=%(etag)s)", {"bucket": b_k, "etag": k.name, "dm": detected_mime})
                    existing_objects.add(k.name)
                    rowcount += local_cur.rowcount
                except:
                    print "Ceph Error", b_k, k.name
            count += 1


            if rowcount != lrc and rowcount % 10000 == 0:
                print count, rowcount
                local_pg.commit()
                lrc = rowcount
        print count, rowcount
        local_pg.commit()
コード例 #5
0
ファイル: download.py プロジェクト: roncanepa/idb-backend
def upload_download_file_to_ceph(filename):
    s = IDigBioStorage()
    keyname, bucket = os.path.basename(filename), "idigbio-downloads"
    fkey = s.upload(s.get_key(keyname, bucket),
                    filename,
                    content_type='application/zip',
                    public=True)
    return "http://s.idigbio.org/idigbio-downloads/" + fkey.name
コード例 #6
0
ファイル: derivatives.py プロジェクト: roncanepa/idb-backend
def upload_all(gr):
    if not gr:
        return
    try:
        for item in gr.items:
            IDigBioStorage.retry_loop(lambda: upload_item(item))
        return gr
    except (BotoServerError, BotoClientError):
        logger.exception("%s failed uploading derivatives", gr.etag)
    except Exception:
        logger.exception("%s Unexpected error", gr.etag)
コード例 #7
0
def get_key_object(bucket, name):
    """Get a key object from Ceph for the requested object.

    Note that most of the metadata with the key won't be populated
    until after it has been fetched.
    """
    global STORAGE_HOST
    storage = IDigBioStorage(host=STORAGE_HOST)
    logger.debug("Retreiving key for {0}:{1}".format(bucket, name))
    key = storage.get_key(name, bucket)
    return key
コード例 #8
0
def set_deriv_from_ceph():
    s = IDigBioStorage()
    b = s.get_bucket("idigbio-images-prod-thumbnail")
    count = 0
    for k in b.list():
        local_cur.execute("UPDATE objects SET derivatives=true WHERE etag=%s", (k.name.split(".")[0],))
        count += 1

        if count % 10000 == 0:
            print count
            local_pg.commit()
    print count
    local_pg.commit()
コード例 #9
0
def set_deriv_from_ceph():
    s = IDigBioStorage()
    b = s.get_bucket("idigbio-images-prod-thumbnail")
    count = 0
    for k in b.list():
        local_cur.execute("UPDATE objects SET derivatives=true WHERE etag=%s", (k.name.split(".")[0],))
        count += 1

        if count % 10000 == 0:
            print count
            local_pg.commit()
    print count
    local_pg.commit()
コード例 #10
0
ファイル: updatedb.py プロジェクト: roncanepa/idb-backend
def get_objects_from_ceph():
    import magic
    existing_objects = set(r[0] for r in apidbpool.fetchiter(
        "SELECT etag FROM objects", cursor_factory=cursor))

    logger.info("Found %d objects", len(existing_objects))

    s = IDigBioStorage()
    buckets = ["datasets", "images"]
    count = 0
    rowcount = 0
    lrc = 0
    with apidbpool.connection() as conn:
        with apidbpool.cursor() as cur:
            for b_k in buckets:
                b = s.get_bucket("idigbio-" + b_k + "-prod")
                for k in b.list():
                    if k.name not in existing_objects:
                        try:
                            ks = k.get_contents_as_string(
                                headers={'Range': 'bytes=0-100'})
                            detected_mime = magic.from_buffer(ks, mime=True)
                            cur.execute(
                                """INSERT INTO objects (bucket,etag,detected_mime)
                                   SELECT %(bucket)s,%(etag)s,%(dm)s
                                   WHERE NOT EXISTS(
                                      SELECT 1 FROM objects WHERE etag=%(etag)s)""",
                                {
                                    "bucket": b_k,
                                    "etag": k.name,
                                    "dm": detected_mime
                                })
                            existing_objects.add(k.name)
                            rowcount += cur.rowcount
                        except:
                            logger.exception(
                                "Ceph Error; bucket:%s keyname:%s", b_k,
                                k.name)
                    count += 1

                    if rowcount != lrc and rowcount % 10000 == 0:
                        logger.info("Count: %8d,  rowcount: %8d", count,
                                    rowcount)

                        conn.commit()
                        lrc = rowcount
                conn.commit()
                logger.info("Count: %8d,  rowcount: %8d  (Finished %s)", count,
                            rowcount, b_k)
コード例 #11
0
ファイル: derivatives.py プロジェクト: roncanepa/idb-backend
def fetch_media(key):
    try:
        return IDigBioStorage.get_contents_to_mem(key, md5=key.name)
    except BotoServerError as e:
        logger.error("%r failed downloading with %r %s %s", key, e.status,
                     e.reason, key.name)
        raise
    except S3DataError as e:
        logger.error("%r failed downloading on md5 mismatch", key)
        raise
    except BotoClientError as e:
        logger.exception("%r failed downloading because...")
        raise
コード例 #12
0
ファイル: db_check.py プロジェクト: roncanepa/idb-backend
def get_file(rsid):
    fname = rsid
    if not os.path.exists(fname):
        try:
            RecordSet.fetch_file(rsid,
                                 fname,
                                 media_store=IDigBioStorage(),
                                 logger=logger.getChild(rsid))
        except (S3ResponseError, S3DataError):
            getrslogger(rsid).exception("failed fetching archive")
            raise
    mime = magic.from_file(fname, mime=True)
    return (fname, mime)
コード例 #13
0
def upload_recordset(rsid, fname, idbmodel):
    filereference = "http://api.idigbio.org/v1/recordsets/" + rsid
    logger.debug("Starting Upload of %r", rsid)
    stor = IDigBioStorage()
    with open(fname, 'rb') as fobj:
        mo = MediaObject.fromobj(fobj,
                                 url=filereference,
                                 type='datasets',
                                 owner=config.IDB_UUID)
        k = mo.get_key(stor)
        if k.exists():
            logger.debug("ETAG %s already present in Storage.", mo.etag)
        else:
            mo.upload(stor, fobj)
            logger.debug("ETAG %s uploading from %r", mo.etag, fname)

        mo.ensure_media(idbmodel)
        mo.ensure_object(idbmodel)
        mo.ensure_media_object(idbmodel)
        logger.debug("Finished Upload of %r, etag = %s", rsid, mo.etag)
        return mo.etag
コード例 #14
0
def process_list(fetchitems, forprefix=''):
    """Process a list of FetchItems.

    This is intended to be the toplevel entry point of a subprocess
    working on a list of one domain's urls

    """
    try:
        store = IDigBioStorage()
        fetchrpool = gevent.pool.Pool(get_fetcher_count(forprefix))
        uploadpool = gevent.pool.Pool(8)
        items = fetchrpool.imap_unordered(lambda fi: fi.get_media(),
                                          fetchitems,
                                          maxsize=10)
        items = uploadpool.imap_unordered(
            lambda fi: fi.upload_to_storage(store), items, maxsize=10)
        items = itertools.imap(FetchItem.cleanup, items)
        items = update_db_status(items)
        items = count_result_types(items, forprefix=forprefix)
        return ilen(items)  # consume the generator
    except StandardError:
        logger.exception("Unhandled error forprefix:%s", forprefix)
        raise
コード例 #15
0
def verify_object(row_obj, key_obj):
    """Download an object and check it against the expected metadata.

    Return is a string status of result of checking the file:

    verified - Object downloads and all available data matches
    stashed - Object is verified and a copy has been kept in stash directory
    timeout - Download times out, probably due to file being truncated
    nosuchkey - Object does not exist, 404 error when downloading
    invalid - Some of the metadata does not match
    failed - No longer used, when this function was boolean this was False
    """

    storage = IDigBioStorage(host=STORAGE_HOST)

    try:
        if not os.path.exists(TMP_DIR):
            os.makedirs(TMP_DIR)
        fn = os.path.join(TMP_DIR, key_obj.name)

        logger.debug("Fetching file {0}:{1}".format(key_obj.bucket.name,
                                                    key_obj.name))
        storage.get_contents_to_filename(key_obj, fn)
        md5 = calc_md5(fn)
        size = os.stat(fn).st_size
    except (S3ResponseError) as ex:
        if "NoSuchKey" in str(ex):
            logger.error("No such key when getting {0}:{1}".format(
                key_obj.bucket.name, key_obj.name))
            return "nosuchkey"
        else:
            logger.error(
                "Exception while attempting to get file {0}:{1} {2}".format(
                    key_obj.bucket.name, key_obj.name, traceback.format_exc()))
            #            raise
            return "S3ResponseError"
    except (HTTPException, socket_error) as ex:
        # Timeout can be controlled by /etc/boto.cfg - see http://boto.cloudhackers.com/en/latest/boto_config_tut.html
        logger.error(
            "Socket timeout when getting {0}:{1}, file is probably corrupt in ceph"
            .format(key_obj.bucket.name, key_obj.name))
        if os.path.exists(fn):
            os.unlink(fn)
        return "timeout"
    except Exception as ex:
        if "503 Service Unavailable" in str(ex):
            logger.error("Service unavailable getting {0}:{1}".format(
                key_obj.bucket.name, key_obj.name))
            return "503Error"
        else:
            logger.error(
                "Exception while attempting to get file {0}:{1} {2}".format(
                    key_obj.bucket.name, key_obj.name, traceback.format_exc()))
            raise

    # The db may have partial information so we need to support it being
    # empty, but if it exists, it should match. Use logging to say what's
    # wrong with file, maintain a return value if anything fails.
    retval = False

    if not retval and (not size == key_obj.size):
        logger.error(
            "File size {0} does not match ceph size {1} for {2}:{3}".format(
                size, key_obj.size, key_obj.bucket.name, key_obj.name))
        retval = "invalid"

    if not retval and (row_obj["ceph_bytes"] and
                       (not size == row_obj["ceph_bytes"])):
        logger.error(
            "File size {0} does not match db size {1} for {2}:{3}".format(
                size, row_obj["ceph_bytes"], key_obj.bucket.name,
                key_obj.name))
        retval = "invalid"

    if not retval and (not md5 == key_obj.etag[1:-1]):  # etag is wraped in ""
        logger.error(
            "File md5 {0} does not match ceph etag {1} for {2}:{3}".format(
                md5, key_obj.etag[1:-1], key_obj.bucket.name, key_obj.name))
        retval = "invalid"

    # db etag has extra '-' chars
    if not retval and (row_obj["ceph_etag"] and
                       (not md5 == row_obj["ceph_etag"].replace('-', ''))):
        logger.error(
            "File md5 {0} does not match db etag {1} for {2}:{3}".format(
                md5, row_obj["ceph_etag"], key_obj.bucket.name, key_obj.name))
        retval = "invalid"

    if not retval:
        logger.debug("Object {0}:{1} verified".format(key_obj.bucket.name,
                                                      key_obj.name))
        #global args
        if STASH and stash_file(fn, key_obj):
            retval = "stashed"
            if DELETE and not TEST:
                logger.debug("Deleting {0}:{1} from ceph.".format(
                    key_obj.bucket.name, key_obj.name))
                try:
                    key_obj.delete()
                    DELETED.append(key_obj.name)
                except:
                    logger.error(
                        "Unable to delete object {0}:{1} in ceph.".format(
                            key_obj.bucket.name, key_obj.name))
        else:
            retval = "verified"
    else:
        logger.warn("Object {0}:{1} failed verification".format(
            key_obj.bucket.name, key_obj.name))

    try:
        os.unlink(fn)
    except:
        pass

    return retval
コード例 #16
0
ファイル: db_check.py プロジェクト: roncanepa/idb-backend
from idb.helpers.storage import IDigBioStorage
from idb.helpers import gipcpool

from idigbio_ingestion.lib.dwca import Dwca
from idigbio_ingestion.lib.delimited import DelimitedFile

bad_chars = u"\ufeff"
bad_char_re = re.compile("[%s]" % re.escape(bad_chars))

logger = idblogger.getChild("db-check")

uuid_re = re.compile(
    "([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"
)

s = IDigBioStorage()


class RecordException(Exception):
    pass


def getrslogger(rsid):
    return logger.getChild(rsid)


def mungeid(s):
    return bad_char_re.sub('', s).strip()


identifier_fields = {
コード例 #17
0
from __future__ import division, absolute_import, print_function

from idb.helpers.logging import getLogger, configure_app_log
from idb.helpers.storage import IDigBioStorage
from idb.helpers.media_validation import sniff_mime
from idb.helpers.memoize import filecached
import hashlib

logger = getLogger("restore")
store = IDigBioStorage()


def get_object_from_backup(etag):
    "need to return a buffer that is the contents of the object that should be in `idigbio-images-prod/$etag`"
    obj = open('reestores/' + etag, 'rb').read()
    md5 = hashlib.md5()
    md5.update(obj)
    assert etag == md5.hexdigest()
    return obj


@filecached("/tmp/restore-from-backup.picklecache")
def get_fouled():
    """Return the original list, cached in a file that is written back as
    we make progress. This way as we make progrss and kill/rerun- the
    file it only tries new ones

    """
    return {(u'images', u'06fbc3c99d7d9f06e1487adbbe171f82', u'image/jpeg'),
            (u'images', u'0f4ceba6d970ad48e43e740a794b288a', u'image/jpeg'),
            (u'images', u'1155e9e1d4af33ebf3ab841523bb9a4c', None),
コード例 #18
0
def main():
    index_file_name = "index.txt"

    query = {
        "size": 0,
        "aggs": {
            "rs": {
                "terms": {
                    "field": "recordset",
                    "size": 1000
                },
                "aggs": {
                    "ic":{
                        "terms": {
                            "field": "institutioncode",
                            "size": 1000,
                        },
                        "aggs": {
                            "cc": {
                                "terms":{
                                    "field": "collectioncode",
                                    "size": 1000,
                                }
                            }
                        }
                    }
                }
            }
        }
    }
    r = requests.post("http://search.idigbio.org/idigbio/records/_search",
                      data=json.dumps(query),
                      headers={"Content-Type": "application/json"})
    r.raise_for_status()
    ro = r.json()

    recordsets = {}
    for rs_b in ro["aggregations"]["rs"]["buckets"]:
        rsid = rs_b["key"]
        ic = ""
        cc = ""
        if len(rs_b["ic"]["buckets"]) == 0:
            ic = ""
            cc = ""
        elif len(rs_b["ic"]["buckets"]) == 1 or (
                float(rs_b["ic"]["buckets"][0]["doc_count"]) / float(rs_b["doc_count"]) > 0.9
            ):
            ic_b = rs_b["ic"]["buckets"][0]
            ic = get_true_ic(ic_b["key"])
            if len(ic_b["cc"]["buckets"]) == 0:
                cc = ""
            elif len(ic_b["cc"]["buckets"]) == 1:
                cc = ic_b["cc"]["buckets"][0]["key"]
            else:
                cc = "MULTIPLE"
        else:
            # print(rs_b)
            ic = "MULTIPLE"
            cc = "MULTIPLE"
        recordsets[rsid] = {
            "institutioncode": ic,
            "collectioncode": cc
        }

    s = IDigBioStorage()
    b = s.get_bucket("idigbio-static-downloads")

    headers = ["zipfile","emlfile","etag","modified","recordset_id", "institutioncode", "collectioncode"]
    files = {}

    for k in b.list():
        # Skip the index itself
        if k == index_file_name:
            continue

        # Skip files older than 8 days
        lm_d = dateutil.parser.parse(k.last_modified).date()
        if lm_d < (datetime.datetime.now() - datetime.timedelta(7)).date():
            continue

        fkey = k.name.split(".")[0]
        if fkey not in files:
            files[fkey] = {k:"" for k in headers}

        if k.name.endswith(".eml"):
            files[fkey]["emlfile"] = k.name
        elif k.name.endswith(".zip"):
            files[fkey]["zipfile"] = k.name
            files[fkey]["modified"] = k.last_modified
            files[fkey]["etag"] = k.etag
            if is_uuid(fkey):
                files[fkey]["recordset_id"] = fkey
                if fkey in recordsets:
                    files[fkey]["institutioncode"] = recordsets[fkey]["institutioncode"]
                    files[fkey]["collectioncode"] = recordsets[fkey]["collectioncode"]
                else:
                    files[fkey]["institutioncode"] = ""
                    files[fkey]["collectioncode"] = ""

    fil = StringIO()

    cw = csv.writer(fil,delimiter="\t")

    cw.writerow(headers)
    for k in files:
        if files[k]["zipfile"] != "":
            cw.writerow([files[k][h].replace("\"","") for h in headers])

    fil.seek(0)

    ik = b.get_key(index_file_name,validate=False)
    ik.content_type = 'text/tsv'
    ik.set_contents_from_file(fil)
    ik.make_public()
コード例 #19
0
def thumb_key(img_etag):
    from idb.helpers.storage import IDigBioStorage
    return IDigBioStorage().get_key(img_etag + '.jpg',
                                    'idigbio-images-prod-thumbnail')
コード例 #20
0
def img_key(img_etag):
    from idb.helpers.storage import IDigBioStorage
    return IDigBioStorage().get_key(img_etag, 'idigbio-images-prod')
コード例 #21
0
def sounds_key(sounds_etag):
    from idb.helpers.storage import IDigBioStorage
    return IDigBioStorage().get_key(sounds_etag, 'idigbio-sounds-prod')
コード例 #22
0
ファイル: v2_media.py プロジェクト: roncanepa/idb-backend
def upload():
    vals = {}
    j = request.get_json()
    if j is not None:
        vals.update(j)
    for k, v in request.values.iteritems():
        vals[k] = v

    filereference = vals.get("filereference")
    if not filereference:
        logger.warning("No filereference specified")
        return json_error(400, "Missing filereference")

    obj = request.files.get('file')
    etag = vals.get('etag')
    media_type = vals.get("media_type")
    mime = vals.get("mime")

    try:
        mime, media_type = validate_mime_for_type(mime, media_type)
    except MediaValidationError as mve:
        logger.warning("Bad mime/media_type combo: %r/%r", mime, media_type)
        return json_error(400, str(mve))

    r = MediaObject.fromurl(filereference, idbmodel=idbmodel)
    if r:
        logger.warning("Found existing object for %r", r.url)
        if r.owner != request.authorization.username:
            return json_error(403)

    if obj:
        # if either type or mime are null it will be ignored, if
        # present they change the behavior of fromobj
        try:
            mo = MediaObject.fromobj(obj, type=media_type, mime=mime, url=filereference, etag=etag)
        except MediaValidationError as mve:
            logger.warning("Validation failure, %r", mve)
            return json_error(400, str(mve))
        mo.upload(IDigBioStorage(), obj)
        mo.insert_object(idbmodel)
    elif etag:
        mo = MediaObject.frometag(etag, idbmodel)
        if not mo or not mo.get_key(IDigBioStorage()).exists():
            return json_error(404, "Unknown etag {0!r}".format(etag))

        mo.last_status = 200
        mo.last_check = datetime.now()
        mo.mime = mime or mo.detected_mime
        mo.type = media_type or mo.bucket

    else:
        mo = r or MediaObject()
        mo.last_status = None
        mo.last_check = None
        try:
            mo.mime, mo.type = validate_mime_for_type(mo.mime or mime, mo.type or media_type)
            if not (mo.mime and mo.type):
                logger.warning("Missing either mime(%r) or type(%r)", mo.mime, mo.type)
                return json_error(400, "Incomplete request")
        except MediaValidationError as mve:
            logger.warning("Validation Failure, %r", mve)
            return json_error(400, str(mve))

    mo.url = filereference
    mo.owner = request.authorization.username

    if r:
        mo.update_media(idbmodel)
    else:
        mo.insert_media(idbmodel)
    if mo.etag:
        mo.ensure_media_object(idbmodel)

    idbmodel.commit()
    return respond_to_record(mo, format='json')
コード例 #23
0
ファイル: derivatives.py プロジェクト: roncanepa/idb-backend
                c['erred'] += 1
            elif len(result.items) > 0:
                c['generated'] += 1
            else:
                c['existed'] += 1

            if count % update_freq == 0:
                output()
            yield result
    except KeyboardInterrupt:
        output()
        raise
    output()


get_store = memoized()(lambda: IDigBioStorage())


def get_keys(obj):
    etag, bucket = obj.etag, obj.bucket
    etag = unicode(etag)
    s = get_store()
    bucketbase = u"idigbio-{0}-{1}".format(bucket, config.ENV)
    mediakey = s.get_key(etag, bucketbase)
    keys = [
        s.get_key(etag + ".jpg", bucketbase + '-' + dtype) for dtype in DTYPES
    ]
    return CheckItem(etag, bucket, mediakey, keys)


def generate_all(item):
コード例 #24
0
def main():
    s = IDigBioStorage()
    # static_queries = [
    #     ({},"idigbio"),
    #     ({"hasImage": True},"idigbio-images"),
    #     ({"geopoint":{"type":"exists"},"taxonid":{"type":"exists"}},"idigbio-geotaxon")
    # ]
    # rsquery = {
    #     "query": {
    #         "match_all": {}
    #     },
    #     "size": 0,
    #     "aggs": {
    #         "recordset_counts": {
    #             "terms": {
    #                 "field": "recordset",
    #                 "size": 10000
    #             }
    #         }
    #     }
    # }
    # ro = runQuery(rsquery)
    # if ro is not None:
    #     print(len(ro["aggregations"]["recordset_counts"]["buckets"]))
    #     for b in ro["aggregations"]["recordset_counts"]["buckets"]:
    #         #print(b["key"], b["doc_count"], b["doc_count"] * 7 / 10000)
    #         static_queries.append(({
    #             "recordset": b["key"]
    #         },b["key"]))

    # print(len(static_queries))
    # count = 0
    # for q in reversed(static_queries):
    #     print(count, q)
    #     file_name = generate_files(record_query=queryFromShim(q[0])["query"],form="dwca-csv",filename=q[1])
    #     print(q[1], file_name)
    #     u = upload_download_file_to_ceph(s,file_name)
    #     # # rseml = eml_from_recordset(q[1],env="prod")
    #     # # e = upload_eml_file_to_ceph(s,q[1],rseml)
    #     print(q[1], u)
    #     count += 1

    file_name = generate_files(record_query=queryFromShim({
        "geopoint": {
            "type": "exists"
        },
        "taxonomicstatus": "accepted",
        "taxonid": {
            "type": "exists"
        },
        "flags": "gbif_taxon_corrected",
        "kingdom": "plantae",
        "geopoint": {
            "type": "geo_bounding_box",
            "top_left": {
                "lat": 89,
                "lon": -179
            },
            "bottom_right": {
                "lat": -89,
                "lon": -33
            }
        }
    })["query"],
                               form="dwca-csv",
                               filename="idigbio-plantae-w")
    u = upload_download_file_to_ceph(s, file_name)
コード例 #25
0
def store(request):
    from idb.helpers.storage import IDigBioStorage
    store = IDigBioStorage()
    return store