Example #1
0
def test_largefile_upload(store, bucketname, tmpdir, monkeypatch):
    monkeypatch.setattr(store, 'MAX_CHUNK_SIZE', 16 * (1024 ** 2))
    keyname = 'largefile'
    testfile = tmpdir / "testfile"
    with testfile.open('ab') as f:
        f.truncate(22 * (1024 ** 2) + 34923)
    md5 = calcFileHash(str(testfile))
    k = store.upload(store.get_key(keyname, bucketname), str(testfile), public=False)
    testfile.remove()
    store.get_contents_to_filename(k, str(testfile), md5=md5)
    k.delete()
    assert calcFileHash(str(testfile)) == md5
Example #2
0
def test_file_upload_download(store, bucketname, tmpdir):
    k = store.upload(store.get_key('foobar', bucketname), __file__, content_type="x-foo/bar", public=False)
    localmd5 = calcFileHash(__file__)
    assert k.md5 == localmd5

    k2 = store.get_key('foobar', bucketname)
    assert k2.exists()

    localdownload = tmpdir / 'foobar'
    store.get_contents_to_filename(k2, str(localdownload), localmd5)
    assert localdownload.exists()
    assert localmd5 == calcFileHash(str(localdownload))
    assert k2.content_type == 'x-foo/bar'
Example #3
0
def harvest_eml(r, db):
    logger.info("Harvest EML %s '%s' @ '%s'", r["id"], r["name"],
                r["eml_link"])
    fname = "{0}.eml".format(r["id"])
    if not download_file(r["eml_link"], fname):
        logger.error("failed Harvest EML %s '%s' @ '%s'", r["id"], r["name"],
                     r["eml_link"])
        return
    try:
        etag = calcFileHash(fname)
        u = r["uuid"]
        if u is None:
            logger.debug("No uuid, using get_uuid on recordids")
            u, _, _ = db.get_uuid(r["recordids"])
        logger.debug("Using recordset UUID: {0}".format(u))
        desc = {}
        with open(fname, "rb") as inf:
            desc = parseEml(r["recordids"][0], inf.read())
        desc["ingest"] = r["ingest"]
        desc["link"] = r["file_link"]
        desc["eml_link"] = r["eml_link"]
        desc["update"] = r["pub_date"].isoformat()
        parent = r["publisher_uuid"]
        db.set_record(u, "recordset", parent, desc, r["recordids"], [])
        sql = ("""UPDATE recordsets
                  SET eml_harvest_etag=%s, eml_harvest_date=%s, uuid=%s
                  WHERE id=%s""", (etag, datetime.datetime.now(), u, r["id"]))
        db.execute(*sql)
    finally:
        if os.path.exists(fname):
            os.unlink(fname)
Example #4
0
    def fromobj(cls, obj, **attrs):
        obj.seek(0)
        attrs.setdefault('last_status', 200)
        attrs.setdefault('last_check', datetime.now())
        attrs.setdefault('derivatives', False)

        mo = cls(**attrs)
        if not mo.detected_mime or not mo.bucket:
            mo.detected_mime, mo.bucket = validate(
                obj.read(1024), url=mo.url,
                type=mo.type or mo.bucket,
                mime=mo.mime or mo.detected_mime)

        if mo.type and not mo.bucket:
            mo.bucket = mo.type
        if mo.bucket and not mo.type:
            mo.type = mo.bucket
        if not mo.mime:
            mo.mime = mo.detected_mime

        obj.seek(0)
        mo.etag = calcFileHash(obj, op=False, return_size=False)
        if attrs.get('etag'):
            if mo.etag != attrs.get('etag'):
                raise EtagMismatchError(attrs.get('etag'), mo.etag)

        obj.seek(0)
        return mo
def test_upload_etag_validate(client, testmedia_result, valid_auth_header,
                              mock, jpgpath):
    mock.patch.object(boto.s3.key.Key, 'exists', return_value=True)
    filereference = "http://test.idigbio.org/test.jpg"
    etag = calcFileHash(str(jpgpath), return_size=False)
    url = url_for('idb.data_api.v2_media.upload', filereference=filereference)
    r = client.post(url,
                    headers=[valid_auth_header],
                    data={
                        'file': (jpgpath.open('rb'), 'file'),
                        'etag': etag
                    })
    assert r.status_code == 200
    assert r.json['last_status'] == 200
    assert r.json['filereference'] == filereference
    assert r.json['etag'] == etag
Example #6
0
def existingkey(store, bucketname, pngpath):
    kn = calcFileHash(str(pngpath))
    k = store.get_key(kn, bucketname)
    k.set_contents_from_filename(str(pngpath))
    assert k.exists()
    return k
Example #7
0
def process_file(fname,
                 mime,
                 rsid,
                 existing_etags,
                 existing_ids,
                 ingest=False,
                 commit_force=False,
                 ispaused=False):
    rlogger = getrslogger(rsid)
    rlogger.info("Processing %s, type: %s", fname, mime)
    counts = {}
    t = datetime.datetime.now()
    filehash = calcFileHash(fname)
    db = PostgresDB()
    commited = False

    try:
        if mime == "application/zip":
            dwcaobj = Dwca(fname, skipeml=True, logname="idb")
            for dwcrf in dwcaobj.extensions:
                rlogger.debug("Processing %r", dwcrf.name)
                counts[dwcrf.name] = process_subfile(dwcrf,
                                                     rsid,
                                                     existing_etags,
                                                     existing_ids,
                                                     ingest=ingest,
                                                     db=db)
                dwcrf.close()
            rlogger.debug("processing core %r", dwcaobj.core.name)
            counts[dwcaobj.core.name] = process_subfile(dwcaobj.core,
                                                        rsid,
                                                        existing_etags,
                                                        existing_ids,
                                                        ingest=ingest,
                                                        db=db)
            dwcaobj.core.close()
            dwcaobj.close()
        elif mime == "text/plain":
            commas = False
            with open(fname, "rb") as testf:
                commas = "," in testf.readline()

            if commas:
                csvrf = DelimitedFile(fname, logname="idigbio")
                counts[fname] = process_subfile(csvrf,
                                                rsid,
                                                existing_etags,
                                                existing_ids,
                                                ingest=ingest,
                                                db=db)
            else:
                tsvrf = DelimitedFile(fname,
                                      delimiter="\t",
                                      fieldenc=None,
                                      logname="idigbio")
                counts[fname] = process_subfile(tsvrf,
                                                rsid,
                                                existing_etags,
                                                existing_ids,
                                                ingest=ingest,
                                                db=db)

        if ingest:
            commit_ok = commit_force

            type_commits = []
            for k in counts:
                if k not in ingestion_types:
                    continue
                if (counts[k]["create"] /
                        float(counts[k]["processed_line_count"]) >= 0.5
                        and counts[k]["delete"] /
                        float(counts[k]["processed_line_count"]) >= 0.5):
                    type_commits.append(True)
                else:
                    type_commits.append(False)

            commit_ok = all(type_commits)

            if commit_ok:
                rlogger.info("Ready to Commit")
                db.commit()
                commited = True
            else:
                rlogger.error("Rollback")
                db.rollback()
        else:
            db.rollback()
        db.close()
    except Exception:
        logger.exception(
            "Unhandled Exception when processing {0}".format(fname))
        db.rollback()
        db.close()

    # Clear after processing an archive
    unconsumed_extensions.clear()
    core_siblings.clear()

    return {
        "name": fname,
        "filemd5": filehash,
        "recordset_id": rsid,
        "counts": counts,
        "processing_start_datetime": t.isoformat(),
        "total_processing_time": (datetime.datetime.now() - t).total_seconds(),
        "commited": commited,
        "paused": ispaused
    }