def test_largefile_upload(store, bucketname, tmpdir, monkeypatch): monkeypatch.setattr(store, 'MAX_CHUNK_SIZE', 16 * (1024 ** 2)) keyname = 'largefile' testfile = tmpdir / "testfile" with testfile.open('ab') as f: f.truncate(22 * (1024 ** 2) + 34923) md5 = calcFileHash(str(testfile)) k = store.upload(store.get_key(keyname, bucketname), str(testfile), public=False) testfile.remove() store.get_contents_to_filename(k, str(testfile), md5=md5) k.delete() assert calcFileHash(str(testfile)) == md5
def test_file_upload_download(store, bucketname, tmpdir): k = store.upload(store.get_key('foobar', bucketname), __file__, content_type="x-foo/bar", public=False) localmd5 = calcFileHash(__file__) assert k.md5 == localmd5 k2 = store.get_key('foobar', bucketname) assert k2.exists() localdownload = tmpdir / 'foobar' store.get_contents_to_filename(k2, str(localdownload), localmd5) assert localdownload.exists() assert localmd5 == calcFileHash(str(localdownload)) assert k2.content_type == 'x-foo/bar'
def harvest_eml(r, db): logger.info("Harvest EML %s '%s' @ '%s'", r["id"], r["name"], r["eml_link"]) fname = "{0}.eml".format(r["id"]) if not download_file(r["eml_link"], fname): logger.error("failed Harvest EML %s '%s' @ '%s'", r["id"], r["name"], r["eml_link"]) return try: etag = calcFileHash(fname) u = r["uuid"] if u is None: logger.debug("No uuid, using get_uuid on recordids") u, _, _ = db.get_uuid(r["recordids"]) logger.debug("Using recordset UUID: {0}".format(u)) desc = {} with open(fname, "rb") as inf: desc = parseEml(r["recordids"][0], inf.read()) desc["ingest"] = r["ingest"] desc["link"] = r["file_link"] desc["eml_link"] = r["eml_link"] desc["update"] = r["pub_date"].isoformat() parent = r["publisher_uuid"] db.set_record(u, "recordset", parent, desc, r["recordids"], []) sql = ("""UPDATE recordsets SET eml_harvest_etag=%s, eml_harvest_date=%s, uuid=%s WHERE id=%s""", (etag, datetime.datetime.now(), u, r["id"])) db.execute(*sql) finally: if os.path.exists(fname): os.unlink(fname)
def fromobj(cls, obj, **attrs): obj.seek(0) attrs.setdefault('last_status', 200) attrs.setdefault('last_check', datetime.now()) attrs.setdefault('derivatives', False) mo = cls(**attrs) if not mo.detected_mime or not mo.bucket: mo.detected_mime, mo.bucket = validate( obj.read(1024), url=mo.url, type=mo.type or mo.bucket, mime=mo.mime or mo.detected_mime) if mo.type and not mo.bucket: mo.bucket = mo.type if mo.bucket and not mo.type: mo.type = mo.bucket if not mo.mime: mo.mime = mo.detected_mime obj.seek(0) mo.etag = calcFileHash(obj, op=False, return_size=False) if attrs.get('etag'): if mo.etag != attrs.get('etag'): raise EtagMismatchError(attrs.get('etag'), mo.etag) obj.seek(0) return mo
def test_upload_etag_validate(client, testmedia_result, valid_auth_header, mock, jpgpath): mock.patch.object(boto.s3.key.Key, 'exists', return_value=True) filereference = "http://test.idigbio.org/test.jpg" etag = calcFileHash(str(jpgpath), return_size=False) url = url_for('idb.data_api.v2_media.upload', filereference=filereference) r = client.post(url, headers=[valid_auth_header], data={ 'file': (jpgpath.open('rb'), 'file'), 'etag': etag }) assert r.status_code == 200 assert r.json['last_status'] == 200 assert r.json['filereference'] == filereference assert r.json['etag'] == etag
def existingkey(store, bucketname, pngpath): kn = calcFileHash(str(pngpath)) k = store.get_key(kn, bucketname) k.set_contents_from_filename(str(pngpath)) assert k.exists() return k
def process_file(fname, mime, rsid, existing_etags, existing_ids, ingest=False, commit_force=False, ispaused=False): rlogger = getrslogger(rsid) rlogger.info("Processing %s, type: %s", fname, mime) counts = {} t = datetime.datetime.now() filehash = calcFileHash(fname) db = PostgresDB() commited = False try: if mime == "application/zip": dwcaobj = Dwca(fname, skipeml=True, logname="idb") for dwcrf in dwcaobj.extensions: rlogger.debug("Processing %r", dwcrf.name) counts[dwcrf.name] = process_subfile(dwcrf, rsid, existing_etags, existing_ids, ingest=ingest, db=db) dwcrf.close() rlogger.debug("processing core %r", dwcaobj.core.name) counts[dwcaobj.core.name] = process_subfile(dwcaobj.core, rsid, existing_etags, existing_ids, ingest=ingest, db=db) dwcaobj.core.close() dwcaobj.close() elif mime == "text/plain": commas = False with open(fname, "rb") as testf: commas = "," in testf.readline() if commas: csvrf = DelimitedFile(fname, logname="idigbio") counts[fname] = process_subfile(csvrf, rsid, existing_etags, existing_ids, ingest=ingest, db=db) else: tsvrf = DelimitedFile(fname, delimiter="\t", fieldenc=None, logname="idigbio") counts[fname] = process_subfile(tsvrf, rsid, existing_etags, existing_ids, ingest=ingest, db=db) if ingest: commit_ok = commit_force type_commits = [] for k in counts: if k not in ingestion_types: continue if (counts[k]["create"] / float(counts[k]["processed_line_count"]) >= 0.5 and counts[k]["delete"] / float(counts[k]["processed_line_count"]) >= 0.5): type_commits.append(True) else: type_commits.append(False) commit_ok = all(type_commits) if commit_ok: rlogger.info("Ready to Commit") db.commit() commited = True else: rlogger.error("Rollback") db.rollback() else: db.rollback() db.close() except Exception: logger.exception( "Unhandled Exception when processing {0}".format(fname)) db.rollback() db.close() # Clear after processing an archive unconsumed_extensions.clear() core_siblings.clear() return { "name": fname, "filemd5": filehash, "recordset_id": rsid, "counts": counts, "processing_start_datetime": t.isoformat(), "total_processing_time": (datetime.datetime.now() - t).total_seconds(), "commited": commited, "paused": ispaused }