def upload_recordset_from_file(rsid, fname): """ Given a recordset uuid and a local dataset filename, upload the local dataset file as the "current" file for that uuid. Parameters ---------- rsid : uuid An iDigBio recordset uuid fname : string Filename (full path or current directory only) Returns ------- bool True if successful, False otherwise """ # convert rsid uuid to string here because of either: # psycopg2.ProgrammingError: can't adapt type 'UUID' # or # TypeError: 'UUID' object does not support indexing rsuuid = str(rsid) logger.info("Manual upload of '{0}' from file '{1}' requested.".format( rsuuid, fname)) # do some checks here try: f = open(fname) f.close() except: logger.error( "Cannot access file: '{0}'. Aborting upload.".format(fname)) raise db = PostgresDB() sql = ("""SELECT id FROM recordsets WHERE uuid=%s""", (rsuuid, )) idcount = db.execute(*sql) if idcount < 1: logger.error( "Cannot find uuid '{0}' in db. Aborting upload.".format(rsuuid)) db.rollback() return False # output the "before" state results = db.fetchall( """SELECT id,file_harvest_date,file_harvest_etag FROM recordsets WHERE uuid=%s""", (rsuuid, )) for each in results: logger.debug("{0}".format(each)) try: etag = upload_recordset(rsuuid, fname, db) assert etag sql = ("""UPDATE recordsets SET file_harvest_etag=%s, file_harvest_date=%s WHERE uuid=%s""", (etag, datetime.datetime.now(), rsuuid)) update_count = db.execute(*sql) db.commit() logger.info("UPDATED {0} rows.".format(update_count)) logger.info( "Finished manual upload of file '{0}', result etag = '{1}', saved to db." .format(fname, etag)) except: logger.error( "An exception occurred during upload of file or db update for '{0}'" .format(fname)) raise # output the "after" state results = db.fetchall( """SELECT id,file_harvest_date,file_harvest_etag FROM recordsets WHERE uuid=%s""", (rsuuid, )) for each in results: logger.debug("{0}".format(each)) return True
def process_file(fname, mime, rsid, existing_etags, existing_ids, ingest=False, commit_force=False, ispaused=False): rlogger = getrslogger(rsid) rlogger.info("Processing %s, type: %s", fname, mime) counts = {} t = datetime.datetime.now() filehash = calcFileHash(fname) db = PostgresDB() commited = False try: if mime == "application/zip": dwcaobj = Dwca(fname, skipeml=True, logname="idb") for dwcrf in dwcaobj.extensions: rlogger.debug("Processing %r", dwcrf.name) counts[dwcrf.name] = process_subfile(dwcrf, rsid, existing_etags, existing_ids, ingest=ingest, db=db) dwcrf.close() rlogger.debug("processing core %r", dwcaobj.core.name) counts[dwcaobj.core.name] = process_subfile(dwcaobj.core, rsid, existing_etags, existing_ids, ingest=ingest, db=db) dwcaobj.core.close() dwcaobj.close() elif mime == "text/plain": commas = False with open(fname, "rb") as testf: commas = "," in testf.readline() if commas: csvrf = DelimitedFile(fname, logname="idigbio") counts[fname] = process_subfile(csvrf, rsid, existing_etags, existing_ids, ingest=ingest, db=db) else: tsvrf = DelimitedFile(fname, delimiter="\t", fieldenc=None, logname="idigbio") counts[fname] = process_subfile(tsvrf, rsid, existing_etags, existing_ids, ingest=ingest, db=db) if ingest: commit_ok = commit_force type_commits = [] for k in counts: if k not in ingestion_types: continue if (counts[k]["create"] / float(counts[k]["processed_line_count"]) >= 0.5 and counts[k]["delete"] / float(counts[k]["processed_line_count"]) >= 0.5): type_commits.append(True) else: type_commits.append(False) commit_ok = all(type_commits) if commit_ok: rlogger.info("Ready to Commit") db.commit() commited = True else: rlogger.error("Rollback") db.rollback() else: db.rollback() db.close() except Exception: logger.exception( "Unhandled Exception when processing {0}".format(fname)) db.rollback() db.close() # Clear after processing an archive unconsumed_extensions.clear() core_siblings.clear() return { "name": fname, "filemd5": filehash, "recordset_id": rsid, "counts": counts, "processing_start_datetime": t.isoformat(), "total_processing_time": (datetime.datetime.now() - t).total_seconds(), "commited": commited, "paused": ispaused }