Example #1
0
def upload_recordset_from_file(rsid, fname):
    """
    Given a recordset uuid and a local dataset filename, upload the local
    dataset file as the "current" file for that uuid.

    Parameters
    ----------
    rsid : uuid
        An iDigBio recordset uuid
    fname : string
        Filename (full path or current directory only)

    Returns
    -------
    bool
        True if successful, False otherwise
    """
    # convert rsid uuid to string here because of either:
    #   psycopg2.ProgrammingError: can't adapt type 'UUID'
    # or
    #  TypeError: 'UUID' object does not support indexing
    rsuuid = str(rsid)

    logger.info("Manual upload of '{0}' from file '{1}' requested.".format(
        rsuuid, fname))

    # do some checks here
    try:
        f = open(fname)
        f.close()
    except:
        logger.error(
            "Cannot access file: '{0}'. Aborting upload.".format(fname))
        raise
    db = PostgresDB()
    sql = ("""SELECT id FROM recordsets WHERE uuid=%s""", (rsuuid, ))
    idcount = db.execute(*sql)
    if idcount < 1:
        logger.error(
            "Cannot find uuid '{0}' in db.  Aborting upload.".format(rsuuid))
        db.rollback()
        return False

    # output the "before" state
    results = db.fetchall(
        """SELECT id,file_harvest_date,file_harvest_etag FROM recordsets WHERE uuid=%s""",
        (rsuuid, ))
    for each in results:
        logger.debug("{0}".format(each))

    try:
        etag = upload_recordset(rsuuid, fname, db)
        assert etag
        sql = ("""UPDATE recordsets
                  SET file_harvest_etag=%s, file_harvest_date=%s
                  WHERE uuid=%s""", (etag, datetime.datetime.now(), rsuuid))
        update_count = db.execute(*sql)
        db.commit()
        logger.info("UPDATED {0} rows.".format(update_count))
        logger.info(
            "Finished manual upload of file '{0}', result etag = '{1}', saved to db."
            .format(fname, etag))
    except:
        logger.error(
            "An exception occurred during upload of file or db update for '{0}'"
            .format(fname))
        raise
    # output the "after" state
    results = db.fetchall(
        """SELECT id,file_harvest_date,file_harvest_etag FROM recordsets WHERE uuid=%s""",
        (rsuuid, ))
    for each in results:
        logger.debug("{0}".format(each))

    return True
Example #2
0
def process_file(fname,
                 mime,
                 rsid,
                 existing_etags,
                 existing_ids,
                 ingest=False,
                 commit_force=False,
                 ispaused=False):
    rlogger = getrslogger(rsid)
    rlogger.info("Processing %s, type: %s", fname, mime)
    counts = {}
    t = datetime.datetime.now()
    filehash = calcFileHash(fname)
    db = PostgresDB()
    commited = False

    try:
        if mime == "application/zip":
            dwcaobj = Dwca(fname, skipeml=True, logname="idb")
            for dwcrf in dwcaobj.extensions:
                rlogger.debug("Processing %r", dwcrf.name)
                counts[dwcrf.name] = process_subfile(dwcrf,
                                                     rsid,
                                                     existing_etags,
                                                     existing_ids,
                                                     ingest=ingest,
                                                     db=db)
                dwcrf.close()
            rlogger.debug("processing core %r", dwcaobj.core.name)
            counts[dwcaobj.core.name] = process_subfile(dwcaobj.core,
                                                        rsid,
                                                        existing_etags,
                                                        existing_ids,
                                                        ingest=ingest,
                                                        db=db)
            dwcaobj.core.close()
            dwcaobj.close()
        elif mime == "text/plain":
            commas = False
            with open(fname, "rb") as testf:
                commas = "," in testf.readline()

            if commas:
                csvrf = DelimitedFile(fname, logname="idigbio")
                counts[fname] = process_subfile(csvrf,
                                                rsid,
                                                existing_etags,
                                                existing_ids,
                                                ingest=ingest,
                                                db=db)
            else:
                tsvrf = DelimitedFile(fname,
                                      delimiter="\t",
                                      fieldenc=None,
                                      logname="idigbio")
                counts[fname] = process_subfile(tsvrf,
                                                rsid,
                                                existing_etags,
                                                existing_ids,
                                                ingest=ingest,
                                                db=db)

        if ingest:
            commit_ok = commit_force

            type_commits = []
            for k in counts:
                if k not in ingestion_types:
                    continue
                if (counts[k]["create"] /
                        float(counts[k]["processed_line_count"]) >= 0.5
                        and counts[k]["delete"] /
                        float(counts[k]["processed_line_count"]) >= 0.5):
                    type_commits.append(True)
                else:
                    type_commits.append(False)

            commit_ok = all(type_commits)

            if commit_ok:
                rlogger.info("Ready to Commit")
                db.commit()
                commited = True
            else:
                rlogger.error("Rollback")
                db.rollback()
        else:
            db.rollback()
        db.close()
    except Exception:
        logger.exception(
            "Unhandled Exception when processing {0}".format(fname))
        db.rollback()
        db.close()

    # Clear after processing an archive
    unconsumed_extensions.clear()
    core_siblings.clear()

    return {
        "name": fname,
        "filemd5": filehash,
        "recordset_id": rsid,
        "counts": counts,
        "processing_start_datetime": t.isoformat(),
        "total_processing_time": (datetime.datetime.now() - t).total_seconds(),
        "commited": commited,
        "paused": ispaused
    }