def create_tables(): """ This function is out-of-sync with actual database, unmaintained. Commenting out all action in this function, it will do nothing until modified again. """ db = PostgresDB() logger.error('create_tables called but has no valid code to run.') # db.execute("""CREATE TABLE IF NOT EXISTS publishers ( # id BIGSERIAL NOT NULL PRIMARY KEY, # uuid uuid UNIQUE, # name text NOT NULL, # recordids text[] NOT NULL DEFAULT '{}', # pub_type varchar(20) NOT NULL DEFAULT 'rss', # portal_url text, # rss_url text NOT NULL, # auto_publish boolean NOT NULL DEFAULT false, # first_seen timestamp NOT NULL DEFAULT now(), # last_seen timestamp NOT NULL DEFAULT now(), # pub_date timestamp # )""") # #pubid, rsid Ingest, rs_record_id, eml_link, file_link, First Seen Date, Last Seen Date, Feed Date, Harvest Date, Harvest Etag # db.execute("""CREATE TABLE IF NOT EXISTS recordsets ( # id BIGSERIAL NOT NULL PRIMARY KEY, # uuid uuid UNIQUE, # publisher_uuid uuid REFERENCES publishers(uuid), # name text NOT NULL, # recordids text[] NOT NULL DEFAULT '{}', # eml_link text, # file_link text NOT NULL, # ingest boolean NOT NULL DEFAULT false, # first_seen timestamp NOT NULL DEFAULT now(), # last_seen timestamp NOT NULL DEFAULT now(), # pub_date timestamp, # harvest_date timestamp, # harvest_etag varchar(41) # )""") # db.commit() db.close()
def process_file(fname, mime, rsid, existing_etags, existing_ids, ingest=False, commit_force=False, ispaused=False): rlogger = getrslogger(rsid) rlogger.info("Processing %s, type: %s", fname, mime) counts = {} t = datetime.datetime.now() filehash = calcFileHash(fname) db = PostgresDB() commited = False try: if mime == "application/zip": dwcaobj = Dwca(fname, skipeml=True, logname="idb") for dwcrf in dwcaobj.extensions: rlogger.debug("Processing %r", dwcrf.name) counts[dwcrf.name] = process_subfile(dwcrf, rsid, existing_etags, existing_ids, ingest=ingest, db=db) dwcrf.close() rlogger.debug("processing core %r", dwcaobj.core.name) counts[dwcaobj.core.name] = process_subfile(dwcaobj.core, rsid, existing_etags, existing_ids, ingest=ingest, db=db) dwcaobj.core.close() dwcaobj.close() elif mime == "text/plain": commas = False with open(fname, "rb") as testf: commas = "," in testf.readline() if commas: csvrf = DelimitedFile(fname, logname="idigbio") counts[fname] = process_subfile(csvrf, rsid, existing_etags, existing_ids, ingest=ingest, db=db) else: tsvrf = DelimitedFile(fname, delimiter="\t", fieldenc=None, logname="idigbio") counts[fname] = process_subfile(tsvrf, rsid, existing_etags, existing_ids, ingest=ingest, db=db) if ingest: commit_ok = commit_force type_commits = [] for k in counts: if k not in ingestion_types: continue if (counts[k]["create"] / float(counts[k]["processed_line_count"]) >= 0.5 and counts[k]["delete"] / float(counts[k]["processed_line_count"]) >= 0.5): type_commits.append(True) else: type_commits.append(False) commit_ok = all(type_commits) if commit_ok: rlogger.info("Ready to Commit") db.commit() commited = True else: rlogger.error("Rollback") db.rollback() else: db.rollback() db.close() except Exception: logger.exception( "Unhandled Exception when processing {0}".format(fname)) db.rollback() db.close() # Clear after processing an archive unconsumed_extensions.clear() core_siblings.clear() return { "name": fname, "filemd5": filehash, "recordset_id": rsid, "counts": counts, "processing_start_datetime": t.isoformat(), "total_processing_time": (datetime.datetime.now() - t).total_seconds(), "commited": commited, "paused": ispaused }