def update_db_from_rss(): existing_recordsets = {} recordsets = {} with PostgresDB() as db: for r in db.fetchall("SELECT * FROM recordsets"): for recordid in r["recordids"]: existing_recordsets[recordid] = r["id"] recordsets[r["id"]] = r #logger.debug("***existing_recordsets DUMP ***\n") #logger.debug("{0}".format(existing_recordsets)) pub_recs = db.fetchall("SELECT * FROM publishers") logger.debug("Checking %d publishers", len(pub_recs)) for r in pub_recs: uuid, rss_url = r['uuid'], r['rss_url'] logger.info("Starting Publisher Feed: %s %s", uuid, rss_url) rsscontents = get_feed(rss_url) if rsscontents: try: _do_rss(rsscontents, r, db, recordsets, existing_recordsets) db.commit() except Exception: logger.exception("Error with %s %s", uuid, rss_url) db.rollback() except: db.rollback() raise logger.info("Finished processing add publisher RSS feeds")
def upload_to_storage(self, store, attempt=1): if not self.ok: return self try: mo = self.media_object k = mo.get_key(store) if k.exists() and k.read() and k.etag == '"{0}"'.format(mo.etag): logger.debug("NoUpload %s etag %s, already present", self.url, mo.etag) else: try: mo.upload(store, StringIO(self.content), force=True) logger.debug("Uploaded %s etag %s", self.url, mo.etag) except (BotoServerError, BotoClientError) as e: logger.exception("Failed uploading to storage: %s", self.url) self.reason = str(e) self.status_code = Status.STORAGE_ERROR return self with PostgresDB() as idbmodel: # Don't need to ensure_media, we're processing # through media entries mo.ensure_object(idbmodel) mo.ensure_media_object(idbmodel) idbmodel.commit() except Exception: logger.exception("DbSaveErr %s", self.url) self.status_code = Status.STORAGE_ERROR return self
def create_tables(): """ This function is out-of-sync with actual database, unmaintained. Commenting out all action in this function, it will do nothing until modified again. """ db = PostgresDB() logger.error('create_tables called but has no valid code to run.') # db.execute("""CREATE TABLE IF NOT EXISTS publishers ( # id BIGSERIAL NOT NULL PRIMARY KEY, # uuid uuid UNIQUE, # name text NOT NULL, # recordids text[] NOT NULL DEFAULT '{}', # pub_type varchar(20) NOT NULL DEFAULT 'rss', # portal_url text, # rss_url text NOT NULL, # auto_publish boolean NOT NULL DEFAULT false, # first_seen timestamp NOT NULL DEFAULT now(), # last_seen timestamp NOT NULL DEFAULT now(), # pub_date timestamp # )""") # #pubid, rsid Ingest, rs_record_id, eml_link, file_link, First Seen Date, Last Seen Date, Feed Date, Harvest Date, Harvest Etag # db.execute("""CREATE TABLE IF NOT EXISTS recordsets ( # id BIGSERIAL NOT NULL PRIMARY KEY, # uuid uuid UNIQUE, # publisher_uuid uuid REFERENCES publishers(uuid), # name text NOT NULL, # recordids text[] NOT NULL DEFAULT '{}', # eml_link text, # file_link text NOT NULL, # ingest boolean NOT NULL DEFAULT false, # first_seen timestamp NOT NULL DEFAULT now(), # last_seen timestamp NOT NULL DEFAULT now(), # pub_date timestamp, # harvest_date timestamp, # harvest_etag varchar(41) # )""") # db.commit() db.close()
def get_db_dicts(rsid): id_uuid = {} uuid_etag = {} for t in ["record", "mediarecord"]: id_uuid[t + "s"] = {} uuid_etag[t + "s"] = {} for c in PostgresDB().get_children_list_for_ingest( rsid, t, cursor_factory=NamedTupleCursor): u = c.uuid e = c.etag uuid_etag[t + "s"][u] = e for i in c.recordids: id_uuid[t + "s"][i] = u return (uuid_etag, id_uuid)
def main(): import argparse from idb.config import config parser = argparse.ArgumentParser( description='Generate a report of newly discovered recordsets.') parser.add_argument( '-a', '--age', help= 'Number of days ago from today to mark beginning of "recent" period.', default=31, type=int) parser.add_argument('-w', '--write', action='store_true', help='Write to a File instead of STDOUT.') args = parser.parse_args() db = PostgresDB() sql = """ select uuid, name, publisher_uuid, file_link, to_json(first_seen), to_json(pub_date), to_json(file_harvest_date) from recordsets where ingest=false and first_seen > now()-'%s days'::interval order by first_seen""" % args.age results_list = db.fetchall(sql) if args.write: fh = open("fresh-recordsets-report.tsv", "w") output_tsv(results_list, fh) fh.close() else: output_tsv(results_list, sys.stdout)
def update_db_from_rss(): # existing_recordsets is a dict that holds mapping of recordids to DB id existing_recordsets = {} # file_links is a dict that holds mapping of file_links to DB id file_links = {} # recordsets is a dict that holds entire rows based on DB id (not recordid or uuid) recordsets = {} with PostgresDB() as db: logger.debug("Gathering existing recordsets...") for row in db.fetchall("SELECT * FROM recordsets"): recordsets[row["id"]] = row file_links[row["file_link"]] = row["id"] for recordid in row["recordids"]: logger.debug( "id | recordid | file_link : '{0}' | '{1}' | '{2}'".format( row["id"], recordid, row["file_link"])) if recordid in existing_recordsets: logger.error( "recordid '{0}' already in existing recordsets. This should never happen." .format(recordid)) else: existing_recordsets[recordid] = row["id"] logger.debug("Gathering existing publishers...") pub_recs = db.fetchall("SELECT * FROM publishers") logger.debug("Checking %d publishers", len(pub_recs)) for row in pub_recs: uuid, rss_url = row['uuid'], row['rss_url'] logger.info("Starting Publisher Feed: %s %s", uuid, rss_url) rsscontents = get_feed(rss_url) if rsscontents: try: _do_rss(rsscontents, row, db, recordsets, existing_recordsets, file_links) logger.debug('_do_rss returned, ready to COMMIT...') db.commit() except Exception: logger.exception( "An exception occurred processing '{0}' in rss '{1}', will try ROLLBACK..." .format(uuid, rss_url)) db.rollback() except: logger.exception( "Unknown exception occurred in rss '{0}' in rss '{1}', will try ROLLBACK..." .format(uuid, rss_url)) db.rollback() raise logger.info("Finished processing add publisher RSS feeds")
def harvest_all_eml(): sql = """SELECT * FROM recordsets WHERE eml_link IS NOT NULL AND ingest=true AND pub_date < now() AND (eml_harvest_date IS NULL OR eml_harvest_date < pub_date)""" with PostgresDB() as db: recs = db.fetchall(sql, cursor_factory=DictCursor) logger.info("Harvesting %d EML files", len(recs)) for r in recs: try: harvest_eml(r, db) db.commit() except KeyboardInterrupt: db.rollback() raise except: db.rollback() logger.exception("failed Harvest EML %s %s", r["id"], r["name"])
def harvest_all_file(): sql = """SELECT * FROM recordsets WHERE file_link IS NOT NULL AND uuid IS NOT NULL AND ingest=true AND pub_date < now() AND (file_harvest_date IS NULL OR file_harvest_date < pub_date)""" with PostgresDB() as db: recs = db.fetchall(sql) logger.info("Harvesting %d files", len(recs)) for r in recs: try: harvest_file(r, db) db.commit() except KeyboardInterrupt: db.rollback() raise except: logger.exception("Error processing id:%s url:%s", r['id'], r['file_link']) db.rollback()
def main(): import argparse from idb.config import config parser = argparse.ArgumentParser( description= 'Delete records and recordset by specifying the recordset uuid') parse_group = parser.add_mutually_exclusive_group(required=True) parse_group.add_argument('-u', '--uuid', dest='uuid_to_delete', type=str, default=None) parse_group.add_argument('--uuid-file', dest='uuid_file', type=str, default=None) args = parser.parse_args() db = PostgresDB() if args.uuid_file: # read each line in uuid_file as a uuid logger.info("Reading uuid_file '{0}'".format(args.uuid_file)) with open(args.uuid_file) as f: for uuid_string in f: uuid = uuid_string.strip() if check_uuid(uuid): delete_recordset(uuid, db) else: # operate on a single uuid if check_uuid(args.uuid_to_delete): delete_recordset(args.uuid_to_delete, db)
import os import requests from requests.auth import HTTPBasicAuth import traceback import json import datetime import time s = requests.Session() adapter = requests.adapters.HTTPAdapter(pool_connections=100, pool_maxsize=100) s.mount('http://', adapter) s.mount('https://', adapter) auth = HTTPBasicAuth(os.environ.get("IDB_UUID"), os.environ.get("IDB_APIKEY")) db = PostgresDB() local_pg = db._pg local_cur = db._cur def create_schema(): local_cur.execute("BEGIN") local_cur.execute("""CREATE TABLE IF NOT EXISTS media ( id BIGSERIAL PRIMARY KEY, url text UNIQUE, type varchar(20), mime varchar(255), last_status integer, last_check timestamp ) """)
def file_list_iter(fn): with open(fn, 'r') as f: for l in f: fields = l.split() if "current" in fields[11]: fields[0] = fields[0][:-1] # trim ":" from end of server name fields[11] = fields[11].replace("\\\\", "\\") # de-escape slashes produced by `find -ls` to get the real file name fields.append(os.path.basename(fields[11])) # only fn for prefix searching yield fields if __name__ == '__main__': fn = sys.argv[1] with PostgresDB() as db: q = """INSERT INTO ceph_server_files (server, line, unk, perms, unk2, owner_name, group_name, size, month, day, year_time, fullname, filename) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON CONFLICT DO NOTHING """ c = 1 for l in file_list_iter(fn): db.execute(q, l) c += 1 if (c % 100000) == 0: db.commit() c = 1 # break
buff = buff + ''' DATA INGESTION STATUS REPORT ''' buff = buff + datetime.date.today().strftime("%B %d, %Y") + "\n\n" buff = buff + hr + "\n" api = idigbio.json() record_count = locale.format("%d", api.count_records(), grouping=True) media_record_count = locale.format("%d", api.count_media(), grouping=True) recordset_count = locale.format("%d", api.count_recordsets(), grouping=True) # Paused count is the count of recordsets where ingest is true, # and paused is also true. Will not count paused recordsets where # ingest is false. db = PostgresDB() sql = """ SELECT count(*) FROM recordsets WHERE ingest = true AND ingest_is_paused = true; """ db_r = db.fetchone(sql) paused_count = db_r["count"] # Updated recordsets is an approximation based on the number of items # appearing in the Ingestion summary log, minus 15% to take into account # that we usually overlap ingest periods for db-check operations by a few weeks. # Currently the summary is in an expected, hard-coded path: # /mnt/data/new_ingestion/summary.pretty.txt summary_path = "/mnt/data/new_ingestion/summary.pretty.txt" if os.path.exists(summary_path): linecount = len(open(summary_path).readlines()) updated_recordsets_blurb = (
def process_file(fname, mime, rsid, existing_etags, existing_ids, ingest=False, commit_force=False, ispaused=False): rlogger = getrslogger(rsid) rlogger.info("Processing %s, type: %s", fname, mime) counts = {} t = datetime.datetime.now() filehash = calcFileHash(fname) db = PostgresDB() commited = False try: if mime == "application/zip": dwcaobj = Dwca(fname, skipeml=True, logname="idb") for dwcrf in dwcaobj.extensions: rlogger.debug("Processing %r", dwcrf.name) counts[dwcrf.name] = process_subfile(dwcrf, rsid, existing_etags, existing_ids, ingest=ingest, db=db) dwcrf.close() rlogger.debug("processing core %r", dwcaobj.core.name) counts[dwcaobj.core.name] = process_subfile(dwcaobj.core, rsid, existing_etags, existing_ids, ingest=ingest, db=db) dwcaobj.core.close() dwcaobj.close() elif mime == "text/plain": commas = False with open(fname, "rb") as testf: commas = "," in testf.readline() if commas: csvrf = DelimitedFile(fname, logname="idigbio") counts[fname] = process_subfile(csvrf, rsid, existing_etags, existing_ids, ingest=ingest, db=db) else: tsvrf = DelimitedFile(fname, delimiter="\t", fieldenc=None, logname="idigbio") counts[fname] = process_subfile(tsvrf, rsid, existing_etags, existing_ids, ingest=ingest, db=db) if ingest: commit_ok = commit_force type_commits = [] for k in counts: if k not in ingestion_types: continue if (counts[k]["create"] / float(counts[k]["processed_line_count"]) >= 0.5 and counts[k]["delete"] / float(counts[k]["processed_line_count"]) >= 0.5): type_commits.append(True) else: type_commits.append(False) commit_ok = all(type_commits) if commit_ok: rlogger.info("Ready to Commit") db.commit() commited = True else: rlogger.error("Rollback") db.rollback() else: db.rollback() db.close() except Exception: logger.exception( "Unhandled Exception when processing {0}".format(fname)) db.rollback() db.close() # Clear after processing an archive unconsumed_extensions.clear() core_siblings.clear() return { "name": fname, "filemd5": filehash, "recordset_id": rsid, "counts": counts, "processing_start_datetime": t.isoformat(), "total_processing_time": (datetime.datetime.now() - t).total_seconds(), "commited": commited, "paused": ispaused }
def upload_recordset_from_file(rsid, fname): """ Given a recordset uuid and a local dataset filename, upload the local dataset file as the "current" file for that uuid. Parameters ---------- rsid : uuid An iDigBio recordset uuid fname : string Filename (full path or current directory only) Returns ------- bool True if successful, False otherwise """ # convert rsid uuid to string here because of either: # psycopg2.ProgrammingError: can't adapt type 'UUID' # or # TypeError: 'UUID' object does not support indexing rsuuid = str(rsid) logger.info("Manual upload of '{0}' from file '{1}' requested.".format( rsuuid, fname)) # do some checks here try: f = open(fname) f.close() except: logger.error( "Cannot access file: '{0}'. Aborting upload.".format(fname)) raise db = PostgresDB() sql = ("""SELECT id FROM recordsets WHERE uuid=%s""", (rsuuid, )) idcount = db.execute(*sql) if idcount < 1: logger.error( "Cannot find uuid '{0}' in db. Aborting upload.".format(rsuuid)) db.rollback() return False # output the "before" state results = db.fetchall( """SELECT id,file_harvest_date,file_harvest_etag FROM recordsets WHERE uuid=%s""", (rsuuid, )) for each in results: logger.debug("{0}".format(each)) try: etag = upload_recordset(rsuuid, fname, db) assert etag sql = ("""UPDATE recordsets SET file_harvest_etag=%s, file_harvest_date=%s WHERE uuid=%s""", (etag, datetime.datetime.now(), rsuuid)) update_count = db.execute(*sql) db.commit() logger.info("UPDATED {0} rows.".format(update_count)) logger.info( "Finished manual upload of file '{0}', result etag = '{1}', saved to db." .format(fname, etag)) except: logger.error( "An exception occurred during upload of file or db update for '{0}'" .format(fname)) raise # output the "after" state results = db.fetchall( """SELECT id,file_harvest_date,file_harvest_etag FROM recordsets WHERE uuid=%s""", (rsuuid, )) for each in results: logger.debug("{0}".format(each)) return True
def get_idb(): return PostgresDB(pool=current_app.config['DB'])
def main2(): for r in PostgresDB().get_type_list("recordset", limit=None): try: print r["uuid"] except: traceback.print_exc()
import uuid import hashlib import string import random from idb.postgres_backend.db import PostgresDB from idb.helpers.encryption import _encrypt from idb import config db = PostgresDB() def getCode(length=100, char=None): if char is None: char = string.ascii_letters + string.digits return ''.join(random.choice(char) for x in range(length)) user_uuid = str(uuid.uuid4()) m = hashlib.md5() m.update(getCode()) api_key = m.hexdigest() api_key_e = _encrypt(api_key, config.IDB_CRYPT_KEY) feeder_parent_id = "520dcbb3-f35a-424c-8778-6df11afc9f95" rs_rid = "http://feeder.idigbio.org/datasets/{0}".format(user_uuid) db.execute("INSERT INTO idb_api_keys (user_uuid,apikey) VALUES (%s,%s)",