Exemple #1
0
def update_db_from_rss():
    existing_recordsets = {}
    recordsets = {}
    with PostgresDB() as db:
        for r in db.fetchall("SELECT * FROM recordsets"):
            for recordid in r["recordids"]:
                existing_recordsets[recordid] = r["id"]
            recordsets[r["id"]] = r
        #logger.debug("***existing_recordsets DUMP ***\n")
        #logger.debug("{0}".format(existing_recordsets))
        pub_recs = db.fetchall("SELECT * FROM publishers")
        logger.debug("Checking %d publishers", len(pub_recs))
        for r in pub_recs:
            uuid, rss_url = r['uuid'], r['rss_url']
            logger.info("Starting Publisher Feed: %s %s", uuid, rss_url)
            rsscontents = get_feed(rss_url)
            if rsscontents:
                try:
                    _do_rss(rsscontents, r, db, recordsets,
                            existing_recordsets)
                    db.commit()
                except Exception:
                    logger.exception("Error with %s %s", uuid, rss_url)
                    db.rollback()
                except:
                    db.rollback()
                    raise
    logger.info("Finished processing add publisher RSS feeds")
Exemple #2
0
    def upload_to_storage(self, store, attempt=1):
        if not self.ok:
            return self
        try:
            mo = self.media_object
            k = mo.get_key(store)

            if k.exists() and k.read() and k.etag == '"{0}"'.format(mo.etag):
                logger.debug("NoUpload  %s etag %s, already present", self.url,
                             mo.etag)
            else:
                try:
                    mo.upload(store, StringIO(self.content), force=True)
                    logger.debug("Uploaded  %s etag %s", self.url, mo.etag)
                except (BotoServerError, BotoClientError) as e:
                    logger.exception("Failed uploading to storage: %s",
                                     self.url)
                    self.reason = str(e)
                    self.status_code = Status.STORAGE_ERROR
                    return self

            with PostgresDB() as idbmodel:
                # Don't need to ensure_media, we're processing
                # through media entries
                mo.ensure_object(idbmodel)
                mo.ensure_media_object(idbmodel)
                idbmodel.commit()
        except Exception:
            logger.exception("DbSaveErr %s", self.url)
            self.status_code = Status.STORAGE_ERROR
        return self
Exemple #3
0
def create_tables():
    """
    This function is out-of-sync with actual database, unmaintained.
    Commenting out all action in this function, it will do nothing until modified again.
    """

    db = PostgresDB()
    logger.error('create_tables called but has no valid code to run.')

    # db.execute("""CREATE TABLE IF NOT EXISTS publishers (
    #     id BIGSERIAL NOT NULL PRIMARY KEY,
    #     uuid uuid UNIQUE,
    #     name text NOT NULL,
    #     recordids text[] NOT NULL DEFAULT '{}',
    #     pub_type varchar(20) NOT NULL DEFAULT 'rss',
    #     portal_url text,
    #     rss_url text NOT NULL,
    #     auto_publish boolean NOT NULL DEFAULT false,
    #     first_seen timestamp NOT NULL DEFAULT now(),
    #     last_seen timestamp NOT NULL DEFAULT now(),
    #     pub_date timestamp
    # )""")

    # #pubid, rsid  Ingest, rs_record_id, eml_link, file_link, First Seen Date, Last Seen Date, Feed Date, Harvest Date, Harvest Etag
    # db.execute("""CREATE TABLE IF NOT EXISTS recordsets (
    #     id BIGSERIAL NOT NULL PRIMARY KEY,
    #     uuid uuid UNIQUE,
    #     publisher_uuid uuid REFERENCES publishers(uuid),
    #     name text NOT NULL,
    #     recordids text[] NOT NULL DEFAULT '{}',
    #     eml_link text,
    #     file_link text NOT NULL,
    #     ingest boolean NOT NULL DEFAULT false,
    #     first_seen timestamp NOT NULL DEFAULT now(),
    #     last_seen timestamp NOT NULL DEFAULT now(),
    #     pub_date timestamp,
    #     harvest_date timestamp,
    #     harvest_etag varchar(41)
    # )""")
    # db.commit()
    db.close()
Exemple #4
0
def get_db_dicts(rsid):
    id_uuid = {}
    uuid_etag = {}
    for t in ["record", "mediarecord"]:
        id_uuid[t + "s"] = {}
        uuid_etag[t + "s"] = {}
        for c in PostgresDB().get_children_list_for_ingest(
                rsid, t, cursor_factory=NamedTupleCursor):
            u = c.uuid
            e = c.etag
            uuid_etag[t + "s"][u] = e
            for i in c.recordids:
                id_uuid[t + "s"][i] = u
    return (uuid_etag, id_uuid)
def main():
    import argparse
    from idb.config import config

    parser = argparse.ArgumentParser(
        description='Generate a report of newly discovered recordsets.')

    parser.add_argument(
        '-a',
        '--age',
        help=
        'Number of days ago from today to mark beginning of "recent" period.',
        default=31,
        type=int)

    parser.add_argument('-w',
                        '--write',
                        action='store_true',
                        help='Write to a File instead of STDOUT.')

    args = parser.parse_args()

    db = PostgresDB()

    sql = """
       select uuid, name, publisher_uuid, file_link, to_json(first_seen), 
       to_json(pub_date), to_json(file_harvest_date) from recordsets where ingest=false and 
       first_seen > now()-'%s days'::interval order by first_seen""" % args.age

    results_list = db.fetchall(sql)

    if args.write:
        fh = open("fresh-recordsets-report.tsv", "w")
        output_tsv(results_list, fh)
        fh.close()
    else:
        output_tsv(results_list, sys.stdout)
Exemple #6
0
def update_db_from_rss():
    # existing_recordsets is a dict that holds mapping of recordids to DB id
    existing_recordsets = {}
    # file_links is a dict that holds mapping of file_links to DB id
    file_links = {}
    # recordsets is a dict that holds entire rows based on DB id (not recordid or uuid)
    recordsets = {}

    with PostgresDB() as db:
        logger.debug("Gathering existing recordsets...")
        for row in db.fetchall("SELECT * FROM recordsets"):
            recordsets[row["id"]] = row
            file_links[row["file_link"]] = row["id"]
            for recordid in row["recordids"]:
                logger.debug(
                    "id | recordid | file_link : '{0}' | '{1}' | '{2}'".format(
                        row["id"], recordid, row["file_link"]))
                if recordid in existing_recordsets:
                    logger.error(
                        "recordid '{0}' already in existing recordsets. This should never happen."
                        .format(recordid))
                else:
                    existing_recordsets[recordid] = row["id"]

        logger.debug("Gathering existing publishers...")
        pub_recs = db.fetchall("SELECT * FROM publishers")
        logger.debug("Checking %d publishers", len(pub_recs))
        for row in pub_recs:
            uuid, rss_url = row['uuid'], row['rss_url']
            logger.info("Starting Publisher Feed: %s %s", uuid, rss_url)
            rsscontents = get_feed(rss_url)
            if rsscontents:
                try:
                    _do_rss(rsscontents, row, db, recordsets,
                            existing_recordsets, file_links)
                    logger.debug('_do_rss returned, ready to COMMIT...')
                    db.commit()
                except Exception:
                    logger.exception(
                        "An exception occurred processing '{0}' in rss '{1}', will try ROLLBACK..."
                        .format(uuid, rss_url))
                    db.rollback()
                except:
                    logger.exception(
                        "Unknown exception occurred in rss '{0}' in rss '{1}', will try ROLLBACK..."
                        .format(uuid, rss_url))
                    db.rollback()
                    raise
    logger.info("Finished processing add publisher RSS feeds")
Exemple #7
0
def harvest_all_eml():
    sql = """SELECT *
             FROM recordsets
             WHERE eml_link IS NOT NULL
               AND ingest=true
               AND pub_date < now()
               AND (eml_harvest_date IS NULL OR eml_harvest_date < pub_date)"""
    with PostgresDB() as db:
        recs = db.fetchall(sql, cursor_factory=DictCursor)
        logger.info("Harvesting %d EML files", len(recs))
        for r in recs:
            try:
                harvest_eml(r, db)
                db.commit()
            except KeyboardInterrupt:
                db.rollback()
                raise
            except:
                db.rollback()
                logger.exception("failed Harvest EML %s %s", r["id"],
                                 r["name"])
Exemple #8
0
def harvest_all_file():
    sql = """SELECT *
             FROM recordsets
             WHERE file_link IS NOT NULL
               AND uuid IS NOT NULL
               AND ingest=true
               AND pub_date < now()
               AND (file_harvest_date IS NULL OR file_harvest_date < pub_date)"""

    with PostgresDB() as db:
        recs = db.fetchall(sql)
        logger.info("Harvesting %d files", len(recs))
        for r in recs:
            try:
                harvest_file(r, db)
                db.commit()
            except KeyboardInterrupt:
                db.rollback()
                raise
            except:
                logger.exception("Error processing id:%s url:%s", r['id'],
                                 r['file_link'])
                db.rollback()
Exemple #9
0
def main():
    import argparse
    from idb.config import config

    parser = argparse.ArgumentParser(
        description=
        'Delete records and recordset by specifying the recordset uuid')

    parse_group = parser.add_mutually_exclusive_group(required=True)

    parse_group.add_argument('-u',
                             '--uuid',
                             dest='uuid_to_delete',
                             type=str,
                             default=None)
    parse_group.add_argument('--uuid-file',
                             dest='uuid_file',
                             type=str,
                             default=None)

    args = parser.parse_args()

    db = PostgresDB()

    if args.uuid_file:
        # read each line in uuid_file as a uuid
        logger.info("Reading uuid_file '{0}'".format(args.uuid_file))
        with open(args.uuid_file) as f:
            for uuid_string in f:
                uuid = uuid_string.strip()
                if check_uuid(uuid):
                    delete_recordset(uuid, db)
    else:
        # operate on a single uuid
        if check_uuid(args.uuid_to_delete):
            delete_recordset(args.uuid_to_delete, db)
Exemple #10
0
import os
import requests
from requests.auth import HTTPBasicAuth
import traceback
import json
import datetime
import time

s = requests.Session()
adapter = requests.adapters.HTTPAdapter(pool_connections=100, pool_maxsize=100)
s.mount('http://', adapter)
s.mount('https://', adapter)

auth = HTTPBasicAuth(os.environ.get("IDB_UUID"), os.environ.get("IDB_APIKEY"))

db = PostgresDB()
local_pg = db._pg
local_cur = db._cur


def create_schema():
    local_cur.execute("BEGIN")
    local_cur.execute("""CREATE TABLE IF NOT EXISTS media (
        id BIGSERIAL PRIMARY KEY,
        url text UNIQUE,
        type varchar(20),
        mime varchar(255),
        last_status integer,
        last_check timestamp
    )
    """)
def file_list_iter(fn):
    with open(fn, 'r') as f:
        for l in f:
           fields = l.split()
           if "current" in fields[11]:
               fields[0] = fields[0][:-1] # trim ":" from end of server name
               fields[11] = fields[11].replace("\\\\", "\\") # de-escape slashes produced by `find -ls` to get the real file name
               fields.append(os.path.basename(fields[11])) # only fn for prefix searching
               yield fields

if __name__ == '__main__':

    fn = sys.argv[1]

    with PostgresDB() as db:
        q = """INSERT INTO ceph_server_files
               (server, line, unk, perms, unk2, owner_name, group_name, 
                size, month, day, year_time, fullname, filename)
               VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
               ON CONFLICT DO NOTHING
            """
        c = 1
        for l in file_list_iter(fn):
            db.execute(q, l)
            c += 1
            if (c % 100000) == 0:
                db.commit()
                c = 1
#                break
Exemple #12
0
buff = buff + '''
DATA INGESTION STATUS REPORT
'''

buff = buff + datetime.date.today().strftime("%B %d, %Y") + "\n\n"
buff = buff + hr + "\n"

api = idigbio.json()
record_count = locale.format("%d", api.count_records(), grouping=True)
media_record_count = locale.format("%d", api.count_media(), grouping=True)
recordset_count = locale.format("%d", api.count_recordsets(), grouping=True)

# Paused count is the count of recordsets where ingest is true,
# and paused is also true.  Will not count paused recordsets where
# ingest is false.
db = PostgresDB()
sql = """
      SELECT count(*) FROM recordsets WHERE ingest = true AND ingest_is_paused = true;
"""
db_r = db.fetchone(sql)
paused_count = db_r["count"]

# Updated recordsets is an approximation based on the number of items
# appearing in the Ingestion summary log, minus 15% to take into account
# that we usually overlap ingest periods for db-check operations by a few weeks.
# Currently the summary is in an expected, hard-coded path:
#   /mnt/data/new_ingestion/summary.pretty.txt
summary_path = "/mnt/data/new_ingestion/summary.pretty.txt"
if os.path.exists(summary_path):
    linecount = len(open(summary_path).readlines())
    updated_recordsets_blurb = (
Exemple #13
0
def process_file(fname,
                 mime,
                 rsid,
                 existing_etags,
                 existing_ids,
                 ingest=False,
                 commit_force=False,
                 ispaused=False):
    rlogger = getrslogger(rsid)
    rlogger.info("Processing %s, type: %s", fname, mime)
    counts = {}
    t = datetime.datetime.now()
    filehash = calcFileHash(fname)
    db = PostgresDB()
    commited = False

    try:
        if mime == "application/zip":
            dwcaobj = Dwca(fname, skipeml=True, logname="idb")
            for dwcrf in dwcaobj.extensions:
                rlogger.debug("Processing %r", dwcrf.name)
                counts[dwcrf.name] = process_subfile(dwcrf,
                                                     rsid,
                                                     existing_etags,
                                                     existing_ids,
                                                     ingest=ingest,
                                                     db=db)
                dwcrf.close()
            rlogger.debug("processing core %r", dwcaobj.core.name)
            counts[dwcaobj.core.name] = process_subfile(dwcaobj.core,
                                                        rsid,
                                                        existing_etags,
                                                        existing_ids,
                                                        ingest=ingest,
                                                        db=db)
            dwcaobj.core.close()
            dwcaobj.close()
        elif mime == "text/plain":
            commas = False
            with open(fname, "rb") as testf:
                commas = "," in testf.readline()

            if commas:
                csvrf = DelimitedFile(fname, logname="idigbio")
                counts[fname] = process_subfile(csvrf,
                                                rsid,
                                                existing_etags,
                                                existing_ids,
                                                ingest=ingest,
                                                db=db)
            else:
                tsvrf = DelimitedFile(fname,
                                      delimiter="\t",
                                      fieldenc=None,
                                      logname="idigbio")
                counts[fname] = process_subfile(tsvrf,
                                                rsid,
                                                existing_etags,
                                                existing_ids,
                                                ingest=ingest,
                                                db=db)

        if ingest:
            commit_ok = commit_force

            type_commits = []
            for k in counts:
                if k not in ingestion_types:
                    continue
                if (counts[k]["create"] /
                        float(counts[k]["processed_line_count"]) >= 0.5
                        and counts[k]["delete"] /
                        float(counts[k]["processed_line_count"]) >= 0.5):
                    type_commits.append(True)
                else:
                    type_commits.append(False)

            commit_ok = all(type_commits)

            if commit_ok:
                rlogger.info("Ready to Commit")
                db.commit()
                commited = True
            else:
                rlogger.error("Rollback")
                db.rollback()
        else:
            db.rollback()
        db.close()
    except Exception:
        logger.exception(
            "Unhandled Exception when processing {0}".format(fname))
        db.rollback()
        db.close()

    # Clear after processing an archive
    unconsumed_extensions.clear()
    core_siblings.clear()

    return {
        "name": fname,
        "filemd5": filehash,
        "recordset_id": rsid,
        "counts": counts,
        "processing_start_datetime": t.isoformat(),
        "total_processing_time": (datetime.datetime.now() - t).total_seconds(),
        "commited": commited,
        "paused": ispaused
    }
Exemple #14
0
def upload_recordset_from_file(rsid, fname):
    """
    Given a recordset uuid and a local dataset filename, upload the local
    dataset file as the "current" file for that uuid.

    Parameters
    ----------
    rsid : uuid
        An iDigBio recordset uuid
    fname : string
        Filename (full path or current directory only)

    Returns
    -------
    bool
        True if successful, False otherwise
    """
    # convert rsid uuid to string here because of either:
    #   psycopg2.ProgrammingError: can't adapt type 'UUID'
    # or
    #  TypeError: 'UUID' object does not support indexing
    rsuuid = str(rsid)

    logger.info("Manual upload of '{0}' from file '{1}' requested.".format(
        rsuuid, fname))

    # do some checks here
    try:
        f = open(fname)
        f.close()
    except:
        logger.error(
            "Cannot access file: '{0}'. Aborting upload.".format(fname))
        raise
    db = PostgresDB()
    sql = ("""SELECT id FROM recordsets WHERE uuid=%s""", (rsuuid, ))
    idcount = db.execute(*sql)
    if idcount < 1:
        logger.error(
            "Cannot find uuid '{0}' in db.  Aborting upload.".format(rsuuid))
        db.rollback()
        return False

    # output the "before" state
    results = db.fetchall(
        """SELECT id,file_harvest_date,file_harvest_etag FROM recordsets WHERE uuid=%s""",
        (rsuuid, ))
    for each in results:
        logger.debug("{0}".format(each))

    try:
        etag = upload_recordset(rsuuid, fname, db)
        assert etag
        sql = ("""UPDATE recordsets
                  SET file_harvest_etag=%s, file_harvest_date=%s
                  WHERE uuid=%s""", (etag, datetime.datetime.now(), rsuuid))
        update_count = db.execute(*sql)
        db.commit()
        logger.info("UPDATED {0} rows.".format(update_count))
        logger.info(
            "Finished manual upload of file '{0}', result etag = '{1}', saved to db."
            .format(fname, etag))
    except:
        logger.error(
            "An exception occurred during upload of file or db update for '{0}'"
            .format(fname))
        raise
    # output the "after" state
    results = db.fetchall(
        """SELECT id,file_harvest_date,file_harvest_etag FROM recordsets WHERE uuid=%s""",
        (rsuuid, ))
    for each in results:
        logger.debug("{0}".format(each))

    return True
Exemple #15
0
def get_idb():
    return PostgresDB(pool=current_app.config['DB'])
Exemple #16
0
def main2():
    for r in PostgresDB().get_type_list("recordset", limit=None):
        try:
            print r["uuid"]
        except:
            traceback.print_exc()
import uuid
import hashlib
import string
import random

from idb.postgres_backend.db import PostgresDB
from idb.helpers.encryption import _encrypt

from idb import config

db = PostgresDB()


def getCode(length=100, char=None):
    if char is None:
        char = string.ascii_letters + string.digits
    return ''.join(random.choice(char) for x in range(length))


user_uuid = str(uuid.uuid4())

m = hashlib.md5()
m.update(getCode())
api_key = m.hexdigest()

api_key_e = _encrypt(api_key, config.IDB_CRYPT_KEY)

feeder_parent_id = "520dcbb3-f35a-424c-8778-6df11afc9f95"
rs_rid = "http://feeder.idigbio.org/datasets/{0}".format(user_uuid)

db.execute("INSERT INTO idb_api_keys (user_uuid,apikey) VALUES (%s,%s)",