Example #1
0
    def __init__(self,
                 fh,
                 encoding="utf8",
                 delimiter=",",
                 fieldenc="\"",
                 header=None,
                 rowtype=None,
                 logname=None):
        super(DelimitedFile, self).__init__()

        self.encoding = encoding
        self.fieldenc = fieldenc
        self.delimiter = delimiter
        self.rowtype = rowtype
        self.lineCount = 0
        self.lineLength = None

        if isinstance(fh, str) or isinstance(fh, unicode):
            self.name = fh
        else:
            self.name = fh.name
        self.filehandle = io.open(fh,
                                  "r",
                                  encoding=encoding,
                                  errors="flag_error")

        if logname is None:
            self.logger = idblogger.getChild('df')
        else:
            self.logger = getLogger(logname)

        encoded_lines = (l.encode("utf-8") for l in self.filehandle)
        if self.fieldenc is None or self.fieldenc == "":
            self._reader = csv.reader(encoded_lines,
                                      encoding="utf-8",
                                      delimiter=self.delimiter,
                                      quoting=csv.QUOTE_NONE)
        else:
            self._reader = csv.reader(encoded_lines,
                                      encoding="utf-8",
                                      delimiter=self.delimiter,
                                      quotechar=self.fieldenc)

        t = defaultdict(int)
        if header is not None:
            self.fields = header
            for k, v in header.items():
                cn = get_canonical_name(v)
                t[cn[1]] += 1
        else:
            headerline = self._reader.next()
            self.lineLength = len(headerline)
            self.fields = {}
            for k, v in enumerate(headerline):
                cn = get_canonical_name(v)
                if cn[0] is not None:
                    t[cn[1]] += 1
                    self.fields[k] = cn[0]

        if self.rowtype is None:
            items = t.items()
            items.sort(key=lambda item: (item[1], item[0]), reverse=True)
            self.rowtype = items[0][0]
            self.logger.info("Setting row type to %s", self.rowtype)
        elif self.rowtype in types:
            self.rowtype = types[self.rowtype]["shortname"]
        else:
            raise TypeError("{} not mapped to short name".format(self.rowtype))
Example #2
0
import logging
from datetime import datetime
from collections import Counter
from functools import wraps

import boto3
import botocore.exceptions

from idb import config
from idb.helpers.memoize import memoized, filecached
from idb.helpers.storage import IDigBioStorage
from idb.helpers.logging import getLogger, configure_app_log
from idb.postgres_backend import apidbpool, cursor
from idigbio_ingestion.mediaing.fetcher import FetchItem

logger = getLogger("cephwalker")
store = IDigBioStorage()


@memoized()
def s3connection():
    return boto3.resource(
        's3',
        aws_access_key_id=config.IDB_STORAGE_ACCESS_KEY,
        aws_secret_access_key=config.IDB_STORAGE_SECRET_KEY,
        endpoint_url="https://s.idigbio.org")


def process_keys(fn, keylist, poolsize=20):

    def wkfn(k):
import os
import sys
from idb.helpers.logging import getLogger, configure_app_log
logger = getLogger("restore")
from idb.postgres_backend.db import PostgresDB

# When Ceph chewed up a bunch of objects (ticket #2605), we figured out how to
# rebuild them by searching for the file parts on disk. We generated lists of
# files on all ceph nodes with:
#  find /srv/ceph -ls > <text file>
# which we then import below into a postgres table in idb-api-beta for rapid
# look ups and searching.


# Table was created manually with below:
#\connect idb_api_beta
#CREATE TABLE ceph_server_files (
#    server VARCHAR(16) NOT NULL,
#    line INTEGER,
#    unk INTEGER,
#    perms VARCHAR(16),
#    unk2 INTEGER,
#    owner_name VARCHAR(16),
#    group_name VARCHAR(16),
#    size BIGINT,
#    day INTEGER,
#    month VARCHAR(3),
#    year_time VARCHAR(8),
#    fullname TEXT NOT NULL,
#    filename TEXT NOT NULL
#);
Example #4
0
    def __init__(self,filedict,fh,logname=None):
        """
            Construct a DwcaRecordFile from a xml tree pointer to the <location> tag containing the data file name
            and a file handle or string pointing to the data file.
        """

        # Avoid Setting attributes on self that conflict with attributes in DelimitedFile to enforce namespace separation
        if isinstance(filedict["files"]["location"], list):
            for l in filedict["files"]["location"]:
                if fh.endswith(l):
                    self.name = l
                    break
            else:
                raise Exception("Name not found.")
        else:
            self.name = filedict['files']['location']

        if logname:
            logbase = getLogger(logname)
        else:
            logbase = idblogger.getChild('dwca')
        self.logger = logbase.getChild(self.name.split(".")[0])

        fields = {}
        self.linebuf = deque()

        idtag = "id"
        idfld = None
        if 'id' in filedict:
            self.filetype = "core"
            idfld = filedict["id"]
        elif "coreid" in filedict:
            idtag = "coreid"
            idfld = filedict["coreid"]
            self.filetype = "extension"
        else:
            self.filetype = "core"

        if idfld is not None:
            fields[int(idfld['#index'])] = idtag

        rowtype = filedict["#rowType"]
        encoding = filedict.get("#encoding", "UTF-8")
        linesplit = filedict["#linesTerminatedBy"].decode('string_escape')
        fieldsplit = filedict["#fieldsTerminatedBy"].decode('string_escape')
        fieldenc = filedict["#fieldsEnclosedBy"].decode('string_escape')
        ignoreheader = int(filedict.get("#ignoreHeaderLines","0"))

        self.defaults = {}
        if "field" not in filedict:
            filedict["field"] = []
        elif not isinstance(filedict['field'],list):
            filedict['field'] = [filedict['field']]
        for fld in filedict['field']:
            # drop any extra quote characters
            term = fld['#term'].replace("\"","")

            # map xmp namespaces into short code form (xxx:fieldName), longest namespaces first
            for ns in sorted(namespaces.keys(),key=lambda x: len(x), reverse=True):
                if term.startswith(ns):
                    term = term.replace(ns,namespaces[ns]+":")
                    break
            if '#index' in fld:
                if int(fld['#index']) not in fields:
                    fields[int(fld['#index'])] = term
                else:
                    self.logger.error("Duplicate field index ignored {0}".format(str(fld)))
            if '#default' in fld:
                self.defaults[term] = fld['#default']
        # print self.defaults

        super(DwcaRecordFile,self).__init__(
            fh,encoding=encoding,delimiter=fieldsplit,fieldenc=fieldenc,header=fields,rowtype=rowtype,
            logname=self.logger.name)

        while ignoreheader > 0:
            self._reader.next()
            ignoreheader -= 1
Example #5
0
    def __init__(self,name="dwca.zip",skipeml=False,logname=None):
        self.path = name.split(".")[0]
        if self.path == name:
            self.path += "_extracted"

        if logname:
            logbase = getLogger(logname)
        else:
            logbase = idblogger.getChild('dwca')
        self.logger = logbase.getChild(name.split("/")[-1].split(".")[0])

        try:
            self.archive = zipfile.ZipFile(name, 'r')
            self.archive.extractall(self.path)
        except zipfile.BadZipfile:
            self.logger.fatal("Couldn't extract '%s'", name)
            raise

        root = None
        meta_filename = self.path + "/" + archiveFile(self.archive,"meta.xml")
        try:
            schema_parser = etree.XMLParser(no_network=False)
            # wut is going on. see https://redmine.idigbio.org/issues/3042
            schema = etree.XMLSchema(etree.parse(DWC_SCHEMA_URL, parser=schema_parser))
            parser = etree.XMLParser(schema=schema, no_network=False)

            with open(meta_filename,'r') as meta:
                try:
                    root = etree.parse(meta, parser=parser).getroot()
                except:
                    self.logger.info("Schema validation failed against '%s', continuing unvalidated.", DWC_SCHEMA_URL)
                    self.logger.debug(traceback.format_exc())
                    meta.seek(0)
                    # print meta.read()
                    # meta.seek(0)
                    root = etree.parse(meta).getroot()
        except:
            self.logger.info("Failed to fetch schema '%s', continuing unvalidated.", DWC_SCHEMA_URL)
            self.logger.debug(traceback.format_exc())
            with open(meta_filename,'r') as meta:
                root = etree.parse(meta).getroot()
        rdict = xml2d(root)

        self.archdict = rdict["archive"]

        if not skipeml and "#metadata" in self.archdict:
            metadata = archiveFile(self.archive,self.archdict["#metadata"])
            with open(self.path + "/" + metadata,'r') as mf:
                mdtree = etree.parse(mf).getroot()
                self.metadata = xml2d(mdtree)
        else:
            self.metadata = None

        corefile = archiveFile(self.archive,self.archdict["core"]["files"]["location"])
        self.core = DwcaRecordFile(self.archdict["core"],
                                   self.path + "/" + corefile,
                                   logname=self.logger.name)

        self.extensions = []
        if "extension" in self.archdict:
            if isinstance(self.archdict["extension"],list):
                for x in self.archdict["extension"]:
                    if isinstance(x["files"]["location"], list):
                        for loc in x["files"]["location"]:
                            extfile = archiveFile(self.archive,loc)
                            print(extfile)
                            try:
                                self.extensions.append(
                                    DwcaRecordFile(x,
                                                   self.path + "/" + extfile,
                                                   logname=self.logger.name))
                            except:
                                traceback.print_exc()
                    else:
                        extfile = archiveFile(self.archive,x["files"]["location"])
                        try:
                            self.extensions.append(
                                DwcaRecordFile(x,
                                               self.path + "/" + extfile,
                                               logname=self.logger.name))
                        except:
                            pass
            else:
                extfile = archiveFile(self.archive,self.archdict["extension"]["files"]["location"])
                self.extensions.append(
                    DwcaRecordFile(self.archdict["extension"],
                                   self.path + "/" + extfile,
                                   logname=self.logger.name))
Example #6
0
from __future__ import division, absolute_import
from __future__ import print_function

from functools import wraps
import cPickle
from idb.helpers.logging import getLogger
from atomicfile import AtomicFile

logger = getLogger('memoize')

def _memoize_0args(fn):
    "A memoizer for a no arg function; only need a single cell storage"
    @wraps(fn)
    def memo():
        if memo.__cache__ is memo:
            memo.__cache__ = fn()
        return memo.__cache__
    memo.__cache__ = memo
    return memo


def _memoize_nargs_error(fn):
    "A memoizer for function w/ arbitrary length arguments."
    memory = {}

    @wraps(fn)
    def memo(*args, **kwargs):
        key = hash((hash(args), hash(tuple(sorted(kwargs.items())))))
        try:
            v = memory[key]
        except KeyError:
        if do_a_file(r["ceph_bucket"], r["ceph_name"], outdir):
            status = "reconstructed"
        else:
            status = "invalid"
        return update_db(r["ceph_bucket"], r["ceph_name"], status)
    except:
        logger.error("Exception in worker on {0}/{1} {2}".format(
            r["ceph_bucket"], r["ceph_name"], traceback.format_exc()))
        update_db(r["ceph_bucket"], r["ceph_name"], "exception")
        return False


if __name__ == '__main__':

    configure_app_log(2, logfile="./reconstruct.log", journal="auto")
    getLogger('paramiko').setLevel(ERROR)
    logger = getLogger("reconstruct")
    logger.info("Begining reconstruction of objects")

    argparser = argparse.ArgumentParser(
        description="Reconstruct a ceph object from files on disk")
    argparser.add_argument("-b",
                           "--bucket",
                           required=False,
                           help="Bucket name eg 'idigbio-images-prod'")
    argparser.add_argument("-n",
                           "--name",
                           required=False,
                           help="Verify only this one name")
    argparser.add_argument(
        "-o",
import sys
import argparse
import string
import math
import traceback

from idb.postgres_backend import apidbpool
from idb.helpers.logging import getLogger, configure_app_log
from idb.helpers.storage import IDigBioStorage

BUCKETS = [
    "idigbio-datasets-prod", "idigbio-images-prod", "idigbio-models-prod",
    "idigbio-sounds-prod", "idigbio-static-downloads", "idigbio-video-prod"
]

logger = getLogger("update-ceph-files")


def append_prefixes(bucket, prefix=""):
    """Generate prefixes for bucket files so we can do little sections at a time.
    Remember that files can be named anything, most just happen to be md5sums
    but we still need to go through all possible letters and numbers unless
    we only parallelize buckets that use etags.
    """
    if prefix != "":
        return [{"bucket": bucket, "prefix": prefix}]
    else:
        valid_chars = string.hexdigits[0:-6]  # no capitals, we lowercase
        prefixes = []
        for l_1 in valid_chars:
            for l_2 in valid_chars:
Example #9
0
# Import the old sqlite database that tracked backups of ceph objects in
# to a new table in the prod database that will track integrity operations
# and backup status on important ceph buckets.

import os
import sys
from idb.helpers.logging import getLogger, configure_app_log
logger = getLogger("import")
from idb.postgres_backend.db import PostgresDB

# no utf-8 here
import csv

# ceph_* -> comes from list ceph bucket
# tsm_* -> from TSM backup process
# ver_* -> from retrive from ceph verification
# rest_* -> from test restores
create_query = """CREATE TABLE IF NOT EXISTS
ceph_objects (
 ceph_bucket varchar(32) NOT NULL,
 ceph_name varchar(128) NOT NULL,
 ceph_date timestamp without time zone,
 ceph_bytes BIGINT,
 ceph_etag uuid,
 tsm_eligible BOOLEAN,
 tsm_status VARCHAR(16),
 tsm_last_success timestamp without time zone,
 tsm_last_failure timestamp without time zone,
 tsm_bytes BIGINT,
 tsm_path VARCHAR(32),
 ver_status VARCHAR(16),
Example #10
0
from socket import error as socket_error

from idb.postgres_backend import apidbpool
from idb.helpers.logging import getLogger, configure_app_log
from idb.helpers.storage import IDigBioStorage
from boto.exception import S3ResponseError

TMP_DIR = os.path.join("/tmp", os.path.basename(sys.argv[0]))
STORAGE_HOST = "10.13.44.95:7480"

TEST = False
DELETE = False
STASH = None
DELETED = []

logger = getLogger("verify-ceph-files")


def check_args_and_set_global_flags(args):
    """Check the command-line arguments and set some global flags to control
    processing."""

    global DELETE
    global TEST
    global STASH

    try:
        if args["names_from_file"] is not None:
            logger.error(
                "'--names-from-file' not yet implemented.  Exiting...")
            # Future: verify the file exits and is readable