def __init__(self, fh, encoding="utf8", delimiter=",", fieldenc="\"", header=None, rowtype=None, logname=None): super(DelimitedFile, self).__init__() self.encoding = encoding self.fieldenc = fieldenc self.delimiter = delimiter self.rowtype = rowtype self.lineCount = 0 self.lineLength = None if isinstance(fh, str) or isinstance(fh, unicode): self.name = fh else: self.name = fh.name self.filehandle = io.open(fh, "r", encoding=encoding, errors="flag_error") if logname is None: self.logger = idblogger.getChild('df') else: self.logger = getLogger(logname) encoded_lines = (l.encode("utf-8") for l in self.filehandle) if self.fieldenc is None or self.fieldenc == "": self._reader = csv.reader(encoded_lines, encoding="utf-8", delimiter=self.delimiter, quoting=csv.QUOTE_NONE) else: self._reader = csv.reader(encoded_lines, encoding="utf-8", delimiter=self.delimiter, quotechar=self.fieldenc) t = defaultdict(int) if header is not None: self.fields = header for k, v in header.items(): cn = get_canonical_name(v) t[cn[1]] += 1 else: headerline = self._reader.next() self.lineLength = len(headerline) self.fields = {} for k, v in enumerate(headerline): cn = get_canonical_name(v) if cn[0] is not None: t[cn[1]] += 1 self.fields[k] = cn[0] if self.rowtype is None: items = t.items() items.sort(key=lambda item: (item[1], item[0]), reverse=True) self.rowtype = items[0][0] self.logger.info("Setting row type to %s", self.rowtype) elif self.rowtype in types: self.rowtype = types[self.rowtype]["shortname"] else: raise TypeError("{} not mapped to short name".format(self.rowtype))
import logging from datetime import datetime from collections import Counter from functools import wraps import boto3 import botocore.exceptions from idb import config from idb.helpers.memoize import memoized, filecached from idb.helpers.storage import IDigBioStorage from idb.helpers.logging import getLogger, configure_app_log from idb.postgres_backend import apidbpool, cursor from idigbio_ingestion.mediaing.fetcher import FetchItem logger = getLogger("cephwalker") store = IDigBioStorage() @memoized() def s3connection(): return boto3.resource( 's3', aws_access_key_id=config.IDB_STORAGE_ACCESS_KEY, aws_secret_access_key=config.IDB_STORAGE_SECRET_KEY, endpoint_url="https://s.idigbio.org") def process_keys(fn, keylist, poolsize=20): def wkfn(k):
import os import sys from idb.helpers.logging import getLogger, configure_app_log logger = getLogger("restore") from idb.postgres_backend.db import PostgresDB # When Ceph chewed up a bunch of objects (ticket #2605), we figured out how to # rebuild them by searching for the file parts on disk. We generated lists of # files on all ceph nodes with: # find /srv/ceph -ls > <text file> # which we then import below into a postgres table in idb-api-beta for rapid # look ups and searching. # Table was created manually with below: #\connect idb_api_beta #CREATE TABLE ceph_server_files ( # server VARCHAR(16) NOT NULL, # line INTEGER, # unk INTEGER, # perms VARCHAR(16), # unk2 INTEGER, # owner_name VARCHAR(16), # group_name VARCHAR(16), # size BIGINT, # day INTEGER, # month VARCHAR(3), # year_time VARCHAR(8), # fullname TEXT NOT NULL, # filename TEXT NOT NULL #);
def __init__(self,filedict,fh,logname=None): """ Construct a DwcaRecordFile from a xml tree pointer to the <location> tag containing the data file name and a file handle or string pointing to the data file. """ # Avoid Setting attributes on self that conflict with attributes in DelimitedFile to enforce namespace separation if isinstance(filedict["files"]["location"], list): for l in filedict["files"]["location"]: if fh.endswith(l): self.name = l break else: raise Exception("Name not found.") else: self.name = filedict['files']['location'] if logname: logbase = getLogger(logname) else: logbase = idblogger.getChild('dwca') self.logger = logbase.getChild(self.name.split(".")[0]) fields = {} self.linebuf = deque() idtag = "id" idfld = None if 'id' in filedict: self.filetype = "core" idfld = filedict["id"] elif "coreid" in filedict: idtag = "coreid" idfld = filedict["coreid"] self.filetype = "extension" else: self.filetype = "core" if idfld is not None: fields[int(idfld['#index'])] = idtag rowtype = filedict["#rowType"] encoding = filedict.get("#encoding", "UTF-8") linesplit = filedict["#linesTerminatedBy"].decode('string_escape') fieldsplit = filedict["#fieldsTerminatedBy"].decode('string_escape') fieldenc = filedict["#fieldsEnclosedBy"].decode('string_escape') ignoreheader = int(filedict.get("#ignoreHeaderLines","0")) self.defaults = {} if "field" not in filedict: filedict["field"] = [] elif not isinstance(filedict['field'],list): filedict['field'] = [filedict['field']] for fld in filedict['field']: # drop any extra quote characters term = fld['#term'].replace("\"","") # map xmp namespaces into short code form (xxx:fieldName), longest namespaces first for ns in sorted(namespaces.keys(),key=lambda x: len(x), reverse=True): if term.startswith(ns): term = term.replace(ns,namespaces[ns]+":") break if '#index' in fld: if int(fld['#index']) not in fields: fields[int(fld['#index'])] = term else: self.logger.error("Duplicate field index ignored {0}".format(str(fld))) if '#default' in fld: self.defaults[term] = fld['#default'] # print self.defaults super(DwcaRecordFile,self).__init__( fh,encoding=encoding,delimiter=fieldsplit,fieldenc=fieldenc,header=fields,rowtype=rowtype, logname=self.logger.name) while ignoreheader > 0: self._reader.next() ignoreheader -= 1
def __init__(self,name="dwca.zip",skipeml=False,logname=None): self.path = name.split(".")[0] if self.path == name: self.path += "_extracted" if logname: logbase = getLogger(logname) else: logbase = idblogger.getChild('dwca') self.logger = logbase.getChild(name.split("/")[-1].split(".")[0]) try: self.archive = zipfile.ZipFile(name, 'r') self.archive.extractall(self.path) except zipfile.BadZipfile: self.logger.fatal("Couldn't extract '%s'", name) raise root = None meta_filename = self.path + "/" + archiveFile(self.archive,"meta.xml") try: schema_parser = etree.XMLParser(no_network=False) # wut is going on. see https://redmine.idigbio.org/issues/3042 schema = etree.XMLSchema(etree.parse(DWC_SCHEMA_URL, parser=schema_parser)) parser = etree.XMLParser(schema=schema, no_network=False) with open(meta_filename,'r') as meta: try: root = etree.parse(meta, parser=parser).getroot() except: self.logger.info("Schema validation failed against '%s', continuing unvalidated.", DWC_SCHEMA_URL) self.logger.debug(traceback.format_exc()) meta.seek(0) # print meta.read() # meta.seek(0) root = etree.parse(meta).getroot() except: self.logger.info("Failed to fetch schema '%s', continuing unvalidated.", DWC_SCHEMA_URL) self.logger.debug(traceback.format_exc()) with open(meta_filename,'r') as meta: root = etree.parse(meta).getroot() rdict = xml2d(root) self.archdict = rdict["archive"] if not skipeml and "#metadata" in self.archdict: metadata = archiveFile(self.archive,self.archdict["#metadata"]) with open(self.path + "/" + metadata,'r') as mf: mdtree = etree.parse(mf).getroot() self.metadata = xml2d(mdtree) else: self.metadata = None corefile = archiveFile(self.archive,self.archdict["core"]["files"]["location"]) self.core = DwcaRecordFile(self.archdict["core"], self.path + "/" + corefile, logname=self.logger.name) self.extensions = [] if "extension" in self.archdict: if isinstance(self.archdict["extension"],list): for x in self.archdict["extension"]: if isinstance(x["files"]["location"], list): for loc in x["files"]["location"]: extfile = archiveFile(self.archive,loc) print(extfile) try: self.extensions.append( DwcaRecordFile(x, self.path + "/" + extfile, logname=self.logger.name)) except: traceback.print_exc() else: extfile = archiveFile(self.archive,x["files"]["location"]) try: self.extensions.append( DwcaRecordFile(x, self.path + "/" + extfile, logname=self.logger.name)) except: pass else: extfile = archiveFile(self.archive,self.archdict["extension"]["files"]["location"]) self.extensions.append( DwcaRecordFile(self.archdict["extension"], self.path + "/" + extfile, logname=self.logger.name))
from __future__ import division, absolute_import from __future__ import print_function from functools import wraps import cPickle from idb.helpers.logging import getLogger from atomicfile import AtomicFile logger = getLogger('memoize') def _memoize_0args(fn): "A memoizer for a no arg function; only need a single cell storage" @wraps(fn) def memo(): if memo.__cache__ is memo: memo.__cache__ = fn() return memo.__cache__ memo.__cache__ = memo return memo def _memoize_nargs_error(fn): "A memoizer for function w/ arbitrary length arguments." memory = {} @wraps(fn) def memo(*args, **kwargs): key = hash((hash(args), hash(tuple(sorted(kwargs.items()))))) try: v = memory[key] except KeyError:
if do_a_file(r["ceph_bucket"], r["ceph_name"], outdir): status = "reconstructed" else: status = "invalid" return update_db(r["ceph_bucket"], r["ceph_name"], status) except: logger.error("Exception in worker on {0}/{1} {2}".format( r["ceph_bucket"], r["ceph_name"], traceback.format_exc())) update_db(r["ceph_bucket"], r["ceph_name"], "exception") return False if __name__ == '__main__': configure_app_log(2, logfile="./reconstruct.log", journal="auto") getLogger('paramiko').setLevel(ERROR) logger = getLogger("reconstruct") logger.info("Begining reconstruction of objects") argparser = argparse.ArgumentParser( description="Reconstruct a ceph object from files on disk") argparser.add_argument("-b", "--bucket", required=False, help="Bucket name eg 'idigbio-images-prod'") argparser.add_argument("-n", "--name", required=False, help="Verify only this one name") argparser.add_argument( "-o",
import sys import argparse import string import math import traceback from idb.postgres_backend import apidbpool from idb.helpers.logging import getLogger, configure_app_log from idb.helpers.storage import IDigBioStorage BUCKETS = [ "idigbio-datasets-prod", "idigbio-images-prod", "idigbio-models-prod", "idigbio-sounds-prod", "idigbio-static-downloads", "idigbio-video-prod" ] logger = getLogger("update-ceph-files") def append_prefixes(bucket, prefix=""): """Generate prefixes for bucket files so we can do little sections at a time. Remember that files can be named anything, most just happen to be md5sums but we still need to go through all possible letters and numbers unless we only parallelize buckets that use etags. """ if prefix != "": return [{"bucket": bucket, "prefix": prefix}] else: valid_chars = string.hexdigits[0:-6] # no capitals, we lowercase prefixes = [] for l_1 in valid_chars: for l_2 in valid_chars:
# Import the old sqlite database that tracked backups of ceph objects in # to a new table in the prod database that will track integrity operations # and backup status on important ceph buckets. import os import sys from idb.helpers.logging import getLogger, configure_app_log logger = getLogger("import") from idb.postgres_backend.db import PostgresDB # no utf-8 here import csv # ceph_* -> comes from list ceph bucket # tsm_* -> from TSM backup process # ver_* -> from retrive from ceph verification # rest_* -> from test restores create_query = """CREATE TABLE IF NOT EXISTS ceph_objects ( ceph_bucket varchar(32) NOT NULL, ceph_name varchar(128) NOT NULL, ceph_date timestamp without time zone, ceph_bytes BIGINT, ceph_etag uuid, tsm_eligible BOOLEAN, tsm_status VARCHAR(16), tsm_last_success timestamp without time zone, tsm_last_failure timestamp without time zone, tsm_bytes BIGINT, tsm_path VARCHAR(32), ver_status VARCHAR(16),
from socket import error as socket_error from idb.postgres_backend import apidbpool from idb.helpers.logging import getLogger, configure_app_log from idb.helpers.storage import IDigBioStorage from boto.exception import S3ResponseError TMP_DIR = os.path.join("/tmp", os.path.basename(sys.argv[0])) STORAGE_HOST = "10.13.44.95:7480" TEST = False DELETE = False STASH = None DELETED = [] logger = getLogger("verify-ceph-files") def check_args_and_set_global_flags(args): """Check the command-line arguments and set some global flags to control processing.""" global DELETE global TEST global STASH try: if args["names_from_file"] is not None: logger.error( "'--names-from-file' not yet implemented. Exiting...") # Future: verify the file exits and is readable