def test_log_bad_level(self): os.environ['LOG_LEVEL'] = 'bad_level' logger = createLog('tester') level = logger.getEffectiveLevel() self.assertEqual(level, logging.WARNING) del os.environ['LOG_LEVEL']
def test_log_debug(self): os.environ['LOG_LEVEL'] = 'debug' logger = createLog('tester') level = logger.getEffectiveLevel() self.assertEqual(level, logging.DEBUG) del os.environ['LOG_LEVEL']
def __init__(self, isbn): self.isbn = isbn self.logger = createLog('unglueit') self.unglueitSearch = os.environ['ISBN_LOOKUP'] self.unglueitFetch = os.environ['OPDS_LOOKUP'] self.unglueitUser = os.environ['USERNAME'] self.unglueitApiKey = os.environ['API_KEY'] self.logger.info('Fetching summary for {}'.format(self.isbn))
def __init__(self, queryName, queryType): self.queryName = queryName self.queryType = queryType.lower() if queryType else 'personal' self.validateType() self.logger = createLog('viafSearch') self.viaf_endpoint = os.environ['VIAF_API'] self.redis = redis.Redis(host=os.environ['REDIS_ARN'], port=6379, socket_timeout=5)
def main(): os.environ["COLLECTIVEACCESS_HOME"] = "/data/idigpaleo/admin" config = json.load(open('./config.json')) # Get the parameters provided in the args parser = ingestHelpers.createParser() args = parser.parse_args() ingestAll = args.ALL ingestSources = args.sources ingestTargets = args.projects testRun = args.test logLevel = args.logLevel # Create the log logger = logHelpers.createLog('ingest', logLevel) logger.info("Starting " + str(ingestTargets) + " ingest") # If we just want a list of instutions and targets, do that if args.listInstitutions: sources = config['sources'] for source in sources: sys.stdout.write(source + "\nPossible Target Projects: ") for project in sources[source]["validTargets"]: sys.stdout.write(project + " ") sys.stdout.write("\n") return True # If we don't have any sources or targets throw errors if ingestSources == None or ingestTargets == None and ingestAll == False: logger.error("You must define at least one source and one target") sys.exit(0) # If the --ALL flag got set, disregard anything else and just do a full import if ingestAll == True: logger.info("Importing from all sources for all targets") ingestTargets = config['targetProjects'] ingestSources = [] for source in config['sources']: ingestSources.append(source) # Start the relevant imports for source in ingestSources: logger.info("Starting ingest for " + source) importCSVs = coreIngest.createCSVSource(source, ingestTargets, testRun) logger.debug(importCSVs) importResults = coreIngest.importCSVFiles(source, ingestTargets, importCSVs) if importResults is True: logger.info("Successfully completed import. Check logs for partial import failures or skipped datasets") else: logger.error("Import was not completed!")
class KModel: LOGGER = createLog('kMeans') def __init__(self, instances): self.instances = instances self.df = None self.clusters = defaultdict(list) def createPipeline(self): return Pipeline([ ('union', FeatureUnion( transformer_list=[ ('place', Pipeline([ ('selector', TextSelector(key='place')), ('tfidf', TfidfVectorizer( preprocessor=KModel.pubProcessor, stop_words='english', strip_accents='unicode', analyzer='char_wb', ngram_range=(2,4)) ) ])), ('publisher', Pipeline([ ('selector', TextSelector(key='publisher')), ('tfidf', TfidfVectorizer( preprocessor=KModel.pubProcessor, stop_words='english', strip_accents='unicode', analyzer='char_wb', ngram_range=(2,4)) ) ])), ('date', Pipeline([ ('selector', NumberSelector(key='pubDate')), ('scaler', MinMaxScaler()) ])) ], transformer_weights={ 'place': 0.5, 'publisher': 1.0, 'date': 2.0 } )), ('kmeans', KMeans( n_clusters=self.currentK, n_jobs=-1 )) ]) @classmethod def pubProcessor(cls, raw): if isinstance(raw, list): raw = ', '.join(filter(None, raw)) if raw is not None: raw = raw.replace('&', 'and') cleanStr = raw.translate( str.maketrans('', '', string.punctuation) ).lower() cleanStr = cleanStr\ .replace('sn', '')\ .replace('place of publication not identified', '')\ .replace('publisher not identified', '') cleanStr = re.sub(r'\s+', ' ', cleanStr) cls.LOGGER.debug('Cleaned string {} to {} for processing'.format( raw, cleanStr )) return cleanStr cls.LOGGER.debug('Unable to clean NoneType, returning empty string') return '' def createDF(self): self.LOGGER.info('Generating DataFrame from instance data') self.df = pd.DataFrame([ { 'place': i.pub_place if i.pub_place else '', 'publisher': KModel.getPublisher(i.agent_instances), 'pubDate': KModel.getPubDateFloat(i.dates), 'edition': i.edition_statement, 'volume': i.volume, 'table_of_contents': i.table_of_contents, 'extent': i.extent, 'summary': i.summary, 'rowID': i.id } for i in self.instances if KModel.emptyInstance(i) != False ]) self.maxK = len(self.df.index) if len(self.df.index) > 1 else 2 if self.maxK > 1000: self.maxK = int(self.maxK * (2/9)) elif self.maxK > 500: self.maxK = int(self.maxK * (3/9)) elif self.maxK > 250: self.maxK = int(self.maxK * (4/9)) @staticmethod def emptyInstance(instance): return bool(instance.pub_place or\ KModel.getPubDateFloat(instance.dates) or\ KModel.getPublisher(instance.agent_instances)) @classmethod def getPubDateFloat(cls, dates): for d in dates: if d.date_type == 'pub_date' and d.date_range: cls.LOGGER.debug('Found publication date {}'.format( d.display_date )) lowerYear = d.date_range.lower.year if d.date_range.lower else None upperYear = d.date_range.upper.year if d.date_range.upper else None if lowerYear and upperYear: return (upperYear + lowerYear) / 2 elif lowerYear: return lowerYear elif upperYear: return upperYear cls.LOGGER.debug('Unable to locate publication date') return 0 @classmethod def getPublisher(cls, agent_rels): publishers = [ a.agent.name for a in agent_rels if a.role == 'publisher' ] return '; '.join(sorted(list(set(publishers)))) def generateClusters(self): self.LOGGER.info('Generating Clusters from instances') try: # Calculate the step for the first run at determining k # Use the natural log of the value to get a reasonable scale # for different values step = int(np.log(self.maxK)**1.5 - 1) if np.log(self.maxK) > 1.6 else 1 # First pass at finding best value for k, using the step value # derived above self.getK(1, self.maxK, step) # Get narrower band of possible k values, based off the initial # step value startK = self.k - (step - 1) if self.k > (step - 1) else 1 stopK = self.k + step if (self.k + step) <= self.maxK else self.maxK # Get the final k value by iterating through the much narrower # range returned above self.getK(startK, stopK, 1) self.LOGGER.debug('Setting K to {}'.format(self.k)) except ZeroDivisionError: self.LOGGER.debug('Single instance found setting K to 1') self.k = 1 try: labels = self.cluster(self.k) except ValueError as err: labels = [0] * len(self.instances) for n, item in enumerate(labels): try: self.clusters[item].append(self.df.loc[[n]]) except KeyError: continue def getK(self, start, stop, step): self.LOGGER.info('Calculating number of clusters, max {}'.format( self.maxK )) warnings.filterwarnings('error', category=ConvergenceWarning) wcss = [] for i in range(start, stop, step): try: wcss.append((self.cluster(i, score=True), i)) except ConvergenceWarning: self.LOGGER.info('Exceeded number of distinct clusters, break') break except ValueError: self.k = 1 return None x1, y1 = wcss[0][1], wcss[0][0] x2, y2 = wcss[len(wcss) - 1][1], wcss[(len(wcss) - 1)][0] distances = [] denominator = sqrt((y2 - y1)**2 + (x2 - x1)**2) for i in range(len(wcss)): x0 = i + 1 y0 = wcss[i][0] numerator = abs((y2 - y1)*x0 - (x2 - x1)*y0 + x2*y1 - y2*x1) distances.append(numerator/denominator) finalStart = 1 if start < 2 else start + 1 self.k = distances.index(max(distances)) + finalStart return None def cluster(self, k, score=False): self.currentK = k self.LOGGER.info('Generating cluster for k={}'.format(k)) pipeline = self.createPipeline() if score is True: self.LOGGER.debug('Returning score for n_clusters estimation') pipeline.fit(self.df) return pipeline['kmeans'].inertia_ else: self.LOGGER.debug('Returning model prediction') return pipeline.fit_predict(self.df) def parseEditions(self): eds = [] self.LOGGER.info('Generating editions from clusters') for clust in dict(self.clusters): yearEds = defaultdict(list) self.LOGGER.info('Parsing cluster {}'.format(clust)) for ed in self.clusters[clust]: self.LOGGER.info('Adding instance to {} edition'.format( ed.iloc[0]['pubDate'] )) yearEds[ed.iloc[0]['pubDate']].append({ 'pubDate': ed.iloc[0]['pubDate'], 'publisher': ed.iloc[0]['publisher'], 'pubPlace': ed.iloc[0]['place'], 'rowID': ed.iloc[0]['rowID'], 'edition': ed.iloc[0]['edition'], 'volume': ed.iloc[0]['volume'], 'table_of_contents': ed.iloc[0]['table_of_contents'], 'extent': ed.iloc[0]['extent'], 'summary': ed.iloc[0]['summary'] }) eds.extend([(year, data) for year, data in yearEds.items()]) eds.sort(key=lambda x: x[0]) return eds
import requests import os from lxml import etree from helpers.errorHelpers import OCLCError, DataError from helpers.logHelpers import createLog from lib.outputManager import OutputManager logger = createLog('classify_read') NAMESPACE = { None: 'http://classify.oclc.org' } def classifyRecord(searchType, searchFields, workUUID, start=0): """Generates a query for the OCLC Classify service and returns the raw XML response received from that service. This method takes 3 arguments: - searchType: identifier|authorTitle - searchFields: identifier+idType|authors+title - uuid: UUID of the parent work record""" try: classifyQuery = QueryManager( searchType, searchFields.get('identifier', None), searchFields.get('idType', None), searchFields.get('authors', None), searchFields.get('title', None), start )
from base64 import b64decode from binascii import Error as base64Error import boto3 from botocore.exceptions import ClientError import os import yaml from helpers.logHelpers import createLog logger = createLog('configHelpers') def loadEnvFile(runType, fileString): envDict = None fileLines = [] if fileString: openFile = fileString.format(runType) else: openFile = 'config.yaml' try: with open(openFile) as envStream: try: envDict = yaml.safe_load(envStream) except yaml.YAMLError as err: logger.error('{} Invalid! Please review'.format(openFile)) raise err envStream.seek(0)
import subprocess import sys import os import re from helpers.logHelpers import createLog from helpers.errorHelpers import InvalidExecutionType from helpers.clientHelpers import createEventMapping from helpers.configHelpers import setEnvVars logger = createLog('runScripts') def main(): """Invoked by the makefile's arguments, controls the overall execution of the Lambda function. h/t to nonword for inspiration to use a makefile. Raises: InvalidExecutionType: If the args do not contain a valid execution type raise an error. """ if len(sys.argv) != 2: logger.warning('This script takes one, and only one, argument!') sys.exit(1) runType = sys.argv[1] if re.match(r'^(?:local|development|qa|production)', runType): logger.info('Deploying lambda to {} environment'.format(runType)) setEnvVars(runType) subprocess.run([ 'lambda', 'deploy', '--config-file', 'run_config.yaml',
import boto3 import json from helpers.logHelpers import createLog from helpers.errorHelpers import InvalidExecutionType from helpers.configHelpers import loadEnvFile logger = createLog('clientHelpers') def createAWSClient(service, configDict=None): if configDict is None: configDict, configLines = loadEnvFile(None, None) clientKwargs = { 'region_name': configDict['region'] } if ( 'aws_access_key_id' in configDict and configDict['aws_access_key_id'] is not None ): clientKwargs['aws_access_key_id'] = configDict['aws_access_key_id'] clientKwargs['aws_secret_access_key'] = configDict['aws_secret_access_key'] # noqa: E501 lambdaClient = boto3.client( service, **clientKwargs )
from sfrCore import SessionManager from lib.fetchers.openLibraryFetcher import OLSessionManager from lib.coverManager import CoverManager from helpers.logHelpers import createLog # Logger can be passed name of current module # Can also be instantiated on a class/method basis using dot notation logger = createLog('handler') """This method will create the database if necessary and otherwise run any new migrations. This is placed here because Lambdas will "freeze" any methods that are executed before the main handler block, meaning that we can run migrations and generate a db connection for multiple invocations, at least until AWS decides to regenerate the container """ MANAGER = SessionManager() MANAGER.generateEngine() OL_MANAGER = OLSessionManager() OL_MANAGER.generateEngine() def handler(event, context): """Method invoked by Lambda event. Verifies that records were received and, if so, passes them to be parsed""" logger.debug('Starting Lambda Execution') coverManager = CoverManager(MANAGER, OL_MANAGER) coverManager.getInstancesForSearch() coverManager.getCoversForInstances()
import os from elasticsearch import Elasticsearch from elasticsearch.exceptions import (ConnectionError, TransportError, ConflictError) from elasticsearch_dsl import connections from elasticsearch_dsl.wrappers import Range from sqlalchemy.orm import configure_mappers from model.elasticModel import (Work, Instance, Subject, Identifier, Agent, Language, Rights) from helpers.logHelpers import createLog from helpers.errorHelpers import ESError logger = createLog('es_manager') class ESConnection(): def __init__(self): self.index = os.environ['ES_INDEX'] self.client = None self.tries = 0 self.batch = [] self.createElasticConnection() self.createIndex() configure_mappers() def createElasticConnection(self):
from datetime import datetime import os from sfrCore import Item, Instance, Identifier from lib.importers.abstractImporter import AbstractImporter from helpers.logHelpers import createLog logger = createLog('itemImporter') class ItemImporter(AbstractImporter): def __init__(self, record, session, kinesisMsgs, sqsMsgs): self.data = record['data'] self.item = None self.kinesisMsgs = kinesisMsgs self.sqsMsgs = sqsMsgs self.logger = self.createLogger() super().__init__(record, session) @property def identifier(self): return self.item.id def lookupRecord(self): self.logger.info('Ingesting item record') itemID = Identifier.getByIdentifier( Item, self.session, self.data.get('identifiers', []) )
from datetime import datetime import json import os from sfrCore import Instance, Identifier from lib.importers.abstractImporter import AbstractImporter from helpers.logHelpers import createLog logger = createLog('instanceImporter') class InstanceImporter(AbstractImporter): def __init__(self, record, session, kinesisMsgs, sqsMsgs): self.source = record.get('source', 'unknown') self.data = record['data'] self.instance = None self.kinesisMsgs = kinesisMsgs self.sqsMsgs = sqsMsgs self.logger = self.createLogger() super().__init__(record, session) @property def identifier(self): return self.instance.id def lookupRecord(self): self.logger.info('Ingesting instance record') instanceID = Identifier.getByIdentifier( Instance, self.session, self.data.get('identifiers', [])) if instanceID is not None:
from lxml import etree import marcalyx from helpers.errorHelpers import OAIFeedError, MARCXMLError from helpers.logHelpers import createLog logger = createLog('oai_parser') # Standard namespaces used by the DOAB OAI-PMH feed OAI_NS = '{http://www.openarchives.org/OAI/2.0/}' MARC_NS = '{http://www.loc.gov/MARC21/slim}' def parseOAI(oaiFeed): """Parse a supplied lxml object into a set of records, which are then read into a list of marcalyx records. This also checks the provided feed for a resumption token, which if found, will be used to recursively retrieve the next page of DOAB records """ try: logger.info('Parsing OAI-PMH feed of MARCXML records') res = etree.XML(oaiFeed) except etree.ParseError as err: logger.error('Unable to parse OAI-PMH Feed with lxml') logger.debug(err) raise OAIFeedError('Unable to parse XML from OAI-PMH feed') logger.info('Loading all retrieved MARCXML records') records = res.findall('.//{}record'.format(OAI_NS)) marcRecords = list(filter(None, (readRecord(r) for r in records)))
import os from helpers.errorHelpers import OCLCError, DataError from helpers.logHelpers import createLog from lib.dataModel import Identifier from lib.readers.oclcClassify import classifyRecord from lib.parsers.parseOCLC import readFromClassify, extractAndAppendEditions from lib.outputManager import OutputManager logger = createLog('enhancer') def enhanceRecord(record): """Takes a single input record and retrieves data from the OCLC Classify service. Manages the overall workflow of the function.""" try: workUUID = record['uuid'] searchType = record['type'] searchFields = record['fields'] startPos = record.get('start', 0) except KeyError as e: logger.error('Missing attribute in data block!') logger.debug(e) raise DataError('Required attribute missing from data block') except TypeError as e: logger.error('Could not read data from source') logger.debug(e) raise DataError('Kinesis data contains non-dictionary value') logger.info('Starting to enhance work record {}'.format(workUUID))
import boto3 import json import datetime import time from helpers.errorHelpers import KinesisError from helpers.logHelpers import createLog from helpers.clientHelpers import createAWSClient logger = createLog('kinesis_write') class OutputManager(): """Class for managing connections and operations with AWS Kinesis""" KINESIS_CLIENT = createAWSClient('kinesis') def __init__(self): pass @classmethod def putRecord(cls, outputObject, stream, workUUID): """Put an event into the specific Kinesis stream""" logger.info("Writing results to Kinesis") # The default lambda function here converts all objects into dicts kinesisStream = OutputManager._convertToJSON(outputObject) try: cls.KINESIS_CLIENT.put_record(StreamName=stream, Data=kinesisStream, PartitionKey=workUUID)
from datetime import datetime from sfrCore import Item from lib.importers.abstractImporter import AbstractImporter from helpers.logHelpers import createLog logger = createLog('accessImporter') class AccessReportImporter(AbstractImporter): def __init__(self, record, session, kinesisMsgs, sqsMsgs): self.data = record['data'] self.item = None self.logger = self.createLogger() super().__init__(record, session) @property def identifier(self): return self.item.id def lookupRecord(self): self.logger.info('Ingest Accessibility Report') return self.insertRecord() def insertRecord(self): accessReport = Item.addReportData(self.session, self.data) if not accessReport: return 'error'
import json from helpers.errorHelpers import OutputError from helpers.logHelpers import createLog from helpers.clientHelpers import createAWSClient logger = createLog('output_write') class OutputManager(): """Controls the output formats and streams from this function. Valid output targets are: Kinesis: for processing in the enhancement pipeline and epub storage SQS: For queuing and processing by the ElasticSearch manager""" KINESIS_CLIENT = createAWSClient('kinesis') def __init__(self): pass @classmethod def putKinesis(cls, data, stream, recType='work'): """Puts records into a Kinesis stream for processing by other parts of the SFR data pipeline. Takes data as an object and converts it into a JSON string. This is then passed to the specified stream. This will raise any error, as failure to pass an output should halt the process.""" logger.info('Writing results to Kinesis') outputObject = {'status': 200, 'data': data, 'type': recType}
from json.decoder import JSONDecodeError import os import requests from .abstractReader import AbsSourceReader from lib.models.metRecord import MetItem from helpers.logHelpers import createLog logger = createLog('metReader') class MetReader(AbsSourceReader): INDEX_URL = 'https://libmma.contentdm.oclc.org/digital/api/search/collection/p15324coll10/order/title/ad/asc/page/{}/maxRecords/50' ITEM_API = 'https://libmma.contentdm.oclc.org/digital/api/collections/p15324coll10/items/{}/false' def __init__(self, updateSince): self.updateSince = updateSince self.startPage = 1 self.stopPage = 48 self.source = 'Metropolitan Museum of Art' self.works = [] self.itemIDs = [] def collectResourceURLs(self): logger.info('Fetching records from MET Digital Collections') for page in range(self.startPage, self.stopPage): logger.debug('Fetching page {}'.format(page)) indexResp = requests.get(self.INDEX_URL.format(page)) indexData = indexResp.json() for item in indexData['items']: itemID = item['itemId'] logger.debug('Found record with ID {}'.format(itemID))
from collections import defaultdict from lib.importers.workImporter import WorkImporter from lib.importers.instanceImporter import InstanceImporter from lib.importers.itemImporter import ItemImporter from lib.importers.accessImporter import AccessReportImporter from lib.importers.coverImporter import CoverImporter from lib.outputManager import OutputManager from helpers.logHelpers import createLog logger = createLog('db_manager') class DBManager: # Load Updaters for specific types of records, all based on AbstractUpdater IMPORTERS = { 'work': WorkImporter, 'instance': InstanceImporter, 'item': ItemImporter, 'access_report': AccessReportImporter, 'cover': CoverImporter } def __init__(self, session): self.session = session self.logger = logger self.kinesisMsgs = defaultdict(list) self.sqsMsgs = defaultdict(list) def importRecord(self, record):
import re import pycountry import requests from requests.exceptions import ConnectionError, MissingSchema, InvalidURL from urllib.parse import quote_plus from helpers.errorHelpers import MARCXMLError, DataError from helpers.logHelpers import createLog from lib.linkParser import LinkParser from lib.dataModel import (WorkRecord, Identifier, Subject, Language, InstanceRecord, Date, Agent, Format, Rights, Link) logger = createLog('marc_parser') SUBJECT_INDICATORS = { '0': 'lcsh', '1': 'lcch', '2': 'msh', '3': 'nalsaf', '4': None, '5': 'csh', '6': 'rvm', } def parseMARC(records, marcRels): """Accepts list of MARCXML records and invokes the parser for each. If an error occurs None is returned and filter() removes them from the list """ logger.info('Transforming MARCXML records into SFR objects')
from datetime import datetime import pycountry import requests from urllib.parse import quote_plus from ..dataModel import (WorkRecord, InstanceRecord, Format, Identifier, Language, Agent, Subject, Link, Date, Rights) from helpers.logHelpers import createLog logger = createLog('metItem') class MetItem(object): ROOT_URL = 'https://libmma.contentdm.oclc.org/digital' ITEM_UI = 'https://libmma.contentdm.oclc.org/digital/collection/p15324coll10/id/{}/rec/1' SFR_CROSSWALK = { 'title': [ { 'level': 'work', 'field': 'title' }, { 'level': 'instance', 'field': 'title' }, ], 'creato': [{ 'level': 'work', 'field': 'author' }], 'descri': [{
def __init__(self, record): self.logger = createLog('CoverParse') self.remoteURL = record.get('url', None) self.source = record.get('source', 'unk') self.sourceID = record.get('identifier', None) self.s3CoverURL = None
from datetime import datetime import re import requests from urllib.parse import quote_plus from helpers.logHelpers import createLog from helpers.errorHelpers import HoldingError from lib.dataModel import InstanceRecord, Agent, Link, Identifier from lib.parsers.parse856Holding import HoldingParser logger = createLog('classify_parse') MEASUREMENT_TIME = datetime.now().strftime('%Y-%m-%d %H:%M:%S') SUBJECT_INDICATORS = { '0': 'lcsh', '1': 'lcch', '2': 'msh', '3': 'nalsaf', '4': None, '5': 'csh', '6': 'rvm', } def readFromMARC(marcRecord): """Parse marcalyx Record object representing oclc record""" logger.debug('Parsing Returned Edition') instance = InstanceRecord()
from datetime import datetime import os from .abstractUpdater import AbstractUpdater from sfrCore import Instance from helpers.errorHelpers import DBError from helpers.logHelpers import createLog logger = createLog('instanceUpdater') class InstanceUpdater(AbstractUpdater): def __init__(self, record, session, kinesisMsgs, sqsMsgs): self.data = record.get('data') self.attempts = int(record.get('attempts', 0)) self.instance = None self.logger = self.createLogger() super().__init__(record, session, kinesisMsgs, sqsMsgs) @property def identifier(self): return self.instance.id def lookupRecord(self): existingID = Instance.lookup( self.session, self.data.get('identifiers', []), self.data.get('volume', None), self.data.pop('primary_identifier', None) ) if existingID is None:
from copy import copy from datetime import datetime import json import os from .abstractUpdater import AbstractUpdater from sfrCore import Link from helpers.errorHelpers import DBError from helpers.logHelpers import createLog logger = createLog('coverUpdater') class CoverUpdater(AbstractUpdater): def __init__(self, record, session, kinesisMsgs, sqsMsgs): self.data = record.get('data') self.attempts = int(record.get('attempts', 0)) self.link = None self.logger = self.createLogger() super().__init__(record, session, kinesisMsgs, sqsMsgs) @property def identifier(self): return self.link.id def lookupRecord(self): currentURL = self.data.pop('originalURL', None) self.logger.debug('Updating Cover from {}'.format( currentURL if currentURL else '[unknown]' )) dbURL = Link.httpRegexSub(currentURL)
# stdnbr LOOKUP_IDENTIFIERS = [ 'oclc', # OCLC Number 'isbn', # ISBN (10 or 13) 'issn', # ISSN 'swid', # OCLC Work Identifier ] IDENTIFIER_TYPES = { 'oclc': OCLC, 'swid': OWI, 'isbn': ISBN, 'issn': ISSN, } logger = createLog('query_constructor') def queryWork(session, work, workUUID): """This takes a work record that has not been queried for enhanced data and begins that process. It extracts one of two things from the work record to allow for this lookup. If it contains an identifier in the list defined in LOOKUP_IDENTIFIERS, it will pass that identifier to the Kinesis stream. If not it will pass the author and title of the work. It will also pass the UUID of the database record, which will be used to match the returned data with the existing record.""" lookupIDs = getIdentifiers(session, work) classifyQueries = []
from datetime import datetime import os from .abstractUpdater import AbstractUpdater from sfrCore import Item from helpers.errorHelpers import DBError from helpers.logHelpers import createLog logger = createLog('itemUpdater') class ItemUpdater(AbstractUpdater): def __init__(self, record, session, kinesisMsgs, sqsMsgs): self.data = record.get('data') self.attempts = int(record.get('attempts', 0)) self.item = None self.logger = self.createLogger() super().__init__(record, session, kinesisMsgs, sqsMsgs) @property def identifier(self): return self.item.id def lookupRecord(self): primaryID = self.data.get('primary_identifier', None) self.logger.debug('Ingesting Item #{}'.format( primaryID['identifier'] if primaryID else 'unknown')) self.item = Item.lookup(self.session, self.data.get('identifiers', []), primaryID) if self.item is None:
from datetime import datetime, timedelta import os from sqlalchemy import text from sfrCore import Instance from .fetchers.openLibraryFetcher import OLCoverFetcher from .fetchers.googleBooksFetcher import GBCoverFetcher from .fetchers.contentCafeFetcher import CCCoverFetcher from .cover import SFRCover from .outputManager import OutputManager from helpers.logHelpers import createLog logger = createLog('coverManager') class CoverManager: """Manager class for finding cover images for Instance records and returning the resulting Cover objects to the database ingest manager. Methods: getInstancesForSearch -- Retrieve cover-less Instances from the database getCoversForInstances -- Search fetchers for covers and generate covers queryFetchers -- Query defined fetchers and break if a cover is found getValidIDs -- Parses list of identifiers for Instance to usable types sendCoversToKinesis -- places covers in stream for database manager """ def __init__(self, manager, olManager): """Initialize CoverManager with database managers and fetchers. This generates a logger, sets the update period and creates the array of fetcher objects which are used to retrieve cover URIs.