Example #1
0
    def test_log_bad_level(self):

        os.environ['LOG_LEVEL'] = 'bad_level'
        logger = createLog('tester')
        level = logger.getEffectiveLevel()
        self.assertEqual(level, logging.WARNING)
        del os.environ['LOG_LEVEL']
Example #2
0
    def test_log_debug(self):

        os.environ['LOG_LEVEL'] = 'debug'
        logger = createLog('tester')
        level = logger.getEffectiveLevel()
        self.assertEqual(level, logging.DEBUG)
        del os.environ['LOG_LEVEL']
Example #3
0
    def __init__(self, isbn):
        self.isbn = isbn
        self.logger = createLog('unglueit')
        self.unglueitSearch = os.environ['ISBN_LOOKUP']
        self.unglueitFetch = os.environ['OPDS_LOOKUP']
        self.unglueitUser = os.environ['USERNAME']
        self.unglueitApiKey = os.environ['API_KEY']

        self.logger.info('Fetching summary for {}'.format(self.isbn))
Example #4
0
    def __init__(self, queryName, queryType):
        self.queryName = queryName
        self.queryType = queryType.lower() if queryType else 'personal'
        self.validateType()

        self.logger = createLog('viafSearch')
        self.viaf_endpoint = os.environ['VIAF_API']
        self.redis = redis.Redis(host=os.environ['REDIS_ARN'],
                                 port=6379,
                                 socket_timeout=5)
Example #5
0
def main():
    os.environ["COLLECTIVEACCESS_HOME"] = "/data/idigpaleo/admin"
    config = json.load(open('./config.json'))
    # Get the parameters provided in the args
    parser = ingestHelpers.createParser()
    args = parser.parse_args()
    ingestAll = args.ALL
    ingestSources = args.sources
    ingestTargets = args.projects
    testRun = args.test
    logLevel = args.logLevel

    # Create the log
    logger = logHelpers.createLog('ingest', logLevel)
    logger.info("Starting " + str(ingestTargets) + " ingest")

    # If we just want a list of instutions and targets, do that
    if args.listInstitutions:
        sources = config['sources']
        for source in sources:
            sys.stdout.write(source + "\nPossible Target Projects: ")
            for project in sources[source]["validTargets"]:
                sys.stdout.write(project + " ")
            sys.stdout.write("\n")
        return True

    # If we don't have any sources or targets throw errors
    if ingestSources == None or ingestTargets == None and ingestAll == False:
        logger.error("You must define at least one source and one target")
        sys.exit(0)

    # If the --ALL flag got set, disregard anything else and just do a full import
    if ingestAll == True:
        logger.info("Importing from all sources for all targets")
        ingestTargets = config['targetProjects']
        ingestSources = []
        for source in config['sources']:
            ingestSources.append(source)

    # Start the relevant imports
    for source in ingestSources:
        logger.info("Starting ingest for " + source)
        importCSVs = coreIngest.createCSVSource(source, ingestTargets, testRun)
        logger.debug(importCSVs)

        importResults = coreIngest.importCSVFiles(source, ingestTargets, importCSVs)
        if importResults is True:
            logger.info("Successfully completed import. Check logs for partial import failures or skipped datasets")
        else:
            logger.error("Import was not completed!")
Example #6
0
class KModel:
    LOGGER = createLog('kMeans')
    
    def __init__(self, instances):
        self.instances = instances
        self.df = None
        self.clusters = defaultdict(list)
    
    def createPipeline(self):
        return Pipeline([
            ('union', FeatureUnion(
                transformer_list=[
                    ('place', Pipeline([
                        ('selector', TextSelector(key='place')),
                        ('tfidf', TfidfVectorizer(
                            preprocessor=KModel.pubProcessor,
                            stop_words='english',
                            strip_accents='unicode',
                            analyzer='char_wb',
                            ngram_range=(2,4))
                        )
                    ])),
                    ('publisher', Pipeline([
                        ('selector', TextSelector(key='publisher')),
                        ('tfidf', TfidfVectorizer(
                            preprocessor=KModel.pubProcessor,
                            stop_words='english',
                            strip_accents='unicode',
                            analyzer='char_wb',
                            ngram_range=(2,4))
                        )
                    ])),
                    ('date', Pipeline([
                        ('selector', NumberSelector(key='pubDate')),
                        ('scaler', MinMaxScaler())
                    ]))
                ],
                transformer_weights={
                    'place': 0.5,
                    'publisher': 1.0,
                    'date': 2.0 
                }
            )),
            ('kmeans', KMeans(
                n_clusters=self.currentK,
                n_jobs=-1
            ))
        ])
    
    @classmethod
    def pubProcessor(cls, raw):
        if isinstance(raw, list):
            raw = ', '.join(filter(None, raw))
        if raw is not None:
            raw = raw.replace('&', 'and')
            cleanStr = raw.translate(
                str.maketrans('', '', string.punctuation)
            ).lower()
            cleanStr = cleanStr\
                .replace('sn', '')\
                .replace('place of publication not identified', '')\
                .replace('publisher not identified', '')
            cleanStr = re.sub(r'\s+', ' ', cleanStr)
            cls.LOGGER.debug('Cleaned string {} to {} for processing'.format(
                raw, cleanStr
            ))
            return cleanStr
        cls.LOGGER.debug('Unable to clean NoneType, returning empty string')
        return ''

    def createDF(self):
        self.LOGGER.info('Generating DataFrame from instance data')
        self.df = pd.DataFrame([
            {
                'place': i.pub_place if i.pub_place else '',
                'publisher': KModel.getPublisher(i.agent_instances),
                'pubDate': KModel.getPubDateFloat(i.dates),
                'edition': i.edition_statement,
                'volume': i.volume,
                'table_of_contents': i.table_of_contents,
                'extent': i.extent,
                'summary': i.summary,
                'rowID': i.id
            }
            for i in self.instances
            if KModel.emptyInstance(i) != False
        ])
        self.maxK = len(self.df.index) if len(self.df.index) > 1 else 2
        if self.maxK > 1000:
            self.maxK = int(self.maxK * (2/9))
        elif self.maxK > 500:
            self.maxK = int(self.maxK * (3/9))
        elif self.maxK > 250:
            self.maxK = int(self.maxK * (4/9))
    
    @staticmethod
    def emptyInstance(instance):
        return bool(instance.pub_place or\
            KModel.getPubDateFloat(instance.dates) or\
            KModel.getPublisher(instance.agent_instances))

    @classmethod
    def getPubDateFloat(cls, dates):
        for d in dates:
            if d.date_type == 'pub_date' and d.date_range:
                cls.LOGGER.debug('Found publication date {}'.format(
                    d.display_date
                ))
                lowerYear = d.date_range.lower.year if d.date_range.lower else None
                upperYear = d.date_range.upper.year if d.date_range.upper else None
                if lowerYear and upperYear:
                    return (upperYear + lowerYear) / 2
                elif lowerYear:
                    return lowerYear
                elif upperYear:
                    return upperYear
        
        cls.LOGGER.debug('Unable to locate publication date')
        return 0
    
    @classmethod
    def getPublisher(cls, agent_rels):
        publishers = [
            a.agent.name for a in agent_rels
            if a.role == 'publisher'
        ]
        return '; '.join(sorted(list(set(publishers))))
    
    def generateClusters(self):
        self.LOGGER.info('Generating Clusters from instances')
        try:
            # Calculate the step for the first run at determining k
            # Use the natural log of the value to get a reasonable scale
            # for different values
            step = int(np.log(self.maxK)**1.5 - 1) if np.log(self.maxK) > 1.6 else 1
            # First pass at finding best value for k, using the step value
            # derived above
            self.getK(1, self.maxK, step)
            # Get narrower band of possible k values, based off the initial
            # step value
            startK = self.k - (step - 1) if self.k > (step - 1) else 1
            stopK = self.k + step if (self.k + step) <= self.maxK else self.maxK
            # Get the final k value by iterating through the much narrower
            # range returned above
            self.getK(startK, stopK, 1)
            self.LOGGER.debug('Setting K to {}'.format(self.k))
        except ZeroDivisionError:
            self.LOGGER.debug('Single instance found setting K to 1')
            self.k = 1
        
        try:
            labels = self.cluster(self.k)
        except ValueError as err:
            labels = [0] * len(self.instances)
        
        for n, item in enumerate(labels):
            try:
                self.clusters[item].append(self.df.loc[[n]])
            except KeyError:
                continue
    
    def getK(self, start, stop, step):
        self.LOGGER.info('Calculating number of clusters, max {}'.format(
            self.maxK
        ))
        warnings.filterwarnings('error', category=ConvergenceWarning)
        wcss = []
        for i in range(start, stop, step):
            try:
                wcss.append((self.cluster(i, score=True), i))
            except ConvergenceWarning:
                self.LOGGER.info('Exceeded number of distinct clusters, break')
                break
            except ValueError:
                self.k = 1
                return None
        
        x1, y1 = wcss[0][1], wcss[0][0]
        x2, y2 = wcss[len(wcss) - 1][1], wcss[(len(wcss) - 1)][0]

        distances = []
        denominator = sqrt((y2 - y1)**2 + (x2 - x1)**2)
        for i in range(len(wcss)):
            x0 = i + 1
            y0 = wcss[i][0]

            numerator = abs((y2 - y1)*x0 - (x2 - x1)*y0 + x2*y1 - y2*x1)
            distances.append(numerator/denominator)
        
        finalStart = 1 if start < 2 else start + 1 
        self.k = distances.index(max(distances)) + finalStart
        return None
    
    def cluster(self, k, score=False):
        self.currentK = k
        self.LOGGER.info('Generating cluster for k={}'.format(k))
        pipeline = self.createPipeline()
        if score is True:
            self.LOGGER.debug('Returning score for n_clusters estimation')
            pipeline.fit(self.df)
            return pipeline['kmeans'].inertia_
        else:
            self.LOGGER.debug('Returning model prediction')
            return pipeline.fit_predict(self.df)
    
    def parseEditions(self):
        eds = []
        self.LOGGER.info('Generating editions from clusters')
        for clust in dict(self.clusters):
            yearEds = defaultdict(list)
            self.LOGGER.info('Parsing cluster {}'.format(clust))
            for ed in self.clusters[clust]:
                self.LOGGER.info('Adding instance to {} edition'.format(
                    ed.iloc[0]['pubDate']
                ))
                yearEds[ed.iloc[0]['pubDate']].append({
                    'pubDate': ed.iloc[0]['pubDate'],
                    'publisher': ed.iloc[0]['publisher'],
                    'pubPlace': ed.iloc[0]['place'],
                    'rowID': ed.iloc[0]['rowID'],
                    'edition': ed.iloc[0]['edition'],
                    'volume': ed.iloc[0]['volume'],
                    'table_of_contents': ed.iloc[0]['table_of_contents'],
                    'extent': ed.iloc[0]['extent'],
                    'summary': ed.iloc[0]['summary']
                })
            eds.extend([(year, data) for year, data in yearEds.items()])
            eds.sort(key=lambda x: x[0])

        return eds
            
Example #7
0
import requests
import os
from lxml import etree

from helpers.errorHelpers import OCLCError, DataError
from helpers.logHelpers import createLog

from lib.outputManager import OutputManager

logger = createLog('classify_read')

NAMESPACE = {
    None: 'http://classify.oclc.org'
}


def classifyRecord(searchType, searchFields, workUUID, start=0):
    """Generates a query for the OCLC Classify service and returns the raw
    XML response received from that service. This method takes 3 arguments:
    - searchType: identifier|authorTitle
    - searchFields: identifier+idType|authors+title
    - uuid: UUID of the parent work record"""
    try:
        classifyQuery = QueryManager(
            searchType,
            searchFields.get('identifier', None),
            searchFields.get('idType', None),
            searchFields.get('authors', None),
            searchFields.get('title', None),
            start
        )
from base64 import b64decode
from binascii import Error as base64Error
import boto3
from botocore.exceptions import ClientError
import os
import yaml

from helpers.logHelpers import createLog

logger = createLog('configHelpers')


def loadEnvFile(runType, fileString):

    envDict = None
    fileLines = []

    if fileString:
        openFile = fileString.format(runType)
    else:
        openFile = 'config.yaml'

    try:
        with open(openFile) as envStream:
            try:
                envDict = yaml.safe_load(envStream)
            except yaml.YAMLError as err:
                logger.error('{} Invalid! Please review'.format(openFile))
                raise err

            envStream.seek(0)
Example #9
0
import subprocess
import sys
import os
import re

from helpers.logHelpers import createLog
from helpers.errorHelpers import InvalidExecutionType
from helpers.clientHelpers import createEventMapping
from helpers.configHelpers import setEnvVars

logger = createLog('runScripts')


def main():
    """Invoked by the makefile's arguments, controls the overall execution of
    the Lambda function. h/t to nonword for inspiration to use a makefile.

    Raises:
        InvalidExecutionType: If the args do not contain a valid execution type
        raise an error.
    """
    if len(sys.argv) != 2:
        logger.warning('This script takes one, and only one, argument!')
        sys.exit(1)
    runType = sys.argv[1]

    if re.match(r'^(?:local|development|qa|production)', runType):
        logger.info('Deploying lambda to {} environment'.format(runType))
        setEnvVars(runType)
        subprocess.run([
            'lambda', 'deploy', '--config-file', 'run_config.yaml',
Example #10
0
import boto3
import json

from helpers.logHelpers import createLog
from helpers.errorHelpers import InvalidExecutionType
from helpers.configHelpers import loadEnvFile

logger = createLog('clientHelpers')

def createAWSClient(service, configDict=None):

    if configDict is None:
        configDict, configLines = loadEnvFile(None, None)

    clientKwargs = {
        'region_name': configDict['region']
    }

    if (
        'aws_access_key_id' in configDict
        and
        configDict['aws_access_key_id'] is not None
    ):
        clientKwargs['aws_access_key_id'] = configDict['aws_access_key_id']
        clientKwargs['aws_secret_access_key'] = configDict['aws_secret_access_key']  # noqa: E501

    lambdaClient = boto3.client(
        service,
        **clientKwargs
    )
Example #11
0
from sfrCore import SessionManager

from lib.fetchers.openLibraryFetcher import OLSessionManager
from lib.coverManager import CoverManager
from helpers.logHelpers import createLog

# Logger can be passed name of current module
# Can also be instantiated on a class/method basis using dot notation
logger = createLog('handler')

"""This method will create the database if necessary and otherwise run any
new migrations. This is placed here because Lambdas will "freeze" any methods
that are executed before the main handler block, meaning that we can run
migrations and generate a db connection for multiple invocations, at least
until AWS decides to regenerate the container
"""
MANAGER = SessionManager()
MANAGER.generateEngine()

OL_MANAGER = OLSessionManager()
OL_MANAGER.generateEngine()


def handler(event, context):
    """Method invoked by Lambda event. Verifies that records were received and,
    if so, passes them to be parsed"""
    logger.debug('Starting Lambda Execution')

    coverManager = CoverManager(MANAGER, OL_MANAGER)
    coverManager.getInstancesForSearch()
    coverManager.getCoversForInstances()
Example #12
0
import os

from elasticsearch import Elasticsearch
from elasticsearch.exceptions import (ConnectionError, TransportError,
                                      ConflictError)
from elasticsearch_dsl import connections
from elasticsearch_dsl.wrappers import Range

from sqlalchemy.orm import configure_mappers

from model.elasticModel import (Work, Instance, Subject, Identifier, Agent,
                                Language, Rights)
from helpers.logHelpers import createLog
from helpers.errorHelpers import ESError

logger = createLog('es_manager')


class ESConnection():
    def __init__(self):
        self.index = os.environ['ES_INDEX']
        self.client = None
        self.tries = 0
        self.batch = []

        self.createElasticConnection()
        self.createIndex()

        configure_mappers()

    def createElasticConnection(self):
Example #13
0
from datetime import datetime
import os
from sfrCore import Item, Instance, Identifier

from lib.importers.abstractImporter import AbstractImporter
from helpers.logHelpers import createLog

logger = createLog('itemImporter')


class ItemImporter(AbstractImporter):
    def __init__(self, record, session, kinesisMsgs, sqsMsgs):
        self.data = record['data']
        self.item = None
        self.kinesisMsgs = kinesisMsgs
        self.sqsMsgs = sqsMsgs
        self.logger = self.createLogger()
        super().__init__(record, session)

    @property
    def identifier(self):
        return self.item.id

    def lookupRecord(self):
        self.logger.info('Ingesting item record')

        itemID = Identifier.getByIdentifier(
            Item,
            self.session,
            self.data.get('identifiers', [])
        )
Example #14
0
from datetime import datetime
import json
import os
from sfrCore import Instance, Identifier

from lib.importers.abstractImporter import AbstractImporter
from helpers.logHelpers import createLog

logger = createLog('instanceImporter')


class InstanceImporter(AbstractImporter):
    def __init__(self, record, session, kinesisMsgs, sqsMsgs):
        self.source = record.get('source', 'unknown')
        self.data = record['data']
        self.instance = None
        self.kinesisMsgs = kinesisMsgs
        self.sqsMsgs = sqsMsgs
        self.logger = self.createLogger()
        super().__init__(record, session)

    @property
    def identifier(self):
        return self.instance.id

    def lookupRecord(self):
        self.logger.info('Ingesting instance record')

        instanceID = Identifier.getByIdentifier(
            Instance, self.session, self.data.get('identifiers', []))
        if instanceID is not None:
Example #15
0
from lxml import etree
import marcalyx

from helpers.errorHelpers import OAIFeedError, MARCXMLError
from helpers.logHelpers import createLog

logger = createLog('oai_parser')

# Standard namespaces used by the DOAB OAI-PMH feed
OAI_NS = '{http://www.openarchives.org/OAI/2.0/}'
MARC_NS = '{http://www.loc.gov/MARC21/slim}'


def parseOAI(oaiFeed):
    """Parse a supplied lxml object into a set of records, which are then read
    into a list of marcalyx records. 

    This also checks the provided feed for a resumption token, which if found,
    will be used to recursively retrieve the next page of DOAB records
    """
    try:
        logger.info('Parsing OAI-PMH feed of MARCXML records')
        res = etree.XML(oaiFeed)
    except etree.ParseError as err:
        logger.error('Unable to parse OAI-PMH Feed with lxml')
        logger.debug(err)
        raise OAIFeedError('Unable to parse XML from OAI-PMH feed')

    logger.info('Loading all retrieved MARCXML records')
    records = res.findall('.//{}record'.format(OAI_NS))
    marcRecords = list(filter(None, (readRecord(r) for r in records)))
Example #16
0
import os

from helpers.errorHelpers import OCLCError, DataError
from helpers.logHelpers import createLog
from lib.dataModel import Identifier
from lib.readers.oclcClassify import classifyRecord
from lib.parsers.parseOCLC import readFromClassify, extractAndAppendEditions
from lib.outputManager import OutputManager

logger = createLog('enhancer')


def enhanceRecord(record):
    """Takes a single input record and retrieves data from the OCLC Classify
    service. Manages the overall workflow of the function."""

    try:
        workUUID = record['uuid']
        searchType = record['type']
        searchFields = record['fields']
        startPos = record.get('start', 0)
    except KeyError as e:
        logger.error('Missing attribute in data block!')
        logger.debug(e)
        raise DataError('Required attribute missing from data block')
    except TypeError as e:
        logger.error('Could not read data from source')
        logger.debug(e)
        raise DataError('Kinesis data contains non-dictionary value')

    logger.info('Starting to enhance work record {}'.format(workUUID))
Example #17
0
import boto3
import json
import datetime
import time

from helpers.errorHelpers import KinesisError
from helpers.logHelpers import createLog
from helpers.clientHelpers import createAWSClient

logger = createLog('kinesis_write')


class OutputManager():
    """Class for managing connections and operations with AWS Kinesis"""
    KINESIS_CLIENT = createAWSClient('kinesis')

    def __init__(self):
        pass

    @classmethod
    def putRecord(cls, outputObject, stream, workUUID):
        """Put an event into the specific Kinesis stream"""
        logger.info("Writing results to Kinesis")

        # The default lambda function here converts all objects into dicts
        kinesisStream = OutputManager._convertToJSON(outputObject)

        try:
            cls.KINESIS_CLIENT.put_record(StreamName=stream,
                                          Data=kinesisStream,
                                          PartitionKey=workUUID)
Example #18
0
from datetime import datetime
from sfrCore import Item

from lib.importers.abstractImporter import AbstractImporter
from helpers.logHelpers import createLog

logger = createLog('accessImporter')


class AccessReportImporter(AbstractImporter):
    def __init__(self, record, session, kinesisMsgs, sqsMsgs):
        self.data = record['data']
        self.item = None
        self.logger = self.createLogger()
        super().__init__(record, session)

    @property
    def identifier(self):
        return self.item.id

    def lookupRecord(self):
        self.logger.info('Ingest Accessibility Report')

        return self.insertRecord()

    def insertRecord(self):
        accessReport = Item.addReportData(self.session, self.data)

        if not accessReport:
            return 'error'
Example #19
0
import json

from helpers.errorHelpers import OutputError
from helpers.logHelpers import createLog
from helpers.clientHelpers import createAWSClient

logger = createLog('output_write')


class OutputManager():
    """Controls the output formats and streams from this function. Valid output
    targets are:
    Kinesis: for processing in the enhancement pipeline and epub storage
    SQS: For queuing and processing by the ElasticSearch manager"""

    KINESIS_CLIENT = createAWSClient('kinesis')

    def __init__(self):
        pass

    @classmethod
    def putKinesis(cls, data, stream, recType='work'):
        """Puts records into a Kinesis stream for processing by other parts of
        the SFR data pipeline. Takes data as an object and converts it into a
        JSON string. This is then passed to the specified stream.

        This will raise any error, as failure to pass an output should halt the
        process."""

        logger.info('Writing results to Kinesis')
        outputObject = {'status': 200, 'data': data, 'type': recType}
Example #20
0
from json.decoder import JSONDecodeError
import os
import requests

from .abstractReader import AbsSourceReader
from lib.models.metRecord import MetItem
from helpers.logHelpers import createLog

logger = createLog('metReader')


class MetReader(AbsSourceReader):
    INDEX_URL = 'https://libmma.contentdm.oclc.org/digital/api/search/collection/p15324coll10/order/title/ad/asc/page/{}/maxRecords/50'
    ITEM_API = 'https://libmma.contentdm.oclc.org/digital/api/collections/p15324coll10/items/{}/false'
    def __init__(self, updateSince):
        self.updateSince = updateSince
        self.startPage = 1
        self.stopPage = 48
        self.source = 'Metropolitan Museum of Art'
        self.works = []
        self.itemIDs = []
    
    def collectResourceURLs(self):
        logger.info('Fetching records from MET Digital Collections')
        for page in range(self.startPage, self.stopPage):
            logger.debug('Fetching page {}'.format(page))
            indexResp = requests.get(self.INDEX_URL.format(page))
            indexData = indexResp.json()
            for item in indexData['items']:
                itemID = item['itemId']
                logger.debug('Found record with ID {}'.format(itemID))
Example #21
0
from collections import defaultdict

from lib.importers.workImporter import WorkImporter
from lib.importers.instanceImporter import InstanceImporter
from lib.importers.itemImporter import ItemImporter
from lib.importers.accessImporter import AccessReportImporter
from lib.importers.coverImporter import CoverImporter
from lib.outputManager import OutputManager

from helpers.logHelpers import createLog

logger = createLog('db_manager')


class DBManager:
    # Load Updaters for specific types of records, all based on AbstractUpdater
    IMPORTERS = {
        'work': WorkImporter,
        'instance': InstanceImporter,
        'item': ItemImporter,
        'access_report': AccessReportImporter,
        'cover': CoverImporter
    }

    def __init__(self, session):
        self.session = session
        self.logger = logger
        self.kinesisMsgs = defaultdict(list)
        self.sqsMsgs = defaultdict(list)

    def importRecord(self, record):
Example #22
0
import re
import pycountry
import requests
from requests.exceptions import ConnectionError, MissingSchema, InvalidURL
from urllib.parse import quote_plus

from helpers.errorHelpers import MARCXMLError, DataError
from helpers.logHelpers import createLog

from lib.linkParser import LinkParser
from lib.dataModel import (WorkRecord, Identifier, Subject, Language,
                           InstanceRecord, Date, Agent, Format, Rights, Link)

logger = createLog('marc_parser')

SUBJECT_INDICATORS = {
    '0': 'lcsh',
    '1': 'lcch',
    '2': 'msh',
    '3': 'nalsaf',
    '4': None,
    '5': 'csh',
    '6': 'rvm',
}


def parseMARC(records, marcRels):
    """Accepts list of MARCXML records and invokes the parser for each. If
    an error occurs None is returned and filter() removes them from the list
    """
    logger.info('Transforming MARCXML records into SFR objects')
Example #23
0
from datetime import datetime
import pycountry
import requests
from urllib.parse import quote_plus

from ..dataModel import (WorkRecord, InstanceRecord, Format, Identifier,
                         Language, Agent, Subject, Link, Date, Rights)
from helpers.logHelpers import createLog

logger = createLog('metItem')


class MetItem(object):
    ROOT_URL = 'https://libmma.contentdm.oclc.org/digital'
    ITEM_UI = 'https://libmma.contentdm.oclc.org/digital/collection/p15324coll10/id/{}/rec/1'
    SFR_CROSSWALK = {
        'title': [
            {
                'level': 'work',
                'field': 'title'
            },
            {
                'level': 'instance',
                'field': 'title'
            },
        ],
        'creato': [{
            'level': 'work',
            'field': 'author'
        }],
        'descri': [{
Example #24
0
 def __init__(self, record):
     self.logger = createLog('CoverParse')
     self.remoteURL = record.get('url', None)
     self.source = record.get('source', 'unk')
     self.sourceID = record.get('identifier', None)
     self.s3CoverURL = None
Example #25
0
from datetime import datetime
import re
import requests
from urllib.parse import quote_plus

from helpers.logHelpers import createLog
from helpers.errorHelpers import HoldingError
from lib.dataModel import InstanceRecord, Agent, Link, Identifier
from lib.parsers.parse856Holding import HoldingParser

logger = createLog('classify_parse')

MEASUREMENT_TIME = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

SUBJECT_INDICATORS = {
    '0': 'lcsh',
    '1': 'lcch',
    '2': 'msh',
    '3': 'nalsaf',
    '4': None,
    '5': 'csh',
    '6': 'rvm',
}


def readFromMARC(marcRecord):
    """Parse marcalyx Record object representing oclc record"""
    logger.debug('Parsing Returned Edition')

    instance = InstanceRecord()
Example #26
0
from datetime import datetime
import os

from .abstractUpdater import AbstractUpdater
from sfrCore import Instance
from helpers.errorHelpers import DBError
from helpers.logHelpers import createLog

logger = createLog('instanceUpdater')


class InstanceUpdater(AbstractUpdater):
    def __init__(self, record, session, kinesisMsgs, sqsMsgs):
        self.data = record.get('data')
        self.attempts = int(record.get('attempts', 0))
        self.instance = None
        self.logger = self.createLogger()
        super().__init__(record, session, kinesisMsgs, sqsMsgs)

    @property
    def identifier(self):
        return self.instance.id

    def lookupRecord(self):
        existingID = Instance.lookup(
            self.session,
            self.data.get('identifiers', []),
            self.data.get('volume', None),
            self.data.pop('primary_identifier', None)
        )
        if existingID is None:
Example #27
0
from copy import copy
from datetime import datetime
import json
import os

from .abstractUpdater import AbstractUpdater
from sfrCore import Link
from helpers.errorHelpers import DBError
from helpers.logHelpers import createLog

logger = createLog('coverUpdater')


class CoverUpdater(AbstractUpdater):
    def __init__(self, record, session, kinesisMsgs, sqsMsgs):
        self.data = record.get('data')
        self.attempts = int(record.get('attempts', 0))
        self.link = None
        self.logger = self.createLogger()
        super().__init__(record, session, kinesisMsgs, sqsMsgs)

    @property
    def identifier(self):
        return self.link.id

    def lookupRecord(self):
        currentURL = self.data.pop('originalURL', None)
        self.logger.debug('Updating Cover from {}'.format(
            currentURL if currentURL else '[unknown]'
        ))
        dbURL = Link.httpRegexSub(currentURL)
Example #28
0
# stdnbr
LOOKUP_IDENTIFIERS = [
    'oclc',  # OCLC Number
    'isbn',  # ISBN (10 or 13)
    'issn',  # ISSN
    'swid',  # OCLC Work Identifier
]

IDENTIFIER_TYPES = {
    'oclc': OCLC,
    'swid': OWI,
    'isbn': ISBN,
    'issn': ISSN,
}

logger = createLog('query_constructor')


def queryWork(session, work, workUUID):
    """This takes a work record that has not been queried for enhanced data
    and begins that process. It extracts one of two things from the work record
    to allow for this lookup.
    If it contains an identifier in the list defined
    in LOOKUP_IDENTIFIERS, it will pass that identifier to the Kinesis stream.
    If not it will pass the author and title of the work.
    It will also pass the UUID of the database record, which will be used to
    match the returned data with the existing record."""

    lookupIDs = getIdentifiers(session, work)
    classifyQueries = []
Example #29
0
from datetime import datetime
import os

from .abstractUpdater import AbstractUpdater
from sfrCore import Item
from helpers.errorHelpers import DBError
from helpers.logHelpers import createLog

logger = createLog('itemUpdater')


class ItemUpdater(AbstractUpdater):
    def __init__(self, record, session, kinesisMsgs, sqsMsgs):
        self.data = record.get('data')
        self.attempts = int(record.get('attempts', 0))
        self.item = None
        self.logger = self.createLogger()
        super().__init__(record, session, kinesisMsgs, sqsMsgs)

    @property
    def identifier(self):
        return self.item.id

    def lookupRecord(self):
        primaryID = self.data.get('primary_identifier', None)
        self.logger.debug('Ingesting Item #{}'.format(
            primaryID['identifier'] if primaryID else 'unknown'))
        self.item = Item.lookup(self.session, self.data.get('identifiers', []),
                                primaryID)

        if self.item is None:
Example #30
0
from datetime import datetime, timedelta
import os
from sqlalchemy import text
from sfrCore import Instance

from .fetchers.openLibraryFetcher import OLCoverFetcher
from .fetchers.googleBooksFetcher import GBCoverFetcher
from .fetchers.contentCafeFetcher import CCCoverFetcher
from .cover import SFRCover
from .outputManager import OutputManager
from helpers.logHelpers import createLog

logger = createLog('coverManager')


class CoverManager:
    """Manager class for finding cover images for Instance records and
    returning the resulting Cover objects to the database ingest manager.

    Methods:
    getInstancesForSearch -- Retrieve cover-less Instances from the database
    getCoversForInstances -- Search fetchers for covers and generate covers
    queryFetchers -- Query defined fetchers and break if a cover is found
    getValidIDs -- Parses list of identifiers for Instance to usable types
    sendCoversToKinesis -- places covers in stream for database manager
    """
    def __init__(self, manager, olManager):
        """Initialize CoverManager with database managers and fetchers. This
        generates a logger, sets the update period and creates the array of
        fetcher objects which are used to retrieve cover URIs.