Beispiel #1
0
def parseRecord(encodedRec, outManager):
    """Parse an individual record. Verifies that an object was able to be
    decoded from the input base64 encoded string and if so, hands this to the
    enhancer method"""
    try:
        record = json.loads(encodedRec['body'])
    except json.decoder.JSONDecodeError as jsonErr:
        logger.error('Invalid JSON block received')
        logger.error(jsonErr)
        raise DataError('Malformed JSON block received from SQS')
    except KeyError as err:
        logger.error('Missing body attribute in SQS message')
        logger.debug(err)
        raise DataError('Body object missing from SQS message')

    logger.info('Storing cover from {}'.format(record['url']))

    coverParser = CoverParse(record)
    coverParser.storeCover()

    outManager.putKinesis(
        {
            'originalURL': coverParser.remoteURL.lower(),
            'storedURL': coverParser.s3CoverURL
        },
        os.environ['DB_UPDATE_STREAM'],
        recType='cover')

    return coverParser.s3CoverURL
Beispiel #2
0
def parseRecord(encodedRec):
    """Parse an individual record. Verifies that an object was able to be
    decoded from the input base64 encoded string and if so, hands this to the
    enhancer method"""
    try:
        record = json.loads(encodedRec['body'])
        logger.info('Creating editions for work {}'.format(
            record['identifier']
        ))
        
        try:
            clustManager = ClusterManager(record, MANAGER)
            clustManager.clusterInstances()
            clustManager.deleteExistingEditions()
            clustManager.storeEditions()
        except Exception as err:  # noqa: Q000
            # There are a large number of SQLAlchemy errors that can be thrown
            # These should be handled elsewhere, but this should catch anything
            # and rollback the session if we encounter something unexpected
            MANAGER.session.rollback() # Rollback current record only
            logger.error('Failed to store record {}'.format(
                record['identifier']
            ))
            logger.debug(err)
            logger.debug(traceback.format_exc())
            return ('failure', '{}|{}'.format(
                clustManager.work.uuid,
                clustManager.work.title
            ))

        session = MANAGER.createSession()
        session.add(clustManager.work)
        esManager = ElasticManager(clustManager.work)
        esManager.enhanceWork()
        esManager.saveWork()
        session.close()
        return ('success', '{}|{}'.format(
            clustManager.work.uuid,
            clustManager.work.title
        ))

    except json.decoder.JSONDecodeError as jsonErr:
        logger.error('Invalid JSON block received')
        logger.error(jsonErr)
        raise DataError('Malformed JSON block received from SQS')
    except KeyError as err:
        logger.error('Missing body attribute in SQS message')
        logger.debug(err)
        raise DataError('Body object missing from SQS message')
Beispiel #3
0
 def addClassItem(self, listAttrib, classType, **identifierDict):
     if listAttrib not in dir(self):
         raise DataError('Field {} not valid for {}'.format(
             listAttrib,
             self.__class__.__name__
         ))
     self[listAttrib].append(classType.createFromDict(**identifierDict))
 def test_record_parse_write_err(self, mockManager, mockSession):
     testRec = base64.b64encode(json.dumps({
         'status': 200,
         'data': 'data'
     }).encode('utf-8'))
     mockManager.importRecord.side_effect = DataError('test err')
     res = self.parseRecord({'kinesis': {'data': testRec}}, mockManager)
     self.assertNotEqual(res, True)
Beispiel #5
0
def parseRecord(encodedRec):
    """Parse an individual record. Verifies that an object was able to be
    decoded from the input base64 encoded string and if so, hands this to the
    enhancer method"""
    try:
        record = json.loads(encodedRec['body'])
        return enhanceRecord(record)
    except json.decoder.JSONDecodeError as jsonErr:
        logger.error('Invalid JSON block recieved')
        logger.error(jsonErr)
        raise DataError('Malformed JSON block recieved from SQS')
    except KeyError as err:
        logger.error('Missing body attribute in SQS message')
        logger.debug(err)
        raise DataError('Body object missing from SQS message')
    except (DataError, OCLCError) as err:
        logger.error(err.message)


    return False
Beispiel #6
0
def readFromClassify(workXML, workUUID):
    """Parse Classify XML document into a object that complies with the
    SFR data model. Accepts a single XML document and returns a WorkRecord."""
    logger.debug('Parsing Returned Work')

    work = workXML.find('.//work', namespaces=NAMESPACE)
    start = workXML.find('.//start', namespaces=NAMESPACE)

    oclcTitle = work.get('title')
    oclcNo = Identifier('oclc', work.text, 1)
    owiNo = Identifier('owi', work.get('owi'), 1)

    if OutputManager.checkRecentQueries('lookup/{}/{}/{}'.format(
        'owi', work.get('owi'), start.text
    )) is True:
        raise DataError('Work {} with OWI {} already classified'.format(
            workUUID, work.get('owi')
        ))

    measurements = []
    for measure in ['editions', 'holdings', 'eholdings']:
        measurements.append(Measurement(
            measure,
            work.get(measure),
            1,
            MEASUREMENT_TIME,
            work.text
        ))

    authors = workXML.findall('.//author', namespaces=NAMESPACE)
    authorList = list(map(parseAuthor, authors))

    editions = workXML.findall('.//edition', namespaces=NAMESPACE)
    editionList = loadEditions(editions)

    headings = workXML.findall('.//heading', namespaces=NAMESPACE)
    headingList = list(map(parseHeading, headings))

    workDict = {
        'title': oclcTitle,
        'agents': authorList,
        'instances': editionList,
        'subjects': headingList,
        'identifiers': [
            oclcNo,
            owiNo
        ],
        'measurements': measurements
    }

    instanceCount = int(work.get('editions', 0))

    return WorkRecord.createFromDict(**workDict), instanceCount, work.text
Beispiel #7
0
    def createFromDict(cls, **kwargs):
        """Take a standard dict object and convert to an instance of the
        provided class. Allows for creation of new instances with arbitrary
        fields set"""
        record = cls()
        for field, value in kwargs.items():
            if field not in dir(record):
                raise DataError('Field {} not valid for {}'.format(
                    field, cls.__name__))
            record[field] = value

        return record
    def clusterInstances(self):
        session = self.dbManager.createSession()
        self.work = self.fetchWork(session)
        self.logger.info('Creating editions for {}'.format(self.work))

        if len(self.work.instances) < 1:
            raise DataError('Work Record has no attached instance Records')

        mlModel = KModel(self.work.instances)
        mlModel.createDF()
        session.close()
        mlModel.generateClusters()
        self.editions = mlModel.parseEditions()
Beispiel #9
0
    def buildDataModel(self, countryCodes):
        logger.debug('Generating work record for bib record {}'.format(
            self.ingest['bib_key']
        ))

        # If we don't have a valid rights code, this means that the row has
        # been improperly formatted (generally fields out of order/misplaced)
        # Raise a warning but continue if this is found to be true
        if self.ingest['rights_statement'] not in HathiRecord.rightsReasons:
            raise DataError(
                '{} is malformed (columns missing or incorrect'.format(
                    self.ingest['htid']
                )
            )

        self.buildWork()

        logger.debug('Generating instance record for hathi record {}'.format(
            self.ingest['htid']
        ))
        self.buildInstance(countryCodes)

        logger.debug('Generating an item record for hathi record {}'.format(
            self.ingest['htid']
        ))
        self.buildItem()

        logger.debug('Generate a rights object for the associated rights statement {}'.format(
            self.ingest['rights']
        ))

        # Generate a stand-alone rights object that contains the hathi
        # generated rights information
        self.createRights()

        for agent in self.work.agents:
            self.getVIAF(agent)

        for instance in self.work.instances:
            for agent in instance.agents:
                self.getVIAF(agent)
            for item in instance.formats:
                for agent in item.agents:
                    self.getVIAF(agent)
 def generateIdentifierURL(self):
     """Creates a query based of an identifier and its type. If either field
     is missing for this request, default to an author/title search.
     """
     if self.recID is not None and self.recType is not None:
         if self.recType not in QueryManager.LOOKUP_IDENTIFIERS:
             raise DataError(
                 'Unrecognized/invalid identifier type {} recieved'.format(
                     self.recType
                 )
             )
         self.query = "{}?{}={}".format(
             QueryManager.CLASSIFY_ROOT,
             self.recType,
             self.recID
         )
         self.addClassifyOptions()
     else:
         self.generateAuthorTitleURL()
    def generateAuthorTitleURL(self):
        """Generates an author/title query for Classify.

        Raises:
            DataError: Raised if no author is received, which can cause
            unexpectedly large results to be returned for a query.
        """
        if self.author is None or self.title is None:
            raise DataError('Author and title required for search')

        self.cleanTitle()

        titleAuthorParam = 'title={}&author={}'.format(self.title, self.author)

        self.query = "{}?{}".format(
            QueryManager.CLASSIFY_ROOT,
            titleAuthorParam
        )

        self.addClassifyOptions()
Beispiel #12
0
def parseHoldingURI(uri):
    logger.info('Loading URI {}'.format(uri))
    try:
        uriHead = requests.head(uri, allow_redirects=False)
        headers = uriHead.headers
    except (MissingSchema, ConnectionError, InvalidURL):
        raise DataError('Invalid Holding URL')

    if uriHead.status_code in [301, 302, 307, 308]:
        redirectTo = headers['Location']
        logger.debug('Found {} Redirect to {}'.format(uriHead.status_code,
                                                      redirectTo))
        return parseHoldingURI(redirectTo)

    try:
        contentType = headers['Content-Type']
    except KeyError:
        logger.warning('Unable to find header Content-Type for {}'.format(uri))
        contentType = 'text/html'

    return uri, contentType
Beispiel #13
0
    def loadMARCRelators(self):
        """DOAB identifies contributors to its records using the MARC Relator
        codes. These are not available in a library anywhere and as a result
        these must be translated to human-readable formats. This parses the
        LoC's provided XML file into a dictionary of translated codes.
        """
        relRes = requests.get(self.relators_file)
        if relRes.status_code != 200:
            logger.error('Failed to load MARC21 Relator Authority')
            logger.debug(relRes.text)
            raise DataError('Unable to load necessary MARC21 Authority')

        relJSON = json.loads(relRes.content)

        terms = {}
        rdfLabel = 'http://www.loc.gov/mads/rdf/v1#authoritativeLabel'
        for rel in relJSON:
            try:
                code = rel['@id'].split('/')[-1]
                terms[code] = rel[rdfLabel][0]['@value']
            except KeyError:
                continue

        return terms
 def test_row_parse_data_error(self, mock_hathi):
     mock_hathi().buildDataModel.side_effect = DataError('Test Error')
     with self.assertRaises(ProcessingError):
         rowParser(['row1'], ['htid'], {})
Beispiel #15
0
def enhanceRecord(record):
    """Takes a single input record and retrieves data from the OCLC Classify
    service. Manages the overall workflow of the function."""

    try:
        workUUID = record['uuid']
        searchType = record['type']
        searchFields = record['fields']
        startPos = record.get('start', 0)
    except KeyError as e:
        logger.error('Missing attribute in data block!')
        logger.debug(e)
        raise DataError('Required attribute missing from data block')
    except TypeError as e:
        logger.error('Could not read data from source')
        logger.debug(e)
        raise DataError('Kinesis data contains non-dictionary value')

    logger.info('Starting to enhance work record {}'.format(workUUID))

    try:
        # Step 1: Generate a set of XML records retrieved from Classify
        # This step also adds the oclc identifiers to the sourceData record
        classifyData = classifyRecord(searchType,
                                      searchFields,
                                      workUUID,
                                      start=startPos)

        # Step 2: Parse the data recieved from Classify into the SFR data model
        classifiedWork, instanceCount, oclcNo = readFromClassify(
            classifyData, workUUID)
        logger.debug('Instances found {}'.format(instanceCount))
        if instanceCount > 500:
            iterStop = startPos + instanceCount
            if instanceCount > 1500:
                iterStop = startPos + 1500
            for i in range(startPos + 500, iterStop, 500):
                classifyPage = classifyRecord(searchType,
                                              searchFields,
                                              workUUID,
                                              start=i)
                extractAndAppendEditions(classifiedWork, classifyPage)

        if instanceCount > startPos + 1500:
            OutputManager.putQueue(
                {
                    'type': 'identifier',
                    'uuid': workUUID,
                    'fields': {
                        'idType': 'oclc',
                        'identifier': oclcNo,
                        'start': startPos + 1500
                    }
                }, os.environ['CLASSIFY_QUEUE'])

        # This sets the primary identifier for processing by the db manager
        classifiedWork.primary_identifier = Identifier('uuid', workUUID, 1)

        # Step 3: Output this block to kinesis
        outputObject = {
            'status': 200,
            'type': 'work',
            'method': 'update',
            'data': classifiedWork
        }
        while len(classifiedWork.instances) > 100:
            instanceChunk = classifiedWork.instances[0:100]
            del classifiedWork.instances[0:100]
            OutputManager.putKinesis(
                {
                    'status': 200,
                    'type': 'work',
                    'method': 'update',
                    'data': {
                        'instances': instanceChunk,
                        'primary_identifier': Identifier('uuid', workUUID, 1)
                    }
                }, os.environ['OUTPUT_KINESIS'], workUUID)
        OutputManager.putKinesis(outputObject, os.environ['OUTPUT_KINESIS'],
                                 workUUID)

    except OCLCError as err:
        logger.error('OCLC Query for work {} failed with message: {}'.format(
            workUUID, err.message))
        raise err

    return True
 def parseMessage(self, record):
     self.idType = record.get('type', 'uuid')
     self.identifier = record.get('identifier', None)
     if self.identifier is None:
         self.logger.error('Missing identifier from SQS message')
         raise DataError('Missing identifier for invocation')
Beispiel #17
0
def parseRecord(encodedRec, updater):
    """Handles each individual record by parsing JSON from the base64 encoded
    string recieved from the Kinesis stream, creating a database session and
    inserting/updating the database to reflect this new data source. It will
    rollback changes if an error is encountered
    """
    try:
        record = json.loads(base64.b64decode(encodedRec['kinesis']['data']))
        statusCode = record['status']
        if statusCode != 200:
            if statusCode == 204:
                logger.info('No updates received')
                raise NoRecordsReceived(
                    'No records received from {}'.format(record['source']),
                    record
                )
            else:
                logger.error('Received error from pipeline')
                logger.debug(record)
                raise DataError('Received non-200 status code')
    except json.decoder.JSONDecodeError as jsonErr:
        logger.error('Invalid JSON block received')
        logger.error(jsonErr)
        raise DataError('Invalid JSON block')
    except (UnicodeDecodeError, binascii.Error) as b64Err:
        logger.error('Invalid data found in base64 encoded block')
        logger.debug(b64Err)
        raise DataError('Error in base64 encoding of record')

    outRec = None
    try:
        MANAGER.startSession()  # Start transaction
        outRec = updater.importRecord(deepcopy(record))
        MANAGER.commitChanges()
    except OperationalError as opErr:
        logger.error('Conflicting updates caused deadlock, retry')
        logger.debug(opErr)
        OutputManager.putKinesis(
            record.get('data'),
            os.environ['UPDATE_STREAM'],
            recType=record.get('type', 'work'),
        )
        MANAGER.session.rollback()  # Rollback current record only except
    except IntegrityError as intErr:
        logger.error('Unique constraint violated, retry')
        logger.debug(intErr)
        OutputManager.putKinesis(
            record.get('data'),
            os.environ['UPDATE_STREAM'],
            recType=record.get('type', 'work'),
        )
        MANAGER.session.rollback()  # Rollback current record only
    except Exception as err:  # noqa: Q000
        # There are a large number of SQLAlchemy errors that can be thrown
        # These should be handled elsewhere, but this should catch anything
        # and rollback the session if we encounter something unexpected
        logger.error('Failed to store record')
        logger.debug(err)
        logger.debug(traceback.format_exc())
        MANAGER.session.rollback()  # Rollback current record only

    return outRec
Beispiel #18
0
 def test_DataError(self):
     testDataError = DataError('testMessage')
     assert testDataError.message == 'testMessage'
Beispiel #19
0
class TestHandler(unittest.TestCase):
    @patch.multiple(SessionManager,
                    generateEngine=DEFAULT,
                    decryptEnvVar=DEFAULT)
    def setUp(self, generateEngine, decryptEnvVar):
        from service import handler, parseRecords, parseRecord
        self.handler = handler
        self.parseRecords = parseRecords
        self.parseRecord = parseRecord

    @patch('service.parseRecords', return_value=True)
    def test_handler_clean(self, mock_parse):
        testRec = {
            'source': 'Kinesis',
            'Records': [{
                'kinesis': {
                    'data': 'data'
                }
            }]
        }
        resp = self.handler(testRec, None)
        self.assertTrue(resp)

    def test_handler_error(self):
        testRec = {'source': 'Kinesis', 'Records': []}
        try:
            self.handler(testRec, None)
        except NoRecordsReceived:
            pass
        self.assertRaises(NoRecordsReceived)

    def test_records_none(self):
        testRec = {'source': 'Kinesis'}
        try:
            self.handler(testRec, None)
        except NoRecordsReceived:
            pass
        self.assertRaises(NoRecordsReceived)

    @patch('service.parseRecord', side_effect=[1, 2, 3])
    @patch('service.MANAGER')
    @patch('service.DBUpdater')
    def test_parseRecords_success(self, mockUpdater, mockManager, mockParse):
        recResults = self.parseRecords(['rec1', 'rec2', 'rec3'])
        self.assertEqual(recResults, [1, 2, 3])
        mockManager.closeConnection.assert_called_once()
        self.assertEqual(mockParse.call_count, 3)

    @patch('service.parseRecord', side_effect=[1, DataError('testing'), 3])
    @patch('service.MANAGER')
    def test_parseRecords_error(self, mockManager, mockParse):
        recResults = self.parseRecords(['rec1', 'rec2', 'rec3'])
        self.assertEqual(recResults[0], 1)
        self.assertEqual(len(recResults), 1)
        mockManager.closeConnection.assert_called_once()

    @patch('service.MANAGER')
    def test_parseRecord_success(self, mockManager):
        encStr = b64encode(
            json.dumps({
                'status': 200,
                'source': 'testing'
            }).encode('utf-8'))
        testRecord = {'kinesis': {'data': encStr}}
        mockUpdater = MagicMock()
        mockUpdater.importRecord.return_value = 'import_record'
        importRec = self.parseRecord(testRecord, mockUpdater)
        mockManager.startSession.assert_called_once()
        mockManager.commitChanges.assert_called_once()
        self.assertEqual(importRec, 'import_record')

    @patch('service.MANAGER')
    def test_parseRecord_dbErr(self, mockManager):
        encStr = b64encode(
            json.dumps({
                'status': 200,
                'source': 'testing'
            }).encode('utf-8'))
        testRecord = {'kinesis': {'data': encStr}}
        mockUpdater = MagicMock()
        mockUpdater.importRecord.side_effect = OperationalError
        importRec = self.parseRecord(testRecord, mockUpdater)
        mockManager.startSession.assert_called_once()
        mockManager.commitChanges.assert_not_called()
        mockManager.session.rollback.assert_called_once()
        self.assertEqual(importRec, None)

    def test_parseRecord_noRecordsErr(self):
        encStr = b64encode(
            json.dumps({
                'status': 204,
                'source': 'testing'
            }).encode('utf-8'))
        testRecord = {'kinesis': {'data': encStr}}
        with self.assertRaises(NoRecordsReceived):
            self.parseRecord(testRecord, 'updater')

    def test_parseRecord_otherRecordErr(self):
        encStr = b64encode(
            json.dumps({
                'status': 500,
                'source': 'testing'
            }).encode('utf-8'))
        testRecord = {'kinesis': {'data': encStr}}
        with self.assertRaises(DataError):
            self.parseRecord(testRecord, 'updater')

    def test_parseRecord_jsonErr(self):
        encStr = b64encode('{"bad: "json"}'.encode('utf-8'))
        testRecord = {'kinesis': {'data': encStr}}
        with self.assertRaises(DataError):
            self.parseRecord(testRecord, 'updater')

    def test_parseRecord_b64Err(self):
        encStr = json.dumps({'bad': 'base64'}).encode('utf-8')
        testRecord = {'kinesis': {'data': encStr}}
        with self.assertRaises(DataError):
            self.parseRecord(testRecord, 'updater')