Exemple #1
0
    def matchEbook(self):
        for source, regex in self.EBOOK_REGEX.items():
            self.source = source
            if re.search(regex, self.uri):

                # Check if link is accessible (e.g. public domain/open source)
                if source == 'internetarchive':
                    if self.checkIAStatus() is True:
                        return None
                    linkID = Identifier(
                        identifier='ia.{}'.format(self.identifier),
                        source=None
                    )
                elif source == 'hathitrust':
                    self.parseHathiLink()
                    return None
                else:
                    linkID = Identifier(
                        identifier=self.identifier,
                        source='gutenberg'
                    )

                self.instance.addFormat(**{
                    'source': source,
                    'content_type': 'ebook',
                    'links': [
                        self.createLink(
                            self.uri, 'text/html',
                            local=False, download=False, images=False, ebook=True
                        )
                    ],
                    'identifiers': [linkID]
                })
                return True
def readFromClassify(workXML, workUUID):
    """Parse Classify XML document into a object that complies with the
    SFR data model. Accepts a single XML document and returns a WorkRecord."""
    logger.debug('Parsing Returned Work')

    work = workXML.find('.//work', namespaces=NAMESPACE)
    start = workXML.find('.//start', namespaces=NAMESPACE)

    oclcTitle = work.get('title')
    oclcNo = Identifier('oclc', work.text, 1)
    owiNo = Identifier('owi', work.get('owi'), 1)

    if OutputManager.checkRecentQueries('lookup/{}/{}/{}'.format(
        'owi', work.get('owi'), start.text
    )) is True:
        raise DataError('Work {} with OWI {} already classified'.format(
            workUUID, work.get('owi')
        ))

    measurements = []
    for measure in ['editions', 'holdings', 'eholdings']:
        measurements.append(Measurement(
            measure,
            work.get(measure),
            1,
            MEASUREMENT_TIME,
            work.text
        ))

    authors = workXML.findall('.//author', namespaces=NAMESPACE)
    authorList = list(map(parseAuthor, authors))

    editions = workXML.findall('.//edition', namespaces=NAMESPACE)
    editionList = loadEditions(editions)

    headings = workXML.findall('.//heading', namespaces=NAMESPACE)
    headingList = list(map(parseHeading, headings))

    workDict = {
        'title': oclcTitle,
        'agents': authorList,
        'instances': editionList,
        'subjects': headingList,
        'identifiers': [
            oclcNo,
            owiNo
        ],
        'measurements': measurements
    }

    instanceCount = int(work.get('editions', 0))

    return WorkRecord.createFromDict(**workDict), instanceCount, work.text
Exemple #3
0
    def matchEbook(self):
        for source, regex in self.EBOOK_REGEX.items():
            self.source = source
            if re.search(regex, self.uri):
                if source == 'internetarchive':
                    if self.checkIAStatus() is True:
                        return None
                elif source == 'hathitrust':
                    self.parseHathiLink()
                    return None

                self.instance.addFormat(
                    **{
                        'source':
                        source,
                        'content_type':
                        'ebook',
                        'links': [
                            self.createLink(self.uri,
                                            'text/html',
                                            local=False,
                                            download=False,
                                            images=False,
                                            ebook=True)
                        ],
                        'identifiers': [
                            Identifier(identifier=self.identifier,
                                       source='hathi')
                        ]
                    })
                return True
Exemple #4
0
    def getNewItemLinks(self, recItem):
        if recItem.get('rightsCode',
                       'ic') in ['ic', 'icus', 'ic-world', 'und']:
            return
        redirectURL = requests.head(recItem['itemURL'])
        realURL = redirectURL.headers['Location'].replace('https://', '')

        hathiID = re.search(self.HATHI_ID_REGEX, realURL).group(1)
        downloadURL = self.HATHI_DOWNLOAD_URL.format(hathiID)

        return {
            'source':
            self.source,
            'content_type':
            'ebook',
            'links': [
                HoldingParser.createLink(realURL,
                                         'text/html',
                                         local=False,
                                         download=False,
                                         images=True,
                                         ebook=False),
                HoldingParser.createLink(downloadURL,
                                         'application/pdf',
                                         local=False,
                                         download=True,
                                         images=True,
                                         ebook=False)
            ],
            'identifiers': [Identifier(identifier=hathiID, source='hathi')]
        }
    def buildWork(self):
        """Construct the SFR Work object from the Hathi data"""
        self.work.title = self.ingest['title']
        self.work.series = self.ingest['description']
        logger.info('Creating work record for {}'.format(self.work.title))
        # The primary identifier for this work is a HathiTrust bib reference
        self.work.primary_identifier = Identifier(
            type='hathi', identifier=self.ingest['bib_key'], weight=1)
        logger.debug('Setting primary_identifier to {}'.format(
            self.work.primary_identifier))

        for idType, key in HathiRecord.identifierFields:
            logger.debug('Setting identifiers {}'.format(idType))
            self.parseIdentifiers(self.work, idType, key)

        # All government documents should be in the public_domain.
        self.parseGovDoc(self.ingest['gov_doc'])

        # The copyright date assigned to the work by HathiTrust
        self.work.addClassItem(
            'dates', Date, **{
                'display_date': self.ingest['copyright_date'],
                'date_range': self.ingest['copyright_date'],
                'date_type': 'copyright_date'
            })
        logger.debug('Setting copyright date to {}'.format(
            self.ingest['copyright_date']))

        self.parseAuthor(self.ingest['author'])
def parseClassification(classification):
    """Parse a classification into an identifier for the work record."""
    tag = classification.get('tag')
    subjectType = MARC_FIELDS[tag]

    classDict = {
        'type': subjectType,
        'identifier': classification.get('sfa'),
        'weight': 1
    }

    return Identifier.createFromDict(**classDict)
Exemple #7
0
def transformMARC(record, marcRels):
    """Accepts a marcalyx object and transforms the MARC record into a SFR
    data object.
    """
    doabID = record[0]
    dateIssued = record[1]
    marcRecord = record[2]
    logger.info('Transforming record {} into a SFR object'.format(doabID))

    work = WorkRecord()
    instance = InstanceRecord()
    item = Format(source='doab', contentType='ebook')

    # Add issued date to work record
    work.addClassItem(
        'dates', Date, **{
            'display_date': dateIssued,
            'date_range': dateIssued,
            'date_type': 'issued'
        })

    # All DOAB records have the same CreativeCommons license, assign this
    # to Instance/Item records
    rights = Rights(
        source='doab',
        license='https://creativecommons.org/licenses/by-nc-nd/4.0/',
        statement=
        'Creative Commons Attribution-NonCommercial-NoDerivatives 4.0 International'
    )
    instance.rights.append(rights)
    item.rights.append(rights)

    # A single DOAB identifier can be assigned to the work/instance/item records
    doabIdentifier = Identifier(type='doab', identifier=doabID, weight=1)
    work.identifiers.append(doabIdentifier)
    instance.identifiers.append(doabIdentifier)
    item.identifiers.append(doabIdentifier)

    # Code Fields (Identifiers)
    logger.debug('Parsing 0X0-0XX Fields')
    controlData = [('010', 'identifiers', 'a', 'lccn'),
                   ('020', 'identifiers', 'a', 'isbn'),
                   ('022', 'identifiers', 'a', 'issn'),
                   ('050', 'identifiers', 'a', 'lcc'),
                   ('082', 'identifiers', 'a', 'ddc'),
                   ('010', 'identifiers', 'z', 'lccn'),
                   ('020', 'identifiers', 'z', 'isbn'),
                   ('022', 'identifiers', 'z', 'issn'),
                   ('050', 'identifiers', 'z', 'lcc'),
                   ('082', 'identifiers', 'z', 'ddc')]
    for field in controlData:
        extractSubfieldValue(marcRecord, work, field)
        extractSubfieldValue(marcRecord, instance, field)

    # Author/Creator Fields
    logger.debug('Parsing 100, 110 & 111 Fields')
    agentData = ['100', '110', '111', '700', '710', '711']
    for agentField in agentData:
        extractAgentValue(marcRecord, work, agentField, marcRels)

    # Title Fields
    logger.debug('Parsing 21X-24X Fields')
    titleData = [('210', 'alt_titles', 'a'), ('222', 'alt_titles', 'a'),
                 ('242', 'alt_titles', 'a'), ('246', 'alt_titles', 'a'),
                 ('247', 'alt_titles', 'a'), ('245', 'title', 'a'),
                 ('245', 'sub_title', 'b')]
    for field in titleData:
        extractSubfieldValue(marcRecord, work, field)
        extractSubfieldValue(marcRecord, instance, field)

    # Edition Fields
    logger.debug('Parsing Edition (250 & 260) Fields')
    editionData = [('250', 'edition_statement', 'a'),
                   ('250', 'edition_statement', 'b'),
                   ('260', 'pub_place', 'a'), ('260', 'pub_date', 'c'),
                   ('260', 'agents', 'b', 'publisher'),
                   ('260', 'agents', 'f', 'manufacturer'),
                   ('264', 'copyright_date', 'c')]
    for field in editionData:
        extractSubfieldValue(marcRecord, instance, field)

    # Physical Details
    # TODO Load fields into items/measurements?
    logger.debug('Parsing Extent (300) Field')
    extentData = [('300', 'extent', 'a'), ('300', 'extent', 'b'),
                  ('300', 'extent', 'c'), ('300', 'extent', 'e'),
                  ('300', 'extent', 'f')]
    for field in extentData:
        extractSubfieldValue(marcRecord, instance, field)

    # Series Details
    logger.debug('Parsing Series (490) Field')
    seriesData = [('490', 'series', 'a'), ('490', 'series_position', 'v')]
    for field in seriesData:
        extractSubfieldValue(marcRecord, work, field)

    # Notes/Description details
    # TODO What fields should we bring in?
    logger.debug('Parsing TOC (505) Field')
    tocData = [('505', 'table_of_contents', 'a'), ('520', 'summary', 'a')]
    for field in tocData:
        extractSubfieldValue(marcRecord, instance, field)

    # Language Fields
    if len(marcRecord['546']) > 0:
        for lang in marcRecord['546'][0].subfield('a'):
            langs = re.split(r'/|\|', lang.value)
            for language in langs:
                logger.debug(
                    'Adding language {} to work and instance'.format(language))
                langObj = pycountry.languages.get(name=language.strip())
                if langObj is None or langObj.alpha_3 == 'und':
                    logger.warning(
                        'Unable to parse language {}'.format(language))
                    continue
                sfrLang = Language(language=language,
                                   iso_2=langObj.alpha_2,
                                   iso_3=langObj.alpha_3)
                work.language.append(sfrLang)
                instance.language.append(sfrLang)

    # Subject Details
    logger.debug('Parsing 6XX Subject Fields')
    subjectData = ['600', '610', '648', '650', '651', '655', '656', '657']
    for subjectType in subjectData:
        extractSubjects(marcRecord, work, subjectType)

    # Eletronic Holding Details
    logger.debug('Parsing 856 (Electronic Holding) Field')
    extractHoldingsLinks(marcRecord['856'], instance, item)

    # TODO Load data for these fields
    # 76X-78X
    # 80X-83X
    instance.formats.append(item)
    work.instances.append(instance)
    return work, doabID
Exemple #8
0
def enhanceRecord(record):
    """Takes a single input record and retrieves data from the OCLC Classify
    service. Manages the overall workflow of the function."""

    try:
        workUUID = record['uuid']
        searchType = record['type']
        searchFields = record['fields']
        startPos = record.get('start', 0)
    except KeyError as e:
        logger.error('Missing attribute in data block!')
        logger.debug(e)
        raise DataError('Required attribute missing from data block')
    except TypeError as e:
        logger.error('Could not read data from source')
        logger.debug(e)
        raise DataError('Kinesis data contains non-dictionary value')

    logger.info('Starting to enhance work record {}'.format(workUUID))

    try:
        # Step 1: Generate a set of XML records retrieved from Classify
        # This step also adds the oclc identifiers to the sourceData record
        classifyData = classifyRecord(searchType,
                                      searchFields,
                                      workUUID,
                                      start=startPos)

        # Step 2: Parse the data recieved from Classify into the SFR data model
        classifiedWork, instanceCount, oclcNo = readFromClassify(
            classifyData, workUUID)
        logger.debug('Instances found {}'.format(instanceCount))
        if instanceCount > 500:
            iterStop = startPos + instanceCount
            if instanceCount > 1500:
                iterStop = startPos + 1500
            for i in range(startPos + 500, iterStop, 500):
                classifyPage = classifyRecord(searchType,
                                              searchFields,
                                              workUUID,
                                              start=i)
                extractAndAppendEditions(classifiedWork, classifyPage)

        if instanceCount > startPos + 1500:
            OutputManager.putQueue(
                {
                    'type': 'identifier',
                    'uuid': workUUID,
                    'fields': {
                        'idType': 'oclc',
                        'identifier': oclcNo,
                        'start': startPos + 1500
                    }
                }, os.environ['CLASSIFY_QUEUE'])

        # This sets the primary identifier for processing by the db manager
        classifiedWork.primary_identifier = Identifier('uuid', workUUID, 1)

        # Step 3: Output this block to kinesis
        outputObject = {
            'status': 200,
            'type': 'work',
            'method': 'update',
            'data': classifiedWork
        }
        while len(classifiedWork.instances) > 100:
            instanceChunk = classifiedWork.instances[0:100]
            del classifiedWork.instances[0:100]
            OutputManager.putKinesis(
                {
                    'status': 200,
                    'type': 'work',
                    'method': 'update',
                    'data': {
                        'instances': instanceChunk,
                        'primary_identifier': Identifier('uuid', workUUID, 1)
                    }
                }, os.environ['OUTPUT_KINESIS'], workUUID)
        OutputManager.putKinesis(outputObject, os.environ['OUTPUT_KINESIS'],
                                 workUUID)

    except OCLCError as err:
        logger.error('OCLC Query for work {} failed with message: {}'.format(
            workUUID, err.message))
        raise err

    return True
def parseEdition(edition):
    """Parse an edition into a Instance record"""
    oclcIdentifier = edition.get('oclc')
    oclcNo = Identifier(
        'oclc',
        oclcIdentifier,
        1
    )

    identifiers = [
        oclcNo
    ]

    fullEditionRec = None
    if OutputManager.checkRecentQueries('lookup/{}/{}'.format('oclc', oclcIdentifier)) is False:
        try:
            logger.info('Querying OCLC lookup for {}'.format(oclcIdentifier))
            oclcRoot = 'https://dev-platform.nypl.org/api/v0.1/research-now/v3/utils/oclc-catalog'
            oclcQuery = '{}?identifier={}&type={}'.format(
                oclcRoot, oclcIdentifier, 'oclc'
            )
            edResp = requests.get(oclcQuery, timeout=10)
            if edResp.status_code == 200:
                logger.debug('Found matching OCLC record')
                fullEditionRec = edResp.json()
        except Exception as err:
            logger.debug('Error received when querying OCLC catalog')
            logger.error(err)

    classifications = edition.findall('.//class', namespaces=NAMESPACE)
    classificationList = list(map(parseClassification, classifications))
    identifiers.extend(classificationList)

    holdings = Measurement(
        'holdings',
        edition.get('holdings'),
        1,
        MEASUREMENT_TIME,
        oclcIdentifier
    )

    digHoldings = Measurement(
        'digitalHoldings',
        edition.get('eholdings'),
        1,
        MEASUREMENT_TIME,
        oclcIdentifier
    )

    language = edition.get('language')
    editionTitle = edition.get('title')

    editionDict = {
        'title': editionTitle,
        'language': language,
        'identifiers': identifiers,
        'measurements': [
            holdings,
            digHoldings
        ]
    }

    if fullEditionRec is not None:
        outEdition = fullEditionRec
        outEdition['title'] = editionDict['title']
        outEdition['identifiers'].extend(editionDict['identifiers']) 
        outEdition['measurements'].extend(editionDict['measurements'])
        outEdition['language'] = list(set(
           [outEdition['language'], editionDict['language']]
        ))
    else:
        outEdition = editionDict
    return InstanceRecord.createFromDict(**outEdition)
Exemple #10
0
 def test_identifier_repr(self):
     idenTest = Identifier(type='test', identifier='1')
     self.assertEqual(str(idenTest), '<Identifier(type=test, id=1)>')
Exemple #11
0
 def test_identifier_create(self):
     idenTest = Identifier()
     self.assertIsInstance(idenTest, Identifier)