Example #1
0
 def append_identifier(d, key, obj, type):
     ids = exists_to_none(d, key)
     if ids:
         if isinstance(ids, list):
             for id in ids:
                 obj.identifiers.append(Identifier(id, type=type))
         else:
             obj.identifiers.append(Identifier(ids, type=type))
Example #2
0
    def import_from_feed(self, feed, even_if_no_author=False, 
                         immediately_presentation_ready=False,
                         feed_url=None):

        # Keep track of editions that were imported. Pools and works
        # for those editions may be looked up or created.
        imported_editions = {}
        pools = {}
        works = {}
        # CoverageFailures that note business logic errors and non-success download statuses
        failures = {}

        # If parsing the overall feed throws an exception, we should address that before
        # moving on. Let the exception propagate.
        metadata_objs, failures = self.extract_feed_data(feed, feed_url)

        # make editions.  if have problem, make sure associated pool and work aren't created.
        for key, metadata in metadata_objs.iteritems():
            # key is identifier.urn here

            # If there's a status message about this item, don't try to import it.
            if key in failures.keys():
                continue

            try:
                # Create an edition. This will also create a pool if there's circulation data.
                edition = self.import_edition_from_metadata(
                    metadata, even_if_no_author, immediately_presentation_ready
                )
                if edition:
                    imported_editions[key] = edition
            except Exception, e:
                # Rather than scratch the whole import, treat this as a failure that only applies
                # to this item.
                self.log.error("Error importing an OPDS item", exc_info=e)
                identifier, ignore = Identifier.parse_urn(self._db, key)
                data_source = self.data_source
                failure = CoverageFailure(identifier, traceback.format_exc(), data_source=data_source, transient=False)
                failures[key] = failure
                # clean up any edition might have created
                if key in imported_editions:
                    del imported_editions[key]
                # Move on to the next item, don't create a work.
                continue

            try:
                pool, work = self.update_work_for_edition(
                    edition, even_if_no_author, immediately_presentation_ready
                )
                if pool:
                    pools[key] = pool
                if work:
                    works[key] = work
            except Exception, e:
                identifier, ignore = Identifier.parse_urn(self._db, key)
                data_source = self.data_source
                failure = CoverageFailure(identifier, traceback.format_exc(), data_source=data_source, transient=False)
                failures[key] = failure
Example #3
0
    def categories(cls, work):
        """Send out _all_ categories for the work.

        (So long as the category type has a URI associated with it in
        Subject.uri_lookup.)
        """
        _db = Session.object_session(work)
        by_scheme_and_term = dict()
        identifier_ids = work.all_identifier_ids()
        classifications = Identifier.classifications_for_identifier_ids(
            _db, identifier_ids)
        for c in classifications:
            subject = c.subject
            if subject.type in Subject.uri_lookup:
                scheme = Subject.uri_lookup[subject.type]
                term = subject.identifier
                weight_field = AtomFeed.schema_("ratingValue")
                key = (scheme, term)
                if not key in by_scheme_and_term:
                    value = dict(term=subject.identifier)
                    if subject.name:
                        value['label'] = subject.name
                    value[weight_field] = 0
                    by_scheme_and_term[key] = value
                by_scheme_and_term[key][weight_field] += c.weight

        # Collapse by_scheme_and_term to by_scheme
        by_scheme = defaultdict(list)
        for (scheme, term), value in by_scheme_and_term.items():
            by_scheme[scheme].append(value)
        by_scheme.update(super(VerboseAnnotator, cls).categories(work))
        return by_scheme
Example #4
0
 def _retrieve_links(self, publications):
     if self.lamr is None:
         return
     pubs_by_uids = {}
     for pub in publications:
         for id in Identifier.find_by_type(pub.identifiers, 'WOK'):
             pubs_by_uids[id.value] = pub
     uids = pubs_by_uids.keys()
     result_by_uids = self.lamr.retrieve_by_ids(uids)
     for uid, result in result_by_uids.iteritems():
         pub = pubs_by_uids[uid]
         if 'timesCited' in result:
             pub.times_cited = int(result['timesCited'])
         if 'sourceURL' in result:
             pub.source_urls.append(
                 URL(result['sourceURL'],
                     type='WOK',
                     description=u'Web of ScienceĀ®'))
         if 'citingArticlesURL' in result:
             pub.cite_urls.append(
                 URL(result['citingArticlesURL'],
                     type='WOK',
                     description=u'Web of ScienceĀ®'))
         if 'message' in result:
             pub.errors.append(u'Failed loading article URLs: ' +
                               unicode(result['message']))
Example #5
0
    def test_transient_failure_if_requested_book_not_mentioned(self):
        """Test an unrealistic case where we ask Axis 360 about one book and
        it tells us about a totally different book.
        """
        # We're going to ask about abcdef
        identifier = self._identifier(identifier_type=Identifier.AXIS_360_ID)
        identifier.identifier = 'abcdef'

        # But we're going to get told about 0003642860.
        data = self.get_data("single_item.xml")
        self.api.queue_response(200, content=data)
        
        [result] = self.provider.process_batch([identifier])

        # Coverage failed for the book we asked about.
        assert isinstance(result, CoverageFailure)
        eq_(identifier, result.obj)
        eq_("Book not in collection", result.exception)
        
        # And nothing major was done about the book we were told
        # about. We created an Identifier record for its identifier,
        # but no LicensePool or Edition.
        wrong_identifier = Identifier.for_foreign_id(
            self._db, Identifier.AXIS_360_ID, "0003642860"
        )
        eq_([], identifier.licensed_through)
        eq_([], identifier.primarily_identifies)
Example #6
0
 def process_urn(self, urn, **kwargs):
     """Turn a URN into a Work suitable for use in an OPDS feed.
     """
     try:
         identifier, is_new = Identifier.parse_urn(self._db, urn)
     except ValueError, e:
         identifier = None
Example #7
0
 def extract_identifier(cls, identifier_tag):
     """Turn a <dcterms:identifier> tag into an IdentifierData object."""
     try:
         type, identifier = Identifier.type_and_identifier_for_urn(identifier_tag.text.lower())
         return IdentifierData(type, identifier)
     except ValueError:
         return None
Example #8
0
    def detail_for_elementtree_entry(
            cls, parser, entry_tag, data_source, feed_url=None,
    ):

        """Turn an <atom:entry> tag into a dictionary of metadata that can be
        used as keyword arguments to the Metadata contructor.

        :return: A 2-tuple (identifier, kwargs)
        """

        identifier = parser._xpath1(entry_tag, 'atom:id')
        if identifier is None or not identifier.text:
            # This <entry> tag doesn't identify a book so we 
            # can't derive any information from it.
            return None, None, None
        identifier = identifier.text
            
        try:
            data = cls._detail_for_elementtree_entry(
                parser, entry_tag, feed_url
            )
            return identifier, data, None

        except Exception, e:
            _db = Session.object_session(data_source)
            identifier_obj, ignore = Identifier.parse_urn(_db, identifier)
            failure = CoverageFailure(
                identifier_obj, traceback.format_exc(), data_source,
                transient=True
            )
            return identifier, None, failure
Example #9
0
    def parse_identifier_list(
            cls, _db, identifier_type, arguments, autocreate=False
    ):
        """Turn a list of identifiers into a list of Identifier objects.

        The list of arguments is probably derived from a command-line
        parser such as the one defined in
        IdentifierInputScript.arg_parser().

        This makes it easy to identify specific identifiers on the
        command line. Examples:

        1 2
        
        a b c
        """
        current_identifier_type = None
        if len(arguments) == 0:
            return []
        if not identifier_type:
            raise ValueError("No identifier type specified!")
        identifiers = []
        for arg in arguments:
            identifier, ignore = Identifier.for_foreign_id(
                _db, identifier_type, arg, autocreate=autocreate
            )
            if not identifier:
                logging.warn(
                    "Could not load identifier %s/%s", identifier_type, arg
                )
            if identifier:
                identifiers.append(identifier)
        return identifiers
Example #10
0
    def process_batch(self, identifiers):
        identifier_strings = self.api.create_identifier_strings(identifiers)
        response = self.api.availability(title_ids=identifier_strings)
        seen_identifiers = set()
        batch_results = []
        for metadata, availability in self.parser.process_all(
                response.content):
            identifier, is_new = metadata.primary_identifier.load(self._db)
            if not identifier in identifiers:
                # Theta told us about a book we didn't ask
                # for. This shouldn't happen, but if it does we should
                # do nothing further.
                continue
            seen_identifiers.add(identifier.identifier)
            result = self.set_metadata(identifier, metadata)
            if not isinstance(result, CoverageFailure):
                result = self.handle_success(identifier)
            batch_results.append(result)

        # Create a CoverageFailure object for each original identifier
        # not mentioned in the results.
        for identifier_string in identifier_strings:
            if identifier_string not in seen_identifiers:
                identifier, ignore = Identifier.for_foreign_id(
                    self._db, Identifier.THETA_ID, identifier_string)
                result = CoverageFailure(identifier,
                                         "Book not in collection",
                                         data_source=self.output_source,
                                         transient=False)
                batch_results.append(result)
        return batch_results
Example #11
0
    def explain_identifier(cls, identifier, primary, seen, strength, level):
        indent = "  " * level
        if primary:
            ident = "Primary identifier"
        else:
            ident = "Identifier"
        if primary:
            strength = 1
        output = "%s %s: %s/%s (q=%s)" % (indent, ident, identifier.type, identifier.identifier, strength)
        print output.encode("utf8")

        _db = Session.object_session(identifier)
        classifications = Identifier.classifications_for_identifier_ids(
            _db, [identifier.id])
        for classification in classifications:
            subject = classification.subject
            genre = subject.genre
            if genre:
                genre = genre.name
            else:
                genre = "(!genre)"
            #print "%s  %s says: %s/%s %s w=%s" % (
            #    indent, classification.data_source.name,
            #    subject.identifier, subject.name, genre, classification.weight
            #)
        seen.add(identifier)
        for equivalency in identifier.equivalencies:
            if equivalency.id in seen:
                continue
            seen.add(equivalency.id)
            output = equivalency.output
            cls.explain_identifier(output, False, seen,
                                    equivalency.strength, level+1)
Example #12
0
    def items_that_need_coverage(self, identifiers=None, **kwargs):
        """Find all items lacking coverage from this CoverageProvider.

        Items should be Identifiers, though Editions should also work.

        By default, all identifiers of the `input_identifier_types` which
        don't already have coverage are chosen.

        :param identifiers: The batch of identifier objects to test for coverage. identifiers and 
            self.input_identifiers can intersect -- if this provider was created for the 
            purpose of running specific Identifiers, and within those Identifiers you want to 
            batch, you can use both parameters.
        """
        qu = Identifier.missing_coverage_from(
            self._db,
            self.input_identifier_types,
            self.output_source,
            count_as_missing_before=self.cutoff_time,
            operation=self.operation,
            identifiers=self.input_identifiers,
            **kwargs)

        if identifiers:
            qu = qu.filter(Identifier.id.in_([x.id for x in identifiers]))

        return qu
Example #13
0
 def _identifier(self,
                 identifier_type=Identifier.GUTENBERG_ID,
                 foreign_id=None):
     if foreign_id:
         id = foreign_id
     else:
         id = self._str
     return Identifier.for_foreign_id(self._db, identifier_type, id)[0]
Example #14
0
 def search_citations(self, publications):
     for publication in publications:
         ut = list(Identifier.find_by_type(publication.identifiers, 'WOK'))
         if len(ut) == 0:
             continue
         ut = ut[0].value.lstrip(u'WOS:')
         for cite_url in URL.find_by_type(publication.cite_urls, 'WOK'):
             for pub in self._get_citations_from_url(cite_url.value, ut):
                 yield pub
Example #15
0
    def process_urns(self, urns, **process_urn_kwargs):
        """Processes a list of URNs for a lookup request.

        :return: None or, to override default feed behavior, a ProblemDetail
        or Response
        """
        identifiers_by_urn, failures = Identifier.parse_urns(self._db, urns)
        self.add_urn_failure_messages(failures)

        for urn, identifier in identifiers_by_urn.items():
            self.process_identifier(identifier, urn, **process_urn_kwargs)
Example #16
0
    def process_urns(self, urns, **process_urn_kwargs):
        """Processes a list of URNs for a lookup request.

        :return: None or, to override default feed behavior, a ProblemDetail
        or Response
        """
        identifiers_by_urn, failures = Identifier.parse_urns(self._db, urns)
        self.add_urn_failure_messages(failures)

        for urn, identifier in identifiers_by_urn.items():
            self.process_identifier(identifier, urn, **process_urn_kwargs)
Example #17
0
    def test_import_one_feed(self):
        # Check coverage records are created.

        monitor = OPDSImportMonitor(self._db, "http://url",
                                    DataSource.OA_CONTENT_SERVER,
                                    DoomedOPDSImporter)
        data_source = DataSource.lookup(self._db, DataSource.OA_CONTENT_SERVER)

        feed = self.content_server_mini_feed

        monitor.import_one_feed(feed, "http://root-url/")

        editions = self._db.query(Edition).all()

        # One edition has been imported
        eq_(1, len(editions))
        [edition] = editions

        # That edition has a CoverageRecord.
        record = CoverageRecord.lookup(
            editions[0].primary_identifier,
            data_source,
            operation=CoverageRecord.IMPORT_OPERATION)
        eq_(CoverageRecord.SUCCESS, record.status)
        eq_(None, record.exception)

        # The edition's primary identifier has a cover link whose
        # relative URL has been resolved relative to the URL we passed
        # into import_one_feed.
        [cover] = [
            x.resource.url for x in editions[0].primary_identifier.links
            if x.rel == Hyperlink.IMAGE
        ]
        eq_("http://root-url/full-cover-image.png", cover)

        # The 202 status message in the feed caused a transient failure.
        # The exception caused a persistent failure.

        coverage_records = self._db.query(CoverageRecord).filter(
            CoverageRecord.operation == CoverageRecord.IMPORT_OPERATION,
            CoverageRecord.status != CoverageRecord.SUCCESS)
        eq_(
            sorted([
                CoverageRecord.TRANSIENT_FAILURE,
                CoverageRecord.PERSISTENT_FAILURE
            ]), sorted([x.status for x in coverage_records]))

        identifier, ignore = Identifier.parse_urn(
            self._db,
            "urn:librarysimplified.org/terms/id/Gutenberg%20ID/10441")
        failure = CoverageRecord.lookup(
            identifier, data_source, operation=CoverageRecord.IMPORT_OPERATION)
        assert "Utter failure!" in failure.exception
Example #18
0
    def search_citations(self, publications):
        """Vrati iterator vracajuci zoznam publikacii, ktore cituju publikacie
           v zozname publications
        """
        for publication in publications:
            eid = list(Identifier.find_by_type(publication.identifiers,
                                               'SCOPUS'))
            if len(eid) == 0:
                continue
            eid = eid[0].value

            for pub in self.search_citations_by_eid(eid):
                yield pub
Example #19
0
    def coveragefailure_from_message(cls, data_source, message):
        """Turn a <simplified:message> tag into a CoverageFailure."""

        _db = Session.object_session(data_source)
        
        # First thing to do is determine which Identifier we're
        # talking about. If we can't do that, we can't create a
        # CoverageFailure object.
        urn = message.urn
        try:
            identifier, ignore = Identifier.parse_urn(_db, urn)
        except ValueError, e:
            identifier = None
Example #20
0
 def search_citations(self, publications):
     for publication in publications:
         eid = list(
             Identifier.find_by_type(publication.identifiers, 'SCOPUS'))
         if len(eid) == 0:
             continue
         eid = eid[0].value
         detail_url = list(
             URL.find_by_type(publication.source_urls, 'SCOPUS'))
         if len(detail_url) == 0:
             continue
         detail_url = detail_url[0].value
         for pub in self._get_citations_from_detail_url(detail_url, eid):
             yield pub
Example #21
0
    def items_that_need_coverage(self, identifiers=None, **kwargs):
        """Find all items lacking coverage from this CoverageProvider.

        Items should be Identifiers, though Editions should also work.

        By default, all identifiers of the `input_identifier_types` which
        don't already have coverage are chosen.
        """
        qu = Identifier.missing_coverage_from(
            self._db,
            self.input_identifier_types,
            self.output_source,
            count_as_missing_before=self.cutoff_time,
            operation=self.operation,
            **kwargs)
        if identifiers:
            qu = qu.filter(Identifier.id.in_([x.id for x in identifiers]))
        return qu
Example #22
0
    def assign_indexes(self, publications):
        pub_by_id = {}

        for pub in publications:
            e = list(Index.find_by_type(pub.indexes, 'WOS'))
            if len(e) > 0:
                continue

            ut = list(Identifier.find_by_type(pub.identifiers, 'WOK'))
            if len(ut) == 0:
                continue
            ut = ut[0].value

            pub_by_id[ut] = pub

        editions = self._find_editions(pub_by_id.keys())
        for ut, edition in editions.iteritems():
            pub_by_id[ut].indexes.append(Index(edition, type='WOS'))
Example #23
0
    def data_detail_for_feedparser_entry(cls, entry, data_source):
        """Turn an entry dictionary created by feedparser into dictionaries of data
        that can be used as keyword arguments to the Metadata and CirculationData constructors.

        :return: A 3-tuple (identifier, kwargs for Metadata constructor, failure)
        """
        identifier = entry.get('id')
        if not identifier:
            return None, None, None

        # At this point we can assume that we successfully got some
        # metadata, and possibly a link to the actual book.
        try:
            kwargs_meta = cls._data_detail_for_feedparser_entry(entry, data_source)
            return identifier, kwargs_meta, None
        except Exception, e:
            _db = Session.object_session(data_source)
            identifier_obj, ignore = Identifier.parse_urn(_db, identifier)
            failure = CoverageFailure(
                identifier_obj, traceback.format_exc(), data_source,
                transient=True
            )
            return identifier, None, failure
from model import (
    DataSource,
    LicensePool,
    SessionManager,
    Work,
    Identifier,
)
from model import production_session

if __name__ == '__main__':
    session = production_session()

    data_source_name = sys.argv[1]
    identifier = sys.argv[2]
    data_source = DataSource.lookup(session, data_source_name)
    wid, ignore = Identifier.for_foreign_id(
        session, data_source.primary_identifier_type, identifier, False)
    pool = session.query(LicensePool).filter(
        LicensePool.data_source == data_source).filter(
            LicensePool.identifier == wid).one()
    primary_edition = pool.edition()
    old_work = primary_edition.work
    if old_work:
        old_work.license_pools.remove(pool)
    primary_edition.work = None
    pool.calculate_work()
    work = pool.work
    work.calculate_presentation()
    session.commit()
from nose.tools import set_trace
import os
import site
import sys
import datetime
d = os.path.split(__file__)[0]
site.addsitedir(os.path.join(d, ".."))
from integration.threem import (
    ThreeMAPI, )
from integration.overdrive import (
    OverdriveAPI, )

from model import (
    production_session,
    DataSource,
    Edition,
    Identifier,
)

if __name__ == '__main__':
    type, identifier_name = sys.argv[1:3]
    db = production_session()
    identifier, is_new = Identifier.for_foreign_id(db, type, identifier_name)
    if identifier.type == Identifier.THREEM_ID:
        source = DataSource.lookup(db, DataSource.THREEM)
        api = ThreeMAPI(db)
        edition, ignore = Edition.for_foreign_id(db, source, type,
                                                 identifier_name)
        data = api.get_bibliographic_info_for([edition])
import site
import sys
import datetime
d = os.path.split(__file__)[0]
site.addsitedir(os.path.join(d, ".."))
from integration.threem import (
    ThreeMAPI,
)
from integration.overdrive import (
    OverdriveAPI,
)

from model import (
    production_session,
    DataSource,
    Edition,
    Identifier,
)

if __name__ == '__main__':
    type, identifier_name = sys.argv[1:3]
    db = production_session()
    identifier, is_new = Identifier.for_foreign_id(db, type, identifier_name)
    if identifier.type==Identifier.THREEM_ID:
        source = DataSource.lookup(db, DataSource.THREEM)
        api = ThreeMAPI(db)
        edition, ignore = Edition.for_foreign_id(
            db, source, type, identifier_name)
        data = api.get_bibliographic_info_for([edition])

import sys
from nose.tools import set_trace

d = os.path.split(__file__)[0]
site.addsitedir(os.path.join(d, ".."))

from model import DataSource, LicensePool, SessionManager, Work, Identifier
from model import production_session

if __name__ == "__main__":
    session = production_session()

    data_source_name = sys.argv[1]
    identifier = sys.argv[2]
    data_source = DataSource.lookup(session, data_source_name)
    wid, ignore = Identifier.for_foreign_id(session, data_source.primary_identifier_type, identifier, False)
    pool = (
        session.query(LicensePool)
        .filter(LicensePool.data_source == data_source)
        .filter(LicensePool.identifier == wid)
        .one()
    )
    primary_edition = pool.edition()
    old_work = primary_edition.work
    if old_work:
        old_work.license_pools.remove(pool)
    primary_edition.work = None
    pool.calculate_work()
    work = pool.work
    work.calculate_presentation()
    session.commit()
Example #28
0
 def _identifier(self, identifier_type=Identifier.GUTENBERG_ID):
     id = self._str
     return Identifier.for_foreign_id(self._db, identifier_type, id)[0]
    def test_recursively_equivalent_identifiers(self):

        # We start with a Gutenberg book.
        gutenberg = DataSource.lookup(self._db, DataSource.GUTENBERG)
        record, ignore = Edition.for_foreign_id(self._db, gutenberg,
                                                Identifier.GUTENBERG_ID, "100")
        gutenberg_id = record.primary_identifier

        # We use OCLC Classify to do a title/author lookup.
        oclc = DataSource.lookup(self._db, DataSource.OCLC)
        search_id, ignore = Identifier.for_foreign_id(self._db,
                                                      Identifier.OCLC_WORK,
                                                      "60010")
        gutenberg_id.equivalent_to(oclc, search_id, 1)

        # The title/author lookup associates the search term with two
        # different OCLC Numbers.
        oclc_id, ignore = Identifier.for_foreign_id(self._db,
                                                    Identifier.OCLC_NUMBER,
                                                    "9999")
        oclc_id_2, ignore = Identifier.for_foreign_id(self._db,
                                                      Identifier.OCLC_NUMBER,
                                                      "1000")

        search_id.equivalent_to(oclc, oclc_id, 1)
        search_id.equivalent_to(oclc, oclc_id_2, 1)

        # We then use OCLC Linked Data to connect one of the OCLC
        # Numbers with an ISBN.
        linked_data = DataSource.lookup(self._db, DataSource.OCLC_LINKED_DATA)
        isbn_id, ignore = Identifier.for_foreign_id(self._db, Identifier.ISBN,
                                                    "900100434X")
        oclc_id.equivalent_to(linked_data, isbn_id, 1)

        # As it turns out, we have an Overdrive work record...
        overdrive = DataSource.lookup(self._db, DataSource.OVERDRIVE)
        overdrive_record, ignore = Edition.for_foreign_id(
            self._db, overdrive, Identifier.OVERDRIVE_ID, "{111-222}")
        overdrive_id = overdrive_record.primary_identifier

        # ...which is tied (by Overdrive) to the same ISBN.
        overdrive_id.equivalent_to(overdrive, isbn_id, 1)

        # Finally, here's a completely unrelated Edition, which
        # will not be showing up.
        gutenberg2, ignore = Edition.for_foreign_id(self._db, gutenberg,
                                                    Identifier.GUTENBERG_ID,
                                                    "200")
        gutenberg2.title = "Unrelated Gutenberg record."

        levels = [record.equivalent_identifiers(i) for i in range(0, 5)]

        # At level 0, the only identifier found is the Gutenberg ID.
        eq_(set([gutenberg_id]), set(levels[0]))

        # At level 1, we pick up the title/author lookup.
        eq_(set([gutenberg_id, search_id]), set(levels[1]))

        # At level 2, we pick up the title/author lookup and the two
        # OCLC Numbers.
        eq_(set([gutenberg_id, search_id, oclc_id, oclc_id_2]), set(levels[2]))

        # At level 3, we also pick up the ISBN.
        eq_(set([gutenberg_id, search_id, oclc_id, oclc_id_2, isbn_id]),
            set(levels[3]))

        # At level 4, the recursion starts to go in the other
        # direction: we pick up the Overdrive ID that's equivalent to
        # the same ISBN as the OCLC Number.
        eq_(
            set([
                gutenberg_id, search_id, oclc_id, oclc_id_2, isbn_id,
                overdrive_id
            ]), set(levels[4]))
Example #30
0
 def do_run(self, _db):
     identifier = Identifier(type='Keep It', identifier='100')
     _db.add(identifier)
Example #31
0
 def _identifier(self, identifier_type=Identifier.GUTENBERG_ID, foreign_id=None):
     if foreign_id:
         id = foreign_id
     else:
         id = self._str
     return Identifier.for_foreign_id(self._db, identifier_type, id)[0]
Example #32
0
 def do_run(self, _db):
     identifier = Identifier(type='You Can', identifier='Keep It')
     _db.add(identifier)
     raise RuntimeError
Example #33
0
    def extract_feed_data(self, feed, feed_url=None):
        """Turn an OPDS feed into lists of Metadata and CirculationData objects, 
        with associated messages and next_links.
        """
        # This is one of these cases where we want to create a
        # DataSource if it doesn't already exist. This way you don't have
        # to predefine a DataSource for every source of OPDS feeds.
        data_source = self.data_source
        fp_metadata, fp_failures = self.extract_data_from_feedparser(feed=feed, data_source=data_source)
        # gets: medium, measurements, links, contributors, etc.
        xml_data_meta, xml_failures = self.extract_metadata_from_elementtree(
            feed, data_source=data_source, feed_url=feed_url
        )

        # translate the id in failures to identifier.urn
        identified_failures = {}
        for id, failure in fp_failures.items() + xml_failures.items():
            external_identifier, ignore = Identifier.parse_urn(self._db, id)
            if self.identifier_mapping:
                internal_identifier = self.identifier_mapping.get(
                    external_identifier, external_identifier)
            else:
                internal_identifier = external_identifier
            identified_failures[internal_identifier.urn] = failure

        # Use one loop for both, since the id will be the same for both dictionaries.
        metadata = {}
        circulationdata = {}
        for id, m_data_dict in fp_metadata.items():
            external_identifier, ignore = Identifier.parse_urn(self._db, id)
            if self.identifier_mapping:
                internal_identifier = self.identifier_mapping.get(
                    external_identifier, external_identifier)
            else:
                internal_identifier = external_identifier

            # Don't process this item if there was already an error
            if internal_identifier.urn in identified_failures.keys():
                continue

            identifier_obj = IdentifierData(
                type=internal_identifier.type,
                identifier=internal_identifier.identifier
            )

            # form the Metadata object
            xml_data_dict = xml_data_meta.get(id, {})
            combined_meta = self.combine(m_data_dict, xml_data_dict)
            if combined_meta.get('data_source') is None:
                combined_meta['data_source'] = self.data_source_name
            
            combined_meta['primary_identifier'] = identifier_obj

            metadata[internal_identifier.urn] = Metadata(**combined_meta)

            # form the CirculationData that would correspond to this Metadata
            c_circulation_dict = m_data_dict.get('circulation')
            xml_circulation_dict = xml_data_dict.get('circulation', {})
            c_data_dict = self.combine(c_circulation_dict, xml_circulation_dict)
            if c_data_dict:
                circ_links_dict = {}
                # extract just the links to pass to CirculationData constructor
                if 'links' in xml_data_dict:
                    circ_links_dict['links'] = xml_data_dict['links']
                combined_circ = self.combine(c_data_dict, circ_links_dict)
                if combined_circ.get('data_source') is None:
                    combined_circ['data_source'] = self.data_source_name
            
                combined_circ['primary_identifier'] = identifier_obj
                circulation = CirculationData(**combined_circ)
                if circulation.formats:
                    metadata[internal_identifier.urn].circulation = circulation
                else:
                    # If the CirculationData has no formats, it
                    # doesn't really offer any way to actually get the
                    # book, and we don't want to create a
                    # LicensePool. All the circulation data is
                    # useless.
                    #
                    # TODO: This will need to be revisited when we add
                    # ODL support.
                    metadata[internal_identifier.urn].circulation = None
        return metadata, identified_failures
Example #34
0
    def check_for_new_data(self, feed):
        """Check if the feed contains any entries that haven't been imported
        yet. If force_import is set, every entry in the feed is
        treated as new.
        """

        # If force_reimport is set, we don't even need to check. Always
        # treat the feed as though it contained new data.
        if self.force_reimport:
            return True

        last_update_dates = self.importer.extract_last_update_dates(feed)

        new_data = False
        for identifier, remote_updated in last_update_dates:

            identifier, ignore = Identifier.parse_urn(self._db, identifier)
            data_source = self.importer.data_source
            record = None

            if identifier:
                record = CoverageRecord.lookup(
                    identifier, data_source, operation=CoverageRecord.IMPORT_OPERATION
                )

            # If there was a transient failure last time we tried to
            # import this book, try again regardless of whether the
            # feed has changed.
            if record and record.status == CoverageRecord.TRANSIENT_FAILURE:
                new_data = True
                self.log.info(
                    "Counting %s as new because previous attempt resulted in transient failure: %s", 
                    record.identifier, record.exception
                )
                break

            # If our last attempt was a success or a persistent
            # failure, we only want to import again if something
            # changed since then.

            if record and record.timestamp:
                # We've imported this entry before, so don't import it
                # again unless it's changed.

                if not remote_updated:
                    # The remote isn't telling us whether the entry
                    # has been updated. Import it again to be safe.
                    new_data = True
                    self.log.info(
                        "Counting %s as new because remote has no information about when it was updated.", 
                        record.identifier
                    )
                    break

                if remote_updated >= record.timestamp:
                    # This book has been updated.
                    self.log.info(
                        "Counting %s as new because its coverage date is %s and remote has %s.", 
                        record.identifier, record.timestamp, remote_updated
                    )

                    new_data = True
                    break

            else:
                # There's no record of an attempt to import this book.
                self.log.info(
                    "Counting %s as new because it has no CoverageRecord.", 
                    identifier
                )
                new_data = True
                break
        return new_data