def setup(self): super(TestCoverageProvider, self).setup() gutenberg = DataSource.lookup(self._db, DataSource.GUTENBERG) self.input_identifier_types = gutenberg.primary_identifier_type self.output_source = DataSource.lookup(self._db, DataSource.OCLC) self.edition = self._edition(gutenberg.name) self.identifier = self.edition.primary_identifier
def setup(self): super(TestPresentationReadyMonitor, self).setup() self.gutenberg = Identifier.GUTENBERG_ID self.oclc = DataSource.lookup(self._db, DataSource.OCLC) self.overdrive = DataSource.lookup(self._db, DataSource.OVERDRIVE) self.edition, self.edition_license_pool = self._edition( DataSource.GUTENBERG, with_license_pool=True) self.work = self._work(DataSource.GUTENBERG, with_license_pool=True) # Don't fake that the work is presentation ready, as we usually do, # because presentation readiness is what we're trying to test. self.work.presentation_ready = False
def test_cover_image_root(self): with self.temp_config(): gutenberg_illustrated = DataSource.lookup( self._db, DataSource.GUTENBERG_COVER_GENERATOR) overdrive = DataSource.lookup( self._db, DataSource.OVERDRIVE) eq_("http://s3.amazonaws.com/test-book-covers-s3-bucket/Gutenberg%20Illustrated/", S3Uploader.cover_image_root(gutenberg_illustrated)) eq_("http://s3.amazonaws.com/test-book-covers-s3-bucket/Overdrive/", S3Uploader.cover_image_root(overdrive)) eq_("http://s3.amazonaws.com/test-book-covers-s3-bucket/scaled/300/Overdrive/", S3Uploader.cover_image_root(overdrive, 300))
def test_book_url(self): identifier = self._identifier(foreign_id="ABOOK") buckets = {S3Uploader.OA_CONTENT_BUCKET_KEY: 'thebooks'} uploader = self._uploader(**buckets) m = uploader.book_url eq_(u'https://s3.amazonaws.com/thebooks/Gutenberg+ID/ABOOK.epub', m(identifier)) # The default extension is .epub, but a custom extension can # be specified. eq_(u'https://s3.amazonaws.com/thebooks/Gutenberg+ID/ABOOK.pdf', m(identifier, extension='pdf')) eq_(u'https://s3.amazonaws.com/thebooks/Gutenberg+ID/ABOOK.pdf', m(identifier, extension='.pdf')) # If a data source is provided, the book is stored underneath the # data source. unglueit = DataSource.lookup(self._db, DataSource.UNGLUE_IT) eq_( u'https://s3.amazonaws.com/thebooks/unglue.it/Gutenberg+ID/ABOOK.epub', m(identifier, data_source=unglueit)) # If a title is provided, the book's filename incorporates the # title, for the benefit of people who download the book onto # their hard drive. eq_( u'https://s3.amazonaws.com/thebooks/Gutenberg+ID/ABOOK/On+Books.epub', m(identifier, title="On Books")) # Non-open-access content can't be stored. assert_raises(NotImplementedError, m, identifier, open_access=False)
def test_cover_image_root(self): bucket = u'test-book-covers-s3-bucket' m = S3Uploader.cover_image_root gutenberg_illustrated = DataSource.lookup( self._db, DataSource.GUTENBERG_COVER_GENERATOR) overdrive = DataSource.lookup(self._db, DataSource.OVERDRIVE) eq_( "https://s3.amazonaws.com/test-book-covers-s3-bucket/Gutenberg+Illustrated/", m(bucket, gutenberg_illustrated)) eq_("https://s3.amazonaws.com/test-book-covers-s3-bucket/Overdrive/", m(bucket, overdrive)) eq_( "https://s3.amazonaws.com/test-book-covers-s3-bucket/scaled/300/Overdrive/", m(bucket, overdrive, 300))
def _customlist(self, foreign_identifier=None, name=None, data_source_name=DataSource.NYT, num_entries=1, entries_exist_as_works=True): data_source = DataSource.lookup(self._db, data_source_name) foreign_identifier = foreign_identifier or self._str now = datetime.utcnow() customlist, ignore = get_one_or_create( self._db, CustomList, create_method_kwargs=dict( created=now, updated=now, name=name or self._str, description=self._str, ), data_source=data_source, foreign_identifier=foreign_identifier) editions = [] for i in range(num_entries): if entries_exist_as_works: work = self._work(with_open_access_download=True) edition = work.presentation_edition else: edition = self._edition(data_source_name, title="Item %s" % i) edition.permanent_work_id = "Permanent work ID %s" % self._str customlist.add_entry(edition, "Annotation %s" % i, first_appearance=now) editions.append(edition) return customlist, editions
def test_extract_data_from_feedparser_handles_exception(self): class DoomedFeedparserOPDSImporter(OPDSImporter): """An importer that can't extract metadata from feedparser.""" @classmethod def _data_detail_for_feedparser_entry(cls, entry, data_source): raise Exception("Utter failure!") data_source = DataSource.lookup(self._db, DataSource.OA_CONTENT_SERVER) values, failures = DoomedFeedparserOPDSImporter.extract_data_from_feedparser( self.content_server_mini_feed, data_source) # No metadata was extracted. eq_(0, len(values.keys())) # There are 2 failures, both from exceptions. The 202 message # found in content_server_mini.opds is not extracted # here--it's extracted by extract_metadata_from_elementtree. eq_(2, len(failures)) # The first error message became a CoverageFailure. failure = failures[ 'urn:librarysimplified.org/terms/id/Gutenberg%20ID/10441'] assert isinstance(failure, CoverageFailure) eq_(True, failure.transient) assert "Utter failure!" in failure.exception # The second error message became a CoverageFailure. failure = failures[ 'urn:librarysimplified.org/terms/id/Gutenberg%20ID/10557'] assert isinstance(failure, CoverageFailure) eq_(True, failure.transient) assert "Utter failure!" in failure.exception
def test_mirror_404_error(self): mirror = DummyS3Uploader() h = DummyHTTPClient() h.queue_response(404) policy = ReplacementPolicy(mirror=mirror, http_get=h.do_get) edition, pool = self._edition(with_license_pool=True) data_source = DataSource.lookup(self._db, DataSource.GUTENBERG) link = LinkData( rel=Hyperlink.IMAGE, media_type=Representation.JPEG_MEDIA_TYPE, href="http://example.com/", ) link_obj, ignore = edition.primary_identifier.add_link( rel=link.rel, href=link.href, data_source=data_source, license_pool=pool, media_type=link.media_type, content=link.content, ) m = Metadata(data_source=data_source) m.mirror_link(edition, data_source, link, link_obj, policy) # Since we got a 404 error, the cover image was not mirrored. eq_(404, link_obj.resource.representation.status_code) eq_(None, link_obj.resource.representation.mirror_url) eq_([], mirror.uploaded)
def _customlist(self, foreign_identifier=None, name=None, data_source_name=DataSource.NYT, num_entries=1, entries_exist_as_works=True ): data_source = DataSource.lookup(self._db, data_source_name) foreign_identifier = foreign_identifier or self._str now = datetime.utcnow() customlist, ignore = get_one_or_create( self._db, CustomList, create_method_kwargs=dict( created=now, updated=now, name=name or self._str, description=self._str, ), data_source=data_source, foreign_identifier=foreign_identifier ) editions = [] for i in range(num_entries): if entries_exist_as_works: work = self._work(with_open_access_download=True) edition = work.presentation_edition else: edition = self._edition( data_source_name, title="Item %s" % i) edition.permanent_work_id="Permanent work ID %s" % self._str customlist.add_entry( edition, "Annotation %s" % i, first_appearance=now) editions.append(edition) return customlist, editions
def to_customlist(self, _db, dictreader): """Turn the CSV file in `dictreader` into a CustomList. TODO: Keep track of the list's current members. If any item was on the list but is no longer on the list, set its last_appeared date to its most recent appearance. """ data_source = DataSource.lookup(_db, self.data_source_name) now = datetime.datetime.utcnow() # Find or create the CustomList object itself. custom_list, was_new = get_one_or_create( _db, CustomList, data_source=data_source, foreign_identifier=self.foreign_identifier, create_method_kwargs = dict( created=now, ) ) custom_list.updated = now # Turn the rows of the CSV file into a sequence of Metadata # objects, then turn each Metadata into a CustomListEntry object. for metadata in self.to_metadata(dictreader): entry = self.metadata_to_list_entry( custom_list, data_source, now, metadata)
def test_staff_picks_and_best_sellers_sublane(self): staff_picks, ignore = self._customlist( foreign_identifier=u"Staff Picks", name=u"Staff Picks!", data_source_name=DataSource.LIBRARY_STAFF, num_entries=0) best_sellers, ignore = self._customlist( foreign_identifier=u"NYT Best Sellers", name=u"Best Sellers!", data_source_name=DataSource.NYT, num_entries=0) lane = Lane(self._db, "Everything", include_staff_picks=True, include_best_sellers=True) # A staff picks sublane and a best-sellers sublane have been # created for us. best, picks = lane.sublanes.lanes eq_("Best Sellers", best.display_name) eq_("Everything - Best Sellers", best.name) nyt = DataSource.lookup(self._db, DataSource.NYT) eq_(nyt.id, best.list_data_source_id) eq_("Staff Picks", picks.display_name) eq_("Everything - Staff Picks", picks.name) eq_([staff_picks.id], picks.list_ids)
def _edition(self, data_source_name=DataSource.GUTENBERG, identifier_type=Identifier.GUTENBERG_ID, with_license_pool=False, with_open_access_download=False, title=None, language="eng", authors=None, identifier_id=None): id = identifier_id or self._str source = DataSource.lookup(self._db, data_source_name) wr = Edition.for_foreign_id( self._db, source, identifier_type, id)[0] if not title: title = self._str wr.title = unicode(title) if language: wr.language = language if authors is None: authors = self._str if isinstance(authors, basestring): authors = [authors] if authors != []: wr.add_contributor(unicode(authors[0]), Contributor.PRIMARY_AUTHOR_ROLE) wr.author = unicode(authors[0]) for author in authors[1:]: wr.add_contributor(unicode(author), Contributor.AUTHOR_ROLE) if with_license_pool or with_open_access_download: pool = self._licensepool(wr, data_source_name=data_source_name, with_open_access_download=with_open_access_download) pool.set_presentation_edition() return wr, pool return wr
def __init__(self, _db, api, datasource, batch_size=10, metadata_replacement_policy=None, circulationdata_replacement_policy=None, cutoff_time=None): self._db = _db self.api = api output_source = DataSource.lookup(_db, datasource) input_identifier_types = [output_source.primary_identifier_type] service_name = "%s Bibliographic Coverage Provider" % datasource metadata_replacement_policy = ( metadata_replacement_policy or ReplacementPolicy.from_metadata_source()) circulationdata_replacement_policy = ( circulationdata_replacement_policy or ReplacementPolicy.from_license_source()) self.metadata_replacement_policy = metadata_replacement_policy self.circulationdata_replacement_policy = circulationdata_replacement_policy super(BibliographicCoverageProvider, self).__init__(service_name, input_identifier_types, output_source, batch_size=batch_size, cutoff_time=cutoff_time)
def test_non_open_access_book_not_mirrored(self): data_source = DataSource.lookup(self._db, DataSource.GUTENBERG) m = Metadata(data_source=data_source) mirror = DummyS3Uploader(fail=True) h = DummyHTTPClient() policy = ReplacementPolicy(mirror=mirror, http_get=h.do_get) content = "foo" link = LinkData(rel=Hyperlink.OPEN_ACCESS_DOWNLOAD, media_type=Representation.EPUB_MEDIA_TYPE, href="http://example.com/", content=content, rights_uri=RightsStatus.IN_COPYRIGHT) identifier = self._identifier() link_obj, is_new = identifier.add_link( rel=link.rel, href=link.href, data_source=data_source, media_type=link.media_type, content=link.content, ) # The Hyperlink object makes it look like an open-access book, # but the context we have from the OPDS feed says that it's # not. m.mirror_link(None, data_source, link, link_obj, policy) # No HTTP requests were made. eq_([], h.requests) # Nothing was uploaded. eq_([], mirror.uploaded)
def data_source(self): """Look up or create a DataSource object representing the source of this OPDS feed. """ return DataSource.lookup( self._db, self.data_source_name, autocreate=True, offers_licenses=self.data_source_offers_licenses )
def test_mirror_open_access_link_mirror_failure(self): edition, pool = self._edition(with_license_pool=True) data_source = DataSource.lookup(self._db, DataSource.GUTENBERG) m = Metadata(data_source=data_source) mirror = DummyS3Uploader(fail=True) h = DummyHTTPClient() policy = ReplacementPolicy(mirror=mirror, http_get=h.do_get) content = open(self.sample_cover_path("test-book-cover.png")).read() link = LinkData(rel=Hyperlink.IMAGE, media_type=Representation.JPEG_MEDIA_TYPE, href="http://example.com/", content=content) link_obj, ignore = edition.primary_identifier.add_link( rel=link.rel, href=link.href, data_source=data_source, license_pool=pool, media_type=link.media_type, content=link.content, ) h.queue_response(200, media_type=Representation.JPEG_MEDIA_TYPE) m.mirror_link(edition, data_source, link, link_obj, policy) representation = link_obj.resource.representation # The representation was fetched successfully. eq_(None, representation.fetch_exception) assert representation.fetched_at != None # But mirroing failed. assert representation.mirror_exception != None eq_(None, representation.mirrored_at) eq_(link.media_type, representation.media_type) eq_(link.href, representation.url) # The mirror url should still be set. assert "Gutenberg" in representation.mirror_url assert representation.mirror_url.endswith( "%s/cover.jpg" % edition.primary_identifier.identifier) # Book content is still there since it wasn't mirrored. assert representation.content != None # the edition's identifier-associated license pool should not be # suppressed just because fetch failed on getting image. eq_(False, pool.suppressed) # the license pool only gets its license_exception column filled in # if fetch failed on getting an Hyperlink.OPEN_ACCESS_DOWNLOAD-type epub. eq_(None, pool.license_exception)
def _credential(self, data_source_name=DataSource.GUTENBERG, type=None, patron=None): data_source = DataSource.lookup(self._db, data_source_name) type = type or self._str patron = patron or self._patron() credential, is_new = Credential.persistent_token_create( self._db, data_source, type, patron ) return credential
def data_source(self): """Look up the DataSource object corresponding to the service we're running this data through. Out of an excess of caution, we look up the DataSource every time, rather than storing it, in case a CoverageProvider is ever used in an environment where the database session is scoped (e.g. the circulation manager). """ return DataSource.lookup(self._db, self.DATA_SOURCE_NAME)
def test_mirror_with_content_modifier(self): edition, pool = self._edition(with_license_pool=True) data_source = DataSource.lookup(self._db, DataSource.GUTENBERG) m = Metadata(data_source=data_source) mirror = DummyS3Uploader() def dummy_content_modifier(representation): representation.content = "Replaced Content" h = DummyHTTPClient() policy = ReplacementPolicy(mirror=mirror, content_modifier=dummy_content_modifier, http_get=h.do_get) link = LinkData( rel=Hyperlink.OPEN_ACCESS_DOWNLOAD, media_type=Representation.EPUB_MEDIA_TYPE, href="http://example.com/test.epub", content="I'm an epub", ) link_obj, ignore = edition.primary_identifier.add_link( rel=link.rel, href=link.href, data_source=data_source, license_pool=pool, media_type=link.media_type, content=link.content, ) h.queue_response(200, media_type=Representation.EPUB_MEDIA_TYPE) m.mirror_link(edition, data_source, link, link_obj, policy) representation = link_obj.resource.representation # The representation was fetched successfully. eq_(None, representation.fetch_exception) assert representation.fetched_at != None # The mirror url is set. assert "Gutenberg" in representation.mirror_url assert representation.mirror_url.endswith( "%s/%s.epub" % (edition.primary_identifier.identifier, edition.title)) # Content isn't there since it was mirrored. eq_(None, representation.content) # The representation was mirrored, with the modified content. eq_([representation], mirror.uploaded) eq_(["Replaced Content"], mirror.content)
def test_mirror_open_access_link_mirror_failure(self): mirror = DummyS3Uploader(fail=True) h = DummyHTTPClient() edition, pool = self._edition(with_license_pool=True) data_source = DataSource.lookup(self._db, DataSource.GUTENBERG) policy = ReplacementPolicy(mirror=mirror, http_get=h.do_get) circulation_data = CirculationData( data_source=edition.data_source, primary_identifier=edition.primary_identifier, ) link = LinkData( rel=Hyperlink.OPEN_ACCESS_DOWNLOAD, media_type=Representation.EPUB_MEDIA_TYPE, href=self._url, ) link_obj, ignore = edition.primary_identifier.add_link( rel=link.rel, href=link.href, data_source=data_source, license_pool=pool, media_type=link.media_type, content=link.content, ) h.queue_response(200, media_type=Representation.EPUB_MEDIA_TYPE) circulation_data.mirror_link(pool, data_source, link, link_obj, policy) representation = link_obj.resource.representation # The representation was fetched successfully. eq_(None, representation.fetch_exception) assert representation.fetched_at != None # But mirroing failed. assert representation.mirror_exception != None eq_(None, representation.mirrored_at) eq_(link.media_type, representation.media_type) eq_(link.href, representation.url) # The mirror url should still be set. assert "Gutenberg" in representation.mirror_url assert representation.mirror_url.endswith("%s.epub" % edition.title) # Book content is still there since it wasn't mirrored. assert representation.content != None # The license pool is suppressed when mirroring fails. eq_(True, pool.suppressed) assert representation.mirror_exception in pool.license_exception
def _licensepool(self, edition, open_access=True, data_source_name=DataSource.GUTENBERG, with_open_access_download=False, set_edition_as_presentation=False, collection=None): source = DataSource.lookup(self._db, data_source_name) if not edition: edition = self._edition(data_source_name) collection = collection or self._default_collection pool, ignore = get_one_or_create( self._db, LicensePool, create_method_kwargs=dict( open_access=open_access), identifier=edition.primary_identifier, data_source=source, collection=collection, availability_time=datetime.utcnow() ) if set_edition_as_presentation: pool.presentation_edition = edition if with_open_access_download: pool.open_access = True url = "http://foo.com/" + self._str media_type = MediaTypes.EPUB_MEDIA_TYPE link, new = pool.identifier.add_link( Hyperlink.OPEN_ACCESS_DOWNLOAD, url, source, media_type ) # Add a DeliveryMechanism for this download pool.set_delivery_mechanism( media_type, DeliveryMechanism.NO_DRM, RightsStatus.GENERIC_OPEN_ACCESS, link.resource, ) representation, is_new = self._representation( url, media_type, "Dummy content", mirrored=True) link.resource.representation = representation else: # Add a DeliveryMechanism for this licensepool pool.set_delivery_mechanism( MediaTypes.EPUB_MEDIA_TYPE, DeliveryMechanism.ADOBE_DRM, RightsStatus.UNKNOWN, None ) pool.licenses_owned = pool.licenses_available = 1 return pool
def test_cover_image_url(self): identifier = self._identifier(foreign_id="ABOOK") buckets = {S3Uploader.BOOK_COVERS_BUCKET_KEY: 'thecovers'} uploader = self._uploader(**buckets) m = uploader.cover_image_url unglueit = DataSource.lookup(self._db, DataSource.UNGLUE_IT) identifier = self._identifier(foreign_id="ABOOK") eq_( u'https://s3.amazonaws.com/thecovers/scaled/601/unglue.it/Gutenberg+ID/ABOOK/filename', m(unglueit, identifier, "filename", scaled_size=601))
def test_import_one_feed(self): # Check coverage records are created. monitor = OPDSImportMonitor(self._db, "http://url", DataSource.OA_CONTENT_SERVER, DoomedOPDSImporter) data_source = DataSource.lookup(self._db, DataSource.OA_CONTENT_SERVER) feed = self.content_server_mini_feed monitor.import_one_feed(feed, "http://root-url/") editions = self._db.query(Edition).all() # One edition has been imported eq_(1, len(editions)) [edition] = editions # That edition has a CoverageRecord. record = CoverageRecord.lookup( editions[0].primary_identifier, data_source, operation=CoverageRecord.IMPORT_OPERATION) eq_(CoverageRecord.SUCCESS, record.status) eq_(None, record.exception) # The edition's primary identifier has a cover link whose # relative URL has been resolved relative to the URL we passed # into import_one_feed. [cover] = [ x.resource.url for x in editions[0].primary_identifier.links if x.rel == Hyperlink.IMAGE ] eq_("http://root-url/full-cover-image.png", cover) # The 202 status message in the feed caused a transient failure. # The exception caused a persistent failure. coverage_records = self._db.query(CoverageRecord).filter( CoverageRecord.operation == CoverageRecord.IMPORT_OPERATION, CoverageRecord.status != CoverageRecord.SUCCESS) eq_( sorted([ CoverageRecord.TRANSIENT_FAILURE, CoverageRecord.PERSISTENT_FAILURE ]), sorted([x.status for x in coverage_records])) identifier, ignore = Identifier.parse_urn( self._db, "urn:librarysimplified.org/terms/id/Gutenberg%20ID/10441") failure = CoverageRecord.lookup( identifier, data_source, operation=CoverageRecord.IMPORT_OPERATION) assert "Utter failure!" in failure.exception
def test_classifications_from_another_source_not_updated(self): # Set up an edition whose primary identifier has two # classifications. source1 = DataSource.lookup(self._db, DataSource.AXIS_360) source2 = DataSource.lookup(self._db, DataSource.METADATA_WRANGLER) edition = self._edition() identifier = edition.primary_identifier c1 = identifier.classify(source1, Subject.TAG, "i will persist") c2 = identifier.classify(source2, Subject.TAG, "i will perish") # Now we get some new metadata from source #2. subjects = [SubjectData(type=Subject.TAG, identifier="i will conquer")] metadata = Metadata(subjects=subjects, data_source=source2) replace = ReplacementPolicy(subjects=True) metadata.apply(edition, replace=replace) # The old classification from source #2 has been destroyed. # The old classification from source #1 is still there. eq_(['i will conquer', 'i will persist'], sorted([x.subject.identifier for x in identifier.classifications]))
def to_customlist(self, _db): """Turn this NYTBestSeller list into a CustomList object.""" data_source = DataSource.lookup(_db, DataSource.NYT) l, was_new = get_one_or_create( _db, CustomList, data_source=data_source, foreign_identifier=self.foreign_identifier, create_method_kwargs=dict(created=self.created, )) l.name = self.name l.updated = self.updated self.update_custom_list(l) return l
def test_parse_list_as_identifiers_with_data_source(self): lp1 = self._licensepool(None, data_source_name=DataSource.UNGLUE_IT) lp2 = self._licensepool(None, data_source_name=DataSource.FEEDBOOKS) lp3 = self._licensepool(None, data_source_name=DataSource.FEEDBOOKS) i1, i2, i3 = [lp.identifier for lp in [lp1, lp2, lp3]] i1.type = i2.type = Identifier.URI source = DataSource.lookup(self._db, DataSource.FEEDBOOKS) # Only URIs with a FeedBooks LicensePool are selected. identifiers = IdentifierInputScript.parse_identifier_list( self._db, Identifier.URI, source, []) eq_([i2], identifiers)
def __init__(self, manager_class, data_source_name, list_identifier, list_name, primary_language, description, **manager_kwargs): data_source = DataSource.lookup(self._db, data_source_name) self.custom_list, is_new = get_one_or_create( self._db, CustomList, data_source_id=data_source.id, foreign_identifier=list_identifier, ) self.custom_list.primary_language = primary_language self.custom_list.description = description self.membership_manager = manager_class(self.custom_list, **manager_kwargs)
def setup(self): super(TestCustomListFromCSV, self).setup() self.data_source = DataSource.lookup(self._db, DataSource.LIBRARY_STAFF) self.metadata = DummyMetadataClient() self.metadata.lookups['Octavia Butler'] = 'Butler, Octavia' self.l = CustomListFromCSV(self.data_source.name, "Test list", metadata_client=self.metadata, display_author_field='author', identifier_fields={Identifier.ISBN: "isbn"}) self.custom_list, ignore = self._customlist( data_source_name=self.data_source.name, num_entries=0) self.now = datetime.datetime.utcnow()
def test_register_equivalency(self): data_source = DataSource.lookup(self._db, DataSource.GUTENBERG) id = "549" # We've got a record. record, was_new = Edition.for_foreign_id(self._db, data_source, Identifier.GUTENBERG_ID, id) # Then we look it up and discover another identifier for it. data_source_2 = DataSource.lookup(self._db, DataSource.OCLC) record2, was_new = Edition.for_foreign_id(self._db, data_source_2, Identifier.OCLC_NUMBER, "22") eq = record.primary_identifier.equivalent_to( data_source_2, record2.primary_identifier, 1) eq_(eq.input, record.primary_identifier) eq_(eq.output, record2.primary_identifier) eq_(eq.data_source, data_source_2) eq_([eq], record.primary_identifier.equivalencies) eq_(set([record, record2]), set(record.equivalent_editions().all()))
def test_mirror_open_access_link_fetch_failure(self): edition, pool = self._edition(with_license_pool=True) data_source = DataSource.lookup(self._db, DataSource.GUTENBERG) m = Metadata(data_source=data_source) mirror = DummyS3Uploader() h = DummyHTTPClient() policy = ReplacementPolicy(mirror=mirror, http_get=h.do_get) link = LinkData( rel=Hyperlink.IMAGE, media_type=Representation.JPEG_MEDIA_TYPE, href="http://example.com/", ) link_obj, ignore = edition.primary_identifier.add_link( rel=link.rel, href=link.href, data_source=data_source, license_pool=pool, media_type=link.media_type, content=link.content, ) h.queue_response(403) m.mirror_link(edition, data_source, link, link_obj, policy) representation = link_obj.resource.representation # Fetch failed, so we should have a fetch exception but no mirror url. assert representation.fetch_exception != None eq_(None, representation.mirror_exception) eq_(None, representation.mirror_url) eq_(link.href, representation.url) assert representation.fetched_at != None eq_(None, representation.mirrored_at) # the edition's identifier-associated license pool should not be # suppressed just because fetch failed on getting image. eq_(False, pool.suppressed) # the license pool only gets its license_exception column filled in # if fetch failed on getting an Hyperlink.OPEN_ACCESS_DOWNLOAD-type epub. eq_(None, pool.license_exception)
def test_mirror_open_access_link_fetch_failure(self): mirror = DummyS3Uploader() h = DummyHTTPClient() edition, pool = self._edition(with_license_pool=True) data_source = DataSource.lookup(self._db, DataSource.GUTENBERG) policy = ReplacementPolicy(mirror=mirror, http_get=h.do_get) circulation_data = CirculationData( data_source=edition.data_source, primary_identifier=edition.primary_identifier, ) link = LinkData( rel=Hyperlink.OPEN_ACCESS_DOWNLOAD, media_type=Representation.EPUB_MEDIA_TYPE, href=self._url, ) link_obj, ignore = edition.primary_identifier.add_link( rel=link.rel, href=link.href, data_source=data_source, license_pool=pool, media_type=link.media_type, content=link.content, ) h.queue_response(403) circulation_data.mirror_link(pool, data_source, link, link_obj, policy) representation = link_obj.resource.representation # Fetch failed, so we should have a fetch exception but no mirror url. assert representation.fetch_exception != None eq_(None, representation.mirror_exception) eq_(None, representation.mirror_url) eq_(link.href, representation.url) assert representation.fetched_at != None eq_(None, representation.mirrored_at) # The license pool is suppressed when fetch fails. eq_(True, pool.suppressed) assert representation.fetch_exception in pool.license_exception
def __init__(self, _db=None, cmd_args=None): _db = _db or self._db args = self.parse_command_line(_db, cmd_args=cmd_args) self.identifier_type = args.identifier_type self.identifiers = args.identifiers subject_type = args.subject_type subject_identifier = args.subject_identifier subject_name = args.subject_name if not subject_name and not subject_identifier: raise ValueError( "Either subject-name or subject-identifier must be provided.") self.data_source = DataSource.lookup(_db, args.data_source) self.weight = args.weight self.subject, ignore = Subject.lookup(_db, subject_type, subject_identifier, subject_name, autocreate=args.create_subject)
def _edition(self, data_source_name=DataSource.GUTENBERG, identifier_type=Identifier.GUTENBERG_ID, with_license_pool=False, with_open_access_download=False, title=None, language="eng", authors=None, identifier_id=None, series=None, collection=None, publicationDate=None ): id = identifier_id or self._str source = DataSource.lookup(self._db, data_source_name) wr = Edition.for_foreign_id( self._db, source, identifier_type, id)[0] if not title: title = self._str wr.title = unicode(title) wr.medium = Edition.BOOK_MEDIUM if series: wr.series = series if language: wr.language = language if authors is None: authors = self._str if isinstance(authors, basestring): authors = [authors] if authors != []: wr.add_contributor(unicode(authors[0]), Contributor.PRIMARY_AUTHOR_ROLE) wr.author = unicode(authors[0]) for author in authors[1:]: wr.add_contributor(unicode(author), Contributor.AUTHOR_ROLE) if publicationDate: wr.published = publicationDate if with_license_pool or with_open_access_download: pool = self._licensepool( wr, data_source_name=data_source_name, with_open_access_download=with_open_access_download, collection=collection ) pool.set_presentation_edition() return wr, pool return wr
import site import sys from nose.tools import set_trace d = os.path.split(__file__)[0] site.addsitedir(os.path.join(d, "..")) from model import DataSource, LicensePool, SessionManager, Work, Identifier from model import production_session if __name__ == "__main__": session = production_session() data_source_name = sys.argv[1] identifier = sys.argv[2] data_source = DataSource.lookup(session, data_source_name) wid, ignore = Identifier.for_foreign_id(session, data_source.primary_identifier_type, identifier, False) pool = ( session.query(LicensePool) .filter(LicensePool.data_source == data_source) .filter(LicensePool.identifier == wid) .one() ) primary_edition = pool.edition() old_work = primary_edition.work if old_work: old_work.license_pools.remove(pool) primary_edition.work = None pool.calculate_work() work = pool.work work.calculate_presentation()
from nose.tools import set_trace d = os.path.split(__file__)[0] site.addsitedir(os.path.join(d, "..")) from model import ( Edition, production_session, DataSource, Work ) from sqlalchemy.orm import joinedload a = 0 db = production_session() start = 0 batch_size = 1000 source = DataSource.lookup(db, DataSource.THREEM) base_query = db.query(Work).join(Work.primary_edition).filter(Edition.data_source==source).order_by(Work.id).options( joinedload('summary'), joinedload('primary_edition', 'cover')).limit(batch_size) batch = base_query.offset(start).all() while batch: for work in batch: if not work.primary_edition: continue if work.primary_edition.cover: work.primary_edition.set_cover(work.primary_edition.cover) print work.primary_edition.cover_thumbnail_url else: print "!COVER %s" % work.primary_edition.primary_identifier if work.summary: work.set_summary(work.summary) print work.summary.content[:70]
genre = classification.genre.name genredata = classifier.genres[genre] parentage = [x.name for x in genredata.parents] + [genre] parentage.reverse() while len(parentage) < 3: parentage.append("") stats[tuple(parentage)][source] += 1 return stats if __name__ == '__main__': _db = production_session() out = csv.writer(sys.stdout) sources = [DataSource.lookup(_db, x) for x in [ DataSource.GUTENBERG, DataSource.OVERDRIVE, DataSource.THREEM]] out.writerow(["Classification", "Parent", "Grandparent"] + [x.name for x in sources] + ["Total"]) for audience in "Adult", "Young Adult", "Children": base_query = _db.query(Work).filter(Work.audience==audience) by_source = count_for_each_data_source(base_query, sources) row = [by_source[source] for source in sources] row += [sum(row)] row = [audience, "" ,""] + row out.writerow(row) out.writerow([]) for fiction, name in (True, "Fiction"), (False, "Nonfiction"), (None, "No Fiction Status"): base_query = _db.query(Work).filter(Work.fiction==fiction)
modified = datetime.datetime.fromtimestamp(os.stat(path).st_mtime) data = open(path).read() representation, ignore = get_one_or_create(db, Representation, url=url, data_source=data_source) representation.status_code = 200 representation.content = data representation.media_type = 'application/xml' representation.fetched_at = modified print url if __name__ == '__main__': data_dir = sys.argv[1] template = "http://cloudlibraryapi.3m.com/cirrus/library/a4tmf/data/cloudevents?startdate=%s&enddate=%s" db = production_session() threem = DataSource.lookup(db, DataSource.THREEM) cache_path = os.path.join(data_dir, "3M", "cache", "events") a = 0 for filename in os.listdir(cache_path): path = os.path.join(cache_path, filename) start_date = filename[:19] end_date = filename[20:] url = template % (start_date, end_date) imp(db, threem, path, url) a += 1 if not a % 10: db.commit() db.commit()
from model import ( CirculationEvent, DataSource, CoverageRecord, production_session, Identifier, Measurement, LicensePool, ) import json import gzip database = production_session() data_dir = sys.argv[1] OVERDRIVE = DataSource.lookup(database, DataSource.OVERDRIVE) TIME_FORMAT = "%Y-%m-%dT%H:%M:%S+00:00" def process_item(_db, item): overdrive_id = item['id'] event_name = item['event'] old_value = item.get('old_value', 0) new_value = item.get('new_value', 0) if event_name in ('check_out', 'check_in'): x = new_value new_value = old_value old_value = x elif event_name in ('hold_release', 'hold_place', 'license_remove'): pass
representation.status_code = status_code representation.content = data representation.location = location representation.media_type = media_type representation.fetched_at = modified return True if __name__ == '__main__': data_dir = sys.argv[1] db = production_session() oclc = OCLCLinkedData(db) d = os.path.join(data_dir, "OCLC Linked Data", "cache", "OCLC Number") cache = FakeCache(d, 4, False) source = DataSource.lookup(db, DataSource.OCLC_LINKED_DATA) min_oclc = 1284796 max_oclc = 2052405 batch_size = 10000 type = Identifier.OCLC_NUMBER cursor = min_oclc while cursor < max_oclc: first_time = time.time() processed = 0 max_batch = cursor + batch_size q = db.query(Identifier).filter(Identifier.type==Identifier.OCLC_NUMBER).filter(Identifier.id >= cursor).filter(Identifier.id < max_batch) for identifier in q: if imp(db, source, identifier, cache): processed += 1
fn = cache._filename(fn) modified = datetime.datetime.fromtimestamp(os.stat(fn).st_mtime) data = cache.open(fn).read() a = dict(collection_token=library['collectionToken'], item_id=i) url = OverdriveAPI.METADATA_ENDPOINT % a representation, ignore = get_one_or_create(db, Representation, url=url, data_source=data_source, identifier=identifier) representation.status_code = 200 representation.content = data representation.media_type = 'application/json' representation.fetched_at = modified print identifier if __name__ == '__main__': data_dir = sys.argv[1] overdrive = OverdriveAPI(data_dir) library = overdrive.get_library() db = production_session() b = overdrive.bibliographic_cache source = DataSource.lookup(db, DataSource.OVERDRIVE) q = db.query(Identifier).filter(Identifier.type==Identifier.OVERDRIVE_ID) a = 0 for i in q: imp(db, source, i, b, library) a += 1 if not a % 1000: db.commit()
def __init__(self, db): self._db = db self.overdrive = DataSource.lookup(self._db, DataSource.OVERDRIVE)