def request(self, path, body=None, method="GET", identifier=None, max_age=None): path = self.full_path(path) url = self.full_url(path) if method == 'GET': headers = {"Accept": "application/xml"} else: headers = {"Content-Type": "application/xml"} self.sign(method, headers, path) # print headers # self.log.debug("3M request: %s %s", method, url) if max_age and method == 'GET': representation, cached = Representation.get( self._db, url, extra_request_headers=headers, do_get=self._simple_http_get, max_age=max_age, exception_handler=Representation.reraise_exception, ) content = representation.content return content else: return self._request_with_timeout(method, url, data=body, headers=headers, allow_redirects=False)
def get_library(self): url = self.LIBRARY_ENDPOINT % dict(library_id=self.library_id) representation, cached = Representation.get( self._db, url, self.get, exception_handler=Representation.reraise_exception, ) return json.loads(representation.content)
def get_library(self): """Get basic information about the collection, including a link to the titles in the collection. """ url = self._library_endpoint representation, cached = Representation.get( self._db, url, self.get, exception_handler=Representation.reraise_exception, ) return json.loads(representation.content)
def main(): FORMAT = '%(asctime)s %(levelname)s %(message)s' logging.basicConfig(format=FORMAT) logging.getLogger().setLevel(logging.INFO) args = parse_args() lang_map = {i: fn for i, fn in enumerate(sorted(listdir(args.lang_map)))} if args.train.endswith('.mtx'): mtx = mmread(args.train).todense() t_mtx = mmread(args.test).todense() else: with open(args.train) as stream: mtx = np.loadtxt(stream, np.float64) with open(args.test) as stream: t_mtx = np.loadtxt(stream, np.float64) labels = np.ravel(mtx[:, 0]) test_labels = t_mtx[:, 0] test_mtx = t_mtx[:, 1:] if args.scale: train = scale(mtx[:, 1:], with_mean=False) else: train = mtx[:, 1:] kwargs = {} for a in args.params: k, v = a.split('=') try: v = int(v) except: pass kwargs[k] = v r = Representation(args.encoder, args.classifier, **kwargs) r.encode(train) logging.info('Matrix encoded') r.train_classifier(labels) logging.info('Model trained') acc = 0 N = 0 for vec_ in test_mtx: vec = np.ravel(vec_) cl = r.classify_vector(vec, with_probs=args.with_probs) try: lab = test_labels[N, 0] except IndexError: lab = test_labels[N] N += 1 if args.with_probs: guess = max(enumerate(cl[0, :]), key=lambda x: x[1])[0] print('{0}\t{1}\t{2}'.format('\t'.join(map(str, cl[0, :])), lang_map[guess], lang_map[int(lab)])) else: try: guess = int(cl[0, 0]) except IndexError: guess = int(cl + 0.5) print('{0}\t{1}'.format(lang_map[guess], lang_map[int(lab)])) if int(guess) == int(lab): acc += 1
def request(self, path, identifier=None, max_age=LIST_MAX_AGE): if not path.startswith(self.BASE_URL): if not path.startswith("/"): path = "/" + path url = self.BASE_URL + path else: url = path joiner = '?' if '?' in url: joiner = '&' url += joiner + "api-key=" + self.api_key representation, cached = Representation.get(self._db, url, do_get=self.do_get, max_age=max_age, debug=True, pause_before=0.1) content = json.loads(representation.content) return content
def get_advantage_accounts(self): """Find all the Overdrive Advantage accounts managed by this library. :yield: A sequence of OverdriveAdvantageAccount objects. """ library = self.get_library() links = library.get('links', {}) advantage = links.get('advantageAccounts') if not advantage: return [] if advantage: # This library has Overdrive Advantage accounts, or at # least a link where some may be found. advantage_url = advantage.get('href') if not advantage_url: return representation, cached = Representation.get( self._db, advantage_url, self.get, exception_handler=Representation.reraise_exception, ) return OverdriveAdvantageAccount.from_representation( representation.content )
def book_info_to_metadata(cls, book, include_bibliographic=True, include_formats=True): """Turn Overdrive's JSON representation of a book into a Metadata object. Note: The json data passed into this method is from a different file/stream from the json data that goes into the book_info_to_circulation() method. """ if not 'id' in book: return None overdrive_id = book['id'] primary_identifier = IdentifierData(Identifier.OVERDRIVE_ID, overdrive_id) if include_bibliographic: title = book.get('title', None) sort_title = book.get('sortTitle') subtitle = book.get('subtitle', None) series = book.get('series', None) publisher = book.get('publisher', None) imprint = book.get('imprint', None) if 'publishDate' in book: published = datetime.datetime.strptime( book['publishDate'][:10], cls.DATE_FORMAT) else: published = None languages = [l['code'] for l in book.get('languages', [])] if 'eng' in languages or not languages: language = 'eng' else: language = sorted(languages)[0] contributors = [] for creator in book.get('creators', []): sort_name = creator['fileAs'] display_name = creator['name'] role = creator['role'] roles = cls.parse_roles(overdrive_id, role) or [Contributor.UNKNOWN_ROLE] contributor = ContributorData(sort_name=sort_name, display_name=display_name, roles=roles, biography=creator.get( 'bioText', None)) contributors.append(contributor) subjects = [] for sub in book.get('subjects', []): subject = SubjectData(type=Subject.OVERDRIVE, identifier=sub['value'], weight=100) subjects.append(subject) for sub in book.get('keywords', []): subject = SubjectData(type=Subject.TAG, identifier=sub['value'], weight=1) subjects.append(subject) extra = dict() if 'grade_levels' in book: # n.b. Grade levels are measurements of reading level, not # age appropriateness. We can use them as a measure of age # appropriateness in a pinch, but we weight them less # heavily than other information from Overdrive. for i in book['grade_levels']: subject = SubjectData(type=Subject.GRADE_LEVEL, identifier=i['value'], weight=10) subjects.append(subject) overdrive_medium = book.get('mediaType', None) if overdrive_medium and overdrive_medium not in cls.overdrive_medium_to_simplified_medium: cls.log.error("Could not process medium %s for %s", overdrive_medium, overdrive_id) medium = cls.overdrive_medium_to_simplified_medium.get( overdrive_medium, Edition.BOOK_MEDIUM) measurements = [] if 'awards' in book: extra['awards'] = book.get('awards', []) num_awards = len(extra['awards']) measurements.append( MeasurementData(Measurement.AWARDS, str(num_awards))) for name, subject_type in (('ATOS', Subject.ATOS_SCORE), ('lexileScore', Subject.LEXILE_SCORE), ('interestLevel', Subject.INTEREST_LEVEL)): if not name in book: continue identifier = str(book[name]) subjects.append( SubjectData(type=subject_type, identifier=identifier, weight=100)) for grade_level_info in book.get('gradeLevels', []): grade_level = grade_level_info.get('value') subjects.append( SubjectData(type=Subject.GRADE_LEVEL, identifier=grade_level, weight=100)) identifiers = [] links = [] for format in book.get('formats', []): for new_id in format.get('identifiers', []): t = new_id['type'] v = new_id['value'] orig_v = v type_key = None if t == 'ASIN': type_key = Identifier.ASIN elif t == 'ISBN': type_key = Identifier.ISBN if len(v) == 10: v = isbnlib.to_isbn13(v) if v is None or not isbnlib.is_isbn13(v): # Overdrive sometimes uses invalid values # like "n/a" as placeholders. Ignore such # values to avoid a situation where hundreds of # books appear to have the same ISBN. ISBNs # which fail check digit checks or are invalid # also can occur. Log them for review. cls.log.info("Bad ISBN value provided: %s", orig_v) continue elif t == 'DOI': type_key = Identifier.DOI elif t == 'UPC': type_key = Identifier.UPC elif t == 'PublisherCatalogNumber': continue if type_key and v: identifiers.append(IdentifierData(type_key, v, 1)) # Samples become links. if 'samples' in format: if not format['id'] in cls.format_data_for_overdrive_format: # Useless to us. continue content_type, drm_scheme = cls.format_data_for_overdrive_format.get( format['id']) if Representation.is_media_type(content_type): for sample_info in format['samples']: href = sample_info['url'] links.append( LinkData(rel=Hyperlink.SAMPLE, href=href, media_type=content_type)) # A cover and its thumbnail become a single LinkData. if 'images' in book: images = book['images'] image_data = cls.image_link_to_linkdata( images.get('cover'), Hyperlink.IMAGE) for name in ['cover300Wide', 'cover150Wide', 'thumbnail']: # Try to get a thumbnail that's as close as possible # to the size we use. image = images.get(name) thumbnail_data = cls.image_link_to_linkdata( image, Hyperlink.THUMBNAIL_IMAGE) if not image_data: image_data = cls.image_link_to_linkdata( image, Hyperlink.IMAGE) if thumbnail_data: break if image_data: if thumbnail_data: image_data.thumbnail = thumbnail_data links.append(image_data) # Descriptions become links. short = book.get('shortDescription') full = book.get('fullDescription') if full: links.append( LinkData( rel=Hyperlink.DESCRIPTION, content=full, media_type="text/html", )) if short and (not full or not full.startswith(short)): links.append( LinkData( rel=Hyperlink.SHORT_DESCRIPTION, content=short, media_type="text/html", )) # Add measurements: rating and popularity if book.get('starRating') is not None and book['starRating'] > 0: measurements.append( MeasurementData(quantity_measured=Measurement.RATING, value=book['starRating'])) if book.get('popularity'): measurements.append( MeasurementData(quantity_measured=Measurement.POPULARITY, value=book['popularity'])) metadata = Metadata( data_source=DataSource.OVERDRIVE, title=title, subtitle=subtitle, sort_title=sort_title, language=language, medium=medium, series=series, publisher=publisher, imprint=imprint, published=published, primary_identifier=primary_identifier, identifiers=identifiers, subjects=subjects, contributors=contributors, measurements=measurements, links=links, ) else: metadata = Metadata( data_source=DataSource.OVERDRIVE, primary_identifier=primary_identifier, ) if include_formats: formats = [] for format in book.get('formats', []): format_id = format['id'] if format_id in cls.format_data_for_overdrive_format: content_type, drm_scheme = cls.format_data_for_overdrive_format.get( format_id) formats.append(FormatData(content_type, drm_scheme)) elif format_id not in cls.ignorable_overdrive_formats: cls.log.error( "Could not process Overdrive format %s for %s", format_id, overdrive_id) # Also make a CirculationData so we can write the formats, circulationdata = CirculationData( data_source=DataSource.OVERDRIVE, primary_identifier=primary_identifier, formats=formats, ) metadata.circulation = circulationdata return metadata
def _do_get(self, url, headers): """This method is overridden in MockOverdriveAPI.""" return Representation.simple_http_get(url, headers)
def _simple_http_get(self, url, headers, *args, **kwargs): """This will be overridden in MockThreeMAPI.""" return Representation.simple_http_get(url, headers, *args, **kwargs)
*build_CURL_dataset(X_train, y_train, CURL_TRAIN_SIZE)) assert len(train_CURL) == CURL_TRAIN_SIZE test_CURL = ContrastiveDataset( *build_CURL_dataset(X_test, y_test, CURL_TEST_SIZE)) train_data = GMMDataset(X_train, y_train) test_data = GMMDataset(X_test, y_test) train_loader = DataLoader(train_data, shuffle=True, batch_size=BATCH_SIZE) test_loader = DataLoader(test_data, shuffle=False, batch_size=BATCH_SIZE) curl_train_loader = DataLoader(train_CURL, shuffle=True, batch_size=BATCH_SIZE) curl_test_loader = DataLoader(test_CURL, shuffle=False, batch_size=BATCH_SIZE) # Model curl_model = Representation(INPUT_DIM, HIDDEN_DIM, OUT_DIM) sup_model = ClassificationNet(Representation(INPUT_DIM, HIDDEN_DIM, OUT_DIM), FCLayer(OUT_DIM, N_CENTERS)) writer_str = ("CURL/GMM-" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + "-" + str(INPUT_DIM) + "-" + str(HIDDEN_DIM) + "-" + str(OUT_DIM) + "-" + str(LR) + "-" + str(BATCH_SIZE)) writer = SummaryWriter(writer_str) """ sup_model = train_multiclass_sup( train_loader, sup_model, writer, N_EPOCH, LR, verbose=True,
) test_data = MNIST( ROOT_DIR, train=False, download=DOWNLOAD, transform=transforms.ToTensor() ) X_train, y_train = MNIST_pre_processing(train_data) X_test, y_test = MNIST_pre_processing(test_data) train_CURL = ContrastiveDataset(*build_CURL_dataset(X_train, y_train, CURL_TRAIN_SIZE)) test_CURL = ContrastiveDataset(*build_CURL_dataset(X_test, y_test, CURL_TRAIN_SIZE)) curl_train_loader = DataLoader(train_CURL, shuffle=True, batch_size=BATCH_SIZE) curl_test_loader = DataLoader(train_CURL, shuffle=True, batch_size=BATCH_SIZE) curl_model = Representation(INPUT_DIM, HIDDEN_DIM, OUT_DIM) writer_str = ( "CURL/MNIST-CURL-" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + "-" + str(INPUT_DIM) + "-" + str(HIDDEN_DIM) + "-" + str(OUT_DIM) + "-" + str(LR) + "-" + str(BATCH_SIZE)
def main(): r = Representation('pca', 'naive_bayes', dimension=3) raw_mtx = numpy.array([[1, 1, 1, 0, 0, 0], [1, 0, 1, 0, 0, 0], [1, 1, 1, 0, 0, 0], [0, 0, 1, 1, 1, 0], [0, 0, 1, 1, 0, 0], [0, 0, 1, 1, 1, 0]]) r.encode(raw_mtx) r.train_classifier([0, 0, 0, 1, 1, 1]) print r.classify_vector([1, 2, 1, 0, 1, 0])