Ejemplo n.º 1
0
 def request(self,
             path,
             body=None,
             method="GET",
             identifier=None,
             max_age=None):
     path = self.full_path(path)
     url = self.full_url(path)
     if method == 'GET':
         headers = {"Accept": "application/xml"}
     else:
         headers = {"Content-Type": "application/xml"}
     self.sign(method, headers, path)
     # print headers
     # self.log.debug("3M request: %s %s", method, url)
     if max_age and method == 'GET':
         representation, cached = Representation.get(
             self._db,
             url,
             extra_request_headers=headers,
             do_get=self._simple_http_get,
             max_age=max_age,
             exception_handler=Representation.reraise_exception,
         )
         content = representation.content
         return content
     else:
         return self._request_with_timeout(method,
                                           url,
                                           data=body,
                                           headers=headers,
                                           allow_redirects=False)
Ejemplo n.º 2
0
 def get_library(self):
     url = self.LIBRARY_ENDPOINT % dict(library_id=self.library_id)
     representation, cached = Representation.get(
         self._db, url, self.get, 
         exception_handler=Representation.reraise_exception,
     )
     return json.loads(representation.content)
Ejemplo n.º 3
0
 def get_library(self):
     """Get basic information about the collection, including
     a link to the titles in the collection.
     """
     url = self._library_endpoint
     representation, cached = Representation.get(
         self._db, url, self.get, 
         exception_handler=Representation.reraise_exception,
     )
     return json.loads(representation.content)
Ejemplo n.º 4
0
def main():
    FORMAT = '%(asctime)s %(levelname)s %(message)s'
    logging.basicConfig(format=FORMAT)
    logging.getLogger().setLevel(logging.INFO)
    args = parse_args()
    lang_map = {i: fn for i, fn in enumerate(sorted(listdir(args.lang_map)))}
    if args.train.endswith('.mtx'):
        mtx = mmread(args.train).todense()
        t_mtx = mmread(args.test).todense()
    else:
        with open(args.train) as stream:
            mtx = np.loadtxt(stream, np.float64)
        with open(args.test) as stream:
            t_mtx = np.loadtxt(stream, np.float64)
    labels = np.ravel(mtx[:, 0])
    test_labels = t_mtx[:, 0]
    test_mtx = t_mtx[:, 1:]
    if args.scale:
        train = scale(mtx[:, 1:], with_mean=False)
    else:
        train = mtx[:, 1:]
    kwargs = {}
    for a in args.params:
        k, v = a.split('=')
        try:
            v = int(v)
        except:
            pass
        kwargs[k] = v
    r = Representation(args.encoder, args.classifier, **kwargs)
    r.encode(train)
    logging.info('Matrix encoded')
    r.train_classifier(labels)
    logging.info('Model trained')
    acc = 0
    N = 0
    for vec_ in test_mtx:
        vec = np.ravel(vec_)
        cl = r.classify_vector(vec, with_probs=args.with_probs)
        try:
            lab = test_labels[N, 0]
        except IndexError:
            lab = test_labels[N]
        N += 1
        if args.with_probs:
            guess = max(enumerate(cl[0, :]), key=lambda x: x[1])[0]
            print('{0}\t{1}\t{2}'.format('\t'.join(map(str, cl[0, :])), lang_map[guess], lang_map[int(lab)]))
        else:
            try:
                guess = int(cl[0, 0])
            except IndexError:
                guess = int(cl + 0.5)
            print('{0}\t{1}'.format(lang_map[guess], lang_map[int(lab)]))
        if int(guess) == int(lab):
            acc += 1
Ejemplo n.º 5
0
 def request(self, path, identifier=None, max_age=LIST_MAX_AGE):
     if not path.startswith(self.BASE_URL):
         if not path.startswith("/"):
             path = "/" + path
         url = self.BASE_URL + path
     else:
         url = path
     joiner = '?'
     if '?' in url:
         joiner = '&'
     url += joiner + "api-key=" + self.api_key
     representation, cached = Representation.get(self._db,
                                                 url,
                                                 do_get=self.do_get,
                                                 max_age=max_age,
                                                 debug=True,
                                                 pause_before=0.1)
     content = json.loads(representation.content)
     return content
Ejemplo n.º 6
0
    def get_advantage_accounts(self):
        """Find all the Overdrive Advantage accounts managed by this library.

        :yield: A sequence of OverdriveAdvantageAccount objects.
        """
        library = self.get_library()
        links = library.get('links', {})
        advantage = links.get('advantageAccounts')
        if not advantage:
            return []
        if advantage:
            # This library has Overdrive Advantage accounts, or at
            # least a link where some may be found.
            advantage_url = advantage.get('href')
            if not advantage_url:
                return
            representation, cached = Representation.get(
                self._db, advantage_url, self.get,
                exception_handler=Representation.reraise_exception,
            )
            return OverdriveAdvantageAccount.from_representation(
                representation.content
            )
Ejemplo n.º 7
0
    def book_info_to_metadata(cls,
                              book,
                              include_bibliographic=True,
                              include_formats=True):
        """Turn Overdrive's JSON representation of a book into a Metadata
        object.

        Note:  The json data passed into this method is from a different file/stream
        from the json data that goes into the book_info_to_circulation() method.
        """
        if not 'id' in book:
            return None
        overdrive_id = book['id']
        primary_identifier = IdentifierData(Identifier.OVERDRIVE_ID,
                                            overdrive_id)

        if include_bibliographic:
            title = book.get('title', None)
            sort_title = book.get('sortTitle')
            subtitle = book.get('subtitle', None)
            series = book.get('series', None)
            publisher = book.get('publisher', None)
            imprint = book.get('imprint', None)

            if 'publishDate' in book:
                published = datetime.datetime.strptime(
                    book['publishDate'][:10], cls.DATE_FORMAT)
            else:
                published = None

            languages = [l['code'] for l in book.get('languages', [])]
            if 'eng' in languages or not languages:
                language = 'eng'
            else:
                language = sorted(languages)[0]

            contributors = []
            for creator in book.get('creators', []):
                sort_name = creator['fileAs']
                display_name = creator['name']
                role = creator['role']
                roles = cls.parse_roles(overdrive_id,
                                        role) or [Contributor.UNKNOWN_ROLE]
                contributor = ContributorData(sort_name=sort_name,
                                              display_name=display_name,
                                              roles=roles,
                                              biography=creator.get(
                                                  'bioText', None))
                contributors.append(contributor)

            subjects = []
            for sub in book.get('subjects', []):
                subject = SubjectData(type=Subject.OVERDRIVE,
                                      identifier=sub['value'],
                                      weight=100)
                subjects.append(subject)

            for sub in book.get('keywords', []):
                subject = SubjectData(type=Subject.TAG,
                                      identifier=sub['value'],
                                      weight=1)
                subjects.append(subject)

            extra = dict()
            if 'grade_levels' in book:
                # n.b. Grade levels are measurements of reading level, not
                # age appropriateness. We can use them as a measure of age
                # appropriateness in a pinch, but we weight them less
                # heavily than other information from Overdrive.
                for i in book['grade_levels']:
                    subject = SubjectData(type=Subject.GRADE_LEVEL,
                                          identifier=i['value'],
                                          weight=10)
                    subjects.append(subject)

            overdrive_medium = book.get('mediaType', None)
            if overdrive_medium and overdrive_medium not in cls.overdrive_medium_to_simplified_medium:
                cls.log.error("Could not process medium %s for %s",
                              overdrive_medium, overdrive_id)

            medium = cls.overdrive_medium_to_simplified_medium.get(
                overdrive_medium, Edition.BOOK_MEDIUM)

            measurements = []
            if 'awards' in book:
                extra['awards'] = book.get('awards', [])
                num_awards = len(extra['awards'])
                measurements.append(
                    MeasurementData(Measurement.AWARDS, str(num_awards)))

            for name, subject_type in (('ATOS', Subject.ATOS_SCORE),
                                       ('lexileScore', Subject.LEXILE_SCORE),
                                       ('interestLevel',
                                        Subject.INTEREST_LEVEL)):
                if not name in book:
                    continue
                identifier = str(book[name])
                subjects.append(
                    SubjectData(type=subject_type,
                                identifier=identifier,
                                weight=100))

            for grade_level_info in book.get('gradeLevels', []):
                grade_level = grade_level_info.get('value')
                subjects.append(
                    SubjectData(type=Subject.GRADE_LEVEL,
                                identifier=grade_level,
                                weight=100))

            identifiers = []
            links = []
            for format in book.get('formats', []):
                for new_id in format.get('identifiers', []):
                    t = new_id['type']
                    v = new_id['value']
                    orig_v = v
                    type_key = None
                    if t == 'ASIN':
                        type_key = Identifier.ASIN
                    elif t == 'ISBN':
                        type_key = Identifier.ISBN
                        if len(v) == 10:
                            v = isbnlib.to_isbn13(v)
                        if v is None or not isbnlib.is_isbn13(v):
                            # Overdrive sometimes uses invalid values
                            # like "n/a" as placeholders. Ignore such
                            # values to avoid a situation where hundreds of
                            # books appear to have the same ISBN. ISBNs
                            # which fail check digit checks or are invalid
                            # also can occur. Log them for review.
                            cls.log.info("Bad ISBN value provided: %s", orig_v)
                            continue
                    elif t == 'DOI':
                        type_key = Identifier.DOI
                    elif t == 'UPC':
                        type_key = Identifier.UPC
                    elif t == 'PublisherCatalogNumber':
                        continue
                    if type_key and v:
                        identifiers.append(IdentifierData(type_key, v, 1))

                # Samples become links.
                if 'samples' in format:

                    if not format['id'] in cls.format_data_for_overdrive_format:
                        # Useless to us.
                        continue
                    content_type, drm_scheme = cls.format_data_for_overdrive_format.get(
                        format['id'])
                    if Representation.is_media_type(content_type):
                        for sample_info in format['samples']:
                            href = sample_info['url']
                            links.append(
                                LinkData(rel=Hyperlink.SAMPLE,
                                         href=href,
                                         media_type=content_type))

            # A cover and its thumbnail become a single LinkData.
            if 'images' in book:
                images = book['images']
                image_data = cls.image_link_to_linkdata(
                    images.get('cover'), Hyperlink.IMAGE)
                for name in ['cover300Wide', 'cover150Wide', 'thumbnail']:
                    # Try to get a thumbnail that's as close as possible
                    # to the size we use.
                    image = images.get(name)
                    thumbnail_data = cls.image_link_to_linkdata(
                        image, Hyperlink.THUMBNAIL_IMAGE)
                    if not image_data:
                        image_data = cls.image_link_to_linkdata(
                            image, Hyperlink.IMAGE)
                    if thumbnail_data:
                        break

                if image_data:
                    if thumbnail_data:
                        image_data.thumbnail = thumbnail_data
                    links.append(image_data)

            # Descriptions become links.
            short = book.get('shortDescription')
            full = book.get('fullDescription')
            if full:
                links.append(
                    LinkData(
                        rel=Hyperlink.DESCRIPTION,
                        content=full,
                        media_type="text/html",
                    ))

            if short and (not full or not full.startswith(short)):
                links.append(
                    LinkData(
                        rel=Hyperlink.SHORT_DESCRIPTION,
                        content=short,
                        media_type="text/html",
                    ))

            # Add measurements: rating and popularity
            if book.get('starRating') is not None and book['starRating'] > 0:
                measurements.append(
                    MeasurementData(quantity_measured=Measurement.RATING,
                                    value=book['starRating']))

            if book.get('popularity'):
                measurements.append(
                    MeasurementData(quantity_measured=Measurement.POPULARITY,
                                    value=book['popularity']))

            metadata = Metadata(
                data_source=DataSource.OVERDRIVE,
                title=title,
                subtitle=subtitle,
                sort_title=sort_title,
                language=language,
                medium=medium,
                series=series,
                publisher=publisher,
                imprint=imprint,
                published=published,
                primary_identifier=primary_identifier,
                identifiers=identifiers,
                subjects=subjects,
                contributors=contributors,
                measurements=measurements,
                links=links,
            )
        else:
            metadata = Metadata(
                data_source=DataSource.OVERDRIVE,
                primary_identifier=primary_identifier,
            )

        if include_formats:
            formats = []
            for format in book.get('formats', []):
                format_id = format['id']
                if format_id in cls.format_data_for_overdrive_format:
                    content_type, drm_scheme = cls.format_data_for_overdrive_format.get(
                        format_id)
                    formats.append(FormatData(content_type, drm_scheme))
                elif format_id not in cls.ignorable_overdrive_formats:
                    cls.log.error(
                        "Could not process Overdrive format %s for %s",
                        format_id, overdrive_id)

            # Also make a CirculationData so we can write the formats,
            circulationdata = CirculationData(
                data_source=DataSource.OVERDRIVE,
                primary_identifier=primary_identifier,
                formats=formats,
            )

            metadata.circulation = circulationdata

        return metadata
Ejemplo n.º 8
0
 def _do_get(self, url, headers):
     """This method is overridden in MockOverdriveAPI."""
     return Representation.simple_http_get(url, headers)
Ejemplo n.º 9
0
 def _simple_http_get(self, url, headers, *args, **kwargs):
     """This will be overridden in MockThreeMAPI."""
     return Representation.simple_http_get(url, headers, *args, **kwargs)
Ejemplo n.º 10
0
    *build_CURL_dataset(X_train, y_train, CURL_TRAIN_SIZE))
assert len(train_CURL) == CURL_TRAIN_SIZE
test_CURL = ContrastiveDataset(
    *build_CURL_dataset(X_test, y_test, CURL_TEST_SIZE))

train_data = GMMDataset(X_train, y_train)
test_data = GMMDataset(X_test, y_test)

train_loader = DataLoader(train_data, shuffle=True, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_data, shuffle=False, batch_size=BATCH_SIZE)

curl_train_loader = DataLoader(train_CURL, shuffle=True, batch_size=BATCH_SIZE)
curl_test_loader = DataLoader(test_CURL, shuffle=False, batch_size=BATCH_SIZE)

# Model
curl_model = Representation(INPUT_DIM, HIDDEN_DIM, OUT_DIM)
sup_model = ClassificationNet(Representation(INPUT_DIM, HIDDEN_DIM, OUT_DIM),
                              FCLayer(OUT_DIM, N_CENTERS))

writer_str = ("CURL/GMM-" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") +
              "-" + str(INPUT_DIM) + "-" + str(HIDDEN_DIM) + "-" +
              str(OUT_DIM) + "-" + str(LR) + "-" + str(BATCH_SIZE))
writer = SummaryWriter(writer_str)
"""
sup_model = train_multiclass_sup(
    train_loader,
    sup_model,
    writer,
    N_EPOCH,
    LR,
    verbose=True,
Ejemplo n.º 11
0
)
test_data = MNIST(
    ROOT_DIR, train=False, download=DOWNLOAD, transform=transforms.ToTensor()
)

X_train, y_train = MNIST_pre_processing(train_data)
X_test, y_test = MNIST_pre_processing(test_data)

train_CURL = ContrastiveDataset(*build_CURL_dataset(X_train, y_train, CURL_TRAIN_SIZE))
test_CURL = ContrastiveDataset(*build_CURL_dataset(X_test, y_test, CURL_TRAIN_SIZE))


curl_train_loader = DataLoader(train_CURL, shuffle=True, batch_size=BATCH_SIZE)
curl_test_loader = DataLoader(train_CURL, shuffle=True, batch_size=BATCH_SIZE)

curl_model = Representation(INPUT_DIM, HIDDEN_DIM, OUT_DIM)


writer_str = (
    "CURL/MNIST-CURL-"
    + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    + "-"
    + str(INPUT_DIM)
    + "-"
    + str(HIDDEN_DIM)
    + "-"
    + str(OUT_DIM)
    + "-"
    + str(LR)
    + "-"
    + str(BATCH_SIZE)
Ejemplo n.º 12
0
def main():
    r = Representation('pca',  'naive_bayes', dimension=3)
    raw_mtx = numpy.array([[1, 1, 1, 0, 0, 0], [1, 0, 1, 0, 0, 0], [1, 1, 1, 0, 0, 0], [0, 0, 1, 1, 1, 0],  [0, 0, 1, 1, 0, 0], [0, 0, 1, 1, 1, 0]])
    r.encode(raw_mtx)
    r.train_classifier([0, 0, 0, 1, 1, 1])
    print r.classify_vector([1, 2, 1, 0, 1, 0])