Example #1
0
 def _load_solrdata(self):
     if self.type == "edition":
         return {
             'ebook_count': int(bool(self.document.ocaid)),
             'edition_count': 1,
             'work_count': 1,
             'last_update': self.document.last_modified
         }
     else:
         q = self.get_solr_query_term()
         if q:
             solr = get_solr()
             result = solr.select(q, fields=["edition_count", "ebook_count_i"])
             last_update_i = [doc['last_update_i'] for doc in result.docs if 'last_update_i' in doc]
             if last_update_i:
                 last_update = self._inttime_to_datetime(last_update_i)
             else:
                 # if last_update is not present in solr, consider last_modfied of
                 # that document as last_update
                 if self.type in ['work', 'author']:
                     last_update = self.document.last_modified
                 else:
                     last_update = None
             return {
                 'ebook_count': sum(doc.get('ebook_count_i', 0) for doc in result.docs),
                 'edition_count': sum(doc.get('edition_count', 0) for doc in result.docs),
                 'work_count': result.num_found,
                 'last_update': last_update
             }
     return {}
Example #2
0
def get_solr_works(work_key: Iterable[str]) -> dict[str, dict]:
    from openlibrary.plugins.worksearch.search import get_solr

    return {
        doc['key']: doc
        for doc in get_solr().get_many(set(work_key), fields=DEFAULT_SEARCH_FIELDS)
    }
Example #3
0
    def GET(self):
        i = web.input(q="", limit=5)
        i.limit = safeint(i.limit, 5)

        solr = get_solr()

        q = solr.escape(i.q).strip()
        if is_work_olid(q.upper()):
            # ensure uppercase; key is case sensitive in solr
            solr_q = 'key:"/works/%s"' % q.upper()
        else:
            solr_q = 'title:"%s"^2 OR title:(%s*)' % (q, q)

        params = {
            'q_op': 'AND',
            'sort': 'edition_count desc',
            'rows': i.limit,
            'fq': 'type:work',
            # limit the fields returned for better performance
            'fl': 'key,title,subtitle,cover_i,first_publish_year,author_name,edition_count'
        }

        data = solr.select(solr_q, **params)
        # exclude fake works that actually have an edition key
        docs = [d for d in data['docs'] if d['key'][-1] == 'W']

        for d in docs:
            # Required by the frontend
            d['name'] = d['key'].split('/')[-1]
            d['full_title'] = d['title']
            if 'subtitle' in d:
                d['full_title'] += ": " + d['subtitle']
        return to_json(docs)
Example #4
0
    def _get_solr_data(self):
        fields = [
            "cover_edition_key", "cover_id", "edition_key",
            "first_publish_year", "has_fulltext", "lending_edition_s",
            "checked_out", "public_scan_b", "ia"
        ]

        solr = get_solr()
        stats.begin("solr", query={"key": self.key}, fields=fields)
        try:
            d = solr.select({"key": self.key}, fields=fields)
        except Exception as e:
            logging.getLogger("openlibrary").exception(
                "Failed to get solr data")
            return None
        finally:
            stats.end()

        if d.num_found > 0:
            w = d.docs[0]
        else:
            w = None

        # Replace _solr_data property with the attribute
        self.__dict__['_solr_data'] = w
        return w
Example #5
0
    def GET(self):
        i = web.input(q="", limit=5)
        i.limit = safeint(i.limit, 5)

        solr = get_solr()

        q = solr.escape(i.q).strip()
        solr_q = ''
        if is_author_olid(q.upper()):
            # ensure uppercase; key is case sensitive in solr
            solr_q = 'key:"/authors/%s"' % q.upper()
        else:
            prefix_q = q + "*"
            solr_q = 'name:(%s) OR alternate_names:(%s)' % (prefix_q, prefix_q)

        params = {
            'q_op': 'AND',
            'sort': 'work_count desc',
            'rows': i.limit,
            'fq': 'type:author'
        }

        data = solr.select(solr_q, **params)
        docs = data['docs']

        for d in docs:
            if 'top_work' in d:
                d['works'] = [d.pop('top_work')]
            else:
                d['works'] = []
            d['subjects'] = d.pop('top_subjects', [])

        return to_json(docs)
Example #6
0
    def find_matches(self, i):
        """
        Tries to find an edition, or work, or multiple work candidates that match the given input data.

        Case#1: No match. None is returned.
        Case#2: Work match but not edition. Work is returned.
        Case#3: Work match and edition match. Edition is returned
        Case#4: Multiple work match. List of works is returned.

        :param web.utils.Storage i: addbook user supplied formdata
        :rtype: None or list or Work or Edition
        :return: None or Work or Edition or list of Works that are likely matches.
        """

        i.publish_year = i.publish_date and self.extract_year(i.publish_date)
        author_key = i.authors and i.authors[0].author.key

        # work is set from the templates/books/check.html page.
        work_key = i.get('work')

        # work_key is set to none-of-these when user selects none-of-these link.
        if work_key == 'none-of-these':
            return None  # Case 1, from check page

        work = work_key and web.ctx.site.get(work_key)
        if work:
            edition = self.try_edition_match(work=work,
                                             publisher=i.publisher,
                                             publish_year=i.publish_year,
                                             id_name=i.id_name,
                                             id_value=i.id_value)
            return edition or work  # Case 3 or 2, from check page

        edition = self.try_edition_match(title=i.title,
                                         author_key=author_key,
                                         publisher=i.publisher,
                                         publish_year=i.publish_year,
                                         id_name=i.id_name,
                                         id_value=i.id_value)

        if edition:
            return edition  # Case 2 or 3 or 4, from add page

        solr = get_solr()
        # Less exact solr search than try_edition_match(), search by supplied title and author only.
        result = solr.select(
            {
                'title': i.title,
                'author_key': author_key.split("/")[-1]
            },
            doc_wrapper=make_work,
            q_op="AND")

        if result.num_found == 0:
            return None  # Case 1, from add page
        elif result.num_found == 1:
            return result.docs[0]  # Case 2
        else:
            return result.docs  # Case 4
Example #7
0
 def _get_edition_keys_from_solr(self, query_terms):
     if not query_terms:
         return
     q = " OR ".join(query_terms)
     solr = get_solr()
     result = solr.select(q, fields=["edition_key"], rows=10000)
     for doc in result['docs']:
         if 'edition_key' not in doc:
             continue
         for k in doc['edition_key']:
             yield "/books/" + k
Example #8
0
 def _get_edition_keys_from_solr(self, query_terms):
     if not query_terms:
         return
     q = " OR ".join(query_terms)
     solr = get_solr()
     result = solr.select(q, fields=["edition_key"], rows=10000)
     for doc in result['docs']:
         if 'edition_key' not in doc:
              continue
         for k in doc['edition_key']:
             yield "/books/" + k
Example #9
0
 def get_solr_query_term(self):
     if self.type == 'edition':
         return "edition_key:" + self._get_document_basekey()
     elif self.type == 'work':
         return 'key:/works/' + self._get_document_basekey()
     elif self.type == 'author':
         return "author_key:" + self._get_document_basekey()
     elif self.type == 'subject':
         type, value = self.key.split(":", 1)
         # escaping value as it can have special chars like : etc.
         value = get_solr().escape(value)
         return "%s_key:%s" % (type, value)
Example #10
0
 def get_solr_query_term(self):
     if self.type == 'edition':
         return "edition_key:" + self._get_document_basekey()
     elif self.type == 'work':
         return 'key:/works/' + self._get_document_basekey()
     elif self.type == 'author':
         return "author_key:" + self._get_document_basekey()
     elif self.type == 'subject':
         type, value = self.key.split(":", 1)
         # escaping value as it can have special chars like : etc.
         value = get_solr().escape(value)
         return "%s_key:%s" % (type, value)
Example #11
0
    def _get_all_subjects(self):
        solr = get_solr()
        q = self._get_solr_query_for_subjects()

        # Solr has a maxBooleanClauses constraint there too many seeds, the
        if len(self.seeds) > 500:
            logger.warn(
                "More than 500 seeds. skipping solr query for finding subjects."
            )
            return []

        facet_names = [
            'subject_facet', 'place_facet', 'person_facet', 'time_facet'
        ]
        try:
            result = solr.select(q,
                                 fields=[],
                                 facets=facet_names,
                                 facet_limit=20,
                                 facet_mincount=1)
        except OSError:
            logger.error("Error in finding subjects of list %s",
                         self.key,
                         exc_info=True)
            return []

        def get_subject_prefix(facet_name):
            name = facet_name.replace("_facet", "")
            if name == 'subject':
                return ''
            else:
                return name + ":"

        def process_subject(facet_name, title, count):
            prefix = get_subject_prefix(facet_name)
            key = prefix + title.lower().replace(" ", "_")
            url = "/subjects/" + key
            return web.storage({
                "title": title,
                "name": title,
                "count": count,
                "key": key,
                "url": url
            })

        def process_all():
            facets = result['facets']
            for k in facet_names:
                for f in facets.get(k, []):
                    yield process_subject(f.name, f.value, f.count)

        return sorted(process_all(), reverse=True, key=lambda s: s["count"])
Example #12
0
    def _solr_data(self):
        fields = [
            "cover_edition_key", "cover_id", "edition_key", "first_publish_year",
            "has_fulltext", "lending_edition_s", "public_scan_b", "ia"]

        solr = get_solr()
        stats.begin("solr", get=self.key, fields=fields)
        try:
            return solr.get(self.key, fields=fields)
        except Exception as e:
            logging.getLogger("openlibrary").exception("Failed to get solr data")
            return None
        finally:
            stats.end()
Example #13
0
    def GET(self):
        i = web.input(q="", limit=5)
        i.limit = safeint(i.limit, 5)

        solr = get_solr()

        q = solr.escape(i.q).strip()
        query_is_key = is_work_olid(q.upper())
        if query_is_key:
            # ensure uppercase; key is case sensitive in solr
            solr_q = 'key:"/works/%s"' % q.upper()
        else:
            solr_q = f'title:"{q}"^2 OR title:({q}*)'

        params = {
            'q_op':
            'AND',
            'sort':
            'edition_count desc',
            'rows':
            i.limit,
            'fq':
            'type:work',
            # limit the fields returned for better performance
            'fl':
            'key,title,subtitle,cover_i,first_publish_year,author_name,edition_count',
        }

        data = solr.select(solr_q, **params)
        # exclude fake works that actually have an edition key
        docs = [d for d in data['docs'] if d['key'][-1] == 'W']

        if query_is_key and not docs:
            # Grumble! Work not in solr yet. Create a dummy.
            key = '/works/%s' % q.upper()
            work = web.ctx.site.get(key)
            if work:
                docs = [work.as_fake_solr_record()]

        for d in docs:
            # Required by the frontend
            d['name'] = d['key'].split('/')[-1]
            d['full_title'] = d['title']
            if 'subtitle' in d:
                d['full_title'] += ": " + d['subtitle']

        return to_json(docs)
Example #14
0
def random_ebooks(limit=2000):
    solr = search.get_solr()
    sort = "edition_count desc"
    result = solr.select(
        query='has_fulltext:true -public_scan_b:false',
        rows=limit,
        sort=sort,
        fields=[
            'has_fulltext',
            'key',
            'ia',
            "title",
            "cover_edition_key",
            "author_key", "author_name",
        ])

    return [format_work_data(doc) for doc in result.get('docs', []) if doc.get('ia')]
Example #15
0
    def _get_all_subjects(self):
        solr = get_solr()
        q = self._get_solr_query_for_subjects()

        # Solr has a maxBooleanClauses constraint there too many seeds, the
        if len(self.seeds) > 500:
            logger.warn("More than 500 seeds. skipping solr query for finding subjects.")
            return []

        facet_names = ['subject_facet', 'place_facet', 'person_facet', 'time_facet']
        try:
            result = solr.select(q,
                fields=[],
                facets=facet_names,
                facet_limit=20,
                facet_mincount=1)
        except IOError:
            logger.error("Error in finding subjects of list %s", self.key, exc_info=True)
            return []

        def get_subject_prefix(facet_name):
            name = facet_name.replace("_facet", "")
            if name == 'subject':
                return ''
            else:
                return name + ":"

        def process_subject(facet_name, title, count):
            prefix = get_subject_prefix(facet_name)
            key = prefix + title.lower().replace(" ", "_")
            url = "/subjects/" + key
            return web.storage({
                "title": title,
                "name": title,
                "count": count,
                "key": key,
                "url": url
            })

        def process_all():
            facets = result['facets']
            for k in facet_names:
                for f in facets.get(k, []):
                    yield process_subject(f.name, f.value, f.count)

        return sorted(process_all(), reverse=True, key=lambda s: s["count"])
Example #16
0
def random_ebooks(limit=2000):
    solr = search.get_solr()
    sort = "edition_count desc"
    result = solr.select(
        query='has_fulltext:true -public_scan_b:false',
        rows=limit,
        sort=sort,
        fields=[
            'has_fulltext',
            'key',
            'ia',
            "title",
            "cover_edition_key",
            "author_key", "author_name",
        ])

    return [format_work_data(doc) for doc in result.get('docs', []) if doc.get('ia')]
Example #17
0
    def GET(self):
        from openlibrary.plugins.worksearch.search import get_solr
        result = get_solr().select(query='borrowed_b:false', fields=['key', 'lending_edition_s'], limit=100)

        def make_doc(d):
            # Makes a store doc from solr doc
            return {
                "_key": "ebooks/books/" + d['lending_edition_s'],
                "_rev": None, # Don't worry about consistancy
                "type": "ebook",
                "book_key": "/books/" + d['lending_edition_s'],
                "borrowed": "false"
            }

        docs = [make_doc(d) for d in result['docs']]
        docdict = dict((d['_key'], d) for d in docs)
        web.ctx.site.store.update(docdict)
        return delegate.RawText("ok\n")
Example #18
0
def random_ebooks(limit=2000):
    solr = search.get_solr()
    sort = "edition_count desc"
    result = solr.select(query='has_fulltext:true -public_scan_b:false',
                         rows=limit,
                         sort=sort,
                         fields=[
                             'has_fulltext',
                             'key',
                             'ia',
                             "title",
                             "cover_edition_key",
                             "author_key",
                             "author_name",
                         ])

    def process_doc(doc):
        d = {}

        key = doc.get('key', '')
        # New solr stores the key as /works/OLxxxW
        if not key.startswith("/works/"):
            key = "/works/" + key

        d['url'] = key
        d['title'] = doc.get('title', '')

        if 'author_key' in doc and 'author_name' in doc:
            d['authors'] = [{
                "key": key,
                "name": name
            } for key, name in zip(doc['author_key'], doc['author_name'])]

        if 'cover_edition_key' in doc:
            d['cover_url'] = h.get_coverstore_url(
            ) + "/b/olid/%s-M.jpg" % doc['cover_edition_key']

        d['read_url'] = "//archive.org/stream/" + doc['ia'][0]
        return d

    return [
        process_doc(doc) for doc in result.get('docs', []) if doc.get('ia')
    ]
Example #19
0
 def get_solr_query_term(self):
     if self.type == 'subject':
         typ, value = self.key.split(":", 1)
         # escaping value as it can have special chars like : etc.
         value = get_solr().escape(value)
         return f"{typ}_key:{value}"
     else:
         doc_basekey = self.document.key.split("/")[-1]
         if self.type == 'edition':
             return f"edition_key:{doc_basekey}"
         elif self.type == 'work':
             return f'key:/works/{doc_basekey}'
         elif self.type == 'author':
             return f"author_key:{doc_basekey}"
         else:
             logger.warning(
                 f"Cannot get solr query term for seed type {self.type}",
                 extra={'list': self._list.key, 'seed': self.key},
             )
             return None
Example #20
0
    def GET(self):
        i = web.input(q="", limit=5)
        i.limit = safeint(i.limit, 5)

        solr = get_solr()

        q = solr.escape(i.q).strip()
        query_is_key = is_author_olid(q.upper())
        if query_is_key:
            # ensure uppercase; key is case sensitive in solr
            solr_q = 'key:"/authors/%s"' % q.upper()
        else:
            prefix_q = q + "*"
            solr_q = f'name:({prefix_q}) OR alternate_names:({prefix_q})'

        params = {
            'q_op': 'AND',
            'sort': 'work_count desc',
            'rows': i.limit,
            'fq': 'type:author',
        }

        data = solr.select(solr_q, **params)
        docs = data['docs']

        if query_is_key and not docs:
            # Grumble! Must be a new author. Fetch from db, and build a "fake" solr resp
            key = '/authors/%s' % q.upper()
            author = web.ctx.site.get(key)
            if author:
                docs = [author.as_fake_solr_record()]

        for d in docs:
            if 'top_work' in d:
                d['works'] = [d.pop('top_work')]
            else:
                d['works'] = []
            d['subjects'] = d.pop('top_subjects', [])

        return to_json(docs)
Example #21
0
    def try_edition_match(
        self,
        work=None,
        title=None,
        author_key=None,
        publisher=None,
        publish_year=None,
        id_name=None,
        id_value=None,
    ):
        """
        Searches solr for potential edition matches.

        :param web.Storage work:
        :param str title:
        :param str author_key: e.g. /author/OL1234A
        :param str publisher:
        :param str publish_year: yyyy
        :param str id_name: from list of values in mapping below
        :param str id_value:
        :rtype: None or Edition or list
        :return: None, an Edition, or a list of Works
        """
        # insufficient data
        if not publisher and not publish_year and not id_value:
            return

        q = {}
        work and q.setdefault('key', work.key.split("/")[-1])
        title and q.setdefault('title', title)
        author_key and q.setdefault('author_key', author_key.split('/')[-1])
        publisher and q.setdefault('publisher', publisher)
        # There are some errors indexing of publish_year. Use publish_date until it is fixed
        publish_year and q.setdefault('publish_date', publish_year)

        mapping = {
            'isbn_10': 'isbn',
            'isbn_13': 'isbn',
            'lccn': 'lccn',
            'oclc_numbers': 'oclc',
            'ocaid': 'ia',
        }
        if id_value and id_name in mapping:
            if id_name.startswith('isbn'):
                id_value = id_value.replace('-', '')
            q[mapping[id_name]] = id_value

        solr = get_solr()
        result = solr.select(q, doc_wrapper=make_work, q_op="AND")

        if len(result.docs) > 1:
            # found multiple work matches
            return result.docs
        elif len(result.docs) == 1:
            # found one work match
            work = result.docs[0]
            publisher = publisher and fuzzy_find(
                publisher,
                work.publisher,
                stopwords=("publisher", "publishers", "and"))

            editions = web.ctx.site.get_many(
                ["/books/" + key for key in work.edition_key])
            for e in editions:
                d = {}
                if publisher:
                    if not e.publishers or e.publishers[0] != publisher:
                        continue
                if publish_year:
                    if not e.publish_date or publish_year != self.extract_year(
                            e.publish_date):
                        continue
                if id_value and id_name in mapping:
                    if not id_name in e or id_value not in e[id_name]:
                        continue
                # return the first good likely matching Edition
                return e