Exemple #1
0
def make_index(dump_file):
    """Make index with "path", "title", "created" and "last_modified" columns."""
    log(f"make_index({dump_file})")
    start_time = datetime.now()
    for i, type, key, revision, timestamp, json_data in enumerate(
            read_tsv(dump_file)):
        data = json.loads(json_data)
        if type in ("/type/edition", "/type/work"):
            title = data.get("title", "untitled")
            path = key + "/" + urlsafe(title)
        elif type == "/type/author":
            title = data.get("name", "unnamed")
            path = key + "/" + urlsafe(title)
        else:
            title = data.get("title", key)
            path = key

        title = title.replace("\t", " ")

        if "created" in data:
            created = data["created"]["value"]
        else:
            created = "-"
        print("\t".join(
            [web.safestr(path),
             web.safestr(title), created, timestamp]))
    minutes = (datetime.now() - start_time).seconds // 60
    log(f"make_index() processed {i:,} records in {minutes:,} minutes.")
Exemple #2
0
def make_index(dump_file):
    """Make index with "path", "title", "created" and "last_modified" columns."""

    from openlibrary.plugins.openlibrary.processors import urlsafe

    for type, key, revision, timestamp, json in read_tsv(dump_file):
        data = simplejson.loads(json)
        if type == '/type/edition' or type == '/type/work':
            title = data.get('title', 'untitled')
            path = key + '/' + urlsafe(title)
        elif type == '/type/author':
            title = data.get('name', 'unnamed')
            path = key + '/' + urlsafe(title)
        else:
            title = data.get('title', key)
            path = key

        title = title.replace("\t", " ")

        if 'created' in data:
            created = data['created']['value']
        else:
            created = "-"
        print("\t".join(
            [web.safestr(path),
             web.safestr(title), created, timestamp]))
Exemple #3
0
def get_url(doc):
    base = web.ctx.get("home", "http://openlibrary.org")
    if doc['key'].startswith("/books/") or doc['key'].startswith("/works/"):
        return base + doc['key'] + "/" + urlsafe(doc.get("title", "untitled"))
    elif doc['key'].startswith("/authors/"):
        return base + doc['key'] + "/" + urlsafe(doc.get("name", "unnamed"))
    else:
        return base + doc['key']
Exemple #4
0
def get_url(doc):
    base = web.ctx.get("home", "http://openlibrary.org")
    if doc['key'].startswith("/books/") or doc['key'].startswith("/works/"):
        return base + doc['key'] + "/" + urlsafe(doc.get("title", "untitled"))
    elif doc['key'].startswith("/authors/"):
        return base + doc['key'] + "/" + urlsafe(doc.get("name", "unnamed"))
    else:
        return base + doc['key']
Exemple #5
0
def get_doc(doc): # called from work_search template
    e_ia = doc.find("arr[@name='ia']")
    first_pub = None
    e_first_pub = doc.find("int[@name='first_publish_year']")
    if e_first_pub is not None:
        first_pub = e_first_pub.text
    e_first_edition = doc.find("str[@name='first_edition']")
    first_edition = None
    if e_first_edition is not None:
        first_edition = e_first_edition.text

    work_subtitle = None
    e_subtitle = doc.find("str[@name='subtitle']")
    if e_subtitle is not None:
        work_subtitle = e_subtitle.text

    if doc.find("arr[@name='author_key']") is None:
        assert doc.find("arr[@name='author_name']") is None
        authors = []
    else:
        ak = [e.text for e in doc.find("arr[@name='author_key']")]
        an = [e.text for e in doc.find("arr[@name='author_name']")]
        authors = [web.storage(key=key, name=name, url="/authors/%s/%s" % (key, (urlsafe(name) if name is not None else 'noname'))) for key, name in zip(ak, an)]

    cover = doc.find("str[@name='cover_edition_key']")
    e_public_scan = doc.find("bool[@name='public_scan_b']")
    e_overdrive = doc.find("str[@name='overdrive_s']")
    e_lending_edition = doc.find("str[@name='lending_edition_s']")
    e_collection = doc.find("str[@name='ia_collection_s']")
    collections = set()
    if e_collection is not None:
        collections = set(e_collection.text.split(';'))

    doc = web.storage(
        key = doc.find("str[@name='key']").text,
        title = doc.find("str[@name='title']").text,
        edition_count = int(doc.find("int[@name='edition_count']").text),
        ia = [e.text for e in (e_ia if e_ia is not None else [])],
        has_fulltext = (doc.find("bool[@name='has_fulltext']").text == 'true'),
        public_scan = ((e_public_scan.text == 'true') if e_public_scan is not None else (e_ia is not None)),
        overdrive = (e_overdrive.text.split(';') if e_overdrive is not None else []),
        lending_edition = (e_lending_edition.text if e_lending_edition is not None else None),
        collections = collections,
        authors = authors,
        first_publish_year = first_pub,
        first_edition = first_edition,
        subtitle = work_subtitle,
        cover_edition_key = (cover.text if cover is not None else None),
    )

    doc.url = '/works/' + doc.key + '/' + urlsafe(doc.title)
    
    if not doc.public_scan and doc.lending_edition:
        store_doc = web.ctx.site.store.get("ebooks/books/" + doc.lending_edition) or {}
        doc.checked_out = store_doc.get("borrowed") == "true"
    else:
        doc.checked_out = "false"
    return doc
Exemple #6
0
def find_path(key, type, json):
    if type in ['/type/edition', '/type/work']:
        data = simplejson.loads(json)
        return key + '/' + urlsafe(data.get('title', 'untitled'))
    elif doc.type == '/type/author':
        data = simplejson.loads(json)
        return key + '/' + urlsafe(data.get('name', 'unnamed'))
    else:
        return doc.key
Exemple #7
0
def find_path(key, type, json):
    if type in ['/type/edition', '/type/work']:
        data = simplejson.loads(json)
        return key + '/' + urlsafe(data.get('title', 'untitled'))
    elif doc.type == '/type/author':
        data = simplejson.loads(json)
        return key + '/' + urlsafe(data.get('name', 'unnamed'))
    else:
        return doc.key
Exemple #8
0
def get_url(doc: Mapping[str, str]) -> str:
    base = web.ctx.get("home", "https://openlibrary.org")
    if base == 'http://[unknown]':
        base = "https://openlibrary.org"
    if doc['key'].startswith(("/books/", "/works/")):
        return base + doc['key'] + "/" + urlsafe(doc.get("title", "untitled"))
    elif doc['key'].startswith("/authors/"):
        return base + doc['key'] + "/" + urlsafe(doc.get("name", "unnamed"))
    else:
        return base + doc['key']
Exemple #9
0
def work_object(w): # called by works_by_author
    ia = w.get('ia', [])

    if config.get("single_core_solr"):
        key = w['key']
    else:
        key = '/works/' + w['key']

    obj = dict(
        authors = [web.storage(key='/authors/' + k, name=n) for k, n in zip(w['author_key'], w['author_name'])],
        edition_count = w['edition_count'],
        key = key,
        title = w['title'],
        public_scan = w.get('public_scan_b', bool(ia)),
        lending_edition = w.get('lending_edition_s', ''),
        lending_identifier = w.get('lending_identifier_s', ''),
        overdrive = (w['overdrive_s'].split(';') if 'overdrive_s' in w else []),
        collections = set(w['ia_collection_s'].split(';') if 'ia_collection_s' in w else []),
        url = key + '/' + urlsafe(w['title']),
        cover_edition_key = w.get('cover_edition_key'),
        first_publish_year = (w['first_publish_year'] if 'first_publish_year' in w else None),
        ia = w.get('ia', []),
        cover_i = w.get('cover_i')
    )

    if obj['lending_identifier']:
        doc = web.ctx.site.store.get("ebooks/" + obj['lending_identifier']) or {}
        obj['checked_out'] = doc.get("borrowed") == "true"
    else:
        obj['checked_out'] = False
    
    for f in 'has_fulltext', 'subtitle':
        if w.get(f):
            obj[f] = w[f]
    return web.storage(obj)
Exemple #10
0
def work_object(w):  # called by works_by_author
    ia = w.get('ia', [])
    obj = dict(
        authors=[
            web.storage(key='/authors/' + k, name=n)
            for k, n in zip(w['author_key'], w['author_name'])
        ],
        edition_count=w['edition_count'],
        key=w['key'],
        title=w['title'],
        public_scan=w.get('public_scan_b', bool(ia)),
        lending_edition=w.get('lending_edition_s', ''),
        lending_identifier=w.get('lending_identifier_s', ''),
        collections=set(w['ia_collection_s'].split(';') if 'ia_collection_s' in
                        w else []),
        url=w['key'] + '/' + urlsafe(w['title']),
        cover_edition_key=w.get('cover_edition_key'),
        first_publish_year=(w['first_publish_year']
                            if 'first_publish_year' in w else None),
        ia=w.get('ia', []),
        cover_i=w.get('cover_i'),
        id_project_gutenberg=w.get('id_project_gutenberg'),
        id_librivox=w.get('id_librivox'),
        id_standard_ebooks=w.get('id_standard_ebooks'),
        id_openstax=w.get('id_openstax'),
    )

    for f in 'has_fulltext', 'subtitle':
        if w.get(f):
            obj[f] = w[f]
    return web.storage(obj)
Exemple #11
0
def work_object(w):  # called by works_by_author
    ia = w.get('ia', [])
    obj = dict(
        authors=[
            web.storage(key='/authors/' + k, name=n)
            for k, n in zip(w['author_key'], w['author_name'])
        ],
        edition_count=w['edition_count'],
        key=w['key'],
        title=w['title'],
        public_scan=w.get('public_scan_b', bool(ia)),
        lending_edition=w.get('lending_edition_s', ''),
        lending_identifier=w.get('lending_identifier_s', ''),
        collections=set(w['ia_collection_s'].split(';') if 'ia_collection_s' in
                        w else []),
        url=w['key'] + '/' + urlsafe(w['title']),
        cover_edition_key=w.get('cover_edition_key'),
        first_publish_year=(w['first_publish_year']
                            if 'first_publish_year' in w else None),
        ia=w.get('ia', []),
        cover_i=w.get('cover_i'))

    if obj['lending_identifier']:
        doc = web.ctx.site.store.get("ebooks/" +
                                     obj['lending_identifier']) or {}
        obj['checked_out'] = doc.get("borrowed") == "true"
    else:
        obj['checked_out'] = False

    for f in 'has_fulltext', 'subtitle':
        if w.get(f):
            obj[f] = w[f]
    return web.storage(obj)
Exemple #12
0
def make_index(dump_file):
    """Make index with "path", "title", "created" and "last_modified" columns."""

    for type, key, revision, timestamp, json_data in read_tsv(dump_file):
        data = json.loads(json_data)
        if type in ("/type/edition", "/type/work"):
            title = data.get("title", "untitled")
            path = key + "/" + urlsafe(title)
        elif type == "/type/author":
            title = data.get("name", "unnamed")
            path = key + "/" + urlsafe(title)
        else:
            title = data.get("title", key)
            path = key

        title = title.replace("\t", " ")

        if "created" in data:
            created = data["created"]["value"]
        else:
            created = "-"
        print("\t".join([web.safestr(path), web.safestr(title), created, timestamp]))
Exemple #13
0
def make_index(dump_file):
    """Make index with "path", "title", "created" and "last_modified" columns."""

    from openlibrary.plugins.openlibrary.processors import urlsafe

    for type, key, revision, timestamp, json in read_tsv(dump_file):
        data = simplejson.loads(json)
        if type == '/type/edition' or type == '/type/work':
            title = data.get('title', 'untitled')
            path = key + '/' + urlsafe(title)
        elif type == '/type/author':
            title = data.get('name', 'unnamed')
            path = key + '/' + urlsafe(title)
        else:
            title = data.get('title', key)
            path = key

        title = title.replace("\t", " ")

        if 'created' in data:
            created = data['created']['value']
        else:
            created = "-"
        print("\t".join([web.safestr(path), web.safestr(title), created, timestamp]))
Exemple #14
0
def work_object(w):
    obj = dict(
        authors = [web.storage(key='/authors/' + k, name=n) for k, n in zip(w['author_key'], w['author_name'])],
        edition_count = w['edition_count'],
        key = '/works/' + w['key'],
        title = w['title'],
        url = '/works/' + w['key'] + '/' + urlsafe(w['title']),
        cover_edition_key = w.get('cover_edition_key', None),
        first_publish_year = (w['first_publish_year'] if 'first_publish_year' in w else None),
        ia = w.get('ia', [])
    )
    for f in 'has_fulltext', 'subtitle':
        if w.get(f, None):
            obj[f] = w[f]
    return web.storage(obj)
Exemple #15
0
def make_index(dump_file):
    """Make index with "path", "title", "created" and "last_modified" columns."""

    from openlibrary.plugins.openlibrary.processors import urlsafe

    for type, key, revision, timestamp, json in read_tsv(dump_file):
        data = simplejson.loads(json)
        if type == "/type/edition" or type == "/type/work":
            title = data.get("title", "untitled")
            path = key + "/" + urlsafe(title)
        elif type == "/type/author":
            title = data.get("name", "unnamed")
            path = key + "/" + urlsafe(title)
        else:
            title = data.get("title", key)
            path = key

        title = title.replace("\t", " ")

        if "created" in data:
            created = data["created"]["value"]
        else:
            created = "-"
        print "\t".join([web.safestr(path), web.safestr(title), created, timestamp])
Exemple #16
0
def work_object(w):
    ia = w.get('ia', [])
    obj = dict(
        authors = [web.storage(key='/authors/' + k, name=n) for k, n in zip(w['author_key'], w['author_name'])],
        edition_count = w['edition_count'],
        key = '/works/' + w['key'],
        title = w['title'],
        public_scan = w.get('public_scan_b', bool(ia)),
        lending_edition = w.get('lending_edition_s', ''),
        overdrive = (w['overdrive_s'].split(';') if 'overdrive_s' in w else []),
        collections = set(w['ia_collection_s'].split(';') if 'ia_collection_s' in w else []),
        url = '/works/' + w['key'] + '/' + urlsafe(w['title']),
        cover_edition_key = w.get('cover_edition_key'),
        first_publish_year = (w['first_publish_year'] if 'first_publish_year' in w else None),
        ia = w.get('ia', [])
    )
    for f in 'has_fulltext', 'subtitle':
        if w.get(f):
            obj[f] = w[f]
    return web.storage(obj)
Exemple #17
0
def get_doc(doc):
    e_ia = doc.find("arr[@name='ia']")
    first_pub = None
    e_first_pub = doc.find("int[@name='first_publish_year']")
    if e_first_pub is not None:
        first_pub = e_first_pub.text
    e_first_edition = doc.find("str[@name='first_edition']")
    first_edition = None
    if e_first_edition is not None:
        first_edition = e_first_edition.text

    work_subtitle = None
    e_subtitle = doc.find("str[@name='subtitle']")
    if e_subtitle is not None:
        work_subtitle = e_subtitle.text

    if doc.find("arr[@name='author_key']") is None:
        assert doc.find("arr[@name='author_name']") is None
        authors = []

    else:
        ak = [e.text for e in doc.find("arr[@name='author_key']")]
        an = [e.text for e in doc.find("arr[@name='author_name']")]
        authors = [web.storage(key=key, name=tidy_name(name), url="/authors/%s/%s" % (key, (urlsafe(name) if name is not None else 'noname'))) for key, name in zip(ak, an)]
    cover = doc.find("str[@name='cover_edition_key']")

    doc = web.storage(
        key = doc.find("str[@name='key']").text,
        title = doc.find("str[@name='title']").text,
        edition_count = int(doc.find("int[@name='edition_count']").text),
        ia = [e.text for e in (e_ia if e_ia is not None else [])],
        has_fulltext = (doc.find("bool[@name='has_fulltext']").text == 'true'),
        authors = authors,
        first_publish_year = first_pub,
        first_edition = first_edition,
        subtitle = work_subtitle,
        cover_edition_key = (cover.text if cover is not None else None),
    )
    doc.url = '/works/' + doc.key + '/' + urlsafe(doc.title)
    return doc
Exemple #18
0
def work_object(w):  # called by works_by_author
    ia = w.get('ia', [])
    obj = dict(
        authors=[
            web.storage(key='/authors/' + k, name=n)
            for k, n in zip(w['author_key'], w['author_name'])
        ],
        edition_count=w['edition_count'],
        key='/works/' + w['key'],
        title=w['title'],
        public_scan=w.get('public_scan_b', bool(ia)),
        lending_edition=w.get('lending_edition_s', ''),
        overdrive=(w['overdrive_s'].split(';') if 'overdrive_s' in w else []),
        collections=set(w['ia_collection_s'].split(';') if 'ia_collection_s' in
                        w else []),
        url='/works/' + w['key'] + '/' + urlsafe(w['title']),
        cover_edition_key=w.get('cover_edition_key'),
        first_publish_year=(w['first_publish_year']
                            if 'first_publish_year' in w else None),
        ia=w.get('ia', []))
    for f in 'has_fulltext', 'subtitle':
        if w.get(f):
            obj[f] = w[f]
    return web.storage(obj)
Exemple #19
0
def get_doc(doc):  # called from work_search template
    e_ia = doc.find("arr[@name='ia']")
    e_id_project_gutenberg = doc.find("arr[@name='id_project_gutenberg']") or []
    e_id_librivox = doc.find("arr[@name='id_librivox']") or []
    e_id_standard_ebooks = doc.find("arr[@name='id_standard_ebooks']") or []

    first_pub = None
    e_first_pub = doc.find("int[@name='first_publish_year']")
    if e_first_pub is not None:
        first_pub = e_first_pub.text
    e_first_edition = doc.find("str[@name='first_edition']")
    first_edition = None
    if e_first_edition is not None:
        first_edition = e_first_edition.text

    work_subtitle = None
    e_subtitle = doc.find("str[@name='subtitle']")
    if e_subtitle is not None:
        work_subtitle = e_subtitle.text

    if doc.find("arr[@name='author_key']") is None:
        assert doc.find("arr[@name='author_name']") is None
        authors = []
    else:
        ak = [e.text for e in doc.find("arr[@name='author_key']")]
        an = [e.text for e in doc.find("arr[@name='author_name']")]
        authors = [
            web.storage(
                key=key,
                name=name,
                url="/authors/{}/{}".format(
                    key, (urlsafe(name) if name is not None else 'noname')
                ),
            )
            for key, name in zip(ak, an)
        ]
    cover = doc.find("str[@name='cover_edition_key']")
    languages = doc.find("arr[@name='language']")
    e_public_scan = doc.find("bool[@name='public_scan_b']")
    e_lending_edition = doc.find("str[@name='lending_edition_s']")
    e_lending_identifier = doc.find("str[@name='lending_identifier_s']")
    e_collection = doc.find("str[@name='ia_collection_s']")
    collections = set()
    if e_collection is not None:
        collections = set(e_collection.text.split(';'))

    doc = web.storage(
        key=doc.find("str[@name='key']").text,
        title=doc.find("str[@name='title']").text,
        edition_count=int(doc.find("int[@name='edition_count']").text),
        ia=[e.text for e in (e_ia if e_ia is not None else [])],
        has_fulltext=(doc.find("bool[@name='has_fulltext']").text == 'true'),
        public_scan=(
            (e_public_scan.text == 'true')
            if e_public_scan is not None
            else (e_ia is not None)
        ),
        lending_edition=(
            e_lending_edition.text if e_lending_edition is not None else None
        ),
        lending_identifier=(
            e_lending_identifier.text if e_lending_identifier is not None else None
        ),
        collections=collections,
        authors=authors,
        first_publish_year=first_pub,
        first_edition=first_edition,
        subtitle=work_subtitle,
        cover_edition_key=(cover.text if cover is not None else None),
        languages=languages and [lang.text for lang in languages],
        id_project_gutenberg=[e.text for e in e_id_project_gutenberg],
        id_librivox=[e.text for e in e_id_librivox],
        id_standard_ebooks=[e.text for e in e_id_standard_ebooks],
    )

    doc.url = doc.key + '/' + urlsafe(doc.title)
    return doc