def make_index(dump_file): """Make index with "path", "title", "created" and "last_modified" columns.""" log(f"make_index({dump_file})") start_time = datetime.now() for i, type, key, revision, timestamp, json_data in enumerate( read_tsv(dump_file)): data = json.loads(json_data) if type in ("/type/edition", "/type/work"): title = data.get("title", "untitled") path = key + "/" + urlsafe(title) elif type == "/type/author": title = data.get("name", "unnamed") path = key + "/" + urlsafe(title) else: title = data.get("title", key) path = key title = title.replace("\t", " ") if "created" in data: created = data["created"]["value"] else: created = "-" print("\t".join( [web.safestr(path), web.safestr(title), created, timestamp])) minutes = (datetime.now() - start_time).seconds // 60 log(f"make_index() processed {i:,} records in {minutes:,} minutes.")
def make_index(dump_file): """Make index with "path", "title", "created" and "last_modified" columns.""" from openlibrary.plugins.openlibrary.processors import urlsafe for type, key, revision, timestamp, json in read_tsv(dump_file): data = simplejson.loads(json) if type == '/type/edition' or type == '/type/work': title = data.get('title', 'untitled') path = key + '/' + urlsafe(title) elif type == '/type/author': title = data.get('name', 'unnamed') path = key + '/' + urlsafe(title) else: title = data.get('title', key) path = key title = title.replace("\t", " ") if 'created' in data: created = data['created']['value'] else: created = "-" print("\t".join( [web.safestr(path), web.safestr(title), created, timestamp]))
def get_url(doc): base = web.ctx.get("home", "http://openlibrary.org") if doc['key'].startswith("/books/") or doc['key'].startswith("/works/"): return base + doc['key'] + "/" + urlsafe(doc.get("title", "untitled")) elif doc['key'].startswith("/authors/"): return base + doc['key'] + "/" + urlsafe(doc.get("name", "unnamed")) else: return base + doc['key']
def get_url(doc): base = web.ctx.get("home", "http://openlibrary.org") if doc['key'].startswith("/books/") or doc['key'].startswith("/works/"): return base + doc['key'] + "/" + urlsafe(doc.get("title", "untitled")) elif doc['key'].startswith("/authors/"): return base + doc['key'] + "/" + urlsafe(doc.get("name", "unnamed")) else: return base + doc['key']
def get_doc(doc): # called from work_search template e_ia = doc.find("arr[@name='ia']") first_pub = None e_first_pub = doc.find("int[@name='first_publish_year']") if e_first_pub is not None: first_pub = e_first_pub.text e_first_edition = doc.find("str[@name='first_edition']") first_edition = None if e_first_edition is not None: first_edition = e_first_edition.text work_subtitle = None e_subtitle = doc.find("str[@name='subtitle']") if e_subtitle is not None: work_subtitle = e_subtitle.text if doc.find("arr[@name='author_key']") is None: assert doc.find("arr[@name='author_name']") is None authors = [] else: ak = [e.text for e in doc.find("arr[@name='author_key']")] an = [e.text for e in doc.find("arr[@name='author_name']")] authors = [web.storage(key=key, name=name, url="/authors/%s/%s" % (key, (urlsafe(name) if name is not None else 'noname'))) for key, name in zip(ak, an)] cover = doc.find("str[@name='cover_edition_key']") e_public_scan = doc.find("bool[@name='public_scan_b']") e_overdrive = doc.find("str[@name='overdrive_s']") e_lending_edition = doc.find("str[@name='lending_edition_s']") e_collection = doc.find("str[@name='ia_collection_s']") collections = set() if e_collection is not None: collections = set(e_collection.text.split(';')) doc = web.storage( key = doc.find("str[@name='key']").text, title = doc.find("str[@name='title']").text, edition_count = int(doc.find("int[@name='edition_count']").text), ia = [e.text for e in (e_ia if e_ia is not None else [])], has_fulltext = (doc.find("bool[@name='has_fulltext']").text == 'true'), public_scan = ((e_public_scan.text == 'true') if e_public_scan is not None else (e_ia is not None)), overdrive = (e_overdrive.text.split(';') if e_overdrive is not None else []), lending_edition = (e_lending_edition.text if e_lending_edition is not None else None), collections = collections, authors = authors, first_publish_year = first_pub, first_edition = first_edition, subtitle = work_subtitle, cover_edition_key = (cover.text if cover is not None else None), ) doc.url = '/works/' + doc.key + '/' + urlsafe(doc.title) if not doc.public_scan and doc.lending_edition: store_doc = web.ctx.site.store.get("ebooks/books/" + doc.lending_edition) or {} doc.checked_out = store_doc.get("borrowed") == "true" else: doc.checked_out = "false" return doc
def find_path(key, type, json): if type in ['/type/edition', '/type/work']: data = simplejson.loads(json) return key + '/' + urlsafe(data.get('title', 'untitled')) elif doc.type == '/type/author': data = simplejson.loads(json) return key + '/' + urlsafe(data.get('name', 'unnamed')) else: return doc.key
def find_path(key, type, json): if type in ['/type/edition', '/type/work']: data = simplejson.loads(json) return key + '/' + urlsafe(data.get('title', 'untitled')) elif doc.type == '/type/author': data = simplejson.loads(json) return key + '/' + urlsafe(data.get('name', 'unnamed')) else: return doc.key
def get_url(doc: Mapping[str, str]) -> str: base = web.ctx.get("home", "https://openlibrary.org") if base == 'http://[unknown]': base = "https://openlibrary.org" if doc['key'].startswith(("/books/", "/works/")): return base + doc['key'] + "/" + urlsafe(doc.get("title", "untitled")) elif doc['key'].startswith("/authors/"): return base + doc['key'] + "/" + urlsafe(doc.get("name", "unnamed")) else: return base + doc['key']
def work_object(w): # called by works_by_author ia = w.get('ia', []) if config.get("single_core_solr"): key = w['key'] else: key = '/works/' + w['key'] obj = dict( authors = [web.storage(key='/authors/' + k, name=n) for k, n in zip(w['author_key'], w['author_name'])], edition_count = w['edition_count'], key = key, title = w['title'], public_scan = w.get('public_scan_b', bool(ia)), lending_edition = w.get('lending_edition_s', ''), lending_identifier = w.get('lending_identifier_s', ''), overdrive = (w['overdrive_s'].split(';') if 'overdrive_s' in w else []), collections = set(w['ia_collection_s'].split(';') if 'ia_collection_s' in w else []), url = key + '/' + urlsafe(w['title']), cover_edition_key = w.get('cover_edition_key'), first_publish_year = (w['first_publish_year'] if 'first_publish_year' in w else None), ia = w.get('ia', []), cover_i = w.get('cover_i') ) if obj['lending_identifier']: doc = web.ctx.site.store.get("ebooks/" + obj['lending_identifier']) or {} obj['checked_out'] = doc.get("borrowed") == "true" else: obj['checked_out'] = False for f in 'has_fulltext', 'subtitle': if w.get(f): obj[f] = w[f] return web.storage(obj)
def work_object(w): # called by works_by_author ia = w.get('ia', []) obj = dict( authors=[ web.storage(key='/authors/' + k, name=n) for k, n in zip(w['author_key'], w['author_name']) ], edition_count=w['edition_count'], key=w['key'], title=w['title'], public_scan=w.get('public_scan_b', bool(ia)), lending_edition=w.get('lending_edition_s', ''), lending_identifier=w.get('lending_identifier_s', ''), collections=set(w['ia_collection_s'].split(';') if 'ia_collection_s' in w else []), url=w['key'] + '/' + urlsafe(w['title']), cover_edition_key=w.get('cover_edition_key'), first_publish_year=(w['first_publish_year'] if 'first_publish_year' in w else None), ia=w.get('ia', []), cover_i=w.get('cover_i'), id_project_gutenberg=w.get('id_project_gutenberg'), id_librivox=w.get('id_librivox'), id_standard_ebooks=w.get('id_standard_ebooks'), id_openstax=w.get('id_openstax'), ) for f in 'has_fulltext', 'subtitle': if w.get(f): obj[f] = w[f] return web.storage(obj)
def work_object(w): # called by works_by_author ia = w.get('ia', []) obj = dict( authors=[ web.storage(key='/authors/' + k, name=n) for k, n in zip(w['author_key'], w['author_name']) ], edition_count=w['edition_count'], key=w['key'], title=w['title'], public_scan=w.get('public_scan_b', bool(ia)), lending_edition=w.get('lending_edition_s', ''), lending_identifier=w.get('lending_identifier_s', ''), collections=set(w['ia_collection_s'].split(';') if 'ia_collection_s' in w else []), url=w['key'] + '/' + urlsafe(w['title']), cover_edition_key=w.get('cover_edition_key'), first_publish_year=(w['first_publish_year'] if 'first_publish_year' in w else None), ia=w.get('ia', []), cover_i=w.get('cover_i')) if obj['lending_identifier']: doc = web.ctx.site.store.get("ebooks/" + obj['lending_identifier']) or {} obj['checked_out'] = doc.get("borrowed") == "true" else: obj['checked_out'] = False for f in 'has_fulltext', 'subtitle': if w.get(f): obj[f] = w[f] return web.storage(obj)
def make_index(dump_file): """Make index with "path", "title", "created" and "last_modified" columns.""" for type, key, revision, timestamp, json_data in read_tsv(dump_file): data = json.loads(json_data) if type in ("/type/edition", "/type/work"): title = data.get("title", "untitled") path = key + "/" + urlsafe(title) elif type == "/type/author": title = data.get("name", "unnamed") path = key + "/" + urlsafe(title) else: title = data.get("title", key) path = key title = title.replace("\t", " ") if "created" in data: created = data["created"]["value"] else: created = "-" print("\t".join([web.safestr(path), web.safestr(title), created, timestamp]))
def make_index(dump_file): """Make index with "path", "title", "created" and "last_modified" columns.""" from openlibrary.plugins.openlibrary.processors import urlsafe for type, key, revision, timestamp, json in read_tsv(dump_file): data = simplejson.loads(json) if type == '/type/edition' or type == '/type/work': title = data.get('title', 'untitled') path = key + '/' + urlsafe(title) elif type == '/type/author': title = data.get('name', 'unnamed') path = key + '/' + urlsafe(title) else: title = data.get('title', key) path = key title = title.replace("\t", " ") if 'created' in data: created = data['created']['value'] else: created = "-" print("\t".join([web.safestr(path), web.safestr(title), created, timestamp]))
def work_object(w): obj = dict( authors = [web.storage(key='/authors/' + k, name=n) for k, n in zip(w['author_key'], w['author_name'])], edition_count = w['edition_count'], key = '/works/' + w['key'], title = w['title'], url = '/works/' + w['key'] + '/' + urlsafe(w['title']), cover_edition_key = w.get('cover_edition_key', None), first_publish_year = (w['first_publish_year'] if 'first_publish_year' in w else None), ia = w.get('ia', []) ) for f in 'has_fulltext', 'subtitle': if w.get(f, None): obj[f] = w[f] return web.storage(obj)
def make_index(dump_file): """Make index with "path", "title", "created" and "last_modified" columns.""" from openlibrary.plugins.openlibrary.processors import urlsafe for type, key, revision, timestamp, json in read_tsv(dump_file): data = simplejson.loads(json) if type == "/type/edition" or type == "/type/work": title = data.get("title", "untitled") path = key + "/" + urlsafe(title) elif type == "/type/author": title = data.get("name", "unnamed") path = key + "/" + urlsafe(title) else: title = data.get("title", key) path = key title = title.replace("\t", " ") if "created" in data: created = data["created"]["value"] else: created = "-" print "\t".join([web.safestr(path), web.safestr(title), created, timestamp])
def work_object(w): ia = w.get('ia', []) obj = dict( authors = [web.storage(key='/authors/' + k, name=n) for k, n in zip(w['author_key'], w['author_name'])], edition_count = w['edition_count'], key = '/works/' + w['key'], title = w['title'], public_scan = w.get('public_scan_b', bool(ia)), lending_edition = w.get('lending_edition_s', ''), overdrive = (w['overdrive_s'].split(';') if 'overdrive_s' in w else []), collections = set(w['ia_collection_s'].split(';') if 'ia_collection_s' in w else []), url = '/works/' + w['key'] + '/' + urlsafe(w['title']), cover_edition_key = w.get('cover_edition_key'), first_publish_year = (w['first_publish_year'] if 'first_publish_year' in w else None), ia = w.get('ia', []) ) for f in 'has_fulltext', 'subtitle': if w.get(f): obj[f] = w[f] return web.storage(obj)
def get_doc(doc): e_ia = doc.find("arr[@name='ia']") first_pub = None e_first_pub = doc.find("int[@name='first_publish_year']") if e_first_pub is not None: first_pub = e_first_pub.text e_first_edition = doc.find("str[@name='first_edition']") first_edition = None if e_first_edition is not None: first_edition = e_first_edition.text work_subtitle = None e_subtitle = doc.find("str[@name='subtitle']") if e_subtitle is not None: work_subtitle = e_subtitle.text if doc.find("arr[@name='author_key']") is None: assert doc.find("arr[@name='author_name']") is None authors = [] else: ak = [e.text for e in doc.find("arr[@name='author_key']")] an = [e.text for e in doc.find("arr[@name='author_name']")] authors = [web.storage(key=key, name=tidy_name(name), url="/authors/%s/%s" % (key, (urlsafe(name) if name is not None else 'noname'))) for key, name in zip(ak, an)] cover = doc.find("str[@name='cover_edition_key']") doc = web.storage( key = doc.find("str[@name='key']").text, title = doc.find("str[@name='title']").text, edition_count = int(doc.find("int[@name='edition_count']").text), ia = [e.text for e in (e_ia if e_ia is not None else [])], has_fulltext = (doc.find("bool[@name='has_fulltext']").text == 'true'), authors = authors, first_publish_year = first_pub, first_edition = first_edition, subtitle = work_subtitle, cover_edition_key = (cover.text if cover is not None else None), ) doc.url = '/works/' + doc.key + '/' + urlsafe(doc.title) return doc
def work_object(w): # called by works_by_author ia = w.get('ia', []) obj = dict( authors=[ web.storage(key='/authors/' + k, name=n) for k, n in zip(w['author_key'], w['author_name']) ], edition_count=w['edition_count'], key='/works/' + w['key'], title=w['title'], public_scan=w.get('public_scan_b', bool(ia)), lending_edition=w.get('lending_edition_s', ''), overdrive=(w['overdrive_s'].split(';') if 'overdrive_s' in w else []), collections=set(w['ia_collection_s'].split(';') if 'ia_collection_s' in w else []), url='/works/' + w['key'] + '/' + urlsafe(w['title']), cover_edition_key=w.get('cover_edition_key'), first_publish_year=(w['first_publish_year'] if 'first_publish_year' in w else None), ia=w.get('ia', [])) for f in 'has_fulltext', 'subtitle': if w.get(f): obj[f] = w[f] return web.storage(obj)
def get_doc(doc): # called from work_search template e_ia = doc.find("arr[@name='ia']") e_id_project_gutenberg = doc.find("arr[@name='id_project_gutenberg']") or [] e_id_librivox = doc.find("arr[@name='id_librivox']") or [] e_id_standard_ebooks = doc.find("arr[@name='id_standard_ebooks']") or [] first_pub = None e_first_pub = doc.find("int[@name='first_publish_year']") if e_first_pub is not None: first_pub = e_first_pub.text e_first_edition = doc.find("str[@name='first_edition']") first_edition = None if e_first_edition is not None: first_edition = e_first_edition.text work_subtitle = None e_subtitle = doc.find("str[@name='subtitle']") if e_subtitle is not None: work_subtitle = e_subtitle.text if doc.find("arr[@name='author_key']") is None: assert doc.find("arr[@name='author_name']") is None authors = [] else: ak = [e.text for e in doc.find("arr[@name='author_key']")] an = [e.text for e in doc.find("arr[@name='author_name']")] authors = [ web.storage( key=key, name=name, url="/authors/{}/{}".format( key, (urlsafe(name) if name is not None else 'noname') ), ) for key, name in zip(ak, an) ] cover = doc.find("str[@name='cover_edition_key']") languages = doc.find("arr[@name='language']") e_public_scan = doc.find("bool[@name='public_scan_b']") e_lending_edition = doc.find("str[@name='lending_edition_s']") e_lending_identifier = doc.find("str[@name='lending_identifier_s']") e_collection = doc.find("str[@name='ia_collection_s']") collections = set() if e_collection is not None: collections = set(e_collection.text.split(';')) doc = web.storage( key=doc.find("str[@name='key']").text, title=doc.find("str[@name='title']").text, edition_count=int(doc.find("int[@name='edition_count']").text), ia=[e.text for e in (e_ia if e_ia is not None else [])], has_fulltext=(doc.find("bool[@name='has_fulltext']").text == 'true'), public_scan=( (e_public_scan.text == 'true') if e_public_scan is not None else (e_ia is not None) ), lending_edition=( e_lending_edition.text if e_lending_edition is not None else None ), lending_identifier=( e_lending_identifier.text if e_lending_identifier is not None else None ), collections=collections, authors=authors, first_publish_year=first_pub, first_edition=first_edition, subtitle=work_subtitle, cover_edition_key=(cover.text if cover is not None else None), languages=languages and [lang.text for lang in languages], id_project_gutenberg=[e.text for e in e_id_project_gutenberg], id_librivox=[e.text for e in e_id_librivox], id_standard_ebooks=[e.text for e in e_id_standard_ebooks], ) doc.url = doc.key + '/' + urlsafe(doc.title) return doc