def main( keys: list[str], src="http://openlibrary.org/", dest="http://localhost:8080", comment="", recursive=True, editions=True, lists: list[str] = None, search: str = None, search_limit: int = 10, ): """ Script to copy docs from one OL instance to another. Typically used to copy templates, macros, css and js from openlibrary.org to dev instance. paths can end with wildcards. USAGE: # Copy all templates ./scripts/copydocs.py --src http://openlibrary.org /templates/* # Copy specific records ./scripts/copydocs.py /authors/OL113592A /works/OL1098727W?v=2 # Copy search results ./scripts/copydocs.py --search "publisher:librivox" --search-limit 10 :param src: URL of the source open library server :param dest: URL of the destination open library server :param recursive: Recursively fetch all the referred docs :param editions: Also fetch all the editions of works :param lists: Copy docs from list(s) :param search: Run a search on open library and copy docs from the results """ # Mypy doesn't handle union-ing types across if statements -_- # https://github.com/python/mypy/issues/6233 src_ol: Union[Disk, OpenLibrary] = ( OpenLibrary(src) if src.startswith("http://") else Disk(src)) dest_ol: Union[Disk, OpenLibrary] = ( OpenLibrary(dest) if dest.startswith("http://") else Disk(dest)) if isinstance(dest_ol, OpenLibrary): section = "[%s]" % web.lstrips(dest, "http://").strip("/") if section in read_lines(os.path.expanduser("~/.olrc")): dest_ol.autologin() else: dest_ol.login("admin", "admin123") for list_key in (lists or []): copy_list(src_ol, dest_ol, list_key, comment=comment) if search: assert isinstance(src_ol, OpenLibrary), "Search only works with OL src" keys += [ doc['key'] for doc in src_ol.search(search, limit=search_limit, fields=['key'])['docs'] ] keys = list(expand(src_ol, ('/' + k.lstrip('/') for k in keys))) copy(src_ol, dest_ol, keys, comment=comment, recursive=recursive, editions=editions)
def main(): options, args = parse_args() if options.src.startswith("http://"): src = OpenLibrary(options.src) else: src = Disk(options.src) if options.dest.startswith("http://"): dest = OpenLibrary(options.dest) section = "[%s]" % web.lstrips(options.dest, "http://").strip("/") if section in read_lines(os.path.expanduser("~/.olrc")): dest.autologin() else: dest.login("admin", "admin123") else: dest = Disk(options.dest) for list_key in options.lists: copy_list(src, dest, list_key, comment=options.comment) keys = args keys = list(expand(src, keys)) copy(src, dest, keys, comment=options.comment, recursive=options.recursive)
def load(): """Loads documents to http://0.0.0.0:8080""" documents = {} for f in find("data"): doc = simplejson.load(open(f)) documents[doc['key']] = doc keys = topological_sort(documents.keys(), get_children=lambda key: [k for k in get_references(documents[key]) if not k.startswith("/type/")]) from openlibrary.api import OpenLibrary ol = OpenLibrary("http://0.0.0.0:8080") ol.autologin() print ol.save_many([documents[k] for k in keys], comment="documents copied from openlibrary.org")
def main(): global options options, args = parse_options() ol = OpenLibrary(options.server) for pattern in args: docs = ol.query({"key~": pattern, "*": None}, limit=1000) for doc in marshal(docs): if doc['type']['key'] == '/type/template': write(make_path(doc), get_value(doc, 'body')) elif doc['type']['key'] == '/type/template': write(make_path(doc), get_value(doc, 'macro')) else: delete(make_path(doc))
def add_cover_to_work(w): if 'cover_edition' in w: return q = {'type':'/type/edition', 'works':w['key'], 'publish_date': None, 'languages': '/l/eng'} cover_edition = pick_cover(query_iter(q)) if not cover_edition: q = {'type':'/type/edition', 'works':w['key'], 'publish_date': None} cover_edition = pick_cover(query_iter(q)) if not cover_edition: return w['cover_edition'] = Reference(cover_edition) if ol is None: rc = read_rc() ol = OpenLibrary("http://openlibrary.org") ol.login('WorkBot', rc['WorkBot']) print ol.save(w['key'], w, 'added cover to work')
def write_to_ol(olkey, oljson): ol = OpenLibrary("http://openlibrary.org") # Log in [daniel, sam] logged_in = False for attempt in range(5): try: ol.autologin() logged_in = True break except: print 'ol.autologin() error; retrying' if not logged_in: sys.exit('Failed to log in.') ol.save(olkey, oljson, 'Adding Table of Contents')
def main(server): ol = OpenLibrary(server) ol.autologin() volumes = ol.query({'type': '/type/volume', 'limit': False, '*': None, 'edition': {'*': None}}) volumes = dict((v['key'], v) for v in volumes) editions = dict((v['edition']['key'], v['edition']) for v in volumes.values() if v['edition']) def make_volume(v): d = {} v.pop('edition') v['type'] = {'key': '/type/volume'} for k in ['type', 'ia_id', 'volume_number']: if k in v: d[k] = v[k] return d for e in editions.values(): e['volumes'] = [] for v in volumes.values(): if v.get('edition'): e = editions[v.get('edition')['key']] e['volumes'].append(make_volume(v)) for e in editions.values(): e['volumes'] = sorted(e['volumes'], key=lambda v: v['volume_number']) print 'linking volumes to %d editions' % len(editions) ol.save_many(editions.values(), 'link volumes')
def main(): global options options, args = parse_options() ol = OpenLibrary(options.server) for pattern in args: docs = ol.query({"key~": pattern, "*": None}, limit=1000) for doc in marshal(docs): # Anand: special care to ignore bad documents in the database. if "--duplicate" in doc['key']: continue if doc['type']['key'] == '/type/template': write(make_path(doc), get_value(doc, 'body')) elif doc['type']['key'] == '/type/macro': write(make_path(doc), get_value(doc, 'macro')) else: delete(make_path(doc))
def __init__(self, username, password): """Takes a username and password of a bot account to establish a connection to OL. """ self.ol = OpenLibrary() self.ol.login(username, password) self.pagreg = re.compile(r"[^\s]\s+[:;]$") self.emptypagreg = re.compile(r"[,.:;]+$") self.formatdict = simplejson.load(codecs.open("formatdict.json", "rb", "utf-8")) self.enc2 = codecs.getencoder("ascii") self.savebuffer = {} self.badrecords = [] self.aucache = {} self.wocache = {} #self.formatcache = NKCache("ol_books_formats", api_key = "cfdeaeda-4a22-4ae7-a2bf-1634da98fa1b") self.logfile = codecs.EncodedFile(open("vacuumbot-log.tsv", "ab"), "unicode_internal", "utf-8", "replace")
def main(): ol = OpenLibrary() ol.autologin() plugin = sys.argv[1] all_docs = [] for pattern in sys.argv[2:]: docs = ol.query({"key~": pattern, "*": None}, limit=1000) all_docs.extend(docs) for doc in all_docs: doc['plugin'] = plugin print ol.save_many(all_docs, comment="Marked as part of %s plugin." % plugin)
def main(server): ol = OpenLibrary(server) ol.autologin() scan_records = ol.query(type='/type/scan_record', limit=False, edition={'*': None}) editions = (r['edition'] for r in scan_records) # process 1000 editions at a time. while True: chunk = take(1000, editions) if not chunk: break print 'linking %d editions' % len(chunk) for e in chunk: e['scan_records'] = [{'key': '/scan_record' + e['key']}] ol.save_many(chunk, 'link scan records')
def load(): """Loads documents to http://0.0.0.0:8080""" documents = {} for f in find("data"): doc = simplejson.load(open(f)) documents[doc['key']] = doc keys = topological_sort(documents.keys(), get_children=lambda key: [ k for k in get_references(documents[key]) if not k.startswith("/type/") ]) from openlibrary.api import OpenLibrary ol = OpenLibrary("http://0.0.0.0:8080") ol.autologin() print ol.save_many([documents[k] for k in keys], comment="documents copied from openlibrary.org")
logstring += "\tPublisher %s not found.\n" % before if os.path.exists("pubbot_lock.txt"): print "Bot already running. Exiting." exit() i = open("pubbot_lock.txt", 'w') t = asctime() s = time.time() try: global conn global c conn = psycopg2.connect('dbname=vandalism user=dmontalvo password=iawatchbot') c = conn.cursor() ol = OpenLibrary("http://openlibrary.org") ol.autologin() global logstring logstring = 'Started at: %s\n' % t c.execute('select * from pubqueue') queue = c.fetchall() for item in queue: master = item[0].decode('utf-8') x = 0 titlecased = '' for letter in master: val = ord(letter) if (x == 0 or master[x-1] == ' ') and val >= 97 and val <= 122: titlecased += string.upper(letter) else:
from openlibrary.catalog.merge.merge_marc import build_marc from openlibrary.catalog.importer.db_read import get_mc, withKey from openlibrary.catalog.marc.marc_subject import subjects_for_work from openlibrary.api import OpenLibrary, unmarshal from openlibrary.catalog.read_rc import read_rc rc = read_rc() marc_index = web.database(dbn='postgres', db='marc_index') marc_index.printing = True db_amazon = web.database(dbn='postgres', db='amazon') db_amazon.printing = False ol = OpenLibrary("http://openlibrary.org") ol.login('ImportBot', rc['ImportBot']) sys.stdout = codecs.getwriter('utf-8')(sys.stdout) t0 = time() t_prev = time() rec_no = 0 chunk = 50 load_count = 0 re_edition_key = re.compile('^/(?:books|b)/(OL\d+M)$') archive_id = sys.argv[1]
def get_ol(): ol = OpenLibrary() ol.autologin() return ol
from __future__ import print_function import MySQLdb, datetime, re, sys sys.path.append('/1/src/openlibrary') from openlibrary.api import OpenLibrary, Reference from pprint import pprint import six conn = MySQLdb.connect(db='merge_editions') cur = conn.cursor() re_edition_key = re.compile('^/books/OL(\d+)M$') re_work_key = re.compile('^/works/OL(\d+)W$') ol = OpenLibrary('http://openlibrary.org/') ol.login('EdwardBot', 'As1Wae9b') re_iso_date = re.compile('^(\d{4})-\d\d-\d\d$') re_end_year = re.compile('(\d{4})$') def get_publish_year(d): if not d: return m = re_iso_date.match(d) if m: return int(m.group(1)) m = re_end_year.match(d) if m: return int(m.group(1)) {'lc_classifications': ['PZ7.H558 Ru'], 'dewey_number': ['[E]']}
from openlibrary.catalog.marc.fast_parse import get_subfield_values, get_first_tag, get_tag_lines, get_subfields, BadDictionary from openlibrary.catalog.utils.query import query_iter, set_staging, query from openlibrary.catalog.utils import mk_norm from openlibrary.catalog.read_rc import read_rc from collections import defaultdict from pprint import pprint, pformat from openlibrary.catalog.utils.edit import fix_edition from openlibrary.catalog.importer.db_read import get_mc import urllib2 from openlibrary.api import OpenLibrary, Reference from lxml import etree from time import sleep, time rc = read_rc() ol = OpenLibrary("http://openlibrary.org") ol.login('WorkBot', rc['WorkBot']) fh_log = open('/1/edward/logs/WorkBot', 'a') def write_log(cat, key, title): print >> fh_log, (("%.2f" % time()), cat, key, title) sys.stdout = codecs.getwriter('utf-8')(sys.stdout) re_skip = re.compile('\b([A-Z]|Co|Dr|Jr|Capt|Mr|Mrs|Ms|Prof|Rev|Revd|Hon|etc)\.$') re_ia_marc = re.compile('^(?:.*/)?([^/]+)_(marc\.xml|meta\.mrc)(:0:\d+)?$') ns = '{http://www.loc.gov/MARC21/slim}' ns_leader = ns + 'leader' ns_data = ns + 'datafield'
#!/usr/bin/python import MySQLdb import datetime import re import sys sys.path.append('/1/src/openlibrary') from openlibrary.api import OpenLibrary, Reference from flask import Flask, render_template, request, flash, redirect, url_for, g from collections import defaultdict app = Flask(__name__) re_edition_key = re.compile('^/books/OL(\d+)M$') ol = OpenLibrary('http://openlibrary.org/') ol.login('EdwardBot', 'As1Wae9b') @app.before_request def before_request(): g.db = MySQLdb.connect(db='merge_editions') @app.after_request def after_request(r): g.db.close() return r re_nonword = re.compile(r'\W', re.U) rows = 200 app.secret_key = 'rt9%s#)5kid$!u*5_@*$f2f_%jq++nl3@d%=7f%v4&78^m4p7c'
from __future__ import print_function import MySQLdb import re from openlibrary.api import OpenLibrary, Reference from collections import defaultdict import six re_edition_key = re.compile('^/books/OL(\d+)M$') re_nonword = re.compile(r'\W', re.U) conn = MySQLdb.connect(db='merge_editions') cur = conn.cursor() cur2 = conn.cursor() ol = OpenLibrary('http://openlibrary.org/') ol.login('EdwardBot', 'As1Wae9b') cur.execute( 'select ia, editions, done from merge where done is null and unmerge_count=0' ) for ia, ekeys, done in cur.fetchall(): updates = [] ekeys = [ '/books/OL%dM' % x for x in sorted( int(re_edition_key.match(ekey).group(1)) for ekey in ekeys.split(' ')) ] print((ia, ekeys)) min_ekey = ekeys[0] editions = [ol.get(ekey) for ekey in ekeys]
from openlibrary.catalog.works.find_work_for_edition import find_matching_work from openlibrary.catalog.marc import fast_parse, is_display_marc from openlibrary.catalog.marc.parse import read_edition, NoTitle from openlibrary.catalog.marc.marc_subject import subjects_for_work from time import time, sleep from openlibrary.api import OpenLibrary, unmarshal from pprint import pprint import argparse parser = argparse.ArgumentParser(description="scribe loader") parser.add_argument("--skip_hide_books", action="store_true") parser.add_argument("--item_id") args = parser.parse_args() rc = read_rc() ol = OpenLibrary("http://openlibrary.org") ol.login("ImportBot", rc["ImportBot"]) db_amazon = web.database(dbn="postgres", db="amazon") db_amazon.printing = False db = web.database(dbn="mysql", host=rc["ia_db_host"], user=rc["ia_db_user"], passwd=rc["ia_db_pass"], db="archive") db.printing = False re_census = re.compile("^\d+(st|nd|rd|th)census") re_edition_key = re.compile("^/(?:books|b)/(OL\d+M)$") def read_short_title(title): return str(fast_parse.normalize_str(title)[:25])
from openlibrary.catalog.marc.fast_parse import get_subfield_values, get_first_tag, get_tag_lines, get_subfields, BadDictionary from openlibrary.catalog.utils.query import query_iter, withKey from openlibrary.catalog.utils import mk_norm from openlibrary.catalog.read_rc import read_rc from collections import defaultdict from pprint import pprint, pformat from openlibrary.catalog.utils.edit import fix_edition from openlibrary.catalog.importer.db_read import get_mc from urllib import urlopen from openlibrary.api import OpenLibrary from lxml import etree from time import sleep, time, strftime from openlibrary.catalog.marc.marc_subject import get_work_subjects, four_types import simplejson as json ol = OpenLibrary("http://openlibrary.org") re_skip = re.compile(r'\b([A-Z]|Co|Dr|Jr|Capt|Mr|Mrs|Ms|Prof|Rev|Revd|Hon|etc)\.$') re_work_key = re.compile('^/works/OL(\d+)W$') re_lang_key = re.compile('^/(?:l|languages)/([a-z]{3})$') re_author_key = re.compile('^/(?:a|authors)/(OL\d+A)$') re_ia_marc = re.compile('^(?:.*/)?([^/]+)_(marc\.xml|meta\.mrc)(:0:\d+)?$') ns = '{http://www.loc.gov/MARC21/slim}' ns_leader = ns + 'leader' ns_data = ns + 'datafield' def has_dot(s): return s.endswith('.') and not re_skip.search(s)
from openlibrary.catalog.utils.query import query, withKey from openlibrary.api import OpenLibrary, unmarshal from openlibrary.catalog.read_rc import read_rc rc = read_rc() ol = OpenLibrary("http://openlibrary.org") ol.login('ImportBot', rc['ImportBot']) to_fix = [] num = 0 for line in open('no_index'): for e in query({'type': '/type/edition', 'title': None, 'ocaid': line[:-1]}): num += 1 print num, e['key'], `e['title']`, line[:-1] e2 = ol.get(e['key']) del e2['ocaid'] to_fix.append(e2) ol.save_many(to_fix, 'remove link')
import MySQLdb, datetime, re, sys sys.path.append('/1/src/openlibrary') from openlibrary.api import OpenLibrary, Reference from collections import defaultdict from pprint import pprint re_edition_key = re.compile('^/books/OL(\d+)M$') re_nonword = re.compile(r'\W', re.U) re_edition = re.compile(' ed edition$') ol = OpenLibrary('http://openlibrary.org/') conn = MySQLdb.connect(db='merge_editions') cur = conn.cursor() skip = 'guineapigscomple00elwa' skip = None total = 5601 cur.execute("select ia, editions, done, unmerge_count from merge where unmerge_count != 0") # and ia='hantayo00hillrich'") unmerge_field_counts = defaultdict(int) num = 0 for ia, ekeys, done, unmerge_count in cur.fetchall(): # if unmerge_count == 0: # continue num += 1 if num % 100 == 0: print '%d/%d %.2f%%' % (num, total, ((float(num) * 100) / total)), ia if skip: if skip == ia: skip = None continue
def main(site, date=None): ol = OpenLibrary(site) ol.autologin("StatsBot") today = date or datetime.date.today().isoformat() print ol._request("/admin/stats/" + today, method='POST', data="").read()
#!/usr/bin/python from __future__ import print_function from openlibrary.api import OpenLibrary from subprocess import Popen, PIPE import MySQLdb ia_db_host = 'dbmeta.us.archive.org' ia_db_user = '******' ia_db_pass = Popen(["/opt/.petabox/dbserver"], stdout=PIPE).communicate()[0] ol = OpenLibrary('http://openlibrary.org/') local_db = MySQLdb.connect(db='merge_editions') local_cur = conn.cursor() archive_db = MySQLdb.connect(host=ia_db_host, user=ia_db_user, \ passwd=ia_db_pass, db='archive') archive_cur = conn.cursor() fields = ['identifier', 'updated', 'collection'] sql_fields = ', '.join(fields) archive_cur.execute("select " + sql_fields + \ " from metadata" + \ " where scanner is not null and mediatype='texts'" + \ " and (not curatestate='dark' or curatestate is null)" + \ " and collection is not null and boxid is not null and identifier not like 'zdanh_test%' and scandate is not null " + \ " order by updated") for num, (ia, updated, collection) in enumerate(cur.fetchall()):
from __future__ import print_function import MySQLdb import re import sys sys.path.append('/1/src/openlibrary') from openlibrary.api import OpenLibrary, Reference from collections import defaultdict re_edition_key = re.compile('^/books/OL(\d+)M$') re_nonword = re.compile(r'\W', re.U) re_edition = re.compile(' ed edition$') ol = OpenLibrary('http://openlibrary.org/') conn = MySQLdb.connect(db='merge_editions') cur = conn.cursor() skip = 'guineapigscomple00elwa' skip = None total = 5601 cur.execute( "select ia, editions, done, unmerge_count from merge where unmerge_count != 0" ) # and ia='hantayo00hillrich'") unmerge_field_counts = defaultdict(int) num = 0 for ia, ekeys, done, unmerge_count in cur.fetchall(): # if unmerge_count == 0: # continue num += 1 if num % 100 == 0: print('%d/%d %.2f%%' % (num, total, ((float(num) * 100) / total)), ia)
from subprocess import Popen, PIPE import argparse parser = argparse.ArgumentParser(description='scribe loader') parser.add_argument('--skip_hide_books', action='store_true') parser.add_argument('--item_id') parser.add_argument('--config', default='openlibrary.yml') args = parser.parse_args() config_file = args.config config.load(config_file) import_bot_password = config.runtime_config['load_scribe']['import_bot_password'] # '/1/var/log/openlibrary/load_scribe' load_scribe_log = config.runtime_config['load_scribe']['log'] ol = OpenLibrary("http://openlibrary.org") ol.login('ImportBot', import_bot_password) password = Popen(["/opt/.petabox/dbserver"], stdout=PIPE).communicate()[0] db = web.database(dbn='mysql', host='dbmeta.us.archive.org', user='******', \ passwd=password, db='archive') db.printing = False re_census = re.compile('^\d+(st|nd|rd|th)census') re_edition_key = re.compile('^/(?:books|b)/(OL\d+M)$') def read_short_title(title): return str(fast_parse.normalize_str(title)[:25]) def make_index_fields(rec):
from openlibrary.api import OpenLibrary ol = OpenLibrary('http://openlibrary.org/') data = eval(open('update').read()) done = [] for author in data: print(author['key'], author['name']) akey = author['key'] a = ol.get(akey) if not a.get('bio') and author['bio']: a['bio'] = author['bio'] ol.save(akey, a, 'Add author bio from Smashwords.') for edition in author['editions']: # wkey = ol.new({ # 'type': '/type/work', # 'title': edition['title'], # 'authors': [{'author': {'key': akey}}], # 'description': edition['description'], # 'subjects': ['Lending library'], # }) # wkey = edition['work'] w = ol.get(wkey) assert edition['description'] if not w.get('description'): w['description'] = edition['description'] if 'Lending library' not in w.get('subjects', []):
from openlibrary.catalog.marc.fast_parse import get_subfield_values, get_first_tag, get_tag_lines, get_subfields, BadDictionary from openlibrary.catalog.utils.query import query_iter, withKey from openlibrary.catalog.utils import mk_norm from openlibrary.catalog.read_rc import read_rc from collections import defaultdict from pprint import pprint, pformat from openlibrary.catalog.utils.edit import fix_edition from openlibrary.catalog.importer.db_read import get_mc from urllib import urlopen from openlibrary.api import OpenLibrary from lxml import etree from time import sleep, time, strftime from openlibrary.catalog.marc.marc_subject import get_work_subjects, four_types import simplejson as json ol = OpenLibrary("http://openlibrary.org") re_skip = re.compile( r'\b([A-Z]|Co|Dr|Jr|Capt|Mr|Mrs|Ms|Prof|Rev|Revd|Hon|etc)\.$') re_work_key = re.compile('^/works/OL(\d+)W$') re_lang_key = re.compile('^/(?:l|languages)/([a-z]{3})$') re_author_key = re.compile('^/(?:a|authors)/(OL\d+A)$') re_ia_marc = re.compile('^(?:.*/)?([^/]+)_(marc\.xml|meta\.mrc)(:0:\d+)?$') ns = '{http://www.loc.gov/MARC21/slim}' ns_leader = ns + 'leader' ns_data = ns + 'datafield' def has_dot(s):
from __future__ import print_function import sys import codecs from openlibrary.catalog.utils.query import query_iter, set_staging, query from openlibrary.api import OpenLibrary, Reference from openlibrary.catalog.read_rc import read_rc from time import sleep set_staging(True) rc = read_rc() ol = OpenLibrary("http://dev.openlibrary.org") ol.login('EdwardBot', rc['EdwardBot']) sys.stdout = codecs.getwriter('utf-8')(sys.stdout) work_q = { 'type': '/type/work', 'authors': None, 'title': None, } queue = [] for w in query_iter(work_q): if not w.get('authors'): print('no authors') continue if any(isinstance(a, dict) and 'author' in a for a in w['authors']): continue print(len(queue), w['key'], w['title']) # , ol.get(w['authors'][0]['key'])['name'] full = ol.get(w['key'])
from __future__ import print_function import MySQLdb, datetime, re, sys from openlibrary.api import OpenLibrary, Reference from collections import defaultdict import six re_edition_key = re.compile('^/books/OL(\d+)M$') re_nonword = re.compile(r'\W', re.U) conn = MySQLdb.connect(db='merge_editions') cur = conn.cursor() cur2 = conn.cursor() ol = OpenLibrary('http://openlibrary.org/') ol.login('EdwardBot', 'As1Wae9b') cur.execute('select ia, editions, done from merge where done is null and unmerge_count=0') for ia, ekeys, done in cur.fetchall(): updates = [] ekeys = ['/books/OL%dM' % x for x in sorted(int(re_edition_key.match(ekey).group(1)) for ekey in ekeys.split(' '))] print((ia, ekeys)) min_ekey = ekeys[0] editions = [ol.get(ekey) for ekey in ekeys] master = editions[0] for e in editions: for k in 'classifications', 'identifiers', 'table_of_contents': if k in e and not e[k]: del e[k]
from __future__ import print_function import sys import codecs from openlibrary.catalog.utils.query import query_iter, set_staging, query from openlibrary.api import OpenLibrary, Reference from openlibrary.catalog.read_rc import read_rc from time import sleep set_staging(True) rc = read_rc() ol = OpenLibrary("http://dev.openlibrary.org") ol.login('EdwardBot', rc['EdwardBot']) sys.stdout = codecs.getwriter('utf-8')(sys.stdout) work_q = { 'type': '/type/work', 'authors': None, 'title': None, } queue = [] for w in query_iter(work_q): if not w.get('authors'): print('no authors') continue if any(isinstance(a, dict) and 'author' in a for a in w['authors']): continue print(len(queue), w['key'], w['title']) # , ol.get(w['authors'][0]['key'])['name']
from openlibrary.catalog.marc.parse_xml import parse, xml_rec from openlibrary.catalog.marc import read_xml from openlibrary.api import OpenLibrary, unmarshal from openlibrary.catalog.marc import fast_parse from openlibrary.catalog.importer import pool from openlibrary.catalog.merge.merge_marc import build_marc from openlibrary.catalog.utils.query import query, withKey from openlibrary.catalog.importer.merge import try_merge import os, re, sys ol = OpenLibrary("http://openlibrary.org") re_census = re.compile('^\d+(st|nd|rd|th)census') re_edition_key = re.compile('^/(?:b|books)/(OL\d+M)$') scan_dir = '/2/edward/20century/scans/' def read_short_title(title): return str(fast_parse.normalize_str(title)[:25]) def make_index_fields(rec): fields = {} for k, v in rec.iteritems(): if k in ('lccn', 'oclc', 'isbn'): fields[k] = v continue if k == 'full_title': fields['title'] = [read_short_title(v)] return fields # no maps or cartograph
"""Sample script to update OL records in bulk. """ from openlibrary.api import OpenLibrary import web import sys ol = OpenLibrary() def read_mapping(filename, chunksize=1000): """Reads OLID, OCLC_NUMBER mapping from given file. Assumes that the file contains one OLID, OCLC_NUMBER per line, separated by tab. """ for line in open(filename): olid, oclc = line.strip().split("\t") yield olid, oclc def add_identifier(doc, id_name, id_value): if id_name in ["isbn_10", "isbn_13", "oclc_numbers", "lccn"]: ids = doc.setdefault(id_name, []) else: ids = doc.setdefault("identifiers", {}).setdefault(id_name, []) if id_value not in ids: ids.append(id_value) def has_identifier(doc, id_name, id_value):
from openlibrary.catalog.merge.merge_marc import * from openlibrary.catalog.read_rc import read_rc import openlibrary.catalog.merge.amazon as amazon from openlibrary.catalog.get_ia import * from openlibrary.catalog.importer.db_read import withKey, get_mc from openlibrary.api import OpenLibrary, Reference import openlibrary.catalog.marc.fast_parse as fast_parse import xml.parsers.expat import web, sys from time import sleep import six rc = read_rc() ol = OpenLibrary("http://openlibrary.org") ol.login('ImportBot', rc['ImportBot']) ia_db = web.database(dbn='mysql', db='archive', user=rc['ia_db_user'], pw=rc['ia_db_pass'], host=rc['ia_db_host']) ia_db.printing = False re_meta_marc = re.compile('([^/]+)_(meta|marc)\.(mrc|xml)') threshold = 875 amazon.set_isbn_match(225)
from openlibrary.catalog.utils.query import query, withKey from openlibrary.api import OpenLibrary, unmarshal from openlibrary.catalog.read_rc import read_rc rc = read_rc() ol = OpenLibrary("http://openlibrary.org") ol.login('ImportBot', rc['ImportBot']) to_fix = [] num = 0 for line in open('no_index'): for e in query({ 'type': '/type/edition', 'title': None, 'ocaid': line[:-1] }): num += 1 print(num, e['key'], repr(e['title']), line[:-1]) e2 = ol.get(e['key']) del e2['ocaid'] to_fix.append(e2) ol.save_many(to_fix, 'remove link')
default="openlibrary.org:80", help="The openlibrary API host") op.add_option("-k", "--nyt-api-key", dest="nyt_api_key", help="API key for use with the nyt bestsellers api") op.add_option("-u", "--bot-username", dest="username", default="nyt_bestsellers_bot", help="The bot username for accessing the Open Library API") op.add_option("-p", "--bot-password", dest="password", help="The bot password for accessing the Open Library API") options, _ = op.parse_args() global NYT_API_KEY NYT_API_KEY = options.nyt_api_key global OL OL = OpenLibrary("http://%s" % options.openlibrary_host) OL.login(options.username, options.password) results = collections.defaultdict(list) for ln in get_nyt_bestseller_list_names(): LOG("INFO", "processing list %s" % ln) for i, book in enumerate(load_nyt_bestseller_list(ln)): ol_keys = reconcile_book(book) if not ol_keys: LOG("WARN", "unable to reconcile '%s' by %s - no OL book found" % ( book['book_details'][0]['title'], book['book_details'][0]['author'] )) if not (key for key in ol_keys if key.startswith("/works/")): LOG("WARN", "only editions for '%s' by %s: %s" % ( book['book_details'][0]['title'], book['book_details'][0]['author'], ol_keys )) results[ln].append({
#!/usr/bin/python import MySQLdb, datetime, re, sys sys.path.append('/1/src/openlibrary') from openlibrary.api import OpenLibrary, Reference from flask import Flask, render_template, request, flash, redirect, url_for, g from collections import defaultdict app = Flask(__name__) re_edition_key = re.compile('^/books/OL(\d+)M$') ol = OpenLibrary('http://openlibrary.org/') ol.login('EdwardBot', 'As1Wae9b') @app.before_request def before_request(): g.db = MySQLdb.connect(db='merge_editions') @app.after_request def after_request(r): g.db.close() return r re_nonword = re.compile(r'\W', re.U) rows = 200 app.secret_key = 'rt9%s#)5kid$!u*5_@*$f2f_%jq++nl3@d%=7f%v4&78^m4p7c'
def get_ol(servername=None): ol = OpenLibrary(base_url=servername) ol.autologin() return ol
# IdentifierBot # by Daniel Montalvo csvfile = 'LibraryThing_to_OpenLibrary.csv' batch_size = 500 import traceback import csv import string import _init_path import sys sys.path.append('/petabox/sw/lib/python') from openlibrary.api import OpenLibrary, marshal ol = OpenLibrary("http://openlibrary.org") reader = csv.reader(open(csvfile), delimiter='\t', quotechar='|') f = open('authors.txt', 'w') def fix_toc(doc): doc = marshal(doc) def f(d): """function to fix one toc entry.""" if d.get('type') == '/type/text': return dict(title=d['value']) else: return d toc = doc.get('table_of_contents')
#!/usr/bin/python from __future__ import print_function from subprocess import Popen, PIPE from openlibrary.utils.ia import find_item, FindItemError from openlibrary.api import OpenLibrary from openlibrary.catalog.read_rc import read_rc from openlibrary.catalog.get_ia import marc_formats, get_marc_ia_data from openlibrary.catalog.marc import is_display_marc from time import sleep, time import MySQLdb import re, urllib2, httplib, json, codecs, socket, sys ol = OpenLibrary('http://openlibrary.org/') rc = read_rc() base_url = 'http://openlibrary.org' ia_db_host = 'dbmeta.us.archive.org' ia_db_user = '******' ia_db_pass = Popen(["/opt/.petabox/dbserver"], stdout=PIPE).communicate()[0] re_census = re.compile('^\d+(st|nd|rd|th)census') fields = [ 'identifier', 'contributor', 'updated', 'noindex', 'collection', 'format', 'boxid' ] sql_fields = ', '.join(fields)
csvfile = 'LibraryThing_to_OpenLibrary.csv' db = 'ids.sqlite' import csv import string import sqlite3 import _init_path from openlibrary.api import OpenLibrary ol = OpenLibrary('http://openlibrary.org/') for attempt in range(5): try: ol.autologin() break except: print 'ol.autologin() error; retrying' conn = sqlite3.connect(db) c = conn.cursor() reader = csv.reader(open(csvfile), delimiter='\t', quotechar='|') for row in reader: ltid = row[0] olid = row[1] key = '/books' + olid[olid.rindex('/'):len(olid)] c.execute('select * from ids where key = ?', (key, )) x = c.fetchone() if x != None: continue print 'Trying to get key: %r' % key for attempt in range(5): try:
op.add_option("-u", "--bot-username", dest="username", default="nyt_bestsellers_bot", help="The bot username for accessing the Open Library API") op.add_option("-p", "--bot-password", dest="password", help="The bot password for accessing the Open Library API") options, _ = op.parse_args() global NYT_API_KEY NYT_API_KEY = options.nyt_api_key global OL OL = OpenLibrary("http://%s" % options.openlibrary_host) OL.login(options.username, options.password) results = collections.defaultdict(list) for ln in get_nyt_bestseller_list_names(): LOG("INFO", "processing list %s" % ln) for i, book in enumerate(load_nyt_bestseller_list(ln)): ol_keys = reconcile_book(book) if not ol_keys: LOG( "WARN", "unable to reconcile '%s' by %s - no OL book found" % (book['book_details'][0]['title'], book['book_details'][0]['author'])) if not (key for key in ol_keys if key.startswith("/works/")): LOG( "WARN", "only editions for '%s' by %s: %s" %
from __future__ import print_function # try and find an existing work for a book from openlibrary.api import OpenLibrary from openlibrary.catalog.utils import mk_norm import sys from time import time ol = OpenLibrary("http://openlibrary.org") def find_matching_work(e): norm_title = mk_norm(e['title']) seen = set() for akey in e['authors']: q = { 'type': '/type/work', 'authors': { 'author': { 'key': akey } }, 'limit': 0, 'title': None, } t0 = time() work_keys = list(ol.query(q)) t1 = time() - t0 print('time to find books by author: %.1f seconds' % t1) for w in work_keys:
#!/usr/bin/python from subprocess import Popen, PIPE from openlibrary.utils.ia import find_item, FindItemError from openlibrary.api import OpenLibrary from openlibrary.catalog.read_rc import read_rc from openlibrary.catalog.get_ia import marc_formats, get_marc_ia_data from openlibrary.catalog.marc import is_display_marc from time import sleep, time from pprint import pprint import MySQLdb import re, urllib2, httplib, json, codecs, socket, sys ol = OpenLibrary('http://openlibrary.org/') rc = read_rc() base_url = 'http://openlibrary.org' ia_db_host = 'dbmeta.us.archive.org' ia_db_user = '******' ia_db_pass = Popen(["/opt/.petabox/dbserver"], stdout=PIPE).communicate()[0] re_census = re.compile('^\d+(st|nd|rd|th)census') fields = ['identifier', 'contributor', 'updated', 'noindex', 'collection', 'format', 'boxid'] sql_fields = ', '.join(fields) scanned_start = open('scanned_start').readline()[:-1] ignore_noindex = set(['printdisabled', 'lendinglibrary', 'inlibrary'])
import argparse parser = argparse.ArgumentParser(description='scribe loader') parser.add_argument('--skip_hide_books', action='store_true') parser.add_argument('--item_id') parser.add_argument('--config', default='openlibrary.yml') args = parser.parse_args() config_file = args.config config.load(config_file) import_bot_password = config.runtime_config['load_scribe'][ 'import_bot_password'] # '/1/var/log/openlibrary/load_scribe' load_scribe_log = config.runtime_config['load_scribe']['log'] ol = OpenLibrary("http://openlibrary.org") ol.login('ImportBot', import_bot_password) password = Popen(["/opt/.petabox/dbserver"], stdout=PIPE).communicate()[0] db = web.database(dbn='mysql', host='dbmeta.us.archive.org', user='******', \ passwd=password, db='archive') db.printing = False re_census = re.compile('^\d+(st|nd|rd|th)census') re_edition_key = re.compile('^/(?:books|b)/(OL\d+M)$') def read_short_title(title): return str(fast_parse.normalize_str(title)[:25])
# IdentifierBot # by Daniel Montalvo csvfile = 'LibraryThing_to_OpenLibrary.csv' batch_size = 500 import traceback import csv import string import _init_path import sys sys.path.append('/petabox/sw/lib/python') from openlibrary.api import OpenLibrary, marshal ol = OpenLibrary("http://openlibrary.org") reader = csv.reader(open(csvfile), delimiter='\t', quotechar='|') f = open('authors.txt', 'w') def fix_toc(doc): doc = marshal(doc) def f(d): """function to fix one toc entry.""" if d.get('type') == '/type/text': return dict(title=d['value']) else: return d toc = doc.get('table_of_contents') if toc:
parser.add_argument('--no_author_updates', action='store_true') parser.add_argument('--just_consider_authors', action='store_true') parser.add_argument('--limit', default=None) args = parser.parse_args() handle_author_merge = args.handle_author_merge only_author_merge = args.only_author_merge skip_author_merge = args.skip_author_merge if only_author_merge: handle_author_merge = True if handle_author_merge: from openlibrary.catalog.works.find_works import find_title_redirects, find_works, get_books, books_query, update_works ol = OpenLibrary("http://" + args.server) set_query_host(args.server) done_login = False config_file = args.config config.load(config_file) base = 'http://%s/openlibrary.org/log/' % config.runtime_config[ 'infobase_server'] skip_user = set(u.lower() for u in args.skip_user) only_user = set(u.lower() for u in args.only_user) if 'state_dir' not in config.runtime_config: print 'state_dir missing from ' + config_file sys.exit(0)
import csv, httplib, sys, codecs, re from openlibrary.api import OpenLibrary from pprint import pprint, pformat sys.stdout = codecs.getwriter('utf-8')(sys.stdout) h1 = httplib.HTTPConnection('www.archive.org') h1.connect() ol = OpenLibrary('http://openlibrary.org/') #ol.login('EdwardBot', 'As1Wae9b') input_file = '/home/edward/Documents/smashwords_ia_20110325.csv' input_file = '/home/edward/Documents/smashwords_ia_20110325-extended-20110406.csv' #headings = ['Distributor', 'Author Name', 'Author Bio', 'Publisher', 'SWID', # 'Book Title', 'Price', 'Short Book Description', 'Long Book Description', # 'ISBN', 'BISAC I', 'BISAC II', 'Where to buy'] # ['Distributor', 'Author Name', 'Author Bio', 'Publisher', 'SWID', # 'Book Title', 'Pub. Date @ Smashwords', 'Price', 'Short Book Description', # 'Long Book Description', 'ISBN', 'BISAC I', 'BISAC II', 'Word Count (Approx.)', # 'Where to buy'] authors = {} titles = set() isbn = set() name_map = { 'JA Konrath': 'J. A. Konrath' }
import MySQLdb, datetime, re, sys sys.path.append('/1/src/openlibrary') from openlibrary.api import OpenLibrary, Reference from pprint import pprint conn = MySQLdb.connect(db='merge_editions') cur = conn.cursor() re_edition_key = re.compile('^/books/OL(\d+)M$') re_work_key = re.compile('^/works/OL(\d+)W$') ol = OpenLibrary('http://openlibrary.org/') ol.login('EdwardBot', 'As1Wae9b') re_iso_date = re.compile('^(\d{4})-\d\d-\d\d$') re_end_year = re.compile('(\d{4})$') def get_publish_year(d): if not d: return m = re_iso_date.match(d) if m: return int(m.group(1)) m = re_end_year.match(d) if m: return int(m.group(1)) {'lc_classifications': ['PZ7.H558 Ru'], 'dewey_number': ['[E]']}
from openlibrary.catalog.utils import cmp, mk_norm from openlibrary.catalog.read_rc import read_rc from collections import defaultdict from pprint import pformat from openlibrary.catalog.utils.edit import fix_edition from openlibrary.catalog.importer.db_read import get_mc import urllib2 from openlibrary.api import OpenLibrary, Reference from lxml import etree from time import sleep, time import six rc = read_rc() ol = OpenLibrary("http://openlibrary.org") ol.login('WorkBot', rc['WorkBot']) def write_log(cat, key, title): print((("%.2f" % time()), cat, key, title), file=fh_log) fh_log.flush() sys.stdout = codecs.getwriter('utf-8')(sys.stdout) re_skip = re.compile( '\b([A-Z]|Co|Dr|Jr|Capt|Mr|Mrs|Ms|Prof|Rev|Revd|Hon|etc)\.$') re_ia_marc = re.compile('^(?:.*/)?([^/]+)_(marc\.xml|meta\.mrc)(:0:\d+)?$') ns = '{http://www.loc.gov/MARC21/slim}'
from catalog.importer.db_read import get_mc, withKey from openlibrary.api import OpenLibrary from catalog.read_rc import read_rc import six rc = read_rc() marc_index = web.database(dbn='postgres', db='marc_index') marc_index.printing = False db_amazon = web.database(dbn='postgres', db='amazon') db_amazon.printing = False ol = OpenLibrary("http://openlibrary.org") ol.login('ImportBot', rc['ImportBot']) sys.stdout = codecs.getwriter('utf-8')(sys.stdout) t0 = time() t_prev = time() rec_no = 0 chunk = 50 load_count = 0 archive_id = sys.argv[1] def percent(a, b): return float(a * 100.0) / b
from openlibrary.catalog.get_ia import files, read_marc_file from openlibrary.catalog.merge.merge_marc import build_marc from openlibrary.catalog.importer.db_read import get_mc, withKey from openlibrary.api import OpenLibrary, unmarshal from openlibrary.catalog.read_rc import read_rc rc = read_rc() marc_index = web.database(dbn='postgres', db='marc_index') marc_index.printing = False db_amazon = web.database(dbn='postgres', db='amazon') db_amazon.printing = False ol = OpenLibrary("http://openlibrary.org") ol.login('ImportBot', rc['ImportBot']) sys.stdout = codecs.getwriter('utf-8')(sys.stdout) t0 = time() t_prev = time() rec_no = 0 chunk = 50 load_count = 0 archive_id = sys.argv[1] def get_with_retry(key): for i in range(3): try: