Example #1
0
def main(
        keys: list[str],
        src="http://openlibrary.org/",
        dest="http://localhost:8080",
        comment="",
        recursive=True,
        editions=True,
        lists: list[str] = None,
        search: str = None,
        search_limit: int = 10,
):
    """
    Script to copy docs from one OL instance to another.
    Typically used to copy templates, macros, css and js from
    openlibrary.org to dev instance. paths can end with wildcards.

    USAGE:
        # Copy all templates
        ./scripts/copydocs.py --src http://openlibrary.org /templates/*
        # Copy specific records
        ./scripts/copydocs.py /authors/OL113592A /works/OL1098727W?v=2
        # Copy search results
        ./scripts/copydocs.py --search "publisher:librivox" --search-limit 10


    :param src: URL of the source open library server
    :param dest: URL of the destination open library server
    :param recursive: Recursively fetch all the referred docs
    :param editions: Also fetch all the editions of works
    :param lists: Copy docs from list(s)
    :param search: Run a search on open library and copy docs from the results
    """

    # Mypy doesn't handle union-ing types across if statements -_-
    # https://github.com/python/mypy/issues/6233
    src_ol: Union[Disk, OpenLibrary] = (
        OpenLibrary(src) if src.startswith("http://") else Disk(src))
    dest_ol: Union[Disk, OpenLibrary] = (
        OpenLibrary(dest) if dest.startswith("http://") else Disk(dest))

    if isinstance(dest_ol, OpenLibrary):
        section = "[%s]" % web.lstrips(dest, "http://").strip("/")
        if section in read_lines(os.path.expanduser("~/.olrc")):
            dest_ol.autologin()
        else:
            dest_ol.login("admin", "admin123")

    for list_key in (lists or []):
        copy_list(src_ol, dest_ol, list_key, comment=comment)

    if search:
        assert isinstance(src_ol, OpenLibrary), "Search only works with OL src"
        keys += [
            doc['key']
            for doc in src_ol.search(search, limit=search_limit, fields=['key'])['docs']
        ]

    keys = list(expand(src_ol, ('/' + k.lstrip('/') for k in keys)))

    copy(src_ol, dest_ol, keys, comment=comment, recursive=recursive, editions=editions)
Example #2
0
def main():
    options, args = parse_args()

    if options.src.startswith("http://"):
        src = OpenLibrary(options.src)
    else:
        src = Disk(options.src)

    if options.dest.startswith("http://"):
        dest = OpenLibrary(options.dest)
        section = "[%s]" % web.lstrips(options.dest, "http://").strip("/")
        if section in read_lines(os.path.expanduser("~/.olrc")):
            dest.autologin()
        else:
            dest.login("admin", "admin123")
    else:
        dest = Disk(options.dest)

    for list_key in options.lists:
        copy_list(src, dest, list_key, comment=options.comment)

    keys = args
    keys = list(expand(src, keys))

    copy(src, dest, keys, comment=options.comment, recursive=options.recursive)
Example #3
0
def load():
    """Loads documents to http://0.0.0.0:8080"""
    documents = {}
    for f in find("data"):
        doc = simplejson.load(open(f))
        documents[doc['key']] = doc

    keys = topological_sort(documents.keys(), 
            get_children=lambda key: [k for k in get_references(documents[key]) if not k.startswith("/type/")])

    from openlibrary.api import OpenLibrary
    ol = OpenLibrary("http://0.0.0.0:8080")
    ol.autologin()
    print ol.save_many([documents[k] for k in keys], comment="documents copied from openlibrary.org")
Example #4
0
def main():
    global options
    options, args = parse_options()
    ol = OpenLibrary(options.server)

    for pattern in args:
        docs = ol.query({"key~": pattern, "*": None}, limit=1000)
        for doc in marshal(docs):
            if doc['type']['key'] == '/type/template':
                write(make_path(doc), get_value(doc, 'body'))
            elif doc['type']['key'] == '/type/template':
                write(make_path(doc), get_value(doc, 'macro'))
            else:
                delete(make_path(doc))
Example #5
0
def add_cover_to_work(w):
    if 'cover_edition' in w:
        return
    q = {'type':'/type/edition', 'works':w['key'], 'publish_date': None, 'languages': '/l/eng'}
    cover_edition = pick_cover(query_iter(q))
    if not cover_edition:
        q = {'type':'/type/edition', 'works':w['key'], 'publish_date': None}
        cover_edition = pick_cover(query_iter(q))
        if not cover_edition:
            return
    w['cover_edition'] = Reference(cover_edition)
    if ol is None:
        rc = read_rc()
        ol = OpenLibrary("http://openlibrary.org")
        ol.login('WorkBot', rc['WorkBot']) 

    print ol.save(w['key'], w, 'added cover to work')
Example #6
0
def write_to_ol(olkey, oljson):

    ol = OpenLibrary("http://openlibrary.org")

    # Log in [daniel, sam]
    logged_in = False
    for attempt in range(5):
        try:
            ol.autologin()
            logged_in = True
            break
        except:
            print 'ol.autologin() error; retrying'
    if not logged_in:
        sys.exit('Failed to log in.')    

    ol.save(olkey, oljson, 'Adding Table of Contents')
Example #7
0
def main(server):
    ol = OpenLibrary(server)
    ol.autologin()

    volumes = ol.query({'type': '/type/volume', 'limit': False, '*': None, 'edition': {'*': None}})

    volumes = dict((v['key'], v) for v in volumes)
    editions = dict((v['edition']['key'], v['edition']) for v in volumes.values() if v['edition'])

    def make_volume(v):
        d = {}
        v.pop('edition')
        v['type'] = {'key': '/type/volume'}
        for k in ['type', 'ia_id', 'volume_number']:
            if k in v:
                d[k] = v[k]
        return d

    for e in editions.values():
        e['volumes'] = []

    for v in volumes.values():
        if v.get('edition'):
            e = editions[v.get('edition')['key']]
            e['volumes'].append(make_volume(v))

    for e in editions.values():
        e['volumes'] = sorted(e['volumes'], key=lambda v: v['volume_number'])

    print 'linking volumes to %d editions' % len(editions)
    ol.save_many(editions.values(), 'link volumes')
def main():
    global options
    options, args = parse_options()
    ol = OpenLibrary(options.server)

    for pattern in args:
        docs = ol.query({"key~": pattern, "*": None}, limit=1000)
        for doc in marshal(docs):
            # Anand: special care to ignore bad documents in the database.
            if "--duplicate" in doc['key']:
                continue

            if doc['type']['key'] == '/type/template':
                write(make_path(doc), get_value(doc, 'body'))
            elif doc['type']['key'] == '/type/macro':
                write(make_path(doc), get_value(doc, 'macro'))
            else:
                delete(make_path(doc))
def main():
    global options
    options, args = parse_options()
    ol = OpenLibrary(options.server)

    for pattern in args:
        docs = ol.query({"key~": pattern, "*": None}, limit=1000)
        for doc in marshal(docs):
            # Anand: special care to ignore bad documents in the database.
            if "--duplicate" in doc['key']:
                continue

            if doc['type']['key'] == '/type/template':
                write(make_path(doc), get_value(doc, 'body'))
            elif doc['type']['key'] == '/type/macro':
                write(make_path(doc), get_value(doc, 'macro'))
            else:
                delete(make_path(doc))
Example #10
0
def main():
    options, args = parse_args()

    if options.src.startswith("http://"):
        src = OpenLibrary(options.src)
    else:
        src = Disk(options.src)

    if options.dest.startswith("http://"):
        dest = OpenLibrary(options.dest)
        section = "[%s]" % web.lstrips(options.dest, "http://").strip("/")
        if section in read_lines(os.path.expanduser("~/.olrc")):
            dest.autologin()
        else:
            dest.login("admin", "admin123")
    else:
        dest = Disk(options.dest)

    for list_key in options.lists:
        copy_list(src, dest, list_key, comment=options.comment)

    keys = args
    keys = list(expand(src, keys))

    copy(src, dest, keys, comment=options.comment, recursive=options.recursive)
Example #11
0
 def __init__(self, username, password):
   """Takes a username and password of a bot account to establish a connection to OL.
   
   """
   self.ol = OpenLibrary()
   self.ol.login(username, password)
   self.pagreg = re.compile(r"[^\s]\s+[:;]$")
   self.emptypagreg = re.compile(r"[,.:;]+$")
   self.formatdict = simplejson.load(codecs.open("formatdict.json", "rb", "utf-8"))
   self.enc2 = codecs.getencoder("ascii")
   self.savebuffer = {}
   self.badrecords = []
   self.aucache = {}
   self.wocache = {}
   #self.formatcache = NKCache("ol_books_formats", api_key = "cfdeaeda-4a22-4ae7-a2bf-1634da98fa1b")
   self.logfile = codecs.EncodedFile(open("vacuumbot-log.tsv", "ab"), "unicode_internal", "utf-8", "replace")
Example #12
0
def main():
    ol = OpenLibrary()
    ol.autologin()

    plugin = sys.argv[1]

    all_docs = []

    for pattern in sys.argv[2:]:
        docs = ol.query({"key~": pattern, "*": None}, limit=1000)
        all_docs.extend(docs)

    for doc in all_docs:
        doc['plugin'] = plugin

    print ol.save_many(all_docs, comment="Marked as part of %s plugin." % plugin)
Example #13
0
def main():
    ol = OpenLibrary()
    ol.autologin()

    plugin = sys.argv[1]

    all_docs = []

    for pattern in sys.argv[2:]:
        docs = ol.query({"key~": pattern, "*": None}, limit=1000)
        all_docs.extend(docs)

    for doc in all_docs:
        doc['plugin'] = plugin

    print ol.save_many(all_docs, comment="Marked as part of %s plugin." % plugin)
def main(server):
    ol = OpenLibrary(server)
    ol.autologin()

    scan_records = ol.query(type='/type/scan_record',
                            limit=False,
                            edition={'*': None})
    editions = (r['edition'] for r in scan_records)

    # process 1000 editions at a time.
    while True:
        chunk = take(1000, editions)
        if not chunk:
            break

        print 'linking %d editions' % len(chunk)

        for e in chunk:
            e['scan_records'] = [{'key': '/scan_record' + e['key']}]

        ol.save_many(chunk, 'link scan records')
Example #15
0
def load():
    """Loads documents to http://0.0.0.0:8080"""
    documents = {}
    for f in find("data"):
        doc = simplejson.load(open(f))
        documents[doc['key']] = doc

    keys = topological_sort(documents.keys(),
                            get_children=lambda key: [
                                k for k in get_references(documents[key])
                                if not k.startswith("/type/")
                            ])

    from openlibrary.api import OpenLibrary
    ol = OpenLibrary("http://0.0.0.0:8080")
    ol.autologin()
    print ol.save_many([documents[k] for k in keys],
                       comment="documents copied from openlibrary.org")
Example #16
0
def main(server):
    ol = OpenLibrary(server)
    ol.autologin()

    scan_records = ol.query(type='/type/scan_record', limit=False, edition={'*': None})
    editions = (r['edition'] for r in scan_records)

    # process 1000 editions at a time.
    while True:
        chunk = take(1000, editions)
        if not chunk:
            break

        print 'linking %d editions' % len(chunk)
        
        for e in chunk:
            e['scan_records'] = [{'key': '/scan_record' + e['key']}]
            
        ol.save_many(chunk, 'link scan records')
Example #17
0
        logstring += "\tPublisher %s not found.\n" % before

if os.path.exists("pubbot_lock.txt"):
    print "Bot already running. Exiting."
    exit()
i = open("pubbot_lock.txt", 'w')

t = asctime()
s = time.time()

try:
    global conn
    global c
    conn = psycopg2.connect('dbname=vandalism user=dmontalvo password=iawatchbot')
    c = conn.cursor()
    ol = OpenLibrary("http://openlibrary.org")
    ol.autologin()
    global logstring
    logstring = 'Started at: %s\n' % t
    c.execute('select * from pubqueue')
    queue = c.fetchall()

    for item in queue:
        master = item[0].decode('utf-8')
        x = 0
        titlecased = ''
        for letter in master:
            val = ord(letter)
            if (x == 0 or master[x-1] == ' ') and val >= 97 and val <= 122:
                titlecased += string.upper(letter)
            else:
Example #18
0
from openlibrary.catalog.merge.merge_marc import build_marc
from openlibrary.catalog.importer.db_read import get_mc, withKey
from openlibrary.catalog.marc.marc_subject import subjects_for_work
from openlibrary.api import OpenLibrary, unmarshal

from openlibrary.catalog.read_rc import read_rc

rc = read_rc()

marc_index = web.database(dbn='postgres', db='marc_index')
marc_index.printing = True

db_amazon = web.database(dbn='postgres', db='amazon')
db_amazon.printing = False

ol = OpenLibrary("http://openlibrary.org")
ol.login('ImportBot', rc['ImportBot'])

sys.stdout = codecs.getwriter('utf-8')(sys.stdout)

t0 = time()
t_prev = time()
rec_no = 0
chunk = 50
load_count = 0

re_edition_key = re.compile('^/(?:books|b)/(OL\d+M)$')

archive_id = sys.argv[1]

Example #19
0
def get_ol():
    ol = OpenLibrary()
    ol.autologin()
    return ol
Example #20
0
from __future__ import print_function
import MySQLdb, datetime, re, sys
sys.path.append('/1/src/openlibrary')
from openlibrary.api import OpenLibrary, Reference
from pprint import pprint

import six


conn = MySQLdb.connect(db='merge_editions')
cur = conn.cursor()

re_edition_key = re.compile('^/books/OL(\d+)M$')
re_work_key = re.compile('^/works/OL(\d+)W$')
ol = OpenLibrary('http://openlibrary.org/')
ol.login('EdwardBot', 'As1Wae9b')

re_iso_date = re.compile('^(\d{4})-\d\d-\d\d$')
re_end_year = re.compile('(\d{4})$')

def get_publish_year(d):
    if not d:
        return
    m = re_iso_date.match(d)
    if m:
        return int(m.group(1))
    m = re_end_year.match(d)
    if m:
        return int(m.group(1))

{'lc_classifications': ['PZ7.H558 Ru'], 'dewey_number': ['[E]']}
Example #21
0
from openlibrary.catalog.marc.fast_parse import get_subfield_values, get_first_tag, get_tag_lines, get_subfields, BadDictionary
from openlibrary.catalog.utils.query import query_iter, set_staging, query
from openlibrary.catalog.utils import mk_norm
from openlibrary.catalog.read_rc import read_rc
from collections import defaultdict
from pprint import pprint, pformat
from openlibrary.catalog.utils.edit import fix_edition
from openlibrary.catalog.importer.db_read import get_mc
import urllib2
from openlibrary.api import OpenLibrary, Reference
from lxml import etree
from time import sleep, time

rc = read_rc()

ol = OpenLibrary("http://openlibrary.org")
ol.login('WorkBot', rc['WorkBot'])
fh_log = open('/1/edward/logs/WorkBot', 'a')

def write_log(cat, key, title):
    print >> fh_log, (("%.2f" % time()), cat, key, title)

sys.stdout = codecs.getwriter('utf-8')(sys.stdout)
re_skip = re.compile('\b([A-Z]|Co|Dr|Jr|Capt|Mr|Mrs|Ms|Prof|Rev|Revd|Hon|etc)\.$')

re_ia_marc = re.compile('^(?:.*/)?([^/]+)_(marc\.xml|meta\.mrc)(:0:\d+)?$')

ns = '{http://www.loc.gov/MARC21/slim}'
ns_leader = ns + 'leader'
ns_data = ns + 'datafield'
Example #22
0
#!/usr/bin/python

import MySQLdb
import datetime
import re
import sys
sys.path.append('/1/src/openlibrary')
from openlibrary.api import OpenLibrary, Reference
from flask import Flask, render_template, request, flash, redirect, url_for, g
from collections import defaultdict
app = Flask(__name__)

re_edition_key = re.compile('^/books/OL(\d+)M$')

ol = OpenLibrary('http://openlibrary.org/')
ol.login('EdwardBot', 'As1Wae9b')

@app.before_request
def before_request():
    g.db = MySQLdb.connect(db='merge_editions')

@app.after_request
def after_request(r):
    g.db.close()
    return r

re_nonword = re.compile(r'\W', re.U)

rows = 200

app.secret_key = 'rt9%s#)5kid$!u*5_@*$f2f_%jq++nl3@d%=7f%v4&78^m4p7c'
Example #23
0
from __future__ import print_function
import MySQLdb
import re
from openlibrary.api import OpenLibrary, Reference
from collections import defaultdict

import six

re_edition_key = re.compile('^/books/OL(\d+)M$')
re_nonword = re.compile(r'\W', re.U)

conn = MySQLdb.connect(db='merge_editions')
cur = conn.cursor()
cur2 = conn.cursor()

ol = OpenLibrary('http://openlibrary.org/')
ol.login('EdwardBot', 'As1Wae9b')

cur.execute(
    'select ia, editions, done from merge where done is null and unmerge_count=0'
)
for ia, ekeys, done in cur.fetchall():
    updates = []
    ekeys = [
        '/books/OL%dM' % x for x in sorted(
            int(re_edition_key.match(ekey).group(1))
            for ekey in ekeys.split(' '))
    ]
    print((ia, ekeys))
    min_ekey = ekeys[0]
    editions = [ol.get(ekey) for ekey in ekeys]
Example #24
0
from openlibrary.catalog.works.find_work_for_edition import find_matching_work
from openlibrary.catalog.marc import fast_parse, is_display_marc
from openlibrary.catalog.marc.parse import read_edition, NoTitle
from openlibrary.catalog.marc.marc_subject import subjects_for_work
from time import time, sleep
from openlibrary.api import OpenLibrary, unmarshal
from pprint import pprint
import argparse

parser = argparse.ArgumentParser(description="scribe loader")
parser.add_argument("--skip_hide_books", action="store_true")
parser.add_argument("--item_id")
args = parser.parse_args()

rc = read_rc()
ol = OpenLibrary("http://openlibrary.org")
ol.login("ImportBot", rc["ImportBot"])

db_amazon = web.database(dbn="postgres", db="amazon")
db_amazon.printing = False

db = web.database(dbn="mysql", host=rc["ia_db_host"], user=rc["ia_db_user"], passwd=rc["ia_db_pass"], db="archive")
db.printing = False

re_census = re.compile("^\d+(st|nd|rd|th)census")

re_edition_key = re.compile("^/(?:books|b)/(OL\d+M)$")


def read_short_title(title):
    return str(fast_parse.normalize_str(title)[:25])
Example #25
0
from openlibrary.catalog.marc.fast_parse import get_subfield_values, get_first_tag, get_tag_lines, get_subfields, BadDictionary
from openlibrary.catalog.utils.query import query_iter, withKey
from openlibrary.catalog.utils import mk_norm
from openlibrary.catalog.read_rc import read_rc
from collections import defaultdict
from pprint import pprint, pformat
from openlibrary.catalog.utils.edit import fix_edition
from openlibrary.catalog.importer.db_read import get_mc
from urllib import urlopen
from openlibrary.api import OpenLibrary
from lxml import etree
from time import sleep, time, strftime
from openlibrary.catalog.marc.marc_subject import get_work_subjects, four_types
import simplejson as json

ol = OpenLibrary("http://openlibrary.org")

re_skip = re.compile(r'\b([A-Z]|Co|Dr|Jr|Capt|Mr|Mrs|Ms|Prof|Rev|Revd|Hon|etc)\.$')
re_work_key = re.compile('^/works/OL(\d+)W$')
re_lang_key = re.compile('^/(?:l|languages)/([a-z]{3})$')
re_author_key = re.compile('^/(?:a|authors)/(OL\d+A)$')

re_ia_marc = re.compile('^(?:.*/)?([^/]+)_(marc\.xml|meta\.mrc)(:0:\d+)?$')

ns = '{http://www.loc.gov/MARC21/slim}'
ns_leader = ns + 'leader'
ns_data = ns + 'datafield'

def has_dot(s):
    return s.endswith('.') and not re_skip.search(s)
Example #26
0
from openlibrary.catalog.utils.query import query, withKey
from openlibrary.api import OpenLibrary, unmarshal
from openlibrary.catalog.read_rc import read_rc

rc = read_rc()
ol = OpenLibrary("http://openlibrary.org")
ol.login('ImportBot', rc['ImportBot'])

to_fix = []
num = 0
for line in open('no_index'):
    for e in query({'type': '/type/edition', 'title': None, 'ocaid': line[:-1]}):
        num += 1
        print num, e['key'], `e['title']`, line[:-1]
        e2 = ol.get(e['key'])
        del e2['ocaid']
        to_fix.append(e2)

ol.save_many(to_fix, 'remove link')
Example #27
0
import MySQLdb, datetime, re, sys
sys.path.append('/1/src/openlibrary')
from openlibrary.api import OpenLibrary, Reference
from collections import defaultdict
from pprint import pprint

re_edition_key = re.compile('^/books/OL(\d+)M$')
re_nonword = re.compile(r'\W', re.U)
re_edition = re.compile(' ed edition$')

ol = OpenLibrary('http://openlibrary.org/')

conn = MySQLdb.connect(db='merge_editions')
cur = conn.cursor()

skip = 'guineapigscomple00elwa'
skip = None
total = 5601
cur.execute("select ia, editions, done, unmerge_count from merge where unmerge_count != 0") # and ia='hantayo00hillrich'")
unmerge_field_counts = defaultdict(int)
num = 0
for ia, ekeys, done, unmerge_count in cur.fetchall():
#    if unmerge_count == 0:
#        continue
    num += 1
    if num % 100 == 0:
        print '%d/%d %.2f%%' % (num, total, ((float(num) * 100) / total)), ia
    if skip:
        if skip == ia:
            skip = None
        continue
Example #28
0
def main(site, date=None):
    ol = OpenLibrary(site)
    ol.autologin("StatsBot")

    today = date or datetime.date.today().isoformat()
    print ol._request("/admin/stats/" + today, method='POST', data="").read()
Example #29
0
#!/usr/bin/python

from __future__ import print_function
from openlibrary.api import OpenLibrary
from subprocess import Popen, PIPE
import MySQLdb

ia_db_host = 'dbmeta.us.archive.org'
ia_db_user = '******'
ia_db_pass = Popen(["/opt/.petabox/dbserver"], stdout=PIPE).communicate()[0]

ol = OpenLibrary('http://openlibrary.org/')

local_db = MySQLdb.connect(db='merge_editions')
local_cur = conn.cursor()

archive_db = MySQLdb.connect(host=ia_db_host, user=ia_db_user, \
        passwd=ia_db_pass, db='archive')
archive_cur = conn.cursor()

fields = ['identifier', 'updated', 'collection']
sql_fields = ', '.join(fields)

archive_cur.execute("select " + sql_fields + \
    " from metadata" + \
    " where scanner is not null and mediatype='texts'" + \
        " and (not curatestate='dark' or curatestate is null)" + \
        " and collection is not null and boxid is not null and identifier not like 'zdanh_test%' and scandate is not null " + \
        " order by updated")

for num, (ia, updated, collection) in enumerate(cur.fetchall()):
Example #30
0
from __future__ import print_function
import MySQLdb
import re
import sys
sys.path.append('/1/src/openlibrary')
from openlibrary.api import OpenLibrary, Reference
from collections import defaultdict

re_edition_key = re.compile('^/books/OL(\d+)M$')
re_nonword = re.compile(r'\W', re.U)
re_edition = re.compile(' ed edition$')

ol = OpenLibrary('http://openlibrary.org/')

conn = MySQLdb.connect(db='merge_editions')
cur = conn.cursor()

skip = 'guineapigscomple00elwa'
skip = None
total = 5601
cur.execute(
    "select ia, editions, done, unmerge_count from merge where unmerge_count != 0"
)  # and ia='hantayo00hillrich'")
unmerge_field_counts = defaultdict(int)
num = 0
for ia, ekeys, done, unmerge_count in cur.fetchall():
    #    if unmerge_count == 0:
    #        continue
    num += 1
    if num % 100 == 0:
        print('%d/%d %.2f%%' % (num, total, ((float(num) * 100) / total)), ia)
Example #31
0
from subprocess import Popen, PIPE
import argparse

parser = argparse.ArgumentParser(description='scribe loader')
parser.add_argument('--skip_hide_books', action='store_true')
parser.add_argument('--item_id')
parser.add_argument('--config', default='openlibrary.yml')
args = parser.parse_args()

config_file = args.config
config.load(config_file)
import_bot_password = config.runtime_config['load_scribe']['import_bot_password']
# '/1/var/log/openlibrary/load_scribe'
load_scribe_log = config.runtime_config['load_scribe']['log']

ol = OpenLibrary("http://openlibrary.org")
ol.login('ImportBot', import_bot_password)

password = Popen(["/opt/.petabox/dbserver"], stdout=PIPE).communicate()[0]
db = web.database(dbn='mysql', host='dbmeta.us.archive.org', user='******', \
        passwd=password, db='archive')
db.printing = False

re_census = re.compile('^\d+(st|nd|rd|th)census')

re_edition_key = re.compile('^/(?:books|b)/(OL\d+M)$')

def read_short_title(title):
    return str(fast_parse.normalize_str(title)[:25])

def make_index_fields(rec):
from openlibrary.api import OpenLibrary

ol = OpenLibrary('http://openlibrary.org/')

data = eval(open('update').read())

done = []

for author in data:
    print(author['key'], author['name'])
    akey = author['key']
    a = ol.get(akey)
    if not a.get('bio') and author['bio']:
        a['bio'] = author['bio']
        ol.save(akey, a, 'Add author bio from Smashwords.')

    for edition in author['editions']:
        #        wkey = ol.new({
        #            'type': '/type/work',
        #            'title': edition['title'],
        #            'authors': [{'author': {'key': akey}}],
        #            'description': edition['description'],
        #            'subjects': ['Lending library'],
        #        })
        #
        wkey = edition['work']
        w = ol.get(wkey)
        assert edition['description']
        if not w.get('description'):
            w['description'] = edition['description']
        if 'Lending library' not in w.get('subjects', []):
Example #33
0
from openlibrary.catalog.marc.fast_parse import get_subfield_values, get_first_tag, get_tag_lines, get_subfields, BadDictionary
from openlibrary.catalog.utils.query import query_iter, withKey
from openlibrary.catalog.utils import mk_norm
from openlibrary.catalog.read_rc import read_rc
from collections import defaultdict
from pprint import pprint, pformat
from openlibrary.catalog.utils.edit import fix_edition
from openlibrary.catalog.importer.db_read import get_mc
from urllib import urlopen
from openlibrary.api import OpenLibrary
from lxml import etree
from time import sleep, time, strftime
from openlibrary.catalog.marc.marc_subject import get_work_subjects, four_types
import simplejson as json

ol = OpenLibrary("http://openlibrary.org")

re_skip = re.compile(
    r'\b([A-Z]|Co|Dr|Jr|Capt|Mr|Mrs|Ms|Prof|Rev|Revd|Hon|etc)\.$')
re_work_key = re.compile('^/works/OL(\d+)W$')
re_lang_key = re.compile('^/(?:l|languages)/([a-z]{3})$')
re_author_key = re.compile('^/(?:a|authors)/(OL\d+A)$')

re_ia_marc = re.compile('^(?:.*/)?([^/]+)_(marc\.xml|meta\.mrc)(:0:\d+)?$')

ns = '{http://www.loc.gov/MARC21/slim}'
ns_leader = ns + 'leader'
ns_data = ns + 'datafield'


def has_dot(s):
from __future__ import print_function
import sys
import codecs
from openlibrary.catalog.utils.query import query_iter, set_staging, query
from openlibrary.api import OpenLibrary, Reference
from openlibrary.catalog.read_rc import read_rc
from time import sleep

set_staging(True)
rc = read_rc()

ol = OpenLibrary("http://dev.openlibrary.org")
ol.login('EdwardBot', rc['EdwardBot'])
sys.stdout = codecs.getwriter('utf-8')(sys.stdout)

work_q = {
    'type': '/type/work',
    'authors': None,
    'title': None,
}

queue = []

for w in query_iter(work_q):
    if not w.get('authors'):
        print('no authors')
        continue
    if any(isinstance(a, dict) and 'author' in a for a in w['authors']):
        continue
    print(len(queue), w['key'], w['title']) # , ol.get(w['authors'][0]['key'])['name']
    full = ol.get(w['key'])
Example #35
0
from __future__ import print_function
import MySQLdb, datetime, re, sys
from openlibrary.api import OpenLibrary, Reference
from collections import defaultdict

import six


re_edition_key = re.compile('^/books/OL(\d+)M$')
re_nonword = re.compile(r'\W', re.U)

conn = MySQLdb.connect(db='merge_editions')
cur = conn.cursor()
cur2 = conn.cursor()

ol = OpenLibrary('http://openlibrary.org/')
ol.login('EdwardBot', 'As1Wae9b')

cur.execute('select ia, editions, done from merge where done is null and unmerge_count=0')
for ia, ekeys, done in cur.fetchall():
    updates = []
    ekeys = ['/books/OL%dM' % x for x in sorted(int(re_edition_key.match(ekey).group(1)) for ekey in ekeys.split(' '))]
    print((ia, ekeys))
    min_ekey = ekeys[0]
    editions = [ol.get(ekey) for ekey in ekeys]
    master = editions[0]

    for e in editions:
        for k in 'classifications', 'identifiers', 'table_of_contents':
            if k in e and not e[k]:
                del e[k]
from __future__ import print_function
import sys
import codecs
from openlibrary.catalog.utils.query import query_iter, set_staging, query
from openlibrary.api import OpenLibrary, Reference
from openlibrary.catalog.read_rc import read_rc
from time import sleep

set_staging(True)
rc = read_rc()

ol = OpenLibrary("http://dev.openlibrary.org")
ol.login('EdwardBot', rc['EdwardBot'])
sys.stdout = codecs.getwriter('utf-8')(sys.stdout)

work_q = {
    'type': '/type/work',
    'authors': None,
    'title': None,
}

queue = []

for w in query_iter(work_q):
    if not w.get('authors'):
        print('no authors')
        continue
    if any(isinstance(a, dict) and 'author' in a for a in w['authors']):
        continue
    print(len(queue), w['key'],
          w['title'])  # , ol.get(w['authors'][0]['key'])['name']
from openlibrary.catalog.marc.parse_xml import parse, xml_rec
from openlibrary.catalog.marc import read_xml
from openlibrary.api import OpenLibrary, unmarshal
from openlibrary.catalog.marc import fast_parse
from openlibrary.catalog.importer import pool
from openlibrary.catalog.merge.merge_marc import build_marc
from openlibrary.catalog.utils.query import query, withKey
from openlibrary.catalog.importer.merge import try_merge
import os, re, sys

ol = OpenLibrary("http://openlibrary.org")

re_census = re.compile('^\d+(st|nd|rd|th)census')
re_edition_key = re.compile('^/(?:b|books)/(OL\d+M)$')

scan_dir = '/2/edward/20century/scans/'

def read_short_title(title):
    return str(fast_parse.normalize_str(title)[:25])

def make_index_fields(rec):
    fields = {}
    for k, v in rec.iteritems():
        if k in ('lccn', 'oclc', 'isbn'):
            fields[k] = v
            continue
        if k == 'full_title':
            fields['title'] = [read_short_title(v)]
    return fields

# no maps or cartograph
"""Sample script to update OL records in bulk.
"""

from openlibrary.api import OpenLibrary
import web
import sys

ol = OpenLibrary()


def read_mapping(filename, chunksize=1000):
    """Reads OLID, OCLC_NUMBER mapping from given file.

    Assumes that the file contains one OLID, OCLC_NUMBER per line, separated by tab.
    """
    for line in open(filename):
        olid, oclc = line.strip().split("\t")
        yield olid, oclc


def add_identifier(doc, id_name, id_value):
    if id_name in ["isbn_10", "isbn_13", "oclc_numbers", "lccn"]:
        ids = doc.setdefault(id_name, [])
    else:
        ids = doc.setdefault("identifiers", {}).setdefault(id_name, [])

    if id_value not in ids:
        ids.append(id_value)


def has_identifier(doc, id_name, id_value):
Example #39
0
from openlibrary.catalog.merge.merge_marc import *
from openlibrary.catalog.read_rc import read_rc
import openlibrary.catalog.merge.amazon as amazon
from openlibrary.catalog.get_ia import *
from openlibrary.catalog.importer.db_read import withKey, get_mc
from openlibrary.api import OpenLibrary, Reference
import openlibrary.catalog.marc.fast_parse as fast_parse
import xml.parsers.expat
import web, sys
from time import sleep

import six

rc = read_rc()

ol = OpenLibrary("http://openlibrary.org")
ol.login('ImportBot', rc['ImportBot'])

ia_db = web.database(dbn='mysql',
                     db='archive',
                     user=rc['ia_db_user'],
                     pw=rc['ia_db_pass'],
                     host=rc['ia_db_host'])
ia_db.printing = False

re_meta_marc = re.compile('([^/]+)_(meta|marc)\.(mrc|xml)')

threshold = 875
amazon.set_isbn_match(225)

Example #40
0
from openlibrary.catalog.utils.query import query, withKey
from openlibrary.api import OpenLibrary, unmarshal
from openlibrary.catalog.read_rc import read_rc

rc = read_rc()
ol = OpenLibrary("http://openlibrary.org")
ol.login('ImportBot', rc['ImportBot'])

to_fix = []
num = 0
for line in open('no_index'):
    for e in query({
            'type': '/type/edition',
            'title': None,
            'ocaid': line[:-1]
    }):
        num += 1
        print(num, e['key'], repr(e['title']), line[:-1])
        e2 = ol.get(e['key'])
        del e2['ocaid']
        to_fix.append(e2)

ol.save_many(to_fix, 'remove link')
                  default="openlibrary.org:80",
                  help="The openlibrary API host")
    op.add_option("-k", "--nyt-api-key", dest="nyt_api_key", 
                  help="API key for use with the nyt bestsellers api")
    op.add_option("-u", "--bot-username", dest="username", 
                  default="nyt_bestsellers_bot",
                  help="The bot username for accessing the Open Library API")
    op.add_option("-p", "--bot-password", dest="password", 
                  help="The bot password for accessing the Open Library API")

    options, _ = op.parse_args()

    global NYT_API_KEY
    NYT_API_KEY = options.nyt_api_key
    global OL
    OL = OpenLibrary("http://%s" % options.openlibrary_host)
    OL.login(options.username, options.password)
    results = collections.defaultdict(list)
    for ln in get_nyt_bestseller_list_names():
        LOG("INFO", "processing list %s" % ln)
        for i, book in enumerate(load_nyt_bestseller_list(ln)):
            ol_keys = reconcile_book(book)
            if not ol_keys:
                LOG("WARN", "unable to reconcile '%s' by %s - no OL book found" % (
                    book['book_details'][0]['title'], book['book_details'][0]['author']
                ))
            if not (key for key in ol_keys if key.startswith("/works/")):
                LOG("WARN", "only editions for '%s' by %s: %s" % (
                    book['book_details'][0]['title'], book['book_details'][0]['author'], ol_keys
                ))
            results[ln].append({
Example #42
0
#!/usr/bin/python

import MySQLdb, datetime, re, sys
sys.path.append('/1/src/openlibrary')
from openlibrary.api import OpenLibrary, Reference
from flask import Flask, render_template, request, flash, redirect, url_for, g
from collections import defaultdict
app = Flask(__name__)

re_edition_key = re.compile('^/books/OL(\d+)M$')

ol = OpenLibrary('http://openlibrary.org/')
ol.login('EdwardBot', 'As1Wae9b')


@app.before_request
def before_request():
    g.db = MySQLdb.connect(db='merge_editions')


@app.after_request
def after_request(r):
    g.db.close()
    return r


re_nonword = re.compile(r'\W', re.U)

rows = 200

app.secret_key = 'rt9%s#)5kid$!u*5_@*$f2f_%jq++nl3@d%=7f%v4&78^m4p7c'
Example #43
0
def get_ol(servername=None):
    ol = OpenLibrary(base_url=servername)
    ol.autologin()
    return ol
Example #44
0
# IdentifierBot
# by Daniel Montalvo

csvfile = 'LibraryThing_to_OpenLibrary.csv'
batch_size = 500

import traceback
import csv
import string
import _init_path
import sys

sys.path.append('/petabox/sw/lib/python')
from openlibrary.api import OpenLibrary, marshal

ol = OpenLibrary("http://openlibrary.org")
reader = csv.reader(open(csvfile), delimiter='\t', quotechar='|')
f = open('authors.txt', 'w')


def fix_toc(doc):
    doc = marshal(doc)

    def f(d):
        """function to fix one toc entry."""
        if d.get('type') == '/type/text':
            return dict(title=d['value'])
        else:
            return d

    toc = doc.get('table_of_contents')
Example #45
0
#!/usr/bin/python
from __future__ import print_function
from subprocess import Popen, PIPE
from openlibrary.utils.ia import find_item, FindItemError
from openlibrary.api import OpenLibrary
from openlibrary.catalog.read_rc import read_rc
from openlibrary.catalog.get_ia import marc_formats, get_marc_ia_data
from openlibrary.catalog.marc import is_display_marc
from time import sleep, time

import MySQLdb
import re, urllib2, httplib, json, codecs, socket, sys

ol = OpenLibrary('http://openlibrary.org/')

rc = read_rc()

base_url = 'http://openlibrary.org'

ia_db_host = 'dbmeta.us.archive.org'
ia_db_user = '******'
ia_db_pass = Popen(["/opt/.petabox/dbserver"], stdout=PIPE).communicate()[0]

re_census = re.compile('^\d+(st|nd|rd|th)census')

fields = [
    'identifier', 'contributor', 'updated', 'noindex', 'collection', 'format',
    'boxid'
]
sql_fields = ', '.join(fields)
csvfile = 'LibraryThing_to_OpenLibrary.csv'
db = 'ids.sqlite'

import csv
import string
import sqlite3
import _init_path
from openlibrary.api import OpenLibrary

ol = OpenLibrary('http://openlibrary.org/')

for attempt in range(5):
    try:
        ol.autologin()
        break
    except:
        print 'ol.autologin() error; retrying'
conn = sqlite3.connect(db)
c = conn.cursor()
reader = csv.reader(open(csvfile), delimiter='\t', quotechar='|')
for row in reader:
    ltid = row[0]
    olid = row[1]
    key = '/books' + olid[olid.rindex('/'):len(olid)]
    c.execute('select * from ids where key = ?', (key, ))
    x = c.fetchone()
    if x != None:
        continue
    print 'Trying to get key: %r' % key
    for attempt in range(5):
        try:
Example #47
0
    op.add_option("-u",
                  "--bot-username",
                  dest="username",
                  default="nyt_bestsellers_bot",
                  help="The bot username for accessing the Open Library API")
    op.add_option("-p",
                  "--bot-password",
                  dest="password",
                  help="The bot password for accessing the Open Library API")

    options, _ = op.parse_args()

    global NYT_API_KEY
    NYT_API_KEY = options.nyt_api_key
    global OL
    OL = OpenLibrary("http://%s" % options.openlibrary_host)
    OL.login(options.username, options.password)
    results = collections.defaultdict(list)
    for ln in get_nyt_bestseller_list_names():
        LOG("INFO", "processing list %s" % ln)
        for i, book in enumerate(load_nyt_bestseller_list(ln)):
            ol_keys = reconcile_book(book)
            if not ol_keys:
                LOG(
                    "WARN",
                    "unable to reconcile '%s' by %s - no OL book found" %
                    (book['book_details'][0]['title'],
                     book['book_details'][0]['author']))
            if not (key for key in ol_keys if key.startswith("/works/")):
                LOG(
                    "WARN", "only editions for '%s' by %s: %s" %
from __future__ import print_function
# try and find an existing work for a book

from openlibrary.api import OpenLibrary
from openlibrary.catalog.utils import mk_norm
import sys
from time import time

ol = OpenLibrary("http://openlibrary.org")


def find_matching_work(e):
    norm_title = mk_norm(e['title'])

    seen = set()
    for akey in e['authors']:
        q = {
            'type': '/type/work',
            'authors': {
                'author': {
                    'key': akey
                }
            },
            'limit': 0,
            'title': None,
        }
        t0 = time()
        work_keys = list(ol.query(q))
        t1 = time() - t0
        print('time to find books by author: %.1f seconds' % t1)
        for w in work_keys:
Example #49
0
#!/usr/bin/python
from subprocess import Popen, PIPE
from openlibrary.utils.ia import find_item, FindItemError
from openlibrary.api import OpenLibrary
from openlibrary.catalog.read_rc import read_rc
from openlibrary.catalog.get_ia import marc_formats, get_marc_ia_data
from openlibrary.catalog.marc import is_display_marc
from time import sleep, time
from pprint import pprint

import MySQLdb
import re, urllib2, httplib, json, codecs, socket, sys

ol = OpenLibrary('http://openlibrary.org/')

rc = read_rc()

base_url = 'http://openlibrary.org'

ia_db_host = 'dbmeta.us.archive.org'
ia_db_user = '******'
ia_db_pass = Popen(["/opt/.petabox/dbserver"], stdout=PIPE).communicate()[0]

re_census = re.compile('^\d+(st|nd|rd|th)census')

fields = ['identifier', 'contributor', 'updated', 'noindex', 'collection', 'format', 'boxid']
sql_fields = ', '.join(fields)

scanned_start = open('scanned_start').readline()[:-1]

ignore_noindex = set(['printdisabled', 'lendinglibrary', 'inlibrary'])
Example #50
0
import argparse

parser = argparse.ArgumentParser(description='scribe loader')
parser.add_argument('--skip_hide_books', action='store_true')
parser.add_argument('--item_id')
parser.add_argument('--config', default='openlibrary.yml')
args = parser.parse_args()

config_file = args.config
config.load(config_file)
import_bot_password = config.runtime_config['load_scribe'][
    'import_bot_password']
# '/1/var/log/openlibrary/load_scribe'
load_scribe_log = config.runtime_config['load_scribe']['log']

ol = OpenLibrary("http://openlibrary.org")
ol.login('ImportBot', import_bot_password)

password = Popen(["/opt/.petabox/dbserver"], stdout=PIPE).communicate()[0]
db = web.database(dbn='mysql', host='dbmeta.us.archive.org', user='******', \
        passwd=password, db='archive')
db.printing = False

re_census = re.compile('^\d+(st|nd|rd|th)census')

re_edition_key = re.compile('^/(?:books|b)/(OL\d+M)$')


def read_short_title(title):
    return str(fast_parse.normalize_str(title)[:25])
Example #51
0
# IdentifierBot
# by Daniel Montalvo

csvfile = 'LibraryThing_to_OpenLibrary.csv'
batch_size = 500

import traceback
import csv
import string
import _init_path
import sys

sys.path.append('/petabox/sw/lib/python')
from openlibrary.api import OpenLibrary, marshal

ol = OpenLibrary("http://openlibrary.org")
reader = csv.reader(open(csvfile), delimiter='\t', quotechar='|')
f = open('authors.txt', 'w')

def fix_toc(doc):
     doc = marshal(doc)

     def f(d):
         """function to fix one toc entry."""
         if d.get('type') == '/type/text':
             return dict(title=d['value'])
         else:
             return d

     toc = doc.get('table_of_contents')
     if toc:
Example #52
0
parser.add_argument('--no_author_updates', action='store_true')
parser.add_argument('--just_consider_authors', action='store_true')
parser.add_argument('--limit', default=None)

args = parser.parse_args()
handle_author_merge = args.handle_author_merge
only_author_merge = args.only_author_merge
skip_author_merge = args.skip_author_merge

if only_author_merge:
    handle_author_merge = True

if handle_author_merge:
    from openlibrary.catalog.works.find_works import find_title_redirects, find_works, get_books, books_query, update_works

ol = OpenLibrary("http://" + args.server)
set_query_host(args.server)
done_login = False

config_file = args.config
config.load(config_file)

base = 'http://%s/openlibrary.org/log/' % config.runtime_config[
    'infobase_server']

skip_user = set(u.lower() for u in args.skip_user)
only_user = set(u.lower() for u in args.only_user)

if 'state_dir' not in config.runtime_config:
    print 'state_dir missing from ' + config_file
    sys.exit(0)
Example #53
0
import csv, httplib, sys, codecs, re
from openlibrary.api import OpenLibrary
from pprint import pprint, pformat

sys.stdout = codecs.getwriter('utf-8')(sys.stdout)

h1 = httplib.HTTPConnection('www.archive.org')
h1.connect()

ol = OpenLibrary('http://openlibrary.org/')
#ol.login('EdwardBot', 'As1Wae9b')

input_file = '/home/edward/Documents/smashwords_ia_20110325.csv'
input_file = '/home/edward/Documents/smashwords_ia_20110325-extended-20110406.csv'

#headings = ['Distributor', 'Author Name', 'Author Bio', 'Publisher', 'SWID',
#    'Book Title', 'Price', 'Short Book Description', 'Long Book Description',
#    'ISBN', 'BISAC I', 'BISAC II', 'Where to buy']

# ['Distributor', 'Author Name', 'Author Bio', 'Publisher', 'SWID',
# 'Book Title', 'Pub. Date @ Smashwords', 'Price', 'Short Book Description',
# 'Long Book Description', 'ISBN', 'BISAC I', 'BISAC II', 'Word Count (Approx.)',
# 'Where to buy']

authors = {}
titles = set()
isbn = set()

name_map = {
    'JA Konrath': 'J. A. Konrath'
}
Example #54
0
import MySQLdb, datetime, re, sys
sys.path.append('/1/src/openlibrary')
from openlibrary.api import OpenLibrary, Reference
from pprint import pprint

conn = MySQLdb.connect(db='merge_editions')
cur = conn.cursor()

re_edition_key = re.compile('^/books/OL(\d+)M$')
re_work_key = re.compile('^/works/OL(\d+)W$')
ol = OpenLibrary('http://openlibrary.org/')
ol.login('EdwardBot', 'As1Wae9b')

re_iso_date = re.compile('^(\d{4})-\d\d-\d\d$')
re_end_year = re.compile('(\d{4})$')


def get_publish_year(d):
    if not d:
        return
    m = re_iso_date.match(d)
    if m:
        return int(m.group(1))
    m = re_end_year.match(d)
    if m:
        return int(m.group(1))


{'lc_classifications': ['PZ7.H558 Ru'], 'dewey_number': ['[E]']}

Example #55
0
from openlibrary.catalog.utils import cmp, mk_norm
from openlibrary.catalog.read_rc import read_rc
from collections import defaultdict
from pprint import pformat
from openlibrary.catalog.utils.edit import fix_edition
from openlibrary.catalog.importer.db_read import get_mc
import urllib2
from openlibrary.api import OpenLibrary, Reference
from lxml import etree
from time import sleep, time

import six

rc = read_rc()

ol = OpenLibrary("http://openlibrary.org")
ol.login('WorkBot', rc['WorkBot'])


def write_log(cat, key, title):
    print((("%.2f" % time()), cat, key, title), file=fh_log)
    fh_log.flush()


sys.stdout = codecs.getwriter('utf-8')(sys.stdout)
re_skip = re.compile(
    '\b([A-Z]|Co|Dr|Jr|Capt|Mr|Mrs|Ms|Prof|Rev|Revd|Hon|etc)\.$')

re_ia_marc = re.compile('^(?:.*/)?([^/]+)_(marc\.xml|meta\.mrc)(:0:\d+)?$')

ns = '{http://www.loc.gov/MARC21/slim}'
from catalog.importer.db_read import get_mc, withKey
from openlibrary.api import OpenLibrary

from catalog.read_rc import read_rc

import six

rc = read_rc()

marc_index = web.database(dbn='postgres', db='marc_index')
marc_index.printing = False

db_amazon = web.database(dbn='postgres', db='amazon')
db_amazon.printing = False

ol = OpenLibrary("http://openlibrary.org")
ol.login('ImportBot', rc['ImportBot'])

sys.stdout = codecs.getwriter('utf-8')(sys.stdout)

t0 = time()
t_prev = time()
rec_no = 0
chunk = 50
load_count = 0

archive_id = sys.argv[1]

def percent(a, b):
    return float(a * 100.0) / b
Example #57
0
from openlibrary.catalog.get_ia import files, read_marc_file
from openlibrary.catalog.merge.merge_marc import build_marc
from openlibrary.catalog.importer.db_read import get_mc, withKey
from openlibrary.api import OpenLibrary, unmarshal

from openlibrary.catalog.read_rc import read_rc

rc = read_rc()

marc_index = web.database(dbn='postgres', db='marc_index')
marc_index.printing = False

db_amazon = web.database(dbn='postgres', db='amazon')
db_amazon.printing = False

ol = OpenLibrary("http://openlibrary.org")
ol.login('ImportBot', rc['ImportBot']) 

sys.stdout = codecs.getwriter('utf-8')(sys.stdout)

t0 = time()
t_prev = time()
rec_no = 0
chunk = 50
load_count = 0

archive_id = sys.argv[1]

def get_with_retry(key):
    for i in range(3):
        try: