Esempio n. 1
0
def main(
        keys: list[str],
        src="http://openlibrary.org/",
        dest="http://localhost:8080",
        comment="",
        recursive=True,
        editions=True,
        lists: list[str] = None,
        search: str = None,
        search_limit: int = 10,
):
    """
    Script to copy docs from one OL instance to another.
    Typically used to copy templates, macros, css and js from
    openlibrary.org to dev instance. paths can end with wildcards.

    USAGE:
        # Copy all templates
        ./scripts/copydocs.py --src http://openlibrary.org /templates/*
        # Copy specific records
        ./scripts/copydocs.py /authors/OL113592A /works/OL1098727W?v=2
        # Copy search results
        ./scripts/copydocs.py --search "publisher:librivox" --search-limit 10


    :param src: URL of the source open library server
    :param dest: URL of the destination open library server
    :param recursive: Recursively fetch all the referred docs
    :param editions: Also fetch all the editions of works
    :param lists: Copy docs from list(s)
    :param search: Run a search on open library and copy docs from the results
    """

    # Mypy doesn't handle union-ing types across if statements -_-
    # https://github.com/python/mypy/issues/6233
    src_ol: Union[Disk, OpenLibrary] = (
        OpenLibrary(src) if src.startswith("http://") else Disk(src))
    dest_ol: Union[Disk, OpenLibrary] = (
        OpenLibrary(dest) if dest.startswith("http://") else Disk(dest))

    if isinstance(dest_ol, OpenLibrary):
        section = "[%s]" % web.lstrips(dest, "http://").strip("/")
        if section in read_lines(os.path.expanduser("~/.olrc")):
            dest_ol.autologin()
        else:
            dest_ol.login("admin", "admin123")

    for list_key in (lists or []):
        copy_list(src_ol, dest_ol, list_key, comment=comment)

    if search:
        assert isinstance(src_ol, OpenLibrary), "Search only works with OL src"
        keys += [
            doc['key']
            for doc in src_ol.search(search, limit=search_limit, fields=['key'])['docs']
        ]

    keys = list(expand(src_ol, ('/' + k.lstrip('/') for k in keys)))

    copy(src_ol, dest_ol, keys, comment=comment, recursive=recursive, editions=editions)
Esempio n. 2
0
def main():
    options, args = parse_args()

    if options.src.startswith("http://"):
        src = OpenLibrary(options.src)
    else:
        src = Disk(options.src)

    if options.dest.startswith("http://"):
        dest = OpenLibrary(options.dest)
        section = "[%s]" % web.lstrips(options.dest, "http://").strip("/")
        if section in read_lines(os.path.expanduser("~/.olrc")):
            dest.autologin()
        else:
            dest.login("admin", "admin123")
    else:
        dest = Disk(options.dest)

    for list_key in options.lists:
        copy_list(src, dest, list_key, comment=options.comment)

    keys = args
    keys = list(expand(src, keys))

    copy(src, dest, keys, comment=options.comment, recursive=options.recursive)
Esempio n. 3
0
def main(server):
    ol = OpenLibrary(server)
    ol.autologin()

    volumes = ol.query({'type': '/type/volume', 'limit': False, '*': None, 'edition': {'*': None}})

    volumes = dict((v['key'], v) for v in volumes)
    editions = dict((v['edition']['key'], v['edition']) for v in volumes.values() if v['edition'])

    def make_volume(v):
        d = {}
        v.pop('edition')
        v['type'] = {'key': '/type/volume'}
        for k in ['type', 'ia_id', 'volume_number']:
            if k in v:
                d[k] = v[k]
        return d

    for e in editions.values():
        e['volumes'] = []

    for v in volumes.values():
        if v.get('edition'):
            e = editions[v.get('edition')['key']]
            e['volumes'].append(make_volume(v))

    for e in editions.values():
        e['volumes'] = sorted(e['volumes'], key=lambda v: v['volume_number'])

    print 'linking volumes to %d editions' % len(editions)
    ol.save_many(editions.values(), 'link volumes')
Esempio n. 4
0
def main():
    ol = OpenLibrary()
    ol.autologin()

    plugin = sys.argv[1]

    all_docs = []

    for pattern in sys.argv[2:]:
        docs = ol.query({"key~": pattern, "*": None}, limit=1000)
        all_docs.extend(docs)

    for doc in all_docs:
        doc['plugin'] = plugin

    print ol.save_many(all_docs, comment="Marked as part of %s plugin." % plugin)
Esempio n. 5
0
def load():
    """Loads documents to http://0.0.0.0:8080"""
    documents = {}
    for f in find("data"):
        doc = simplejson.load(open(f))
        documents[doc['key']] = doc

    keys = topological_sort(documents.keys(),
                            get_children=lambda key: [
                                k for k in get_references(documents[key])
                                if not k.startswith("/type/")
                            ])

    from openlibrary.api import OpenLibrary
    ol = OpenLibrary("http://0.0.0.0:8080")
    ol.autologin()
    print ol.save_many([documents[k] for k in keys],
                       comment="documents copied from openlibrary.org")
def main():
    global options
    options, args = parse_options()
    ol = OpenLibrary(options.server)

    for pattern in args:
        docs = ol.query({"key~": pattern, "*": None}, limit=1000)
        for doc in marshal(docs):
            # Anand: special care to ignore bad documents in the database.
            if "--duplicate" in doc['key']:
                continue

            if doc['type']['key'] == '/type/template':
                write(make_path(doc), get_value(doc, 'body'))
            elif doc['type']['key'] == '/type/macro':
                write(make_path(doc), get_value(doc, 'macro'))
            else:
                delete(make_path(doc))
def main(server):
    ol = OpenLibrary(server)
    ol.autologin()

    scan_records = ol.query(type='/type/scan_record',
                            limit=False,
                            edition={'*': None})
    editions = (r['edition'] for r in scan_records)

    # process 1000 editions at a time.
    while True:
        chunk = take(1000, editions)
        if not chunk:
            break

        print 'linking %d editions' % len(chunk)

        for e in chunk:
            e['scan_records'] = [{'key': '/scan_record' + e['key']}]

        ol.save_many(chunk, 'link scan records')
Esempio n. 8
0
from __future__ import print_function
import MySQLdb
import re
from openlibrary.api import OpenLibrary, Reference
from collections import defaultdict

import six

re_edition_key = re.compile('^/books/OL(\d+)M$')
re_nonword = re.compile(r'\W', re.U)

conn = MySQLdb.connect(db='merge_editions')
cur = conn.cursor()
cur2 = conn.cursor()

ol = OpenLibrary('http://openlibrary.org/')
ol.login('EdwardBot', 'As1Wae9b')

cur.execute(
    'select ia, editions, done from merge where done is null and unmerge_count=0'
)
for ia, ekeys, done in cur.fetchall():
    updates = []
    ekeys = [
        '/books/OL%dM' % x for x in sorted(
            int(re_edition_key.match(ekey).group(1))
            for ekey in ekeys.split(' '))
    ]
    print((ia, ekeys))
    min_ekey = ekeys[0]
    editions = [ol.get(ekey) for ekey in ekeys]
Esempio n. 9
0
parser.add_argument('--no_author_updates', action='store_true')
parser.add_argument('--just_consider_authors', action='store_true')
parser.add_argument('--limit', default=None)

args = parser.parse_args()
handle_author_merge = args.handle_author_merge
only_author_merge = args.only_author_merge
skip_author_merge = args.skip_author_merge

if only_author_merge:
    handle_author_merge = True

if handle_author_merge:
    from openlibrary.catalog.works.find_works import find_title_redirects, find_works, get_books, books_query, update_works

ol = OpenLibrary("http://" + args.server)
set_query_host(args.server)
done_login = False

config_file = args.config
config.load(config_file)

base = 'http://%s/openlibrary.org/log/' % config.runtime_config[
    'infobase_server']

skip_user = set(u.lower() for u in args.skip_user)
only_user = set(u.lower() for u in args.only_user)

if 'state_dir' not in config.runtime_config:
    print 'state_dir missing from ' + config_file
    sys.exit(0)
Esempio n. 10
0
import argparse

parser = argparse.ArgumentParser(description='scribe loader')
parser.add_argument('--skip_hide_books', action='store_true')
parser.add_argument('--item_id')
parser.add_argument('--config', default='openlibrary.yml')
args = parser.parse_args()

config_file = args.config
config.load(config_file)
import_bot_password = config.runtime_config['load_scribe'][
    'import_bot_password']
# '/1/var/log/openlibrary/load_scribe'
load_scribe_log = config.runtime_config['load_scribe']['log']

ol = OpenLibrary("http://openlibrary.org")
ol.login('ImportBot', import_bot_password)

password = Popen(["/opt/.petabox/dbserver"], stdout=PIPE).communicate()[0]
db = web.database(dbn='mysql', host='dbmeta.us.archive.org', user='******', \
        passwd=password, db='archive')
db.printing = False

re_census = re.compile('^\d+(st|nd|rd|th)census')

re_edition_key = re.compile('^/(?:books|b)/(OL\d+M)$')


def read_short_title(title):
    return str(fast_parse.normalize_str(title)[:25])
Esempio n. 11
0
def get_ol(servername=None):
    ol = OpenLibrary(base_url=servername)
    ol.autologin()
    return ol
Esempio n. 12
0
def get_ol():
    ol = OpenLibrary()
    ol.autologin()
    return ol
Esempio n. 13
0
from __future__ import print_function
import sys
import codecs
from openlibrary.catalog.utils.query import query_iter, set_staging, query
from openlibrary.api import OpenLibrary, Reference
from openlibrary.catalog.read_rc import read_rc
from time import sleep

set_staging(True)
rc = read_rc()

ol = OpenLibrary("http://dev.openlibrary.org")
ol.login('EdwardBot', rc['EdwardBot'])
sys.stdout = codecs.getwriter('utf-8')(sys.stdout)

work_q = {
    'type': '/type/work',
    'authors': None,
    'title': None,
}

queue = []

for w in query_iter(work_q):
    if not w.get('authors'):
        print('no authors')
        continue
    if any(isinstance(a, dict) and 'author' in a for a in w['authors']):
        continue
    print(len(queue), w['key'],
          w['title'])  # , ol.get(w['authors'][0]['key'])['name']
Esempio n. 14
0
    op.add_option("-u",
                  "--bot-username",
                  dest="username",
                  default="nyt_bestsellers_bot",
                  help="The bot username for accessing the Open Library API")
    op.add_option("-p",
                  "--bot-password",
                  dest="password",
                  help="The bot password for accessing the Open Library API")

    options, _ = op.parse_args()

    global NYT_API_KEY
    NYT_API_KEY = options.nyt_api_key
    global OL
    OL = OpenLibrary("http://%s" % options.openlibrary_host)
    OL.login(options.username, options.password)
    results = collections.defaultdict(list)
    for ln in get_nyt_bestseller_list_names():
        LOG("INFO", "processing list %s" % ln)
        for i, book in enumerate(load_nyt_bestseller_list(ln)):
            ol_keys = reconcile_book(book)
            if not ol_keys:
                LOG(
                    "WARN",
                    "unable to reconcile '%s' by %s - no OL book found" %
                    (book['book_details'][0]['title'],
                     book['book_details'][0]['author']))
            if not (key for key in ol_keys if key.startswith("/works/")):
                LOG(
                    "WARN", "only editions for '%s' by %s: %s" %
Esempio n. 15
0
def main(site, date=None):
    ol = OpenLibrary(site)
    ol.autologin("StatsBot")

    today = date or datetime.date.today().isoformat()
    print ol._request("/admin/stats/" + today, method='POST', data="").read()
Esempio n. 16
0
"""Sample script to update OL records in bulk.
"""

from openlibrary.api import OpenLibrary
import web
import sys

ol = OpenLibrary()


def read_mapping(filename, chunksize=1000):
    """Reads OLID, OCLC_NUMBER mapping from given file.

    Assumes that the file contains one OLID, OCLC_NUMBER per line, separated by tab.
    """
    for line in open(filename):
        olid, oclc = line.strip().split("\t")
        yield olid, oclc


def add_identifier(doc, id_name, id_value):
    if id_name in ["isbn_10", "isbn_13", "oclc_numbers", "lccn"]:
        ids = doc.setdefault(id_name, [])
    else:
        ids = doc.setdefault("identifiers", {}).setdefault(id_name, [])

    if id_value not in ids:
        ids.append(id_value)


def has_identifier(doc, id_name, id_value):