from catalog.utils.query import query_iter, set_staging, withKey, get_mc import sys, codecs, re sys.path.append('/home/edward/src/olapi') from olapi import OpenLibrary, Reference from catalog.read_rc import read_rc from catalog.get_ia import get_from_archive, get_from_local from catalog.marc.fast_parse import get_first_tag, get_all_subfields rc = read_rc() sys.stdout = codecs.getwriter('utf-8')(sys.stdout) set_staging(True) ol = OpenLibrary("http://dev.openlibrary.org") ol.login('EdwardBot', rc['EdwardBot']) q = { 'type': '/type/edition', 'table_of_contents': None, 'subjects': None } queue = [] count = 0 for e in query_iter(q, limit=100): key = e['key'] mc = get_mc(key) if not mc: continue data = get_from_local(mc) line = get_first_tag(data, set(['041'])) if not line: continue print key, line[0:2], list(get_all_subfields(line))
#!/usr/bin/env python # NondescriptBot # by John Shutt (http://shutt.in) import sys from olapi import OpenLibrary # secrets.py holds the login info, and is excluded from version control from secrets import login_name, password ol = OpenLibrary() # Log in. logged_in = False print 'Trying to log in...' for attempt in range(5): try: ol.login(login_name, password) logged_in = True print 'Login successful.' break except: print 'ol.login() error; retrying' if not logged_in: sys.exit('Failed to log in.')
import simplejson as json from collections import defaultdict from catalog.read_rc import read_rc from catalog.utils.query import query, query_iter, set_staging, base_url from catalog.utils import mk_norm, get_title from six.moves import urllib import six sys.stdout = codecs.getwriter('utf-8')(sys.stdout) set_staging(True) rc = read_rc() ol = OpenLibrary(base_url()) ol.login('EdwardBot', rc['EdwardBot']) re_year = re.compile('(\d{3,})$') queue = [] def iter_works(fields): q = {'type': '/type/work', 'key': None} for f in fields: q[f] = None return query_iter(q) def dates(): global queue
from openlibrary.catalog.importer.load import build_query, east_in_by_statement, import_author from openlibrary.catalog.utils.query import query, withKey from openlibrary.catalog.importer.merge import try_merge from openlibrary.catalog.importer.lang import add_lang from openlibrary.catalog.importer.update import add_source_records from openlibrary.catalog.get_ia import get_ia, urlopen_keep_trying, NoMARCXML from openlibrary.catalog.importer.db_read import get_mc import openlibrary.catalog.marc.parse_xml as parse_xml from time import time, sleep import openlibrary.catalog.marc.fast_parse as fast_parse sys.path.append('/home/edward/src/olapi') from olapi import OpenLibrary, unmarshal rc = read_rc() ol = OpenLibrary("http://openlibrary.org") ol.login('ImportBot', rc['ImportBot']) db_amazon = web.database(dbn='postgres', db='amazon') db_amazon.printing = False db = web.database(dbn='mysql', host=rc['ia_db_host'], user=rc['ia_db_user'], \ passwd=rc['ia_db_pass'], db='archive') db.printing = False start = '2009-10-11 22:04:57' fh_log = open('/1/edward/logs/load_scribe', 'a') t0 = time() t_prev = time() rec_no = 0 chunk = 50
from catalog.read_rc import read_rc from catalog.marc.fast_parse import get_tag_lines, get_all_subfields, get_first_tag from catalog.marc.new_parser import read_edition from catalog.utils.query import query_iter from catalog.marc.utils import files sys.path.append('/home/edward/src/olapi') from olapi import OpenLibrary, unmarshal import simplejson as json from catalog.importer.load import build_query, east_in_by_statement, import_author rc = read_rc() marc_index = web.database(dbn='postgres', db='marc_index') marc_index.printing = False ol = OpenLibrary("http://openlibrary.org") ol.login('ImportBot', rc['ImportBot']) sys.stdout = codecs.getwriter('utf-8')(sys.stdout) #ocm04775229 re_oclc = re.compile('^oc[mn]0*(\d+)$') def get_keys(loc): assert loc.startswith('marc:') vars = {'loc': loc[5:]} db_iter = marc_index.query('select k from machine_comment where v=$loc', vars) mc = list(db_iter) if mc: return [r.k for r in mc]
from catalog.utils.query import query_iter, set_staging, withKey import sys, codecs, re sys.path.append("/home/edward/src/olapi") from olapi import OpenLibrary, Reference from catalog.read_rc import read_rc rc = read_rc() sys.stdout = codecs.getwriter("utf-8")(sys.stdout) set_staging(True) ol = OpenLibrary("http://dev.openlibrary.org") ol.login("EdwardBot", rc["EdwardBot"]) re_skip = re.compile("\b([A-Z]|Co|Dr|Jr|Capt|Mr|Mrs|Ms|Prof|Rev|Revd|Hon)\.$") def has_dot(s): return s.endswith(".") and not re_skip.search(s) q = {"type": "/type/edition", "table_of_contents": None, "subjects": None} queue = [] count = 0 for e in query_iter(q): if not e.get("subjects", None) or not any(has_dot(s) for s in e["subjects"]): continue subjects = [s[:-1] if has_dot(s) else s for s in e["subjects"]] q = {"key": e["key"], "subjects": {"connect": "update_list", "value": subjects}} if e.get("table_of_contents", None) and e["table_of_contents"][0]["type"] == "/type/text":
#!/usr/bin/env python from time import localtime, sleep, strftime from olapi import OpenLibrary ol = OpenLibrary() ol.login("someBot", "somePassword") def print_log(msg): timestamp = strftime("%Y%m%d_%H:%M:%S", localtime()) print("[" + timestamp + "] " + msg) def set_identifier(book, id_name, id_value): ids = book.setdefault("identifiers", {}) ids[id_name] = [id_value] def set_goodreads_id(olid, goodreads_id): book = ol.get(olid) set_identifier(book, "goodreads", goodreads_id) ol.save(book['key'], book, "Added goodreads ID.") def map_id(olid, isbn, goodreads_id): book = ol.get(olid) if book.has_key('identifiers'): if book['identifiers'].has_key('goodreads'): if goodreads_id in book['identifiers']['goodreads']: return print_log("Adding Goodreads ID \"" + goodreads_id + "\" to Openlibrary ID \"" + olid + "\"") set_goodreads_id(olid, goodreads_id) def load(filename):
class VacuumBot: """VacuumBot can help clean up Open Library, just tell him what to do! The VacuumBot essentially has methods to do specific cleanup tasks. It needs the credentials of a bot account on Open Library and some instructions. """ def __init__(self, username, password): self.ol = OpenLibrary() self.ol.login(username, password) def remove_classification_value(self, obj, type, value): """Removes a value from the list of <type> classifications. For example, can be used to remove the "B" value from Dewey Decimal classifications. If the classifications list is empty afterwards, it is removed. If the classifications object in the record is empty (because removing the deleted list was the only one in it), it is removed as well. """ special = ["lc_classifications", "dewey_decimal_class"] if type in special and type in obj.keys(): while value in obj[type]: obj[type].remove(value) if len(obj[type]) == 0: del obj[type] elif "classifications" in obj.keys() and type in obj["classifications"].keys(): while value in obj["classifications"][type]: obj["classifications"][type].remove(value) if len(obj["classifications"][type]) == 0: del obj["classifications"][type] if len(obj["classifications"]) == 0: del obj["classifications"] def deduplicate_list(self, li): """Sorts a list and removes duplicate values in place.""" a = len(li) c = 0 li.sort() while c < a-1: if li[c] == li[c+1]: li.pop(c+1) a = a-1 else: c = c+1 def dedup(self, obj): """Removes duplicate values from an object. Calls deduplicate_list for lists. Calls itself on compound objects. Does nothing with strings or other types. """ if isinstance(obj, str): return elif isinstance(obj, dict): for k in obj: dedup(obj[k]) elif isinstance(obj, list): deduplicate_list(obj) else: return def remove_key(self, olid, key): """Removes a key from a record Use with caution :) """ object = ol.get(olid) if key in object: del object[key] ol.save(object['key'], object, "Sucked up \"" + key + "\".") def deduplicate_values(self, olid, key): """Removes duplicate values Reads the values of a key and removes duplicate values, leaving 1. """ object = ol.get(olid) if key in object: dedup(object[key]) def remove_classification(self, obj, classification): if "classifications" in obj: if classification in obj["classifications"]: del obj["classifications"][classification] def clean_lccn_permalink(self, olid): """Removes lccn_permalink from classifications Removes permalink from classifications and adds the LCCN to the identifiers, if is isn't there already. """ object = ol.get(olid) if "classifications" in object: if "lccn_permalink" in object["classifications"]: if "identifiers" in object: if "lccn" in object["identifiers"]: lccn = remove_classification(object, "lccn_permalink") def vacuum(self, filename): """Main execution Vacuums the Open Library based on commands found in the file. Command files are structured as follows: [todo] """ n = 0 for line in open(filename): olid, isbn, goodreads_id = line.strip().split() n = n+1 if (n % 100000) == 0: print_log("(just read line " + str(n) + " from the map file)") is_good = False while (not is_good): try: map_id(olid, isbn, goodreads_id) is_good = True except: print_log("Exception for Goodreads ID \"" + goodreads_id + "\", message: \"" + str(sys.exc_info()[1]) + "\"") sleep(30)
#!/usr/bin/env python from time import localtime, sleep, strftime from olapi import OpenLibrary ol = OpenLibrary() ol.login("someBot", "somePassword") def print_log(msg): timestamp = strftime("%Y%m%d_%H:%M:%S", localtime()) print("[" + timestamp + "] " + msg) def set_identifier(book, id_name, id_value): # OL handles the standard identifiers in a different way. if id_name in ["isbn_10", "isbn_13", "oclc_numbers", "lccn"]: ids = book.setdefault(id_name, []) if id_value not in ids: ids.append(id_value) else: ids = book.setdefault("identifiers", {}) ids[id_name] = [id_value] def set_goodreads_id(olid, goodreads_id): book = ol.get(olid) set_identifier(book, "goodreads", goodreads_id) ol.save(book['key'], book, "Added goodreads ID.")
#!/usr/bin/env python from olapi import OpenLibrary ol = OpenLibrary() ol.login("VacuumBot", "somePassword") # Top level classifications don't go in the classifications dict. tl_classifications = ["lc_classifications", "dewey_decimal_class"] def upgrade_classifications(olid): """Changes classification from list of (name,value)-dict to dict of lists. """ record = ol.get(olid) # Check if the classifications are a list: if not isinstance(record["classifications"], list): return # Create a new dict to replace the list: c = {} # Read the dicts from the classifications list: for k in record["classifications"]: if k["name"] in tl_classifications: if k["name"] in record.keys(): record[k["name"]].append(k["value"]) else: record[k["name"]] = [k["value"]] elif k["name"] not in c.keys():
#!/usr/bin/env python from olapi import OpenLibrary ol = OpenLibrary() ol.login("VacuumBot", "somePassword") # Top level classifications don't go in the classifications dict. tl_classifications = ["lc_classifications","dewey_decimal_class"] def upgrade_classifications(olid): """Changes classification from list of (name,value)-dict to dict of lists. """ record = ol.get(olid) # Check if the classifications are a list: if not isinstance(record["classifications"], list): return # Create a new dict to replace the list: c = {} # Read the dicts from the classifications list: for k in record["classifications"]: if k["name"] in tl_classifications: if k["name"] in record.keys(): record[k["name"]].append(k["value"]) else: record[k["name"]] = [k["value"]] elif k["name"] not in c.keys(): c["name"] = [k["value"]]
from openlibrary.catalog.read_rc import read_rc import openlibrary.catalog.merge.amazon as amazon from openlibrary.catalog.get_ia import * from openlibrary.catalog.importer.db_read import withKey, get_mc import openlibrary.catalog.marc.fast_parse as fast_parse import xml.parsers.expat import web, sys sys.path.append("/home/edward/src/olapi") from olapi import OpenLibrary from time import sleep rc = read_rc() ol = OpenLibrary("http://openlibrary.org") ol.login("ImportBot", rc["ImportBot"]) ia_db = web.database(dbn="mysql", db="archive", user=rc["ia_db_user"], pw=rc["ia_db_pass"], host=rc["ia_db_host"]) ia_db.printing = False re_meta_marc = re.compile("([^/]+)_(meta|marc)\.(mrc|xml)") threshold = 875 amazon.set_isbn_match(225) def try_amazon(thing): if "isbn_10" not in thing: return None if "authors" in thing: authors = []