import os, re, sys, codecs
from openlibrary.catalog.read_rc import read_rc
from openlibrary.catalog.importer.db_read import get_mc

sys.path.append('/home/edward/src/olapi')
from olapi import OpenLibrary, unmarshal, marshal

rc = read_rc()
ol = OpenLibrary("http://dev.openlibrary.org")
ol.login('EdwardBot', rc['EdwardBot']) 

test_dir = '/home/edward/ol/test_data'

re_edition = re.compile('^/b/OL\d+M$')

re_meta_mrc = re.compile('^([^/]*)_meta.mrc:0:\d+$')

#out = open('source_records', 'w')
for f in os.listdir(test_dir):
    key = f.replace('_', '/')
    if not re_edition.match(key):
        continue
    print key
    continue
    mc = get_mc(key)
    print key, mc
    if not mc:
        continue
    e = ol.get(key)
    if e.get('source_records', []):
        continue
Exemple #2
0
from catalog.get_ia import read_marc_file
from catalog.read_rc import read_rc
from catalog.marc.fast_parse import get_tag_lines, get_all_subfields, get_first_tag
from catalog.marc.new_parser import read_edition
from catalog.utils.query import query_iter
from catalog.marc.utils import files
sys.path.append('/home/edward/src/olapi')
from olapi import OpenLibrary, unmarshal
import simplejson as json
from catalog.importer.load import build_query, east_in_by_statement, import_author

rc = read_rc()
marc_index = web.database(dbn='postgres', db='marc_index')
marc_index.printing = False

ol = OpenLibrary("http://openlibrary.org")
ol.login('ImportBot', rc['ImportBot'])

sys.stdout = codecs.getwriter('utf-8')(sys.stdout)

#ocm04775229
re_oclc = re.compile('^oc[mn]0*(\d+)$')


def get_keys(loc):
    assert loc.startswith('marc:')
    vars = {'loc': loc[5:]}
    db_iter = marc_index.query('select k from machine_comment where v=$loc',
                               vars)
    mc = list(db_iter)
    if mc:
#!/usr/bin/env python

from time import localtime, sleep, strftime
from olapi import OpenLibrary

ol = OpenLibrary()
ol.login("someBot", "somePassword")

def print_log(msg):
  timestamp = strftime("%Y%m%d_%H:%M:%S", localtime())
  print("[" + timestamp + "] " + msg)

def set_identifier(book, id_name, id_value):
  ids = book.setdefault("identifiers", {})
  ids[id_name] = [id_value]

def set_goodreads_id(olid, goodreads_id):
  book = ol.get(olid)
  set_identifier(book, "goodreads", goodreads_id)
  ol.save(book['key'], book, "Added goodreads ID.")

def map_id(olid, isbn, goodreads_id):
  book = ol.get(olid)
  if book.has_key('identifiers'):
    if book['identifiers'].has_key('goodreads'):
      if goodreads_id in book['identifiers']['goodreads']:
        return
  print_log("Adding Goodreads ID \"" + goodreads_id + "\" to Openlibrary ID \"" + olid + "\"")
  set_goodreads_id(olid, goodreads_id)

def load(filename):
Exemple #4
0
from openlibrary.catalog.read_rc import read_rc
from openlibrary.catalog.importer.load import build_query, east_in_by_statement, import_author
from openlibrary.catalog.utils.query import query, withKey
from openlibrary.catalog.importer.merge import try_merge
from openlibrary.catalog.importer.lang import add_lang
from openlibrary.catalog.importer.update import add_source_records
from openlibrary.catalog.get_ia import get_ia, urlopen_keep_trying, NoMARCXML
from openlibrary.catalog.importer.db_read import get_mc
import openlibrary.catalog.marc.parse_xml as parse_xml
from time import time, sleep
import openlibrary.catalog.marc.fast_parse as fast_parse
sys.path.append('/home/edward/src/olapi')
from olapi import OpenLibrary, unmarshal

rc = read_rc()
ol = OpenLibrary("http://openlibrary.org")
ol.login('ImportBot', rc['ImportBot'])

db_amazon = web.database(dbn='postgres', db='amazon')
db_amazon.printing = False

db = web.database(dbn='mysql', host=rc['ia_db_host'], user=rc['ia_db_user'], \
        passwd=rc['ia_db_pass'], db='archive')
db.printing = False

start = '2009-10-11 22:04:57'
fh_log = open('/1/edward/logs/load_scribe', 'a')

t0 = time()
t_prev = time()
rec_no = 0
Exemple #5
0
from openlibrary.catalog.merge.merge_marc import *
from openlibrary.catalog.read_rc import read_rc
import openlibrary.catalog.merge.amazon as amazon
from openlibrary.catalog.get_ia import *
from openlibrary.catalog.importer.db_read import withKey, get_mc
import openlibrary.catalog.marc.fast_parse as fast_parse
import xml.parsers.expat
import web, sys

sys.path.append("/home/edward/src/olapi")
from olapi import OpenLibrary
from time import sleep

rc = read_rc()

ol = OpenLibrary("http://openlibrary.org")
ol.login("ImportBot", rc["ImportBot"])

ia_db = web.database(dbn="mysql", db="archive", user=rc["ia_db_user"], pw=rc["ia_db_pass"], host=rc["ia_db_host"])
ia_db.printing = False

re_meta_marc = re.compile("([^/]+)_(meta|marc)\.(mrc|xml)")

threshold = 875
amazon.set_isbn_match(225)


def try_amazon(thing):
    if "isbn_10" not in thing:
        return None
    if "authors" in thing:
import sys
import codecs
import re
sys.path.append('/home/edward/src/olapi')
from olapi import OpenLibrary, Reference
from catalog.read_rc import read_rc

import six


rc = read_rc()

sys.stdout = codecs.getwriter('utf-8')(sys.stdout)
set_staging(True)

ol = OpenLibrary("http://dev.openlibrary.org")
ol.login('EdwardBot', rc['EdwardBot'])

re_skip = re.compile('\b([A-Z]|Co|Dr|Jr|Capt|Mr|Mrs|Ms|Prof|Rev|Revd|Hon)\.$')

def has_dot(s):
    return s.endswith('.') and not re_skip.search(s)

q = { 'type': '/type/edition', 'table_of_contents': None, 'subjects': None }
queue = []
count = 0
for e in query_iter(q):
    if not e.get('subjects', None) or not any(has_dot(s) for s in e['subjects']):
        continue
    subjects = [s[:-1] if has_dot(s) else s for s in e['subjects']]
    q = {
Exemple #7
0
class VacuumBot:
  """VacuumBot can help clean up Open Library, just tell him what to do!
  
  The VacuumBot essentially has methods to do specific cleanup tasks.
  It needs the credentials of a bot account on Open Library and some instructions.
  """
  
  def __init__(self, username, password):
    self.ol = OpenLibrary()
    self.ol.login(username, password)
  
  def remove_classification_value(self, obj, type, value):
    """Removes a value from the list of <type> classifications.
    
    For example, can be used to remove the "B" value from 
    Dewey Decimal classifications.
    If the classifications list is empty afterwards, it is removed.
    If the classifications object in the record is empty (because
    removing the deleted list was the only one in it), it is removed 
    as well.
    """
    special = ["lc_classifications", "dewey_decimal_class"]
    if type in special and type in obj.keys():
      while value in obj[type]:
        obj[type].remove(value)
      if len(obj[type]) == 0:
        del obj[type]
    elif "classifications" in obj.keys() and type in obj["classifications"].keys():
      while value in obj["classifications"][type]:
        obj["classifications"][type].remove(value)
      if len(obj["classifications"][type]) == 0:
        del obj["classifications"][type]
        if len(obj["classifications"]) == 0:
          del obj["classifications"]
   
  def deduplicate_list(self, li):
    """Sorts a list and removes duplicate values in place."""
    
    a = len(li)
    c = 0
    li.sort()
    while c < a-1:
      if li[c] == li[c+1]:
        li.pop(c+1)
        a = a-1
      else:
        c = c+1
    
  def dedup(self, obj):
    """Removes duplicate values from an object.
    
    Calls deduplicate_list for lists.
    Calls itself on compound objects.
    Does nothing with strings or other types.
    """
    if isinstance(obj, str):
      return
    elif isinstance(obj, dict):
      for k in obj:
        dedup(obj[k])
    elif isinstance(obj, list):
      deduplicate_list(obj) 
    else:
      return

  def remove_key(self, olid, key):
    """Removes a key from a record
    
    Use with caution :)
    """
    object = ol.get(olid)
    if key in object:
      del object[key]
      ol.save(object['key'], object, "Sucked up \"" + key + "\".")
    

  def deduplicate_values(self, olid, key):
    """Removes duplicate values
    
    Reads the values of a key and removes duplicate values,
    leaving 1.
    """
    object = ol.get(olid)
    if key in object:
      dedup(object[key])

  def remove_classification(self, obj, classification):
    if "classifications" in obj:
      if classification in obj["classifications"]:
        del obj["classifications"][classification]

  def clean_lccn_permalink(self, olid):
    """Removes lccn_permalink from classifications
    
    Removes permalink from classifications and adds the LCCN to
    the identifiers, if is isn't there already.
    """
    object = ol.get(olid)
    if "classifications" in object:
      if "lccn_permalink" in object["classifications"]:
        if "identifiers" in object:
          if "lccn" in object["identifiers"]:
        lccn = 
        remove_classification(object, "lccn_permalink")
    

  def vacuum(self, filename):
    """Main execution
    
    Vacuums the Open Library based on commands found in the file.
    Command files are structured as follows: [todo]
    
    """
    n = 0
    for line in open(filename):
      olid, isbn, goodreads_id = line.strip().split()
      n = n+1
      if (n % 100000) == 0:
        print_log("(just read line " + str(n) + " from the map file)")
      is_good = False
      while (not is_good):
        try:
          map_id(olid, isbn, goodreads_id)
          is_good = True
        except:
          print_log("Exception for Goodreads ID \"" + goodreads_id + "\", message: \"" + str(sys.exc_info()[1]) + "\"")
          sleep(30)
from olapi import OpenLibrary
import simplejson as json
from collections import defaultdict
from catalog.read_rc import read_rc
from catalog.utils.query import query, query_iter, set_staging, base_url
from catalog.utils import mk_norm, get_title
from six.moves import urllib

import six

sys.stdout = codecs.getwriter('utf-8')(sys.stdout)
set_staging(True)

rc = read_rc()

ol = OpenLibrary(base_url())
ol.login('EdwardBot', rc['EdwardBot'])

re_year = re.compile('(\d{3,})$')

queue = []


def iter_works(fields):
    q = {'type': '/type/work', 'key': None}
    for f in fields:
        q[f] = None
    return query_iter(q)


def dates():
#!/usr/bin/env python

from time import localtime, sleep, strftime
from olapi import OpenLibrary

ol = OpenLibrary()
ol.login("someBot", "somePassword")


def print_log(msg):
    timestamp = strftime("%Y%m%d_%H:%M:%S", localtime())
    print("[" + timestamp + "] " + msg)


def set_identifier(book, id_name, id_value):
    # OL handles the standard identifiers in a different way.
    if id_name in ["isbn_10", "isbn_13", "oclc_numbers", "lccn"]:
        ids = book.setdefault(id_name, [])
        if id_value not in ids:
            ids.append(id_value)
    else:
        ids = book.setdefault("identifiers", {})
        ids[id_name] = [id_value]


def set_goodreads_id(olid, goodreads_id):
    book = ol.get(olid)
    set_identifier(book, "goodreads", goodreads_id)
    ol.save(book['key'], book, "Added goodreads ID.")

Exemple #10
0
import os, re, sys, codecs
from openlibrary.catalog.read_rc import read_rc
from openlibrary.catalog.importer.db_read import get_mc

sys.path.append('/home/edward/src/olapi')
from olapi import OpenLibrary, unmarshal, marshal

rc = read_rc()
ol = OpenLibrary("http://dev.openlibrary.org")
ol.login('EdwardBot', rc['EdwardBot'])

test_dir = '/home/edward/ol/test_data'

re_edition = re.compile('^/b/OL\d+M$')

re_meta_mrc = re.compile('^([^/]*)_meta.mrc:0:\d+$')

#out = open('source_records', 'w')
for f in os.listdir(test_dir):
    key = f.replace('_', '/')
    if not re_edition.match(key):
        continue
    print key
    continue
    mc = get_mc(key)
    print key, mc
    if not mc:
        continue
    e = ol.get(key)
    if e.get('source_records', []):
        continue
Exemple #11
0
from catalog.utils.query import query_iter, set_staging, withKey
import sys
import codecs
import re
sys.path.append('/home/edward/src/olapi')
from olapi import OpenLibrary, Reference
from catalog.read_rc import read_rc

import six

rc = read_rc()

sys.stdout = codecs.getwriter('utf-8')(sys.stdout)
set_staging(True)

ol = OpenLibrary("http://dev.openlibrary.org")
ol.login('EdwardBot', rc['EdwardBot'])

re_skip = re.compile('\b([A-Z]|Co|Dr|Jr|Capt|Mr|Mrs|Ms|Prof|Rev|Revd|Hon)\.$')


def has_dot(s):
    return s.endswith('.') and not re_skip.search(s)


q = {'type': '/type/edition', 'table_of_contents': None, 'subjects': None}
queue = []
count = 0
for e in query_iter(q):
    if not e.get('subjects', None) or not any(
            has_dot(s) for s in e['subjects']):
Exemple #12
0
#!/usr/bin/env python

from olapi import OpenLibrary

ol = OpenLibrary()
ol.login("VacuumBot", "somePassword")

# Top level classifications don't go in the classifications dict.
tl_classifications = ["lc_classifications", "dewey_decimal_class"]


def upgrade_classifications(olid):
    """Changes classification from list of (name,value)-dict
  to dict of lists.
  """
    record = ol.get(olid)
    # Check if the classifications are a list:
    if not isinstance(record["classifications"], list):
        return

    # Create a new dict to replace the list:
    c = {}

    # Read the dicts from the classifications list:
    for k in record["classifications"]:
        if k["name"] in tl_classifications:
            if k["name"] in record.keys():
                record[k["name"]].append(k["value"])
            else:
                record[k["name"]] = [k["value"]]
        elif k["name"] not in c.keys():
#!/usr/bin/env python

from olapi import OpenLibrary

ol = OpenLibrary()
ol.login("VacuumBot", "somePassword")

# Top level classifications don't go in the classifications dict.
tl_classifications = ["lc_classifications","dewey_decimal_class"]

def upgrade_classifications(olid):
  """Changes classification from list of (name,value)-dict
  to dict of lists.
  """
  record = ol.get(olid)
  # Check if the classifications are a list:
  if not isinstance(record["classifications"], list):
    return
  
  # Create a new dict to replace the list:
  c = {}
  
  # Read the dicts from the classifications list:
  for k in record["classifications"]:
    if k["name"] in tl_classifications:
      if k["name"] in record.keys():
        record[k["name"]].append(k["value"])
      else:
        record[k["name"]] = [k["value"]]
    elif k["name"] not in c.keys():
      c["name"] = [k["value"]]
Exemple #14
0
#!/usr/bin/env python
# NondescriptBot
# by John Shutt (http://shutt.in)

import sys
from olapi import OpenLibrary
# secrets.py holds the login info, and is excluded from version control
from secrets import login_name, password

ol = OpenLibrary()

# Log in.
logged_in = False
print 'Trying to log in...'
for attempt in range(5):
    try:
        ol.login(login_name, password)
        logged_in = True
        print 'Login successful.'
        break
    except:
        print 'ol.login() error; retrying'
if not logged_in:
    sys.exit('Failed to log in.')
Exemple #15
0
 def __init__(self, username, password):
   self.ol = OpenLibrary()
   self.ol.login(username, password)
Exemple #16
0
from openlibrary.catalog.marc.fast_parse import get_subfield_values, get_first_tag, get_tag_lines, get_subfields
from openlibrary.catalog.utils.query import query_iter, set_staging, query
from openlibrary.catalog.utils import cmp, mk_norm
from openlibrary.catalog.read_rc import read_rc
from collections import defaultdict

from catalog.utils.edit import fix_edition
import urllib
from olapi import OpenLibrary, Reference
import olapi

import six

rc = read_rc()

ol = OpenLibrary("http://dev.openlibrary.org")
ol.login('EdwardBot', rc['EdwardBot'])

sys.stdout = codecs.getwriter('utf-8')(sys.stdout)
re_skip = re.compile(
    '\b([A-Z]|Co|Dr|Jr|Capt|Mr|Mrs|Ms|Prof|Rev|Revd|Hon|etc)\.$')

base_url = "http://dev.openlibrary.org"
query_url = base_url + "/query.json?query="

work_num = 184076

set_staging(True)


def withKey(key):
from catalog.utils.query import query_iter, set_staging, withKey
import sys, codecs, re

sys.path.append("/home/edward/src/olapi")
from olapi import OpenLibrary, Reference
from catalog.read_rc import read_rc

rc = read_rc()

sys.stdout = codecs.getwriter("utf-8")(sys.stdout)
set_staging(True)

ol = OpenLibrary("http://dev.openlibrary.org")
ol.login("EdwardBot", rc["EdwardBot"])

re_skip = re.compile("\b([A-Z]|Co|Dr|Jr|Capt|Mr|Mrs|Ms|Prof|Rev|Revd|Hon)\.$")


def has_dot(s):
    return s.endswith(".") and not re_skip.search(s)


q = {"type": "/type/edition", "table_of_contents": None, "subjects": None}
queue = []
count = 0
for e in query_iter(q):
    if not e.get("subjects", None) or not any(has_dot(s) for s in e["subjects"]):
        continue
    subjects = [s[:-1] if has_dot(s) else s for s in e["subjects"]]
    q = {"key": e["key"], "subjects": {"connect": "update_list", "value": subjects}}
    if e.get("table_of_contents", None) and e["table_of_contents"][0]["type"] == "/type/text":
Exemple #18
0
from catalog.utils.query import query_iter, set_staging, withKey, get_mc
import sys, codecs, re
sys.path.append('/home/edward/src/olapi')
from olapi import OpenLibrary, Reference
from catalog.read_rc import read_rc
from catalog.get_ia import get_from_archive, get_from_local
from catalog.marc.fast_parse import get_first_tag, get_all_subfields
rc = read_rc()

sys.stdout = codecs.getwriter('utf-8')(sys.stdout)
set_staging(True)

ol = OpenLibrary("http://dev.openlibrary.org")
ol.login('EdwardBot', rc['EdwardBot'])

q = { 'type': '/type/edition', 'table_of_contents': None, 'subjects': None }
queue = []
count = 0
for e in query_iter(q, limit=100):
    key = e['key']
    mc = get_mc(key)
    if not mc:
        continue
    data = get_from_local(mc)
    line = get_first_tag(data, set(['041']))
    if not line:
        continue
    print key, line[0:2], list(get_all_subfields(line))

Exemple #19
0
from openlibrary.catalog.merge.merge_marc import build_marc
from openlibrary.catalog.importer.db_read import get_mc, withKey
sys.path.append('/home/edward/src/olapi')
from olapi import OpenLibrary, unmarshal

from openlibrary.catalog.read_rc import read_rc

rc = read_rc()

marc_index = web.database(dbn='postgres', db='marc_index')
marc_index.printing = False

db_amazon = web.database(dbn='postgres', db='amazon')
db_amazon.printing = False

ol = OpenLibrary("http://openlibrary.org")
ol.login('ImportBot', rc['ImportBot']) 

sys.stdout = codecs.getwriter('utf-8')(sys.stdout)

t0 = time()
t_prev = time()
rec_no = 0
chunk = 50
load_count = 0

archive_id = sys.argv[1]

def get_with_retry(key):
    for i in range(3):
        try: