def process_redirect_dump(writer, redirects_dump): import bsddb db = bsddb.btopen('solrdump/redirects.db', 'w', cachesize=1024 * 1024 * 1024) for type, key, revision, timestamp, json_data in read_tsv(redirects_dump): d = json.loads(json_data) if not key.startswith(("/authors/", "/works/")): continue location = d.get('location') if location: # Old redirects still start with /a/ instead of /authors/. location = location.replace("/a/", "/authors/") db[key] = location for key in db: if key.startswith("/works/"): redirect = find_redirect(db, key) if redirect: writer.write([(redirect, "redirect", key)]) return db
def process_triples(path): """Takes a file with triples, sort it using first column and groups it by first column.""" print("processing triples from", path) cmd = ["sort", path] p = subprocess.Popen(cmd, stdout=subprocess.PIPE) for key, chunk in itertools.groupby(read_tsv(p.stdout), lambda t: t[0]): d = collections.defaultdict(list) for k, name, value in chunk: if name in ['json', 'edition', 'author']: value = json.loads(value) d[name].append(value) yield key, d
def process_author_dump(writer, authors_dump): import bsddb db = bsddb.btopen('solrdump/authors.db', 'w', cachesize=1024 * 1024 * 1024) properties = ['key', 'name', 'alternate_names', 'personal_name'] for type, key, revision, timestamp, json_data in read_tsv(authors_dump): author = json.loads(json_data) olid = key.split("/")[-1] db[olid] = json.dumps(subdict(author, properties)) writer.write(process_author(author)) return db
def main(filename): for tokens in read_tsv(filename): json = tokens[-1] doc = simplejson.loads(json) cover = doc.get('covers') and doc.get('covers')[0] isbns = doc.get('isbn_10', []) + doc.get('isbn_13', []) key = doc['key'] key = web.safestr(key) isbns = (web.safestr(isbn) for isbn in isbns) try: if cover and cover > 0: print "\t".join([str(cover), key, ",".join(isbns)]) except: print >> sys.stderr, doc print >> sys.stderr, (key, cover, isbns) raise
def process_edition_dump(writer, editions_dump): for type, key, revision, timestamp, json_data in read_tsv(editions_dump): doc = json.loads(json_data) writer.write(process_edition(doc))
def process_work_dump(writer, works_dump, author_db, redirect_db): for type, key, revision, timestamp, json_data in read_tsv(works_dump): doc = json.loads(json_data) writer.write(process_work(doc, author_db, redirect_db))