Example #1
0
def process_redirect_dump(writer, redirects_dump):
    import bsddb

    db = bsddb.btopen('solrdump/redirects.db',
                      'w',
                      cachesize=1024 * 1024 * 1024)

    for type, key, revision, timestamp, json_data in read_tsv(redirects_dump):
        d = json.loads(json_data)
        if not key.startswith(("/authors/", "/works/")):
            continue

        location = d.get('location')
        if location:
            # Old redirects still start with /a/ instead of /authors/.
            location = location.replace("/a/", "/authors/")
            db[key] = location

    for key in db:
        if key.startswith("/works/"):
            redirect = find_redirect(db, key)
            if redirect:
                writer.write([(redirect, "redirect", key)])

    return db
Example #2
0
def process_triples(path):
    """Takes a file with triples, sort it using first column and groups it by first column."""
    print("processing triples from", path)
    cmd = ["sort", path]
    p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
    for key, chunk in itertools.groupby(read_tsv(p.stdout), lambda t: t[0]):
        d = collections.defaultdict(list)
        for k, name, value in chunk:
            if name in ['json', 'edition', 'author']:
                value = json.loads(value)
            d[name].append(value)
        yield key, d
Example #3
0
def process_author_dump(writer, authors_dump):
    import bsddb
    db = bsddb.btopen('solrdump/authors.db', 'w', cachesize=1024 * 1024 * 1024)

    properties = ['key', 'name', 'alternate_names', 'personal_name']
    for type, key, revision, timestamp, json_data in read_tsv(authors_dump):
        author = json.loads(json_data)

        olid = key.split("/")[-1]
        db[olid] = json.dumps(subdict(author, properties))

        writer.write(process_author(author))
    return db
Example #4
0
def main(filename):
    for tokens in read_tsv(filename):
        json = tokens[-1]
        doc = simplejson.loads(json)

        cover = doc.get('covers') and doc.get('covers')[0]
        isbns = doc.get('isbn_10', []) + doc.get('isbn_13', [])
        key = doc['key']

        key = web.safestr(key)
        isbns = (web.safestr(isbn) for isbn in isbns)

        try:
            if cover and cover > 0:
                print "\t".join([str(cover), key, ",".join(isbns)])
        except:
            print >> sys.stderr, doc
            print >> sys.stderr, (key, cover, isbns)
            raise
Example #5
0
def main(filename):
    for tokens in read_tsv(filename):
        json = tokens[-1]
        doc = simplejson.loads(json)

        cover = doc.get('covers') and doc.get('covers')[0]
        isbns = doc.get('isbn_10', []) + doc.get('isbn_13', [])
        key = doc['key']

        key = web.safestr(key)
        isbns = (web.safestr(isbn) for isbn in isbns)

        try:
            if cover and cover > 0:
                print "\t".join([str(cover), key, ",".join(isbns)])
        except:
            print >> sys.stderr, doc
            print >> sys.stderr, (key, cover, isbns)
            raise
Example #6
0
def process_edition_dump(writer, editions_dump):
    for type, key, revision, timestamp, json_data in read_tsv(editions_dump):
        doc = json.loads(json_data)
        writer.write(process_edition(doc))
Example #7
0
def process_work_dump(writer, works_dump, author_db, redirect_db):
    for type, key, revision, timestamp, json_data in read_tsv(works_dump):
        doc = json.loads(json_data)
        writer.write(process_work(doc, author_db, redirect_db))