Ejemplo n.º 1
0
    def GET(self): # yes, this is a bit of a mess
        web.header('Content-Type','text/html; charset=utf-8', unique=True)
        web.header('Refresh','60', unique=True)
        page = ''
        page += "<html>\n<head><title>Import status</title>"
        page += "<style>th { vertical-align: bottom; text-align: left }</style>"
        page += "</head><body>"
        page += "<h1>Import status</h1>"
#        page += '<p style="padding: 5px; background: lightblue; border: black 1px solid; font-size:125%; font-weight: bold">MARC import is paused during the OCA conference</p>'
        page += "<b>Done:</b>"
        page += ', '.join('<a href="http://archive.org/details/%s">%s</a>' % (ia, ia) for ia in done) + '<br>'
        page += "<table>"
        page += "<tr><th>Archive ID</th><th>input<br>(rec/sec)</th>"
        page += "<th>no match<br>(%)</th>"
        page += "<th>load<br>(rec/sec)</th>"
#        page += "<th>last update<br>(secs)</th><th>running<br>(hours)</th>"
        page += "<th>progress</th>"
        page += "<th>remaining<br>(hours)</th>"
        page += "<th>remaining<br>(records)</th>"
        page += "</tr>"
        cur_time = time()
        total_recs = 0
        total_t = 0
        total_load = 0
        total_rec_per_sec = 0
        total_load_per_sec = 0
        total_future_load = 0
        for k in server_read('keys'):
            if k.endswith('2'):
                continue
            if k in done:
                continue

            broken = False
            q = server_read('store/' + k)
            t1 = cur_time - q['time']
            rec_no = q['rec_no']
            chunk = q['chunk']
            load_count = q['load_count']
            rec_per_sec = rec_no / q['t1']
            load_per_sec = load_count / q['t1']
            if k in done:
                page += '<tr bgcolor="#00ff00">'
                broken = True
            elif t1 > 600:
                broken = True
                page += '<tr bgcolor="red">'
            elif t1 > 120:
                broken = True
                page += '<tr bgcolor="yellow">'
            else:
                page += '<tr bgcolor="#00ff00">'
                total_rec_per_sec += rec_per_sec
                total_load_per_sec += load_per_sec
                total_recs += rec_no
                total_load += load_count
                total_t += q['t1']
            page += '<td><a href="http://archive.org/details/%s">%s</a></td>' % (k.rstrip('2'), k)
#            page += '<td><a href="http://openlibrary.org/show-marc/%s">current record</a></td>' % q['cur']
#            if 'last_key' in q and q['last_key']:
#                last_key = q['last_key']
#                page += '<td><a href="http://openlibrary.org%s">%s</a></td>' % (last_key, last_key[3:])
#            else:
#                page += '<td>No key</td>'
            if k in done:
                for i in range(5):
                    page += '<td></td>'
                page += '<td align="right">100.0%</td>'
            else:
                page += '<td align="right">%.2f</td>' % rec_per_sec
                no_match = float(q['load_count']) / q['rec_no']
                page += '<td align="right">%.2f%%</td>' % (no_match * 100)
                page += '<td align="right">%.2f</td>' % load_per_sec
                hours = q['t1'] / 3600.0
                page += '<td align="right">%.2f</td>' % hours
                (pos, total) = progress(k, q['part'], q['pos'])
                if pos:
                    page += '<td align="right">%.2f%%</td>' % (float(pos * 100) / total)
                else:
                    page += '<td align="right">n/a</td>'
                if 'bytes_per_sec_total' in q and total is not None and pos:
                    remaining_bytes = total - pos
                    sec = remaining_bytes / q['bytes_per_sec_total']
                    hours = sec / 3600
                    days = hours / 24
                    page += '<td align="right">%.2f</td>' % hours
                    total_bytes = q['bytes_per_sec_total'] * q['t1']
                    avg_bytes = total_bytes / q['rec_no']
                    future_load = ((remaining_bytes / avg_bytes) * no_match) 
                    total_future_load += future_load
                    page += '<td align="right">%s</td>' % web.commify(int(future_load))
                else:
                    page += '<td></td>'

            page += '</tr>'
        page += '<tr><td>Total</td><td align="right">%.2f</td>' % total_rec_per_sec
        if total_recs:
            page += '<td align="right">%.2f%%</td>' % (float(total_load * 100.0) / total_recs)
        else:
            page += '<td align="right"></td>' 
        page += '<td align="right">%.2f</td>' % total_load_per_sec
        page += '<td></td>' * 3 + '<td align="right">%s</td>' % web.commify(int(total_future_load))
        page += '</tr></table>'
#        page += "<table>"
#        page += '<tr><td align="right">loading:</td><td align="right">%.1f</td><td>rec/hour</td></tr>' % (total_load_per_sec * (60 * 60))
#        page += '<tr><td align="right">loading:</td><td align="right">%.1f</td><td>rec/day</td></tr>' % (total_load_per_sec * (60 * 60 * 24))
#        if total_load_per_sec:
#            page += '<tr><td>one million records takes:</td><td align="right">%.1f</td><td>hours</td></tr>' % ((1000000 / total_load_per_sec) / (60 * 60))
#            page += '<tr><td>one million records takes:</td><td align="right">%.1f</td><td>days</td></tr>' % ((1000000 / total_load_per_sec) / (60 * 60 * 24))
#        page += "</table>"
        rec_per_sec, count = read_book_count()
        page += "Total records per second: %.2f<br>" % rec_per_sec
        day = web.commify(int(rec_per_sec * (60 * 60 * 24)))
        page += "Total records per day: %s<br>" % day

        page += "Books in Open Library: " + web.commify(count) + "<br>"
        page += '</body>\n<html>'
        return page
Ejemplo n.º 2
0
    yield unescape(xdata)

out_edition = open('edition_file', 'w')
out_author = open('author_file', 'w')
out_edition_work = open('edition_work_file', 'w')
out_work = open('work_file', 'w')
works = []
authors = {}
misc_fields = ['created', 'modified', 'last_modified', 'latest_revision', 'personal_name', 'id', 'revision', 'type']
rec_no = 0
t0 = time()
for data in read_input():
    rec_no += 1
    if rec_no % 100000 == 0:
        t1 = time() - t0
        print("%s %.2f minutes" % (web.commify(rec_no), (float(t1) / 60.0)))
    try:
        d = json.loads(data)
    except:
        print(data)
        raise
    t = d['type']['key']
    k = d['key']
    if t == '/type/edition':
        m = re_edition_key.match(k)
        if not m:
            print('bad edition key:', k)
            print(data)
            continue
        print(data, file=out_edition)
        for w in d.get('works', []):
Ejemplo n.º 3
0
    yield unescape(xdata)

out_edition = open('edition_file', 'w')
out_author = open('author_file', 'w')
out_edition_work = open('edition_work_file', 'w')
out_work = open('work_file', 'w')
works = []
authors = {}
misc_fields = ['created', 'modified', 'last_modified', 'latest_revision', 'personal_name', 'id', 'revision', 'type']
rec_no = 0
t0 = time()
for data in read_input():
    rec_no += 1
    if rec_no % 100000 == 0:
        t1 = time() - t0
        print("%s %.2f minutes" % (web.commify(rec_no), (float(t1) / 60.0)))
    try:
        d = json.loads(data)
    except:
        print(data)
        raise
    t = d['type']['key']
    k = d['key']
    if t == '/type/edition':
        m = re_edition_key.match(k)
        if not m:
            print('bad edition key:', k)
            print(data)
            continue
        print(data, file=out_edition)
        for w in d.get('works', []):
Ejemplo n.º 4
0
    def GET(self): # yes, this is a bit of a mess
        web.header('Content-Type','text/html; charset=utf-8', unique=True)
        web.header('Refresh','60', unique=True)
        page = ''
        page += "<html>\n<head><title>Import status</title>"
        page += "<style>th { vertical-align: bottom; text-align: left }</style>"
        page += "</head><body>"
        page += "<h1>Import status</h1>"
#        page += '<p style="padding: 5px; background: lightblue; border: black 1px solid; font-size:125%; font-weight: bold">MARC import is paused during the OCA conference</p>'
        page += "<b>Done:</b>"
        page += ', '.join('<a href="http://archive.org/details/%s">%s</a>' % (ia, ia) for ia in done) + '<br>'
        page += "<table>"
        page += "<tr><th>Archive ID</th><th>input<br>(rec/sec)</th>"
        page += "<th>no match<br>(%)</th>"
        page += "<th>load<br>(rec/sec)</th>"
#        page += "<th>last update<br>(secs)</th><th>running<br>(hours)</th>"
        page += "<th>progress</th>"
        page += "<th>remaining<br>(hours)</th>"
        page += "<th>remaining<br>(records)</th>"
        page += "</tr>"
        cur_time = time()
        total_recs = 0
        total_t = 0
        total_load = 0
        total_rec_per_sec = 0
        total_load_per_sec = 0
        total_future_load = 0
        for k in server_read('keys'):
            if k.endswith('2'):
                continue
            if k in done:
                continue

            broken = False
            q = server_read('store/' + k)
            t1 = cur_time - q['time']
            rec_no = q['rec_no']
            chunk = q['chunk']
            load_count = q['load_count']
            rec_per_sec = rec_no / q['t1']
            load_per_sec = load_count / q['t1']
            if k in done:
                page += '<tr bgcolor="#00ff00">'
                broken = True
            elif t1 > 600:
                broken = True
                page += '<tr bgcolor="red">'
            elif t1 > 120:
                broken = True
                page += '<tr bgcolor="yellow">'
            else:
                page += '<tr bgcolor="#00ff00">'
                total_rec_per_sec += rec_per_sec
                total_load_per_sec += load_per_sec
                total_recs += rec_no
                total_load += load_count
                total_t += q['t1']
            page += '<td><a href="http://archive.org/details/%s">%s</a></td>' % (k.rstrip('2'), k)
#            page += '<td><a href="http://openlibrary.org/show-marc/%s">current record</a></td>' % q['cur']
#            if 'last_key' in q and q['last_key']:
#                last_key = q['last_key']
#                page += '<td><a href="http://openlibrary.org%s">%s</a></td>' % (last_key, last_key[3:])
#            else:
#                page += '<td>No key</td>'
            if k in done:
                for i in range(5):
                    page += '<td></td>'
                page += '<td align="right">100.0%</td>'
            else:
                page += '<td align="right">%.2f</td>' % rec_per_sec
                no_match = float(q['load_count']) / q['rec_no']
                page += '<td align="right">%.2f%%</td>' % (no_match * 100)
                page += '<td align="right">%.2f</td>' % load_per_sec
                hours = q['t1'] / 3600.0
                page += '<td align="right">%.2f</td>' % hours
                (pos, total) = progress(k, q['part'], q['pos'])
                if pos:
                    page += '<td align="right">%.2f%%</td>' % (float(pos * 100) / total)
                else:
                    page += '<td align="right">n/a</td>'
                if 'bytes_per_sec_total' in q and total is not None and pos:
                    remaining_bytes = total - pos
                    sec = remaining_bytes / q['bytes_per_sec_total']
                    hours = sec / 3600
                    days = hours / 24
                    page += '<td align="right">%.2f</td>' % hours
                    total_bytes = q['bytes_per_sec_total'] * q['t1']
                    avg_bytes = total_bytes / q['rec_no']
                    future_load = ((remaining_bytes / avg_bytes) * no_match)
                    total_future_load += future_load
                    page += '<td align="right">%s</td>' % web.commify(int(future_load))
                else:
                    page += '<td></td>'

            page += '</tr>'
        page += '<tr><td>Total</td><td align="right">%.2f</td>' % total_rec_per_sec
        if total_recs:
            page += '<td align="right">%.2f%%</td>' % (float(total_load * 100.0) / total_recs)
        else:
            page += '<td align="right"></td>'
        page += '<td align="right">%.2f</td>' % total_load_per_sec
        page += '<td></td>' * 3 + '<td align="right">%s</td>' % web.commify(int(total_future_load))
        page += '</tr></table>'
#        page += "<table>"
#        page += '<tr><td align="right">loading:</td><td align="right">%.1f</td><td>rec/hour</td></tr>' % (total_load_per_sec * (60 * 60))
#        page += '<tr><td align="right">loading:</td><td align="right">%.1f</td><td>rec/day</td></tr>' % (total_load_per_sec * (60 * 60 * 24))
#        if total_load_per_sec:
#            page += '<tr><td>one million records takes:</td><td align="right">%.1f</td><td>hours</td></tr>' % ((1000000 / total_load_per_sec) / (60 * 60))
#            page += '<tr><td>one million records takes:</td><td align="right">%.1f</td><td>days</td></tr>' % ((1000000 / total_load_per_sec) / (60 * 60 * 24))
#        page += "</table>"
        rec_per_sec, count = read_book_count()
        page += "Total records per second: %.2f<br>" % rec_per_sec
        day = web.commify(int(rec_per_sec * (60 * 60 * 24)))
        page += "Total records per day: %s<br>" % day

        page += "Books in Open Library: " + web.commify(count) + "<br>"
        page += '</body>\n<html>'
        return page
Ejemplo n.º 5
0
class Dollars(Integer):
    display = lambda self, x: '$' + web.commify(x)
Ejemplo n.º 6
0
class Number(Integer):
    display = lambda self, x: web.commify(x)