def save_a_bunch(count=1000):
    import time

    db_name = 'abunch'
    dabase.init(db_name)

    start = time.time()

    page_ids = get_dab_page_ids(count=count)

    pages    = green_call_list(get_articles, page_ids)
    dabblets = sum([ get_dabblets(p) for p in pages ], [])

    # TODO start transaction
    for d in dabblets:
        d.save()

    all_choices = green_call_list(get_dab_choices, dabblets)

    for c in all_choices:
        c.save()
    # TODO end transaction 
    end = time.time()

    print len(dabblets), 'Dabblets saved to', db_name, 'in', end-start, 'seconds'
    print len(set([d.title for d in dabblets])), 'unique titles'
    print len(set([d.source_title for d in dabblets])), 'unique source pages'
    print len(all_choices), 'dabblet choices fetched and saved.'

    print Dabblet.select().count(), 'total records in database'
    print len(set([d.title for d in Dabblet.select()])), 'unique titles in database'

    return dabblets
Exemple #2
0
def get_dabblet():
    dab_id = request.GET.get('id')
    if dab_id:
        ret = Dabblet.get(id=int(dab_id))
    else:
        ret = Dabblet.select().order_by('priority').limit('1').get()
    
    return ret.jsondict
Exemple #3
0
def solve_dabblet():
    session_id = request.get_cookie('session_id') #TODO: decorator-ify session
    session = get_session(session_id)
    response.set_cookie('session_id', session['id'])

    dabblet_id = int(request.POST['dabblet_id'])
    choice_id  = int(request.POST['choice_id'])

    dabblet = Dabblet.get(id=dabblet_id)
    if choice_id < 0:
        choice = None
    else:
        choice  = DabChoice.get(id=choice_id)

    sol = DabSolution(dabblet=dabblet,
                      choice=choice,
                      solver_ip=request.get('REMOTE_ADDR'),
                      solver_index=session.get('cur_index', 0),
                      date_solved=datetime.now())
    sol.save()
    # replace?

    view_count = DabSolution.select().count()
    pass_count = DabSolution.select().where(choice_id=None).count()
    return { "view_count": view_count,
             "solution_count": view_count-pass_count }
Exemple #4
0
def get_dabblets(parsed_page):
    "Call with a Page object, the type you'd get from get_articles()"
    ret = []
    d = pq(parsed_page.revisiontext)
    page_title = parsed_page.title

    images_found = [img.attrib['src'] 
                    for img in d('img.thumbimage')
                    if img.attrib.get('src')][:3]

    dab_link_markers = d('span:contains("disambiguation needed")')
    for i, dlm in enumerate(dab_link_markers):
        try:
            dab_link = d(dlm).parents("sup")[0].getprevious() # TODO: remove extra d?
            dab_link = d(dab_link)
            if dab_link.is_('a'):
                dab_title = dab_link.attr('title')
                context = get_context(dab_link)
                ctx_html = context.outerHtml()
                ret.append( Dabblet.from_page(title        = dab_title, 
                                              context      = ctx_html, 
                                              source_page  = parsed_page, 
                                              source_order = i,
                                              source_imgs  = images_found))
        except Exception as e:
            print 'nope', e
            pass
            
    return ret
Exemple #5
0
def next_dabblet():
    session_id = request.get_cookie('session_id')
    session = get_session(session_id)
    response.set_cookie('session_id', session['id'])

    cur_index = session.get('cur_index', 0)
    cur_id = session['seq'][cur_index]
    session['cur_index'] = cur_index + 1

    return { 'cur_index': cur_index,
             'total': len(session['seq']),
             'dabblet': Dabblet.get(id=cur_id).jsondict
             }
Exemple #6
0
def get_random_dabblet():
    rdabs = Dabblet.select().order_by("RANDOM()").limit(2)
    
    return { 'dabs': [ d.jsondict for d in rdabs ] }
Exemple #7
0
                      choice=choice,
                      solver_ip=request.get('REMOTE_ADDR'),
                      solver_index=session.get('cur_index', 0),
                      date_solved=datetime.now())
    sol.save()
    # replace?

    view_count = DabSolution.select().count()
    pass_count = DabSolution.select().where(choice_id=None).count()
    return { "view_count": view_count,
             "solution_count": view_count-pass_count }
    
@route('/random/')
def get_random_dabblet():
    rdabs = Dabblet.select().order_by("RANDOM()").limit(2)
    
    return { 'dabs': [ d.jsondict for d in rdabs ] }

class SlashMiddleware(object):
    def __init__(self, app):
        self.app = app
    def __call__(self, e, h):
        e['PATH_INFO'] = e['PATH_INFO'].rstrip('/')+'/'
        return self.app(e,h)

if __name__ == '__main__':
    dabase.init('abunch')
    ALL_DABBLETS = [ (d.id, d.priority) for d in Dabblet.select(['id','priority']) ]
    app = SlashMiddleware(bottle.app())
    run(app=app, host='0.0.0.0', port=8080, server='gevent')
Exemple #8
0
def save_a_bunch(count=DEFAULT_LIMIT, category=DEFAULT_CAT, concurrency=DEFAULT_CONC, 
                 per_call=DEFAULT_PER_CALL, db_name=DEFAULT_DB):
    import time

    page_ids = get_dab_page_ids(category, count)

    dabblets = []
    dpm = ProgressMeter(total=len(page_ids), unit="articles", ticks=30)
    for pages in chunked_pimap(get_articles, page_ids,
                               concurrency=concurrency,
                               chunk_size=per_call):
        for p in pages:
            dpm.update(1)
            cur_dabs = get_dabblets(p)
            dabblets.extend(cur_dabs)
    
    print
    print 'Saving', len(dabblets), 'dabblets.'
    dspm = ProgressMeter(total=len(dabblets), unit="dabblets", ticks=30)
    dsave_start = time.time()
    for d in dabblets:
        d.save()
        for img in d.source_imgs:
            dab_img = DabImage(dabblet=d, src=img)
            dab_img.save()
        dspm.update(1)
    print
    print 'Done saving', len(dabblets), 'Dabblets. (', time.time()-dsave_start,'seconds)'

    print 'Processing choices for', len(dabblets), 'Dabblets.'
    cpm = ProgressMeter(total=len(page_ids), unit="Dabblets", ticks=30)
    all_choices = []
    for choices in chunked_pimap(get_dab_choices, dabblets,
                                 concurrency=concurrency,
                                 chunk_size=per_call):
        cpm.update(per_call)
        all_choices.extend(choices)
    
    print
    print 'Saving', len(all_choices), 'DabChoices.'
    cspm = ProgressMeter(total=len(all_choices), unit="DabChoices", ticks=30)
    csave_start = time.time()
    for c in all_choices:
        c.save()
        cspm.update(1)
    print 'Done saving', len(dabblets), 'DabChoices. (', time.time()-csave_start,'seconds)'

    drank_start = time.time()
    print 'Ranking', len(dabblets), 'Dabblets.'
    for d in dabblets:
        d.priority = d.get_priority()
        d.save()
    print 'Done ranking', len(dabblets), 'DabChoices. (', time.time()-drank_start,'seconds)'

    print len(set([d.title for d in dabblets])), 'unique titles'
    print len(set([d.source_title for d in dabblets])), 'unique source pages'
    print len(all_choices), 'dabblet choices fetched and saved.'

    print Dabblet.select().count(), 'total records in database'
    print len(set([d.title for d in Dabblet.select()])), 'unique titles in database'

    print 'Committing...'
    return dabblets