def save_a_bunch(count=1000): import time db_name = 'abunch' dabase.init(db_name) start = time.time() page_ids = get_dab_page_ids(count=count) pages = green_call_list(get_articles, page_ids) dabblets = sum([ get_dabblets(p) for p in pages ], []) # TODO start transaction for d in dabblets: d.save() all_choices = green_call_list(get_dab_choices, dabblets) for c in all_choices: c.save() # TODO end transaction end = time.time() print len(dabblets), 'Dabblets saved to', db_name, 'in', end-start, 'seconds' print len(set([d.title for d in dabblets])), 'unique titles' print len(set([d.source_title for d in dabblets])), 'unique source pages' print len(all_choices), 'dabblet choices fetched and saved.' print Dabblet.select().count(), 'total records in database' print len(set([d.title for d in Dabblet.select()])), 'unique titles in database' return dabblets
def get_dabblet(): dab_id = request.GET.get('id') if dab_id: ret = Dabblet.get(id=int(dab_id)) else: ret = Dabblet.select().order_by('priority').limit('1').get() return ret.jsondict
def solve_dabblet(): session_id = request.get_cookie('session_id') #TODO: decorator-ify session session = get_session(session_id) response.set_cookie('session_id', session['id']) dabblet_id = int(request.POST['dabblet_id']) choice_id = int(request.POST['choice_id']) dabblet = Dabblet.get(id=dabblet_id) if choice_id < 0: choice = None else: choice = DabChoice.get(id=choice_id) sol = DabSolution(dabblet=dabblet, choice=choice, solver_ip=request.get('REMOTE_ADDR'), solver_index=session.get('cur_index', 0), date_solved=datetime.now()) sol.save() # replace? view_count = DabSolution.select().count() pass_count = DabSolution.select().where(choice_id=None).count() return { "view_count": view_count, "solution_count": view_count-pass_count }
def get_dabblets(parsed_page): "Call with a Page object, the type you'd get from get_articles()" ret = [] d = pq(parsed_page.revisiontext) page_title = parsed_page.title images_found = [img.attrib['src'] for img in d('img.thumbimage') if img.attrib.get('src')][:3] dab_link_markers = d('span:contains("disambiguation needed")') for i, dlm in enumerate(dab_link_markers): try: dab_link = d(dlm).parents("sup")[0].getprevious() # TODO: remove extra d? dab_link = d(dab_link) if dab_link.is_('a'): dab_title = dab_link.attr('title') context = get_context(dab_link) ctx_html = context.outerHtml() ret.append( Dabblet.from_page(title = dab_title, context = ctx_html, source_page = parsed_page, source_order = i, source_imgs = images_found)) except Exception as e: print 'nope', e pass return ret
def next_dabblet(): session_id = request.get_cookie('session_id') session = get_session(session_id) response.set_cookie('session_id', session['id']) cur_index = session.get('cur_index', 0) cur_id = session['seq'][cur_index] session['cur_index'] = cur_index + 1 return { 'cur_index': cur_index, 'total': len(session['seq']), 'dabblet': Dabblet.get(id=cur_id).jsondict }
def get_random_dabblet(): rdabs = Dabblet.select().order_by("RANDOM()").limit(2) return { 'dabs': [ d.jsondict for d in rdabs ] }
choice=choice, solver_ip=request.get('REMOTE_ADDR'), solver_index=session.get('cur_index', 0), date_solved=datetime.now()) sol.save() # replace? view_count = DabSolution.select().count() pass_count = DabSolution.select().where(choice_id=None).count() return { "view_count": view_count, "solution_count": view_count-pass_count } @route('/random/') def get_random_dabblet(): rdabs = Dabblet.select().order_by("RANDOM()").limit(2) return { 'dabs': [ d.jsondict for d in rdabs ] } class SlashMiddleware(object): def __init__(self, app): self.app = app def __call__(self, e, h): e['PATH_INFO'] = e['PATH_INFO'].rstrip('/')+'/' return self.app(e,h) if __name__ == '__main__': dabase.init('abunch') ALL_DABBLETS = [ (d.id, d.priority) for d in Dabblet.select(['id','priority']) ] app = SlashMiddleware(bottle.app()) run(app=app, host='0.0.0.0', port=8080, server='gevent')
def save_a_bunch(count=DEFAULT_LIMIT, category=DEFAULT_CAT, concurrency=DEFAULT_CONC, per_call=DEFAULT_PER_CALL, db_name=DEFAULT_DB): import time page_ids = get_dab_page_ids(category, count) dabblets = [] dpm = ProgressMeter(total=len(page_ids), unit="articles", ticks=30) for pages in chunked_pimap(get_articles, page_ids, concurrency=concurrency, chunk_size=per_call): for p in pages: dpm.update(1) cur_dabs = get_dabblets(p) dabblets.extend(cur_dabs) print print 'Saving', len(dabblets), 'dabblets.' dspm = ProgressMeter(total=len(dabblets), unit="dabblets", ticks=30) dsave_start = time.time() for d in dabblets: d.save() for img in d.source_imgs: dab_img = DabImage(dabblet=d, src=img) dab_img.save() dspm.update(1) print print 'Done saving', len(dabblets), 'Dabblets. (', time.time()-dsave_start,'seconds)' print 'Processing choices for', len(dabblets), 'Dabblets.' cpm = ProgressMeter(total=len(page_ids), unit="Dabblets", ticks=30) all_choices = [] for choices in chunked_pimap(get_dab_choices, dabblets, concurrency=concurrency, chunk_size=per_call): cpm.update(per_call) all_choices.extend(choices) print print 'Saving', len(all_choices), 'DabChoices.' cspm = ProgressMeter(total=len(all_choices), unit="DabChoices", ticks=30) csave_start = time.time() for c in all_choices: c.save() cspm.update(1) print 'Done saving', len(dabblets), 'DabChoices. (', time.time()-csave_start,'seconds)' drank_start = time.time() print 'Ranking', len(dabblets), 'Dabblets.' for d in dabblets: d.priority = d.get_priority() d.save() print 'Done ranking', len(dabblets), 'DabChoices. (', time.time()-drank_start,'seconds)' print len(set([d.title for d in dabblets])), 'unique titles' print len(set([d.source_title for d in dabblets])), 'unique source pages' print len(all_choices), 'dabblet choices fetched and saved.' print Dabblet.select().count(), 'total records in database' print len(set([d.title for d in Dabblet.select()])), 'unique titles in database' print 'Committing...' return dabblets