def index(request): k=50 context = {} boxes = CountryCheckboxes(initial={'country' : [cty[0] for cty in get_country_list()]}) if 'q' in request.GET: q = request.GET['q'].encode('utf-8') p = 1 if 'p' in request.GET: p = min(1, int(request.GET['p'])) es = ESController() c = get_selected_country_list(request) context.update(es.search(q,c,k,p)) context.update({ 'linklist' : pager(k,int(context['result_count']),p,10), 'last_search' : q, 'last_search_url' : str(reverse('search.views.index')) + '?q=' + str(urllib.quote(q)), 'page' : p+1 }) if c: context.update({'custom' : True}) boxes = CountryCheckboxes(initial={'country':c}) else: q = '' context.update({'form':boxes}) return render(request, 'search/index.html',context)
def sites(request): sites = Site.objects.all() site_list = [] es = ESController() try: site_doc_count = es.get_document_count_by_site() except: site_doc_count = {} owners = config_file.get_config().get('bblio','owners').split(';') scoreboard = {} for o in owners: if o != '': scoreboard.update({o: {'doc':0, 'site':0}}) for site in sites: s = model_to_dict(site) s.update({'doc_count': Document.objects.filter(site_id=site.id).count()}) s.update({'zero_count': Document.objects.filter(site_id=site.id).filter(isUsed=0).count()}) try: doc = site_doc_count[site.id] s.update({'index_count':doc}) if site.owner: scoreboard[site.owner]['site'] += 1 scoreboard[site.owner]['doc'] += doc except: s.update({'index_count': 0}) site_list.append(s) if not site_doc_count: d = Document.objects.filter(isUsed=0).values('site__owner').annotate(zero_count=Count('site__owner')) scoreboard= d context = {'sites':site_list, 'score':scoreboard} return render(request, 'operations/sites.html',context)
def site(request, site_id): context = {} site = None if site_id !='0': site = Site.objects.get(pk=site_id) if not scraper.scrapeController.get_jobs_for_site(site_id): site.running=0 site.save(update_fields=['running']) elif scraper.scrapeController.get_jobs_for_site(site_id) == 'finished': site.running=0 site.save(update_fields=['running']) if request.method == 'POST': site_form = SiteForm(request.POST,instance=site) if site_form.is_valid(): new_site = site_form.save() if 'crawl' in request.POST: if request.POST['crawl']=='yes': crawl(request, new_site.id) if site_id == '0': return HttpResponseRedirect(reverse('site', kwargs={ 'site_id' : new_site.id})) else: return HttpResponse('Error fields: ' + str(site_form.errors)) if site_id !='0': es = ESController() site = Site.objects.get(pk=site_id) d = (Document.objects.filter(site_id=site_id) .values('id','urlAddress','isUsed') .order_by('isUsed','urlAddress')) context.update({ 'doc_count' : d.count(), 'zero_count' : Document.objects.filter(site_id=site_id).filter(isUsed=0).count(), 'docs':d, 'running' : site.running}) try: context.update({'index_count' : es.get_document_count_for_site_id(site_id)}) except: pass try: context.update({'jobid': site.jobid, 'instance_ip': aws.ec2.getInstanceFromInstanceName(site.instance).ip_address}) except: pass site_form = SiteForm(instance=site) context.update({ 'site_id':site_id, 'site_form':site_form,}) return render(request, 'operations/site.html',context)
def document(request,doc_id): import HTMLParser es = ESController() doc = Document.objects.get(pk=doc_id) context = { 'html': '<code>' + re.sub('\n','</code>\n<code>',cgi.escape(es.get_body_html(doc.document_html))) + '</code>', 'parsed_text' : es.text_parse(doc), 'parsed_title' : es.title_parse(doc.document_html) } return render(request, 'operations/document.html',context)
def navbar_inclusion(): es = ESController() instance_ips = [{'url':i.ip_address,'name':i.id} for i in aws.ec2.getCrawlerInstances()] return {"es_count":es.get_document_count(), 'crawlers':instance_ips, 'zero_count' : Document.objects.filter(isUsed=0).count()}
def es_remove_site_from_index(request, site_id): es = ESController() es.delete_site_id_from_es(site_id) return HttpResponseRedirect(request.META.get('HTTP_REFERER'))
def index_process(site_id): es = ESController() es.index_site_id(site_id)