def run_deny_params(site_id): denys = Site.objects.get(pk=site_id).deny_parameters.split(';') for i,d in enumerate(denys): if "r'" in str(d[0:2]) and "'" in str(d[-1]): denys[i] = d[2:-1] denys = [i for i in denys if i != ''] universal_deny = config_file.get_config().get('bblio','universal_deny').split(';') universal_deny = [i for i in universal_deny if i != ''] denys.extend(universal_deny) host_regex= None source_allowed_domains = Site.objects.get(pk=site_id).source_allowed_domains if not source_allowed_domains: host_regex = re.compile('') else: regex = r'^(.*\.)?(%s)$' % '|'.join(re.escape(d) for d in source_allowed_domains.split(";")) host_regex = re.compile(regex) for doc in Document.objects.filter(site_id=site_id): if any([re.search(re.compile(d), doc.urlAddress) for d in denys if d != None]): doc.isUsed = 6 doc.save() if not bool(host_regex.search(urlparse(doc.urlAddress).hostname)): doc.isUsed=6 doc.save()
def main(): os.system("touch /tmp/debug") config = get_config() # Create the Agent and pass all the configuration to it then run it.. agent_class = globals()[config.agent.name] agent = agent_class(config) agent.run(mode='train')
def sites(request): sites = Site.objects.all() site_list = [] es = ESController() try: site_doc_count = es.get_document_count_by_site() except: site_doc_count = {} owners = config_file.get_config().get('bblio','owners').split(';') scoreboard = {} for o in owners: if o != '': scoreboard.update({o: {'doc':0, 'site':0}}) for site in sites: s = model_to_dict(site) s.update({'doc_count': Document.objects.filter(site_id=site.id).count()}) s.update({'zero_count': Document.objects.filter(site_id=site.id).filter(isUsed=0).count()}) try: doc = site_doc_count[site.id] s.update({'index_count':doc}) if site.owner: scoreboard[site.owner]['site'] += 1 scoreboard[site.owner]['doc'] += doc except: s.update({'index_count': 0}) site_list.append(s) if not site_doc_count: d = Document.objects.filter(isUsed=0).values('site__owner').annotate(zero_count=Count('site__owner')) scoreboard= d context = {'sites':site_list, 'score':scoreboard} return render(request, 'operations/sites.html',context)
def index(request): config = config_file.get_config() if request.POST: if len(request.POST) > 0: for r in request.POST: config.set('bblio',r,str(request.POST[r])) config_file.set_config(config) try: config.items('bblio') except: config.add_section('bblio') form = ConfigForm() for f in form.fields: try: form.fields[f].initial = config.get('bblio',str(f)) except: config.set('bblio',f,'') config_file.set_config(config) context = {'form' : form } return render(request, 'operations/admin.html', context)
def crawl(request, site_id): import time site = Site.objects.get(pk=site_id) if site.instance == '' or not site.instance: return HttpResponse("No working instance selected") if scraper.scrapeController.get_jobs_for_site(site_id)=='Running': return HttpResponse("Crawler is running") try: c_dict = scraper.scrapeController.get_job_status_count_for_instance(site.instance) count = int(c_dict['running']) + int(c_dict['pending']) except: count = 0 if count < int(config_file.get_config().get('bblio','crawler_instance_site_limit')): ret = scraper.scrapeController.curl_schedule_crawl(site_id, site.instance) if 'jobid' in ret: site.jobid = ret['jobid'] site.save() else: return HttpResponse("Error when scheduling job") else: return HttpResponse("Worker instance is full. Please try another instance") site.running=2 site.save() time.sleep(3) return HttpResponseRedirect(reverse('site', kwargs={ 'site_id' : site_id}))
def getCrawlerInstances(): crawler_list = [] for c in [getInstance(i) for i in get_config().get('bblio','crawler_instance').split(';')]: if c.ip_address: crawler_list.append(c) return crawler_list
def retrieve_params_from_each_file(self, list): """ Iterate through the list and retrieve params for each file. """ # List that contains all params related to LEAVES and TREES' values. list_of_files_param = [] for i in list: # Append each file's params list_of_files_param.append( get_config(self.map_of_files[int(i[0])], self.entry)) return list_of_files_param
def main(): os.system("touch /tmp/debug") config = get_config() # Create the Agent and pass all the configuration to it then run it.. agent_class = globals()[config.agent.name] agent = agent_class(config) agent.test_one_file((fn_path, None)) note_fn_list = agent.data_source.test_loader.dataset.note_fn_list audio_fn_list = agent.data_source.test_loader.dataset.audio_fn_list for pair in zip(audio_fn_list, note_fn_list): acc = agent.test_one_file(pair) print(pair, acc) '''
def getCrawlerInstances(): crawler_list = [] for c in [getInstance(i) for i in get_config().get('bblio','crawler_instance').split(';')]: if c == None: continue try: if c.ip_address: crawler_list.append(c) except AttributeError: pass return crawler_list
def __init__(self, *args, **kwargs): super(SiteForm, self).__init__(*args, **kwargs) running_limit = config_file.get_config().get('bblio','crawler_instance_site_limit') instance_list = [] grouping_list = [('works','works'),('works dirty','works dirty'),('WIP','WIP'),('error','error'),('condemned','condemned'),('start','start'), ('test','test')] try: for i in getCrawlerInstances(): dict = get_job_status_count_for_instance(i.id) count = int(dict['pending']) + int(dict['running']) instance_list.append({'name':i.id,'choice_name': i.id + ' ' + str(count) + '/' + str(running_limit)}) except: pass instance_list.append({'name':'','choice_name':''}) instance_choices = ((i['name'],i['choice_name']) for i in instance_list) self.fields['instance'].widget = Select(attrs={'class': 'form-control input-sm'}, choices=instance_choices) self.fields['jurisdiction'].widget = Select(attrs={'class': 'form-control input-sm'}, choices=get_country_list()) self.fields['owner'].widget = Select(attrs={'class': 'form-control input-sm'}, choices=[(o, o) for o in config_file.get_config().get('bblio','owners').split(';')]) self.fields['grouping'].widget = Select(attrs={'class': 'form-control input-sm'}, choices=[(g,g) for g in config_file.get_config().get('bblio','grouping').split(';')])
def getCrawlerInstance(): return conn().get_all_instances(instance_ids=get_config().get('bblio','crawler_instance').split(';')[0])[0].instances[0]
def __init__(self, *a, **kw): self.follow = [] self.parsing = [] self.deny = [] site = None if 'id' in kw: self.id = kw['id'] site = Site.objects.get(pk=self.id) self.start_urls = site.source_start_urls.split(';') self.allowed_domains = site.source_allowed_domains.split(';') if site.parse_parameters: self.parsing = site.parse_parameters.strip().encode('utf-8').split(";") if site.follow_parameters: self.follow = site.follow_parameters.strip().encode('utf-8').split(";") if site.deny_parameters: self.deny = site.deny_parameters.strip().encode('utf-8').split(";") else: self.allowed_domains = kw['source_allowed_domains'].split(';') self.start_urls = kw['source_start_urls'].split(';') if kw['parse_parameters']: self.parsing = kw['parse_parameters'].strip().encode('utf-8').split(';') if kw['follow_parameters']: self.follow = kw['follow_parameters'].strip().encode('utf-8').split(';') if kw['deny_parameters']: self.deny = kw['deny_parameters'].strip().encode('utf-8').split(';') self.parsing = [i for i in self.parsing if i !=''] self.follow = [i for i in self.follow if i !=''] self.deny = [i for i in self.deny if i !=''] config = config_file.get_config() universal_deny = config.get('bblio','universal_deny').strip().split(";") universal_deny = [i for i in universal_deny if i != ''] self.deny.extend(universal_deny) if len(self.follow) > 0: for i,d in enumerate(self.follow): if "r'" in str(d[0:2]) and "'" in str(d[-1]): self.follow[i] = d[2:-1] if len(self.parsing) > 0: for i,d in enumerate(self.parsing): if "r'" in str(d[0:2]) and "'" in str(d[-1]): self.parsing[i] = d[2:-1] if len(self.deny) > 0: for i,d in enumerate(self.deny): if "r'" in str(d[0:2]) and "'" in str(d[-1]): self.deny[i] = d[2:-1] if Document.objects.filter(site_id=self.id).count() > 0: self.url_list = Document.objects.filter(site_id=self.id).values_list('urlAddress') self.rules = ( Rule(SgmlLinkExtractor( allow=self.parsing, deny=self.deny, unique=True, restrict_xpaths=self._restrict_xpath, deny_extensions=self.ignored_extensions, ), callback='parse_item', follow='true'), Rule(SgmlLinkExtractor( allow=self.follow, deny=self.deny, unique=True, restrict_xpaths=self._restrict_xpath, ), callback='follow_item', follow='true'), ) super(SpiderAll, self).__init__(*a, **kw)
def get_country_list(): config = config_file.get_config() cl = config.get('bblio','country_list') return ((x.split("|")[0], x.split("|")[1]) for x in cl.split(";"))
def getCrawlerIP(): return conn().get_all_instances(instance_ids=get_config().get('bblio','crawler_instance'))[0].instances[0].dns_name
def getESip(): return getInstance(get_config().get('bblio','es_instance'),'dns_name')
def getWebServerInstance(): return getInstance(get_config().get('bblio','web_server_instance'))
def stopES(): conn().stop_instances(instance_ids=get_config().get('bblio','es_instance'))