Ejemplo n.º 1
0
def run_deny_params(site_id):
    denys = Site.objects.get(pk=site_id).deny_parameters.split(';')
    for i,d in enumerate(denys):
        if "r'" in str(d[0:2]) and "'" in str(d[-1]):
            denys[i] = d[2:-1]

    denys = [i for i in denys if i != '']
    universal_deny = config_file.get_config().get('bblio','universal_deny').split(';')
    universal_deny = [i for i in universal_deny if i != '']
    denys.extend(universal_deny)
    
    host_regex= None
    source_allowed_domains = Site.objects.get(pk=site_id).source_allowed_domains
    
    if not source_allowed_domains:
        host_regex = re.compile('')
    else:
        regex = r'^(.*\.)?(%s)$' % '|'.join(re.escape(d) for d in source_allowed_domains.split(";"))
        host_regex = re.compile(regex)

    for doc in Document.objects.filter(site_id=site_id):        
        if any([re.search(re.compile(d), doc.urlAddress) for d in denys if d != None]):
            doc.isUsed = 6
            doc.save()
        if not bool(host_regex.search(urlparse(doc.urlAddress).hostname)):
            doc.isUsed=6
            doc.save()
Ejemplo n.º 2
0
def main():
    os.system("touch /tmp/debug")
    config = get_config()
    # Create the Agent and pass all the configuration to it then run it..
    agent_class = globals()[config.agent.name]
    agent = agent_class(config)
    agent.run(mode='train')
Ejemplo n.º 3
0
def sites(request):
    sites = Site.objects.all()
    site_list = []
    es = ESController()

    try:
        site_doc_count = es.get_document_count_by_site()
    except:
        site_doc_count = {}

    owners = config_file.get_config().get('bblio','owners').split(';')
    scoreboard = {}
    for o in owners:
        if o != '':
            scoreboard.update({o: {'doc':0, 'site':0}})
    for site in sites:
        s = model_to_dict(site)
        s.update({'doc_count': Document.objects.filter(site_id=site.id).count()})
        s.update({'zero_count': Document.objects.filter(site_id=site.id).filter(isUsed=0).count()})
        try:
            doc = site_doc_count[site.id]
            s.update({'index_count':doc})
            if site.owner:
                scoreboard[site.owner]['site'] += 1
                scoreboard[site.owner]['doc'] += doc
        except:
            s.update({'index_count': 0})

        site_list.append(s)
    if not site_doc_count:
        d = Document.objects.filter(isUsed=0).values('site__owner').annotate(zero_count=Count('site__owner'))
        scoreboard= d

    context = {'sites':site_list, 'score':scoreboard}
    return render(request, 'operations/sites.html',context)
Ejemplo n.º 4
0
def index(request):
    config = config_file.get_config()
    
    if request.POST:
        if len(request.POST) > 0:
            for r in request.POST:
                config.set('bblio',r,str(request.POST[r]))
            config_file.set_config(config) 
    
    try:
        config.items('bblio')
    except:
        config.add_section('bblio')
    
    form = ConfigForm()
    for f in form.fields:
        try:
            form.fields[f].initial = config.get('bblio',str(f))
        except:
            config.set('bblio',f,'')
    config_file.set_config(config)    

    context = {'form' : form }
    
    return render(request, 'operations/admin.html', context)
Ejemplo n.º 5
0
def crawl(request, site_id):
    import time
    site = Site.objects.get(pk=site_id)
    if site.instance == '' or not site.instance:
        return HttpResponse("No working instance selected")
     
    if scraper.scrapeController.get_jobs_for_site(site_id)=='Running':
        return HttpResponse("Crawler is running")
    try:
        c_dict = scraper.scrapeController.get_job_status_count_for_instance(site.instance) 
        count = int(c_dict['running']) + int(c_dict['pending'])
    except:
        count = 0
    if count < int(config_file.get_config().get('bblio','crawler_instance_site_limit')):
        ret = scraper.scrapeController.curl_schedule_crawl(site_id, site.instance)
        if 'jobid' in ret:
            site.jobid = ret['jobid']
            site.save()
        else:
            return HttpResponse("Error when scheduling job")
    else:
        return HttpResponse("Worker instance is full. Please try another instance")
    
    site.running=2
    site.save()
    time.sleep(3)
    return HttpResponseRedirect(reverse('site', 
        kwargs={ 'site_id' : site_id}))
Ejemplo n.º 6
0
Archivo: ec2.py Proyecto: yuanzai/bblio
def getCrawlerInstances():
    crawler_list = []
    for c in [getInstance(i) for i in get_config().get('bblio','crawler_instance').split(';')]:
        if c.ip_address:
            crawler_list.append(c)

    return crawler_list 
Ejemplo n.º 7
0
 def retrieve_params_from_each_file(self, list):
     """ Iterate through the list and retrieve params for each file.
     """
     # List that contains all params related to LEAVES and TREES' values.
     list_of_files_param = []
     for i in list:
         # Append each file's params
         list_of_files_param.append(
             get_config(self.map_of_files[int(i[0])], self.entry))
     return list_of_files_param
def main():
    os.system("touch /tmp/debug")
    config = get_config()
    # Create the Agent and pass all the configuration to it then run it..
    agent_class = globals()[config.agent.name]
    agent = agent_class(config)
    agent.test_one_file((fn_path, None))
    note_fn_list = agent.data_source.test_loader.dataset.note_fn_list
    audio_fn_list = agent.data_source.test_loader.dataset.audio_fn_list
    for pair in zip(audio_fn_list, note_fn_list):
        acc = agent.test_one_file(pair)
        print(pair, acc)
        '''
Ejemplo n.º 9
0
def getCrawlerInstances():
    crawler_list = []
    for c in [getInstance(i) for i in get_config().get('bblio','crawler_instance').split(';')]:
        if c == None:
            continue
        try:
            if c.ip_address:
                crawler_list.append(c)
        except AttributeError:
            pass
       

    return crawler_list 
Ejemplo n.º 10
0
    def __init__(self, *args, **kwargs):
        super(SiteForm, self).__init__(*args, **kwargs)
        
        running_limit = config_file.get_config().get('bblio','crawler_instance_site_limit')
    
        instance_list = []
        grouping_list = [('works','works'),('works dirty','works dirty'),('WIP','WIP'),('error','error'),('condemned','condemned'),('start','start'), ('test','test')]
        try:
            for i in getCrawlerInstances():            
                dict = get_job_status_count_for_instance(i.id)
                count = int(dict['pending']) + int(dict['running'])
                instance_list.append({'name':i.id,'choice_name': i.id + ' ' + str(count) + '/' + str(running_limit)})
        except:
            pass

        instance_list.append({'name':'','choice_name':''})
        instance_choices = ((i['name'],i['choice_name']) for i in instance_list)
        self.fields['instance'].widget = Select(attrs={'class': 'form-control input-sm'}, choices=instance_choices)
        self.fields['jurisdiction'].widget = Select(attrs={'class': 'form-control input-sm'}, choices=get_country_list())
        self.fields['owner'].widget = Select(attrs={'class': 'form-control input-sm'}, choices=[(o, o) for o in config_file.get_config().get('bblio','owners').split(';')])
        self.fields['grouping'].widget = Select(attrs={'class': 'form-control input-sm'}, choices=[(g,g) for g in config_file.get_config().get('bblio','grouping').split(';')])
Ejemplo n.º 11
0
Archivo: ec2.py Proyecto: yuanzai/bblio
def getCrawlerInstance():
    return conn().get_all_instances(instance_ids=get_config().get('bblio','crawler_instance').split(';')[0])[0].instances[0]
Ejemplo n.º 12
0
    def __init__(self, *a, **kw):
        self.follow = []
        self.parsing = []
        self.deny = []
        site = None
        if 'id' in kw:
            self.id = kw['id']

            site = Site.objects.get(pk=self.id)
            self.start_urls = site.source_start_urls.split(';')
            self.allowed_domains = site.source_allowed_domains.split(';')
            if site.parse_parameters:  self.parsing = site.parse_parameters.strip().encode('utf-8').split(";")
            if site.follow_parameters:  self.follow = site.follow_parameters.strip().encode('utf-8').split(";")   
            if site.deny_parameters: self.deny = site.deny_parameters.strip().encode('utf-8').split(";")    
        else:
            self.allowed_domains = kw['source_allowed_domains'].split(';')
            self.start_urls = kw['source_start_urls'].split(';')
            if kw['parse_parameters']: self.parsing = kw['parse_parameters'].strip().encode('utf-8').split(';')
            if kw['follow_parameters']: self.follow = kw['follow_parameters'].strip().encode('utf-8').split(';')
            if kw['deny_parameters']: self.deny = kw['deny_parameters'].strip().encode('utf-8').split(';')
        self.parsing = [i for i in self.parsing if i !='']
        self.follow = [i for i in self.follow if i !='']
        self.deny = [i for i in self.deny if i !='']
        
        config = config_file.get_config()
        universal_deny = config.get('bblio','universal_deny').strip().split(";")
        universal_deny = [i for i in universal_deny if i != '']
        self.deny.extend(universal_deny)
        if len(self.follow) > 0:
            for i,d in enumerate(self.follow):
                if "r'" in str(d[0:2]) and "'" in str(d[-1]):
                    self.follow[i] = d[2:-1]
        
        if len(self.parsing) > 0:
            for i,d in enumerate(self.parsing):
                if "r'" in str(d[0:2]) and "'" in str(d[-1]):
                    self.parsing[i] = d[2:-1]
        
        if len(self.deny) > 0:
            for i,d in enumerate(self.deny):
                if "r'" in str(d[0:2]) and "'" in str(d[-1]):
                    self.deny[i] = d[2:-1]
        if Document.objects.filter(site_id=self.id).count() > 0:
            self.url_list = Document.objects.filter(site_id=self.id).values_list('urlAddress')

        self.rules = (
                Rule(SgmlLinkExtractor(
                    allow=self.parsing,
                    deny=self.deny,
                    unique=True,
                    restrict_xpaths=self._restrict_xpath,
                    deny_extensions=self.ignored_extensions,
                    ), 
                    callback='parse_item', follow='true'),
                Rule(SgmlLinkExtractor(
                    allow=self.follow,
                    deny=self.deny,
                    unique=True,
                    restrict_xpaths=self._restrict_xpath,
                    ), 
                    callback='follow_item', follow='true'),
                )
        super(SpiderAll, self).__init__(*a, **kw) 
Ejemplo n.º 13
0
def get_country_list():
    config = config_file.get_config()
    cl = config.get('bblio','country_list')
    return ((x.split("|")[0], x.split("|")[1]) for x in cl.split(";"))
Ejemplo n.º 14
0
Archivo: ec2.py Proyecto: yuanzai/bblio
def getCrawlerIP():
    return conn().get_all_instances(instance_ids=get_config().get('bblio','crawler_instance'))[0].instances[0].dns_name
Ejemplo n.º 15
0
Archivo: ec2.py Proyecto: yuanzai/bblio
def getESip():
    return getInstance(get_config().get('bblio','es_instance'),'dns_name')
Ejemplo n.º 16
0
Archivo: ec2.py Proyecto: yuanzai/bblio
def getWebServerInstance():
    return getInstance(get_config().get('bblio','web_server_instance'))
Ejemplo n.º 17
0
Archivo: ec2.py Proyecto: yuanzai/bblio
def stopES():
    conn().stop_instances(instance_ids=get_config().get('bblio','es_instance'))