def check(request): """ Check if the passed paramter named 'url' exists in DB, if not, start the spider, otherwise display result for the url. """ url = request.POST.get('url', request.COOKIES.get('url', None)) if not url: logger.debug('check received wrong request method.') messages.error(request, 'URL is missing!') return redirect('start') domain_name = get_domain_from_url(url) if Domains.objects.filter(domain__icontains=domain_name).exists(): obj = Domains.objects.filter(domain__icontains=domain_name).first() msg = 'Domain: {} exists in DB.'.format(obj.domain) messages.info(request, msg) return redirect('display', domain=obj.domain) obj = Domains.objects.create(domain=domain_name, url=url) msg = 'Domain: {} was created.'.format(obj.domain) messages.success(request, msg) return redirect('display', domain=obj.domain)
def add_domain_to_network(request, network_name): url = request.POST.get('url', None) if not url: logger.debug('check received wrong request method.') messages.error(request, 'URL is missing!') return redirect('start') domain_name = get_domain_from_url(url) obj = None if Domains.objects.filter(domain__icontains=domain_name).exists(): obj = Domains.objects.filter(domain__icontains=domain_name).first() else: obj = Domains.objects.create(domain=domain_name, url=url) localhost = 'http://localhost:6800' scrapyd = ScrapydAPI(localhost) job_id = scrapyd.schedule('default', 'externalspider', started_by_domain=obj.domain, keywords=[]) ExternalSpider.objects.create(domain=obj, job_id=job_id) obj.status = 'external_started' obj.save() nw = None if not Network.objects.filter(name=network_name).exists(): msg = 'Network: {} not found!'.format(network_name) messages.warning(request, msg) return redirect('start') nw = Network.objects.filter(name=network_name).first() nw.domains.add(obj) return redirect('network', network_name=nw.name)
def _is_suspicious(self): domain = Domains.objects.get(domain=self.domain) my_domain = get_domain_from_url(self.url) if my_domain in domain.to_info_scan: return True if self._info() and self._info().is_suspicious: return True return False
def _recommend_name(self, item): domain = get_domain_from_url(item['url']) words = re.split(r'\W+', domain) common_top_level_domain = [ 'com', 'org', 'de', 'eu', 'net', 'fr', ] for w in words: if w in common_top_level_domain: words.remove(w) match = {} tags = [ 'title', 'meta_og_title', 'meta_title', 'name', 'alternative_name' ] for tag in tags: if tag not in item or not item[tag]: continue compare = item[tag].split(' ') for w in words: for c in compare: ratio = similar(w, c.lower()) if ratio > 0.7: match[tag] = { 'ratio': ratio, 'w_domain': w, 'w_tag': c } tmp = 0 recommend = '' for key, values in match.items(): if float(values['ratio']) > tmp: tmp = values['ratio'] recommend = key elif values['ratio'] == tmp and tmp != 0.0: recommend = key if len(item[key]) < len(item[recommend]) \ else recommend return recommend
def errback_urls(self, failure): # https://stackoverflow.com/questions/13724730/how-to-get-the-scrapy-failure-urls # log all failures # self.logger.error(repr(failure)) if failure.check(DNSLookupError): # this is the original request request = failure.request # self.logger.error('DNSLookupError on %s', request.url) url = 'http://www.' + \ get_domain_from_url(request.url) request = scrapy.Request(url, callback=self.parse_urls, meta={ 'dont_retry': True, 'domain': url }) return request if failure.check(HttpError): # these exceptions come from HttpError spider middleware # you can get the non-200 response response = failure.value.response self.logger.info('HttpError on %s', response.url)
def parse_urls(self, response): item = {} # get title item['title'] = response.xpath('//title/text()').get() item['meta_title'] = response.xpath( "//meta[@name='title']/@content").get() item['meta_og_title'] = response.xpath( "//meta[@property='og:title']/@content").get() # clean title from startseite, home etc. for k, v in item.items(): if not v: continue item[k] = self._clean_title(v) req_url = 'http://' + response.request._meta['download_slot'] item['domain'] = get_domain_from_url(req_url) item['url'] = response.url # get description item['meta_description'] = response.xpath( "//meta[@name='description']/@content").get() item['meta_og_description'] = response.xpath( "//meta[@property='og:description']/@content").get() # get keywords item['meta_keywords'] = response.xpath( "//meta[@name='keywords']/@content").get() item['meta_og_keywords'] = response.xpath( "//meta[@property='og:keywords']/@content").get() # get imprint for finding potential zip and name of company imprint_keywords = [ 'impressum', 'imprint', 'legalnotices', 'privacy', 'privacy', 'policy', 'legaldisclosure', 'corporate-info', 'terms-of-service', 'contact', 'kontakt', 'about' ] imprint_link_extractor = LinkExtractor( allow=('/(?i)(' + '|'.join(imprint_keywords) + ')'), unique=True).extract_links(response) urls_with_imprint_keyword = [i.url for i in imprint_link_extractor] if len(urls_with_imprint_keyword) > 1: for key in imprint_keywords: for url in urls_with_imprint_keyword: if key in url.lower(): item['imprint'] = url request = scrapy.Request(item['imprint'], callback=self.parse_impressum, dont_filter=True, errback=self.errback_urls) request.meta['item'] = item return request elif len(urls_with_imprint_keyword) == 1: item['imprint'] = urls_with_imprint_keyword[0] request = scrapy.Request(item['imprint'], callback=self.parse_impressum, dont_filter=True, errback=self.errback_urls) request.meta['item'] = item return request else: domain = item['domain'] item['tip'] = self._recommend_name(item) self.domains[domain] = clean_dict(item) return self.domains[domain]
def add(request): logger.debug('received post: {}'.format(request.POST)) url = request.POST.get('url', request.COOKIES.get('url', None)) name = request.POST.get('name', None) keywords = request.POST.get('keywords', None) if not url or not name: logger.debug('received post: {}'.format(request.POST)) messages.error(request, 'Name/URL of network is missing! Retry.') return redirect('start') # getting or creating the related domain for url domain_name = get_domain_from_url(url) obj_domain = None if Domains.objects.filter(domain__icontains=domain_name).exists(): obj_domain = Domains.objects.filter(domain__icontains=domain_name) \ .first() msg = 'Domain: {} exists in DB.'.format(obj_domain.domain) # messages.info(request, msg) logger.info(msg) if not obj_domain: obj_domain = Domains.objects.create(domain=domain_name, url=url) msg = 'Domain: {} was created.'.format(obj_domain.domain) messages.success(request, msg) # TODO think, whether start spider specially here, or wait till # display of created domain is visited. For now decided network is # primarly for displaying DB results in graph and not for starting # specific scans. For this the user is adviced to visit the given # actor # getting / creating related network obj_network = None if Network.objects.filter(name=name).exists(): obj_network = Network.objects.filter(name=name).first() msg = 'Network: {} exists in DB.'.format(obj_network) # messages.info(request, msg) logger.info(msg) else: obj_network = Network.objects.create(name=name, keywords=keywords) msg = 'Network: {} successfully created.'.format(obj_network) messages.success(request, msg) # eig: network.domains.add(create, related) # obj_network.domains.add(obj_domain, through_defaults={'related': True}) obj_network.domains.add(obj_domain) # beatles.members.set([john, paul, ringo, george], # through_defaults={'date_joined': date(1960, 8, 1)}) # https://docs.djangoproject.com/en/dev/topics/db/models/#extra-fields-on-many-to-many-relationships # getting / setting relationship between network and url # rel, created = Relation.objects.get_or_create(domain=obj_domain, # network=obj_network, # related=True) # if created: # msg = 'Domain {} is now related to Network {}'.format(obj_domain, # obj_network) # messages.success(request, msg) # TODO redirect network/network_name return redirect('network', network_name=obj_network.name)
def _status(self): domain = get_domain_from_url(self.url) if not Domains.objects.filter(domain=domain).exists(): return 'not created yet' obj = Domains.objects.get(domain=domain) return obj.status
def _is_being_crawled(self): domain = get_domain_from_url(self.url) if not Domains.objects.filter(domain=domain).exists(): return False obj = Domains.objects.get(domain=domain) return obj.is_being_crawled
def _fullscan(self): domain = get_domain_from_url(self.url) if Domains.objects.filter(domain=domain).exists(): return Domains.objects.get(domain=domain).fullscan return False
def _info(self): domain = get_domain_from_url(self.url) if Domains.objects.filter(domain=domain).exists(): return Domains.objects.get(domain=domain).info return
def _get_domain(self): return get_domain_from_url(self.url)