def aggregate_batch(self, batch): """ Given a particular batch, aggregate the stats from its children into the data model and return it """ # find all the invalid sitescans and delete them # first, take out those with invalid domains requested_domains = [get_domain(s) for s in batch.sites] for sitescan in batch.sitescan_set.iterator(): if not get_domain(sitescan.site_url) in requested_domains: sitescan.delete() # then take out those with no URLScans (a la mail.ru) for sitescan in batch.sitescan_set.iterator(): if not sitescan.urlscan_set.count(): sitescan.delete() sitescans = model.SiteScan.objects.filter(batch=batch).iterator() # Initialize counters total_rules = 0 total_properties = 0 total_pages_scanned = 0 total_css_issues = 0 total_ua_issues = 0 # Aggregate data for each sitescan for sitescan in sitescans: sitescan_data = self.aggregate_sitescan(sitescan) total_rules += sitescan_data.num_rules total_properties += sitescan_data.num_properties total_pages_scanned += sitescan_data.scanned_pages total_css_issues += sitescan_data.css_issues total_ua_issues += 1 if sitescan_data.ua_issues else 0 # Actually update the batchdata field data = model.BatchData.objects.create( batch=batch, num_rules=total_rules, num_properties=total_properties, scanned_pages=total_pages_scanned, css_issues=total_css_issues, ua_issues=total_ua_issues, ) # Count and store regressions and fixes prev = DBUtils.get_previous_batch(batch) if prev and prev.data_aggregated: regressions, fixes = RegressionHunter.get_ua_diffs(prev, batch) data.ua_issues_regressed = len(regressions) data.ua_issues_fixed = len(fixes) regressions, fixes = RegressionHunter.get_css_diffs(prev, batch) data.css_issues_regressed = len(regressions) data.css_issues_fixed = len(fixes) data.save() # Mark the batch complete batch.data_aggregated = True batch.save()
def bad_sites(self): """ Returns a list of site urls that did not get scraped """ sitelist = self.sites bad_sites = [] for sitescan in self.sitescan_set.iterator(): for site in sitelist: if get_domain(site) == get_domain(sitescan.site_url): # if this domain is ok, take it out sitelist.remove(site) break else: bad_sites.append(site) # now sitelist contains only "bad sites", since the good ones were # removed in the loop bad_sites.extend(sitelist) return bad_sites
def make_requests_from_url(self, url): """ Generates one request per user_agent """ sitescan, _ = model.SiteScan.objects.get_or_create( batch=self.batch, site_url_hash=sha256(get_domain(url)).hexdigest(), defaults={"site_url": url} ) # Generate different UA requests for each UA for batch_user_agent in self.batch_user_agents: ua = batch_user_agent new_request = Request(url, dont_filter=True) new_request.headers.setdefault("User-Agent", ua.ua_string) new_request.meta["sitescan"] = sitescan new_request.meta["user_agent"] = ua self.log("Created request for {0} with ua {1}".format(url, ua.ua_string)) yield new_request
def make_requests_from_url(self, url): """ Generates one request per user_agent """ sitescan, _ = model.SiteScan.objects.get_or_create( batch=self.batch, site_url_hash=sha256(get_domain(url)).hexdigest(), defaults={'site_url': url}) # Generate different UA requests for each UA for batch_user_agent in self.batch_user_agents: ua = batch_user_agent new_request = Request(url, dont_filter=True) new_request.headers.setdefault('User-Agent', ua.ua_string) new_request.meta['sitescan'] = sitescan new_request.meta['user_agent'] = ua self.log("Created request for {0} with ua {1}".format( url, ua.ua_string)) yield new_request
def parse(self, response): """ Function called by the scrapy downloader after a site url has been visited """ content_type = self.get_content_type(response.headers) sitescan = response.meta.get('sitescan') if sitescan is None: # This sitescan needs to be created sitescan, ss_created = model.SiteScan.objects.get_or_create( batch=self.batch, site_url_hash=sha256(get_domain(response.url)).hexdigest(), defaults={'site_url': response.url}) if not ss_created: # Duplicate URL in the text file, ignore this site return if response.meta.get('user_agent') is None: # Generate different UA requests for each UA for batch_user_agent in self.batch_user_agents: ua = batch_user_agent # Generate new request new_request = Request(response.url) new_request.headers.setdefault('User-Agent', ua.ua_string) new_request.meta['referrer'] = response.meta.get('referrer') new_request.meta['sitescan'] = sitescan new_request.meta['user_agent'] = ua new_request.meta['content_type'] = content_type yield new_request else: if 'text/html' not in self.get_content_type(response.headers): # For linked content, find the urlscan it linked from urlscan = model.URLScan.objects.get( site_scan=sitescan, page_url_hash=sha256( response.meta['referrer']).hexdigest()) else: # Only create urlscans for text/html urlscan, us_created = model.URLScan.objects.get_or_create( site_scan=sitescan, page_url_hash=sha256(response.url).hexdigest(), defaults={ 'page_url': response.url, 'timestamp': self.get_now_time() }) # Continue crawling # Parse stylesheet links, scripts, and hyperlinks hxs = HtmlXPathSelector(response) # Extract other target links try: css_links = hxs.select('//link/@href').extract() except TypeError: css_links = [] try: js_links = hxs.select('//script/@src').extract() except TypeError: js_links = [] try: hyperlinks = hxs.select('//a/@href').extract() except TypeError: hyperlinks = [] # Using a set removes duplicate links. all_links = set(hyperlinks + js_links + css_links) # Examine links, yield requests if they are valid for url in all_links: if not url.startswith('http://'): # ensure that links are to real sites if url.startswith('javascript:'): continue else: url = urljoin(response.url, url) ua = response.meta['user_agent'] request = Request(url) request.headers.setdefault('User-Agent', ua.ua_string) request.meta['referrer'] = response.url request.meta['sitescan'] = sitescan request.meta['user_agent'] = ua request.meta['content_type'] = None yield request # The response contains a user agent, we should yield an item item = MarkupItem() item['content_type'] = self.get_content_type(response.headers) item['filename'] = os.path.basename(urlparse(response.url).path) item['headers'] = unicode(response.headers) item['meta'] = response.meta item['raw_content'] = response.body item['sitescan'] = sitescan item['urlscan'] = urlscan item['url'] = response.url item['user_agent'] = response.meta.get('user_agent') item['redirected_from'] = response.meta.get('redirected_from', u'') yield item
def parse(self, response): """ Function called by the scrapy downloader after a site url has been visited """ content_type = self.get_content_type(response.headers) sitescan = response.meta.get('sitescan') if sitescan is None: # This sitescan needs to be created sitescan, ss_created = model.SiteScan.objects.get_or_create( batch=self.batch, site_url_hash=sha256(get_domain(response.url)).hexdigest(), defaults={'site_url': response.url}) if not ss_created: # Duplicate URL in the text file, ignore this site return if response.meta.get('user_agent') is None: # Generate different UA requests for each UA for batch_user_agent in self.batch_user_agents: ua = batch_user_agent # Generate new request new_request = Request(response.url) new_request.headers.setdefault('User-Agent', ua.ua_string) new_request.meta['referrer'] = response.meta.get('referrer') new_request.meta['sitescan'] = sitescan new_request.meta['user_agent'] = ua new_request.meta['content_type'] = content_type yield new_request else: if 'text/html' not in self.get_content_type(response.headers): # For linked content, find the urlscan it linked from urlscan = model.URLScan.objects.get( site_scan=sitescan, page_url_hash= sha256(response.meta['referrer']).hexdigest()) else: # Only create urlscans for text/html urlscan, us_created = model.URLScan.objects.get_or_create( site_scan=sitescan, page_url_hash=sha256(response.url).hexdigest(), defaults={'page_url': response.url, 'timestamp': self.get_now_time()}) # Continue crawling # Parse stylesheet links, scripts, and hyperlinks hxs = HtmlXPathSelector(response) # Extract other target links try: css_links = hxs.select('//link/@href').extract() except TypeError: css_links = [] try: js_links = hxs.select('//script/@src').extract() except TypeError: js_links = [] try: hyperlinks = hxs.select('//a/@href').extract() except TypeError: hyperlinks = [] # Using a set removes duplicate links. all_links = set(hyperlinks + js_links + css_links) # Examine links, yield requests if they are valid for url in all_links: if not url.startswith('http://'): # ensure that links are to real sites if url.startswith('javascript:'): continue else: url = urljoin(response.url, url) ua = response.meta['user_agent'] request = Request(url) request.headers.setdefault('User-Agent', ua.ua_string) request.meta['referrer'] = response.url request.meta['sitescan'] = sitescan request.meta['user_agent'] = ua request.meta['content_type'] = None yield request # The response contains a user agent, we should yield an item item = MarkupItem() item['content_type'] = self.get_content_type(response.headers) item['filename'] = os.path.basename(urlparse(response.url).path) item['headers'] = unicode(response.headers) item['meta'] = response.meta item['raw_content'] = response.body item['sitescan'] = sitescan item['urlscan'] = urlscan item['url'] = response.url item['user_agent'] = response.meta.get('user_agent') item['redirected_from'] = response.meta.get('redirected_from', u'') yield item