def test_expects_to_filter_by_audit_type(self): # Arrange domain = 'sub.domain.com' site = Site.from_domain_or_url(domain) page = Page(site) test_axe_report_path = helper.fixture_file_path( 'httpbin-org-page-all-violations.json') with open(test_axe_report_path, "r") as f: data = json.loads(f.read()) axe_errors = data["violations"] test_cases = [ # audit_type expected_violations_length ("design", 2), ("code", 3), (None, 5) ] for audit_type, expected_violations_length in test_cases: audit = AxePageAudit(page, audit_type) sorted_violations = [] # Act for error in axe_errors: sorted_violations += Violation.s_from_audit_axe_error( audit, error) # Assert self.assertEqual(expected_violations_length, len(sorted_violations))
def test_expects_violations_in_csv(self, webmock): # Arrange test_dir = pathjoin(AUDITS_DIR, "sub-domain-com") violations_csv_path = pathjoin(test_dir, "sub-domain-com.csv") webmock.get(requests_mock.ANY, text='ok') domain = 'sub.domain.com' site = Site(domain) page = Page(site) audit = AxeSiteAudit(site) source = 'test' identifier = 'test-error' severity = 'low' violation = Violation( page=page, source=source, identifier=identifier, severity=severity ) violation.kind = "error" violation.help = "Error must be fixed" violation.help_url = "https://help.com" violation.html = "<p>Test</p>" violation.failure = "This is incorrect" # Act audit.write_to_violation_csv(violations_csv_path, [violation]) with open(violations_csv_path, 'r') as file: csv_rows = list(csv.reader(file)) row_count = len(csv_rows) # Assert self.assertEqual(row_count, 2) self.assertEqual(csv_rows[0][0], "page_url") self.assertEqual(csv_rows[1][8], violation.failure)
def test_expect_successful_page_audit(self, webmock): # Arrange domain = 'httpbin.org' test_axe_report_path = helper.fixture_file_path('httpbin-org-page-all-violations.json') webmock.get(requests_mock.ANY, text='ok') site = Site.from_domain_or_url(domain) page = Page(site) audit_type = None # Assume self.assertIsNone(page.audit) self.assertPathExists(test_axe_report_path) # Act # Mock the AxeAudit generate_report method to return our test fixture file # path when page.axe_audit called. with patch.object(AxePageAudit, 'generate_report') as mocked_method: mocked_method.return_value = test_axe_report_path page.axe_audit(audit_type) # Assert self.assertIsNotNone(page.audit) self.assertEqual(page.site, site) self.assertIn(domain, page.url) self.assertEqual(5, len(page.audit.violations)) self.assertEqual(5, len(page.audit.errors)) self.assertEqual(0, len(page.audit.warnings))
def load_page_data(self, page_file): page_item_hash = {} page_item_list = [] con = open(page_file, "r") data = csv.DictReader(con, delimiter="\t") count = 0 for row in data: page_item = Page(row) page_item_hash[page_item.urlid] = page_item page_item_list.append(page_item.urlid) if page_item.alchemy_category_score: self.cat_score_sum+=page_item.alchemy_category_score if page_item.avglinksize: self.avglinksize_sum+=page_item.avglinksize if page_item.commonLinkRatio_1: self.commonlinkratio_1_sum+=page_item.commonLinkRatio_1 if page_item.commonLinkRatio_2: self.commonlinkratio_2_sum+=page_item.commonLinkRatio_2 if page_item.commonLinkRatio_3: self.commonlinkratio_3_sum+=page_item.commonLinkRatio_3 if page_item.commonLinkRatio_4: self.commonlinkratio_4_sum+=page_item.commonLinkRatio_4 # for key in row.keys(): # if page_item.compression_ratio: # self.add_val_to_hash(self.field_value_sum, compression_ratio, page_item.compression_ratio) count+=1 self.page_item_list = page_item_list self.page_item_hash = page_item_hash self.count = count
def crawl(page, visited, pool): """Crawl url, build site's map and list assets""" logging.info('Crawling {}'.format(page.url)) visited.add(page) if soft: time.sleep(random.random()) try: links = page.extract_internal_links() except eventlet.Timeout: page.retries_left -= 1 if page.retries_left > 0: pool.spawn_n(crawl, page, visited, pool) else: logging.warning('Couldn\'t fetch {} after {} retries.'.format( page.url, Page.MAX_RETRIES)) return for link in links: new_page = Page(link) if new_page not in visited: pool.spawn_n(crawl, new_page, visited, pool) else: # Url already crawled pass page.print_assets()
def add_page(self): last_page = Page.query.filter_by(book_id=self.book.id).order_by( sqlalchemy.desc(Page.id)).first() if last_page != None: self.page = Page(page_num=self.page_num, book_id=self.book.id, location_in_book=self.location_in_book, page_image=self.page_image, page_order=last_page.page_order + 10) else: self.page = Page(page_num=self.page_num, book_id=self.book.id, location_in_book=self.location_in_book, page_image=self.page_image, page_order=10) db.session.add(self.page) db.session.commit()
def audit(self): AxeAudit.validate_type(self.audit_type) urls = self.extract_site_page_urls_from_sitemap() for url in urls: page = Page(self, url) page.axe_audit(self.audit_type) self.pages.append(page) return AxeAudit.from_site(self)
def post(self): '''This method creates a new page related to an user''' params = self.reqparse.parse_args() if Commons.isValidId(params['categoryId']): Page(title=params['title'], url=params['url'], categoryId=params['categoryId'], userId=auth.user['id']).save() return make_response(jsonify({'data': 'Page created'}), 201) return make_response(jsonify({'error': 'Invalid categoryId'}), 500)
def test_expects_page_instance(self): # Arrange domain = 'sub.domain.com' site = Site.from_domain_or_url(domain) # Act page = Page(site) # Assert self.assertIsInstance(page, Page) self.assertEqual(page.site, site) self.assertIn(domain, page.url)
def test_expects_new_axe_page_audit(self): # Arrange url = 'https://sub.domain.com' site = Site(url) page = Page(site) # Act page_audit = AxePageAudit(page) # Assert self.assertIsInstance(page_audit, AxePageAudit) self.assertEqual(url, page_audit.url) self.assertListEqual([], page_audit.violations)
def test_expects_templates(self): # Arrange url = 'https://sub.domain.com/path/subpath/subsubpath/index.html' site = Site.from_domain_or_url(url) page = Page(site) expected_templates = [ 'path/subpath/subsubpath/index.html', 'path/subpath/subsubpath', 'path/subpath', 'path' ] # Act templates = page.templates # Assert self.assertEqual(expected_templates, templates)
def get_page(url) -> Optional[Page]: try: log.info(f"Started fetching..") req = requests.get(url, headers=get_randomized_headers(), timeout=15) # Make the received data searchable soup = BeautifulSoup(req.text, features="html.parser") log.info(f"Souped {req.url} ({req.status_code})") title = soup.title.string if soup.title and soup.title.string else "Error: Title not found" log.info(f"Title: {title}") page = Page(req.url, req.status_code, title, soup) return page # If the submitted page couldn't be fetched, throw an exception except (ConnectionError, Timeout, AttributeError, Exception): log.debug(traceback.format_exc()) log.warning(f"Unable to fetch {url}") return None
def test_expects_subtemplate(self): # Arrange test_cases = [ # url, expected_subtemplate ('https://sub.domain.com/path/subpath/subsubpath/index.html', 'path/subpath'), ('https://sub.domain.com/path/subpath/', 'path/subpath'), ('https://sub.domain.com/path/subpath', 'path/subpath'), ('https://sub.domain.com/path/', None), ('https://sub.domain.com/', None) ] # Act / Assert for url, expected_subtemplate in test_cases: # Act site = Site.from_domain_or_url(url) page = Page(site) # Assert self.assertEqual(expected_subtemplate, page.subtemplate, url)
def run(self, max_depth): depth = 0 while len(self.pending) > 0: page = self.pending.popleft() # get links for the current page page_data = self.scraper.run(page) page['title'] = page_data['title'] page['links'] = page_data['links'] # persist to db instance = Page(title=page['title'], url=page['link'], search_term=self.search_term, links=page['links'], created_at=datetime.now(), updated_at=datetime.now()) if page['parent'] is not None: instance.parent = page['parent'] instance.save() if self.curr_parent is None or page[ 'parent'] is not self.curr_parent: parent_id = self.curr_parent if parent_id is None: parent_id = instance.id if page['parent'] is not self.curr_parent: depth += 1 parent_id = instance.id self.curr_parent = page['parent'] logger.info('crawl level: %d, max depth: %d, queue size: %d', depth, max_depth, len(self.pending)) logger.info('crawling page id %s, url: %s', str(instance.id), page['link']) # queue up the links for another level of crawling if depth < max_depth: self.enqueue(page_data['links'], parent_id)
def test_expects_paths(self): # Arrange test_cases = [ # url, expected_path ('https://sub.domain.com', ''), ('https://sub.domain.com/', ''), ('https://sub.domain.com/path', 'path'), ('https://sub.domain.com/path/', 'path'), ('https://sub.domain.com/path/subpath/', 'path/subpath'), ] # Act / Assert for url, expected_path in test_cases: # Act site = Site.from_domain_or_url(url) page = Page(site) # Assert self.assertEqual(expected_path, page.path, url) self.assertEqual(url, page.url)
def get_page(url): try: # Fetch amp page log.info(f"Started fetching..") req = requests.get(url, headers=random_headers()) current_url = req.url # Make the received data searchable soup = BeautifulSoup(req.text, features="html.parser") title = soup.title.string if soup.title else "Error: Title not found" log.info(f"Made a soup of {current_url} ({req.status_code})") log.info(f"Page title = {title}") page = Page(current_url, req.status_code, title, soup) return page # If the submitted page couldn't be fetched, throw an exception except (ConnectionError, Exception, AttributeError): log.warning(traceback.format_exc()) log.warning("the page could not be fetched") return None
def test_expects_violation_string_to_have_correct_information(self): # Arrange domain = 'sub.domain.com' site = Site.from_domain_or_url(domain) page = Page(site) source = 'test' identifier = 'test-error' severity = 'low' expected_output = 'test reported a low test-error error on http://sub.domain.com' # Act violation = Violation(page=page, source=source, identifier=identifier, severity=severity) # Same as __str__ magic method in Violation violation_str = str(violation) # Assert self.assertEqual(expected_output, violation_str)
def test_expects_violation_instance(self): # Arrange domain = 'sub.domain.com' site = Site.from_domain_or_url(domain) page = Page(site) source = 'test' identifier = 'test-error' severity = 'low' # Act violation = Violation(page=page, source=source, identifier=identifier, severity=severity) # Assert self.assertIsInstance(violation, Violation) self.assertEqual(page, violation.page) self.assertEqual(source, violation.source) self.assertEqual(identifier, violation.identifier) self.assertEqual(severity, violation.severity)
def get_state(self): url = self.driver.current_url page = Page.objects(url=url).first() if page is None: self.driver.get(url) default_state = state_builder.get_current_state(self.driver) default_state.name = self.page default_state.save() page = Page(url=url, default_state=default_state, states=[default_state]) page.name = self.page page.save() for state in page.states: if state.name == self.page: print "Found state %s" % state.name return state print "State not found, creating new state" new_state = state_builder.get_current_state(self.driver) new_state.save() new_state.name = self.page page.states.append(new_state) page.save() return new_state
def test_page(self): default_state = State.objects().first() states = State.objects[:5] page = Page(url="http://www.google.com/", default_state=default_state, states=states) page.save() assert len(page.states) > 0
if __name__ == '__main__': libfm_predictions_file_name = sys.argv[1] output_file_name = sys.argv[1] + ".submit" test_file_name = "../data/test.tsv" f_write = open(output_file_name, "w") line = '' print "Starting to read libFM output file from:" + libfm_predictions_file_name f_libfm_predicts = open(libfm_predictions_file_name) f_test = open(test_file_name, "r") test_data = csv.DictReader(f_test, delimiter="\t") import pdb pdb.set_trace() count = 0 f_write.write("urlid,label" + "\n") libfm_lines = f_libfm_predicts.readlines() for row in test_data: item = Page(row) out_str = str(item.urlid) + "," + get_label(libfm_lines[count]) print out_str f_write.write(out_str + "\n") count += 1 f_libfm_predicts.close() print "...done :" + output_file_name f_write.close()
def test_pages_set(): pages = set() pages.add(Page('www.test.com')) pages.add(Page('www.test.com')) assert len(pages) == 1
new_page = Page(link) if new_page not in visited: pool.spawn_n(crawl, new_page, visited, pool) else: # Url already crawled pass page.print_assets() if __name__ == '__main__': if soft: logging.info('Soft mode enabled.') logging.info('Using pool of {} threads.'.format(max_threads)) root_page = Page(url) if not root_page.has_valid_url: logging.error('Url {} is not valid'.format(root_page.url)) exit(1) visited = set() pool = eventlet.GreenPool(size=max_threads) crawl(root_page, visited, pool) pool.waitall() print '\n', 'Sitemap for {} :'.format(root_page.url) for page in sorted(list(visited)): if page.path: print '\t', page.path