Ejemplo n.º 1
0
    def test_expects_to_filter_by_audit_type(self):
        # Arrange
        domain = 'sub.domain.com'
        site = Site.from_domain_or_url(domain)
        page = Page(site)
        test_axe_report_path = helper.fixture_file_path(
            'httpbin-org-page-all-violations.json')

        with open(test_axe_report_path, "r") as f:
            data = json.loads(f.read())

        axe_errors = data["violations"]

        test_cases = [
            # audit_type   expected_violations_length
            ("design", 2),
            ("code", 3),
            (None, 5)
        ]

        for audit_type, expected_violations_length in test_cases:
            audit = AxePageAudit(page, audit_type)
            sorted_violations = []

            # Act
            for error in axe_errors:
                sorted_violations += Violation.s_from_audit_axe_error(
                    audit, error)

            # Assert
            self.assertEqual(expected_violations_length,
                             len(sorted_violations))
Ejemplo n.º 2
0
    def test_expects_violations_in_csv(self, webmock):
        # Arrange
        test_dir = pathjoin(AUDITS_DIR, "sub-domain-com")
        violations_csv_path = pathjoin(test_dir, "sub-domain-com.csv")
        webmock.get(requests_mock.ANY, text='ok')
        domain = 'sub.domain.com'
        site = Site(domain)
        page = Page(site)
        audit = AxeSiteAudit(site)
        source = 'test'
        identifier = 'test-error'
        severity = 'low'
        violation = Violation(
            page=page,
            source=source,
            identifier=identifier,
            severity=severity
        )
        violation.kind = "error"
        violation.help = "Error must be fixed"
        violation.help_url = "https://help.com"
        violation.html = "<p>Test</p>"
        violation.failure = "This is incorrect"

        # Act
        audit.write_to_violation_csv(violations_csv_path, [violation])
        with open(violations_csv_path, 'r') as file:
            csv_rows = list(csv.reader(file))
            row_count = len(csv_rows)

        # Assert
            self.assertEqual(row_count, 2)
            self.assertEqual(csv_rows[0][0], "page_url")
            self.assertEqual(csv_rows[1][8], violation.failure)
Ejemplo n.º 3
0
    def test_expect_successful_page_audit(self, webmock):
        # Arrange
        domain = 'httpbin.org'
        test_axe_report_path = helper.fixture_file_path('httpbin-org-page-all-violations.json')
        webmock.get(requests_mock.ANY, text='ok')
        site = Site.from_domain_or_url(domain)
        page = Page(site)
        audit_type = None

        # Assume
        self.assertIsNone(page.audit)
        self.assertPathExists(test_axe_report_path)

        # Act
        # Mock the AxeAudit generate_report method to return our test fixture file
        # path when page.axe_audit called.
        with patch.object(AxePageAudit, 'generate_report') as mocked_method:
            mocked_method.return_value = test_axe_report_path
            page.axe_audit(audit_type)

        # Assert
        self.assertIsNotNone(page.audit)
        self.assertEqual(page.site, site)
        self.assertIn(domain, page.url)
        self.assertEqual(5, len(page.audit.violations))
        self.assertEqual(5, len(page.audit.errors))
        self.assertEqual(0, len(page.audit.warnings))
Ejemplo n.º 4
0
	def load_page_data(self, page_file):

		page_item_hash = {}
		page_item_list = []
		con = open(page_file, "r")
		data = csv.DictReader(con, delimiter="\t")


 		count = 0
 		for row in data:
 			page_item = Page(row)
 			page_item_hash[page_item.urlid] = page_item
 			page_item_list.append(page_item.urlid)
 			if page_item.alchemy_category_score:
 				self.cat_score_sum+=page_item.alchemy_category_score
 			if page_item.avglinksize:
 				self.avglinksize_sum+=page_item.avglinksize
 			if page_item.commonLinkRatio_1:
 				self.commonlinkratio_1_sum+=page_item.commonLinkRatio_1
 			if page_item.commonLinkRatio_2:
 				self.commonlinkratio_2_sum+=page_item.commonLinkRatio_2
 			if page_item.commonLinkRatio_3:
 				self.commonlinkratio_3_sum+=page_item.commonLinkRatio_3
 			if page_item.commonLinkRatio_4:
 				self.commonlinkratio_4_sum+=page_item.commonLinkRatio_4
 			
# 			for key in row.keys():
#	 			if page_item.compression_ratio:
#	 				self.add_val_to_hash(self.field_value_sum, compression_ratio, page_item.compression_ratio)
 				
 			count+=1

 		self.page_item_list = page_item_list
 		self.page_item_hash = page_item_hash
 		self.count = count
Ejemplo n.º 5
0
def crawl(page, visited, pool):
    """Crawl url, build site's map and list assets"""
    logging.info('Crawling {}'.format(page.url))
    visited.add(page)
    if soft:
        time.sleep(random.random())

    try:
        links = page.extract_internal_links()
    except eventlet.Timeout:
        page.retries_left -= 1
        if page.retries_left > 0:
            pool.spawn_n(crawl, page, visited, pool)
        else:
            logging.warning('Couldn\'t fetch {} after {} retries.'.format(
                page.url, Page.MAX_RETRIES))
        return

    for link in links:
        new_page = Page(link)
        if new_page not in visited:
            pool.spawn_n(crawl, new_page, visited, pool)
        else:
            # Url already crawled
            pass

    page.print_assets()
Ejemplo n.º 6
0
 def add_page(self):
     last_page = Page.query.filter_by(book_id=self.book.id).order_by(
         sqlalchemy.desc(Page.id)).first()
     if last_page != None:
         self.page = Page(page_num=self.page_num,
                          book_id=self.book.id,
                          location_in_book=self.location_in_book,
                          page_image=self.page_image,
                          page_order=last_page.page_order + 10)
     else:
         self.page = Page(page_num=self.page_num,
                          book_id=self.book.id,
                          location_in_book=self.location_in_book,
                          page_image=self.page_image,
                          page_order=10)
     db.session.add(self.page)
     db.session.commit()
Ejemplo n.º 7
0
    def audit(self):
        AxeAudit.validate_type(self.audit_type)
        urls = self.extract_site_page_urls_from_sitemap()

        for url in urls:
            page = Page(self, url)
            page.axe_audit(self.audit_type)
            self.pages.append(page)

        return AxeAudit.from_site(self)
Ejemplo n.º 8
0
 def post(self):
     '''This method creates a new page related to an user'''
     params = self.reqparse.parse_args()
     if Commons.isValidId(params['categoryId']):
         Page(title=params['title'],
              url=params['url'],
              categoryId=params['categoryId'],
              userId=auth.user['id']).save()
         return make_response(jsonify({'data': 'Page created'}), 201)
     return make_response(jsonify({'error': 'Invalid categoryId'}), 500)
Ejemplo n.º 9
0
    def test_expects_page_instance(self):
        # Arrange
        domain = 'sub.domain.com'
        site = Site.from_domain_or_url(domain)

        # Act
        page = Page(site)

        # Assert
        self.assertIsInstance(page, Page)
        self.assertEqual(page.site, site)
        self.assertIn(domain, page.url)
Ejemplo n.º 10
0
    def test_expects_new_axe_page_audit(self):
        # Arrange
        url = 'https://sub.domain.com'
        site = Site(url)
        page = Page(site)

        # Act
        page_audit = AxePageAudit(page)

        # Assert
        self.assertIsInstance(page_audit, AxePageAudit)
        self.assertEqual(url, page_audit.url)
        self.assertListEqual([], page_audit.violations)
Ejemplo n.º 11
0
    def test_expects_templates(self):
        # Arrange
        url = 'https://sub.domain.com/path/subpath/subsubpath/index.html'
        site = Site.from_domain_or_url(url)
        page = Page(site)
        expected_templates = [
            'path/subpath/subsubpath/index.html',
            'path/subpath/subsubpath',
            'path/subpath',
            'path'
        ]

        # Act
        templates = page.templates

        # Assert
        self.assertEqual(expected_templates, templates)
Ejemplo n.º 12
0
def get_page(url) -> Optional[Page]:
    try:
        log.info(f"Started fetching..")
        req = requests.get(url, headers=get_randomized_headers(), timeout=15)

        # Make the received data searchable
        soup = BeautifulSoup(req.text, features="html.parser")
        log.info(f"Souped {req.url} ({req.status_code})")
        title = soup.title.string if soup.title and soup.title.string else "Error: Title not found"
        log.info(f"Title: {title}")
        page = Page(req.url, req.status_code, title, soup)
        return page

    # If the submitted page couldn't be fetched, throw an exception
    except (ConnectionError, Timeout, AttributeError, Exception):
        log.debug(traceback.format_exc())
        log.warning(f"Unable to fetch {url}")
        return None
Ejemplo n.º 13
0
    def test_expects_subtemplate(self):
        # Arrange
        test_cases = [
            # url, expected_subtemplate
            ('https://sub.domain.com/path/subpath/subsubpath/index.html', 'path/subpath'),
            ('https://sub.domain.com/path/subpath/', 'path/subpath'),
            ('https://sub.domain.com/path/subpath', 'path/subpath'),
            ('https://sub.domain.com/path/', None),
            ('https://sub.domain.com/', None)
        ]

        # Act / Assert
        for url, expected_subtemplate in test_cases:
            # Act
            site = Site.from_domain_or_url(url)
            page = Page(site)

            # Assert
            self.assertEqual(expected_subtemplate, page.subtemplate, url)
Ejemplo n.º 14
0
    def run(self, max_depth):
        depth = 0
        while len(self.pending) > 0:
            page = self.pending.popleft()

            # get links for the current page
            page_data = self.scraper.run(page)

            page['title'] = page_data['title']
            page['links'] = page_data['links']

            # persist to db
            instance = Page(title=page['title'],
                            url=page['link'],
                            search_term=self.search_term,
                            links=page['links'],
                            created_at=datetime.now(),
                            updated_at=datetime.now())
            if page['parent'] is not None:
                instance.parent = page['parent']
            instance.save()

            if self.curr_parent is None or page[
                    'parent'] is not self.curr_parent:
                parent_id = self.curr_parent

                if parent_id is None:
                    parent_id = instance.id

                if page['parent'] is not self.curr_parent:
                    depth += 1
                    parent_id = instance.id
                    self.curr_parent = page['parent']

            logger.info('crawl level: %d, max depth: %d, queue size: %d',
                        depth, max_depth, len(self.pending))
            logger.info('crawling page id %s, url: %s', str(instance.id),
                        page['link'])

            # queue up the links for another level of crawling
            if depth < max_depth:
                self.enqueue(page_data['links'], parent_id)
Ejemplo n.º 15
0
    def test_expects_paths(self):
        # Arrange
        test_cases = [
            # url, expected_path
            ('https://sub.domain.com', ''),
            ('https://sub.domain.com/', ''),
            ('https://sub.domain.com/path', 'path'),
            ('https://sub.domain.com/path/', 'path'),
            ('https://sub.domain.com/path/subpath/', 'path/subpath'),
        ]

        # Act / Assert
        for url, expected_path in test_cases:
            # Act
            site = Site.from_domain_or_url(url)
            page = Page(site)

            # Assert
            self.assertEqual(expected_path, page.path, url)
            self.assertEqual(url, page.url)
Ejemplo n.º 16
0
def get_page(url):
    try:
        # Fetch amp page
        log.info(f"Started fetching..")
        req = requests.get(url, headers=random_headers())
        current_url = req.url

        # Make the received data searchable
        soup = BeautifulSoup(req.text, features="html.parser")
        title = soup.title.string if soup.title else "Error: Title not found"
        log.info(f"Made a soup of {current_url} ({req.status_code})")
        log.info(f"Page title = {title}")
        page = Page(current_url, req.status_code, title, soup)
        return page

    # If the submitted page couldn't be fetched, throw an exception
    except (ConnectionError, Exception, AttributeError):
        log.warning(traceback.format_exc())
        log.warning("the page could not be fetched")
        return None
Ejemplo n.º 17
0
    def test_expects_violation_string_to_have_correct_information(self):
        # Arrange
        domain = 'sub.domain.com'
        site = Site.from_domain_or_url(domain)
        page = Page(site)
        source = 'test'
        identifier = 'test-error'
        severity = 'low'
        expected_output = 'test reported a low test-error error on http://sub.domain.com'

        # Act
        violation = Violation(page=page,
                              source=source,
                              identifier=identifier,
                              severity=severity)

        # Same as __str__ magic method in Violation
        violation_str = str(violation)

        # Assert
        self.assertEqual(expected_output, violation_str)
Ejemplo n.º 18
0
    def test_expects_violation_instance(self):
        # Arrange
        domain = 'sub.domain.com'
        site = Site.from_domain_or_url(domain)
        page = Page(site)
        source = 'test'
        identifier = 'test-error'
        severity = 'low'

        # Act
        violation = Violation(page=page,
                              source=source,
                              identifier=identifier,
                              severity=severity)

        # Assert
        self.assertIsInstance(violation, Violation)
        self.assertEqual(page, violation.page)
        self.assertEqual(source, violation.source)
        self.assertEqual(identifier, violation.identifier)
        self.assertEqual(severity, violation.severity)
Ejemplo n.º 19
0
 def get_state(self):
     url = self.driver.current_url
     page = Page.objects(url=url).first()
     if page is None:
         self.driver.get(url)
         default_state = state_builder.get_current_state(self.driver)
         default_state.name = self.page
         default_state.save()
         page = Page(url=url,
                     default_state=default_state,
                     states=[default_state])
         page.name = self.page
         page.save()
     for state in page.states:
         if state.name == self.page:
             print "Found state %s" % state.name
             return state
     print "State not found, creating new state"
     new_state = state_builder.get_current_state(self.driver)
     new_state.save()
     new_state.name = self.page
     page.states.append(new_state)
     page.save()
     return new_state
Ejemplo n.º 20
0
 def test_page(self):
     default_state = State.objects().first()
     states = State.objects[:5]
     page = Page(url="http://www.google.com/", default_state=default_state, states=states)
     page.save()
     assert len(page.states) > 0
Ejemplo n.º 21
0
if __name__ == '__main__':
    libfm_predictions_file_name = sys.argv[1]
    output_file_name = sys.argv[1] + ".submit"
    test_file_name = "../data/test.tsv"

    f_write = open(output_file_name, "w")
    line = ''
    print "Starting to read libFM output file from:" + libfm_predictions_file_name
    f_libfm_predicts = open(libfm_predictions_file_name)

    f_test = open(test_file_name, "r")
    test_data = csv.DictReader(f_test, delimiter="\t")

    import pdb
    pdb.set_trace()
    count = 0
    f_write.write("urlid,label" + "\n")
    libfm_lines = f_libfm_predicts.readlines()

    for row in test_data:
        item = Page(row)
        out_str = str(item.urlid) + "," + get_label(libfm_lines[count])
        print out_str
        f_write.write(out_str + "\n")
        count += 1

    f_libfm_predicts.close()
    print "...done :" + output_file_name

    f_write.close()
Ejemplo n.º 22
0
def test_pages_set():
    pages = set()
    pages.add(Page('www.test.com'))
    pages.add(Page('www.test.com'))
    assert len(pages) == 1
Ejemplo n.º 23
0
        new_page = Page(link)
        if new_page not in visited:
            pool.spawn_n(crawl, new_page, visited, pool)
        else:
            # Url already crawled
            pass

    page.print_assets()


if __name__ == '__main__':

    if soft:
        logging.info('Soft mode enabled.')
    logging.info('Using pool of {} threads.'.format(max_threads))

    root_page = Page(url)
    if not root_page.has_valid_url:
        logging.error('Url {} is not valid'.format(root_page.url))
        exit(1)

    visited = set()
    pool = eventlet.GreenPool(size=max_threads)
    crawl(root_page, visited, pool)
    pool.waitall()

    print '\n', 'Sitemap for {} :'.format(root_page.url)
    for page in sorted(list(visited)):
        if page.path:
            print '\t', page.path