def test_does_not_add_404_responses(self): generator = SiteMapGenerator('http://www.example.com/404') generator.parser_class = TestParserClass generator.Process() self.assertFalse(generator.internal_pages)
def test_records_redirected_to_pages(self): generator = SiteMapGenerator('http://www.example.com/handles_redirects') generator.parser_class = TestParserClass generator.Process() self.assertNotIn('http://www.example.com/handles_redirects', generator.internal_pages) self.assertIn('http://www.example.com/actual_page', generator.internal_pages)
def test_finds_pages(self): generator = SiteMapGenerator('http://www.example.com/gets_page') generator.parser_class = TestParserClass generator.Process() self.assertIn('http://www.example.com/gets_page', generator.internal_pages) self.assertIn('http://www.example.com/found_page', generator.internal_pages)
import sys from SiteCrawler.SiteMapGenerator import SiteMapGenerator logger = logging.getLogger(__name__) if __name__ == '__main__': logging.basicConfig(level=logging.INFO) result_filename = sys.argv[1] if len(sys.argv) > 2: begin_url = sys.argv[2] else: begin_url = 'http://hiring-tests.s3-website-eu-west-1.amazonaws.com/2015_Developer_Scrape/5_products.html' generator = SiteMapGenerator(begin_url) generator.GetProducts() with open(result_filename, 'w') as result_file: result_file.write( json.dumps( { 'results': list(generator.products), 'total': sum([ float(product['unit_price']) for product in generator.products ]) },