def test_does_not_add_404_responses(self):
        generator = SiteMapGenerator('http://www.example.com/404')
        generator.parser_class = TestParserClass

        generator.Process()

        self.assertFalse(generator.internal_pages)
    def test_records_redirected_to_pages(self):
        generator = SiteMapGenerator('http://www.example.com/handles_redirects')
        generator.parser_class = TestParserClass

        generator.Process()

        self.assertNotIn('http://www.example.com/handles_redirects', generator.internal_pages)
        self.assertIn('http://www.example.com/actual_page', generator.internal_pages)
    def test_finds_pages(self):
        generator = SiteMapGenerator('http://www.example.com/gets_page')
        generator.parser_class = TestParserClass

        generator.Process()

        self.assertIn('http://www.example.com/gets_page', generator.internal_pages)
        self.assertIn('http://www.example.com/found_page', generator.internal_pages)
Ejemplo n.º 4
0
import sys

from SiteCrawler.SiteMapGenerator import SiteMapGenerator

logger = logging.getLogger(__name__)

if __name__ == '__main__':
    logging.basicConfig(level=logging.INFO)
    result_filename = sys.argv[1]

    if len(sys.argv) > 2:
        begin_url = sys.argv[2]
    else:
        begin_url = 'http://hiring-tests.s3-website-eu-west-1.amazonaws.com/2015_Developer_Scrape/5_products.html'

    generator = SiteMapGenerator(begin_url)

    generator.GetProducts()

    with open(result_filename, 'w') as result_file:
        result_file.write(
            json.dumps(
                {
                    'results':
                    list(generator.products),
                    'total':
                    sum([
                        float(product['unit_price'])
                        for product in generator.products
                    ])
                },