Esempi in Python per crawl

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: soup

Metodo/funzione: crawl

Esempi su hotexamples.com: 6

crawl in Python: 6 esempi trovati. Questi sono i migliori esempi reali in Python per soup.crawl, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Esempio n. 1

0

Mostra file

File: edge_testing.py Progetto: jchulton/lsptp2

    def testCrawlNoResponse(self):
        # NO CONNECTION RESPONSE, 602 error
        test_url = "https://test"
        test_json = dict()

        outer_links = []
        code = 602
        text = None
        date = None
        proper_response = make_response(test_url, outer_links, code, text,
                                        date)

        soup.crawl(test_url, test_json)
        self.assertDictEqual(test_json, proper_response)
        return

Esempio n. 2

0

Mostra file

File: edge_testing.py Progetto: jchulton/lsptp2

    def testCrawlGoodIrrelevantResponse(self):
        # GOOD RESPONSE, NO RPI RELEVANCE, 600 error
        test_url = "https://onebananas.com/"
        test_json = dict()

        outer_links = []
        code = 600
        text = None
        date = None
        proper_response = make_response(test_url, outer_links, code, text,
                                        date)

        soup.crawl(test_url, test_json)
        self.assertDictEqual(test_json, proper_response)
        return

Esempio n. 3

0

Mostra file

File: edge_testing.py Progetto: jchulton/lsptp2

    def testCrawlBadResponse(self):
        # BAD RESPONSE, 404 error
        test_url = "http://paper.com/users"
        test_json = dict()

        outer_links = []
        code = 404
        text = None
        date = None
        proper_response = make_response(test_url, outer_links, code, text,
                                        date)

        soup.crawl(test_url, test_json)
        self.assertDictEqual(test_json, proper_response)
        return

Esempio n. 4

0

Mostra file

File: edge_testing.py Progetto: jchulton/lsptp2

    def testCrawlGoodRelevantResponse(self):
        test_url = "https://science.rpi.edu/"
        test_json = dict()

        outer_links = []
        code = 200
        text = None
        date = soup.find_recrawl_date()
        proper_response = make_response(test_url, outer_links, code, text,
                                        date)

        soup.crawl(test_url, test_json)
        self.assertNotEqual(sorted(test_json['outbound-links']),
                            sorted(proper_response['outbound-links']))
        self.assertIsNotNone(test_json['plain-text'])
        return

Esempio n. 5

0

Mostra file

File: edge_testing.py Progetto: jchulton/lsptp2

    def testCrawlRobots(self):

        #No robots.txt is the link below, crawl_robots should be an empty link
        url = "http://blog.davidstea.com/robots.txt"
        disallow_list = soup.crawl_robots(url)
        self.assertEqual(disallow_list, [])

        #There is valid robots.txt at the link below, which contains dissallowed links.
        #Make sure that the disallowed links are scraped and removed during the crawl process
        url = "https://science.rpi.edu"
        disallow_list = soup.crawl_robots(url)
        self.assertNotEqual(disallow_list, [])
        server_response = dict()
        soup.crawl(url, server_response)
        for unallowed_link in disallow_list:
            for link in server_response['outbound-links']:
                self.assertNotIn(unallowed_link, link)

        #No robots.txt is the link below, crawl_robots should be an empty link
        url = "www.google.com"
        disallow_list = soup.crawl_robots(url)
        self.assertEqual(disallow_list, [])

        return

Esempio n. 6

0

Mostra file

File: crawler_server.py Progetto: jchulton/lsptp2

def scrape_link(link, json_object):
    # call the crawl algorithm on the given link
    json_object = crawl(link, json_object)