Esempio n. 1
0
    def testCrawlNoResponse(self):
        # NO CONNECTION RESPONSE, 602 error
        test_url = "https://test"
        test_json = dict()

        outer_links = []
        code = 602
        text = None
        date = None
        proper_response = make_response(test_url, outer_links, code, text,
                                        date)

        soup.crawl(test_url, test_json)
        self.assertDictEqual(test_json, proper_response)
        return
Esempio n. 2
0
    def testCrawlGoodIrrelevantResponse(self):
        # GOOD RESPONSE, NO RPI RELEVANCE, 600 error
        test_url = "https://onebananas.com/"
        test_json = dict()

        outer_links = []
        code = 600
        text = None
        date = None
        proper_response = make_response(test_url, outer_links, code, text,
                                        date)

        soup.crawl(test_url, test_json)
        self.assertDictEqual(test_json, proper_response)
        return
Esempio n. 3
0
    def testCrawlBadResponse(self):
        # BAD RESPONSE, 404 error
        test_url = "http://paper.com/users"
        test_json = dict()

        outer_links = []
        code = 404
        text = None
        date = None
        proper_response = make_response(test_url, outer_links, code, text,
                                        date)

        soup.crawl(test_url, test_json)
        self.assertDictEqual(test_json, proper_response)
        return
Esempio n. 4
0
    def testCrawlGoodRelevantResponse(self):
        test_url = "https://science.rpi.edu/"
        test_json = dict()

        outer_links = []
        code = 200
        text = None
        date = soup.find_recrawl_date()
        proper_response = make_response(test_url, outer_links, code, text,
                                        date)

        soup.crawl(test_url, test_json)
        self.assertNotEqual(sorted(test_json['outbound-links']),
                            sorted(proper_response['outbound-links']))
        self.assertIsNotNone(test_json['plain-text'])
        return
Esempio n. 5
0
    def testCrawlRobots(self):

        #No robots.txt is the link below, crawl_robots should be an empty link
        url = "http://blog.davidstea.com/robots.txt"
        disallow_list = soup.crawl_robots(url)
        self.assertEqual(disallow_list, [])

        #There is valid robots.txt at the link below, which contains dissallowed links.
        #Make sure that the disallowed links are scraped and removed during the crawl process
        url = "https://science.rpi.edu"
        disallow_list = soup.crawl_robots(url)
        self.assertNotEqual(disallow_list, [])
        server_response = dict()
        soup.crawl(url, server_response)
        for unallowed_link in disallow_list:
            for link in server_response['outbound-links']:
                self.assertNotIn(unallowed_link, link)

        #No robots.txt is the link below, crawl_robots should be an empty link
        url = "www.google.com"
        disallow_list = soup.crawl_robots(url)
        self.assertEqual(disallow_list, [])

        return
Esempio n. 6
0
def scrape_link(link, json_object):
    # call the crawl algorithm on the given link
    json_object = crawl(link, json_object)