Esempio n. 1
0
class Scrapper:
    def __init__(self, start_url, savefile, max_depth=10, max_width=100):
        """
            max_depth: maximum recursion depth to follow for each link
            max_width: maximum number of dict keys, i.e. width of tree
        """
        self.parser = Parser()
        self.start_url = start_url
        self.saver = Saver(savefile, max_width)
        self.max_depth = max_depth
        self.saver.starting_url(self.start_url)

    def start_scrapping(self, depth=0, start_url = None):
        if depth == self.max_depth:
            return
        if start_url == None:
            start_url = self.start_url
        nested_urls = self.get_urls(start_url)
        bool_break = self.save_data(start_url, nested_urls)
        if not bool_break:
            exit()

        if nested_urls == None:
            return
        else:
            for url in nested_urls:
                self.start_scrapping(depth+1, url)

    def get_urls(self, url):
        try:
            response = requests.get(url)
            web_page = response.content
            urls = self.parser.get_links(web_page)
        except requests.exceptions.RequestException as re:
            print(re)
            return
        return urls

    def save_data(self, start_url, nested_urls):
        reply = self.saver.save(start_url, nested_urls)
        if not reply:
            return False
        else:
            return True