Ejemplo n.º 1
0
class Scraper:
    def __init__(self, start_link):
        self.start_link = start_link
        self.page_getter = PageGetter()
        self.links = []
        self.product_tabs = [
            'all', 'characteristics', 'review', 'photo', 'comments'
        ]
        self.curr_path_manager = None

    def crawl(self):
        self._crawl_by_categories([self.start_link])
        # self.curr_path_manager = PathManager(self.start_link, category=True)
        # self._caching_page()
        # links = Parser(self._read_cached_page()).scrap_top_category_links()
        # for link in links:
        #     self.scrap_product(link)

    def _crawl_by_categories(self, links, parent_dir=''):
        for link in links:
            self.curr_path_manager = PathManager(link, parent_dir=parent_dir)
            self._caching_page()
            new_links = Parser(self._read_cached_page()).parse_page()
            self._crawl_by_categories(new_links['categories'],
                                      self.curr_path_manager.category_dir)
            for product in new_links['products']:
                self.scrap_product(product,
                                   self.curr_path_manager.category_dir)

    def scrap_product(self, link, category_dir):
        for tab in self.product_tabs:
            tab_link = '{link}{tab}/'.format(link=link, tab=tab)
            self.curr_path_manager = PathManager(tab_link,
                                                 parent_dir=category_dir,
                                                 product_tab=tab)
            self._caching_page()
            result = Parser(self._read_cached_page()).scrap_product()
            self._save_res_file(result['all'])
            self._save_csv_file(result['characteristics'])

    def _caching_page(self):
        if not self._is_page_cached():
            page_source = self.page_getter.get_remote_page(
                self.curr_path_manager.get_link())
            self._save_page(page_source)

    def _is_page_cached(self):
        cached_page_path = self.curr_path_manager.get_cached_path()
        return os.path.isfile(cached_page_path) and os.stat(
            cached_page_path).st_size

    def _read_cached_page(self):
        return open(self.curr_path_manager.get_cached_path(), 'r').read()

    def _save_page(self, page_source):
        with open(self.curr_path_manager.get_cached_path(), 'w') as page:
            page.write(page_source)

    def _save_csv_file(self, result):
        self._save_res_file('\n'.join([','.join(row) for row in result]),
                            'csv')

    def _save_res_file(self, res, type_='html'):
        if res:
            with open(self.curr_path_manager.get_result_path(type_),
                      'w') as result_file:
                result_file.write(res)

    def scrap_characteristics_tab(self):
        pass