Example #1
0
class ParserLaucher(JsonSaveMixin, UrlProcessMixin, DatabaseSaver):
    """Class to interact with Parser.
    """
    BASE_URL = 'https://www.ozon.ru'

    def __init__(self,
                 result_save_directory,
                 save_to_json=False,
                 save_to_db=False,
                 *args,
                 **kwargs):
        self.result_save_directory = result_save_directory
        self.save_to_json = save_to_json
        self.save_to_db = save_to_db
        super().__init__(*args, **kwargs)
        self.parser = Parser()

    @staticmethod
    def get_parent_categories_from_db():
        """Load categories with no parents.
        """
        return Category.query.filter(Category.is_parent()).all()

    def fetch_parent_categories(self):
        print('Fetching parent categories...', file=sys.stdout)
        # Parse site for parent categories
        categories = self.parser.get_parent_categories(self.BASE_URL)
        categories = categories[0][self.BASE_URL]

        # SAVE TO .JSON FILE
        if self.save_to_json:
            print('Saving to .json file...', file=sys.stdout)
            self.save_to_jsonfile('parent_categories', categories,
                                  self.result_save_directory)

        # SAVE TO DATABASE
        if self.save_to_db:
            print('Saving to database...', file=sys.stdout)
            self.save_categories_to_database(categories)

    def fetch_subcategories(self):
        print('Fetching subcategories...', file=sys.stdout)
        # Get parent categories
        parent_categories = self.get_parent_categories_from_db()
        full_urls = [
            self.get_full_url(self.BASE_URL, parent_category.url)
            for parent_category in parent_categories
        ]
        subcategories = self.parser.get_subcategories(full_urls)

        merged_subcategories = {}
        for subcategory in subcategories:
            key = list(subcategory)[0]
            value = subcategory[key]
            merged_subcategories[self.get_url_path(key)] = value

        # SAVE TO .JSON FILE
        if self.save_to_json:
            print('Saving to .json file...', file=sys.stdout)
            self.save_to_jsonfile('subcategories', merged_subcategories,
                                  self.result_save_directory)

        # SAVE TO DATABASE
        if self.save_to_db:
            print('Saving to database...', file=sys.stdout)
            self.save_subcategories_to_database(merged_subcategories,
                                                parent_categories)

    def fetch_items(self):
        print('Fetching items...', file=sys.stdout)
        # Get parent categories
        parent_categories = self.get_parent_categories_from_db()

        for parent_category in parent_categories:
            # For every parent category get its leaves categories
            leaf_categories = Category.query.filter(
                Category.has_no_children(),
                Category.path.descendant_of(parent_category.path)).all()

            for leaf_category in leaf_categories:
                # For every leaf category get corresponding items
                print('Parsing category:', leaf_category.name, file=sys.stdout)

                url = self.get_full_url(self.BASE_URL, leaf_category.url)
                items = self.parser.get_items(url)

                # SAVE ITEMS TO .JSON FILE
                if self.save_to_json:
                    print('Saving to .json file...', file=sys.stdout)
                    self.save_to_jsonfile(
                        'items_{}'.format(leaf_category.slug), items,
                        self.result_save_directory)

                # SAVE ITEMS TO DATABASE
                if self.save_to_db:
                    print('Saving to database...', file=sys.stdout)
                    self.save_items_to_database(items, leaf_category)