Beispiel #1
0
    def wrapped(self, **kwargs):
        assert 'url_obj' in kwargs
        # session = Session(autocommit=True)

        method(self, **kwargs)

        url_obj = kwargs.get('url_obj')

        p_categories = url_obj.key.split('@@')
        floor_ul = '/'.join(['ul' for i in range(len(p_categories))])

        category_xpaths = '//ul[@id="zg_browseRoot"]/ul/li/a'
        if floor_ul:
            category_xpaths = '//ul[@id="zg_browseRoot"]/ul/%s/li/a' % floor_ul

        urls = []
        category_eles = self.lr.xpaths(category_xpaths)
        if category_eles is not None and len(category_eles) > 0:
            for category_ele in category_eles:
                url = self.wrapped_url(
                    urlparse.urljoin(self.lr.current_url,
                                     category_ele.attrib['href']))
                text = category_ele.text.strip()

                key = '%s@@%s' % (url_obj.key, text)

                urls.append(
                    Url(url=url,
                        type=URL_TYPE.BEST_SELL_CATEGORY,
                        name=text,
                        key=key))

        self.output.put(['add', urls])
Beispiel #2
0
    def wrapped(self, **kwargs):
        method(self, **kwargs)

        urls = []
        product_eles = self.lr.xpaths(
            '//div[@class="zg_itemImageImmersion"]/a')
        for ele in product_eles:
            product_url = urlparse.urljoin(self.lr.current_url,
                                           ele.attrib['href'].strip())
            asin = product_url.split('/dp/', 1)[1].split('/', 1)[0]

            urls.append(
                Url(url=product_url, type=URL_TYPE.PRODUCT_URL, key=asin))

        self.output.put(['add', urls])
Beispiel #3
0
    def categories_first(self, categories):
        self.load(self.best_url)
        session = Session(autocommit=True)

        category_eles = self.lr.xpaths('//ul[@id="zg_browseRoot"]/ul/li/a')
        for category_ele in category_eles:
            if category_ele.text.strip() in categories:
                url = self.wrapped_url(
                    urlparse.urljoin(self.best_url,
                                     category_ele.attrib['href']))

                key = category_ele.text.strip()
                if session.query(Url).filter_by(url=url).count() < 1:
                    session.add(
                        Url(url=url,
                            type=URL_TYPE.BEST_SELL_CATEGORY,
                            name=category_ele.text.strip(),
                            key=key))
Beispiel #4
0
    def wrapped(self, **kwargs):

        method(self, **kwargs)

        urls = []
        for ele in self.lr.xpaths('//div[@id="zg_paginationWrapper"]//a')[1:]:
            page_url = urlparse.urljoin(self.lr.current_url,
                                        ele.attrib['href'].strip())

            id = page_url.split('/ref', 1)[0].rsplit('/', 1)[-1]
            index = page_url.split('&pg=', 1)[-1]
            key = '%s_%s' % (id, index)

            urls.append(
                Url(url=page_url,
                    type=URL_TYPE.BEST_SELL_CATEGORY_NEXT,
                    key=key))

        self.output.put(['add', urls])
Beispiel #5
0
    def wrapped(self, **kwargs):
        session = Session(autocommit=True)

        method(self, **kwargs)

        urls = []
        product_eles = self.lr.xpaths(
            '//div[@class="zg_itemImageImmersion"]/a')
        for ele in product_eles:
            product_url = urlparse.urljoin(self.lr.current_url,
                                           ele.attrib['href'].strip())
            asin = product_url.split('/dp/', 1)[1].split('/', 1)[0]

            if session.query(Url).filter_by(key=asin).count() < 1:
                logger.info('Add Product: %s' % asin)
                urls.append(
                    Url(url=product_url, type=URL_TYPE.PRODUCT_URL, key=asin))
                # session.commit()

        session.bulk_save_objects(urls)
Beispiel #6
0
    def wrapped(self, **kwargs):
        session = Session(autocommit=True)

        method(self, **kwargs)

        urls = []
        for ele in self.lr.xpaths('//div[@id="zg_paginationWrapper"]//a')[1:]:
            page_url = urlparse.urljoin(self.lr.current_url,
                                        ele.attrib['href'].strip())

            id = page_url.split('/ref', 1)[0].rsplit('/', 1)[-1]
            index = page_url.split('&pg=', 1)[-1]
            key = '%s_%s' % (id, index)

            if session.query(Url).filter_by(key=key).count() < 1:
                logger.info('Add Category Next: %s' % page_url)
                urls.append(
                    Url(url=page_url,
                        type=URL_TYPE.BEST_SELL_CATEGORY_NEXT,
                        key=key))
                # session.commit()
        session.bulk_save_objects(urls)
Beispiel #7
0
    def wrapped(self, **kwargs):
        assert 'url_obj' in kwargs
        session = Session(autocommit=True)

        method(self, **kwargs)

        url_obj = kwargs.get('url_obj')

        p_categories = url_obj.key.split('@@')
        floor_ul = '/'.join(['ul' for i in range(len(p_categories))])

        category_xpaths = '//ul[@id="zg_browseRoot"]/ul/li/a'
        if floor_ul:
            category_xpaths = '//ul[@id="zg_browseRoot"]/ul/%s/li/a' % floor_ul

        urls = []
        category_eles = self.lr.xpaths(category_xpaths)
        if category_eles is not None and len(category_eles) > 0:
            for category_ele in category_eles:
                url = self.wrapped_url(
                    urlparse.urljoin(self.lr.current_url,
                                     category_ele.attrib['href']))
                text = category_ele.text.strip()

                key = '%s@@%s' % (url_obj.key, text)

                if session.query(Url).filter_by(key=key).count() < 1:
                    logger.info('Add Category: %s: %s' % (text, url))
                    urls.append(
                        Url(url=url,
                            type=URL_TYPE.BEST_SELL_CATEGORY,
                            name=text,
                            key=key))
                    # session.commit()

        session.bulk_save_objects(urls)
Beispiel #8
0
    def categories_first2(self, categories):
        self.load(self.best_url)

        category_eles = self.lr.xpaths('//ul[@id="zg_browseRoot"]/ul/li/a')
        for category_ele in category_eles:
            if category_ele.text.strip() in categories:
                url = self.wrapped_url(
                    urlparse.urljoin(self.best_url,
                                     category_ele.attrib['href']))

                key = category_ele.text.strip()
                if session.query(Url).filter_by(url=url).count() < 1:
                    session.add(
                        Url(url=url,
                            type=URL_TYPE.BEST_SELL_CATEGORY,
                            name=category_ele.text.strip(),
                            key=key))
                    session.commit()

        while session.query(Url).filter_by(type=URL_TYPE.BEST_SELL_CATEGORY,
                                           has_crawled=False).count() > 0:
            for url_obj in session.query(Url).filter_by(
                    type=URL_TYPE.BEST_SELL_CATEGORY, has_crawled=False):
                self.load(url_obj.url.encode('utf-8'))

                # sub category
                p_categories = url_obj.key.split('@@')
                floor_ul = '/'.join(['ul' for i in range(len(p_categories))])

                category_xpaths = '//ul[@id="zg_browseRoot"]/ul/li/a'
                if floor_ul:
                    category_xpaths = '//ul[@id="zg_browseRoot"]/ul/%s/li/a' % floor_ul

                category_eles = self.lr.xpaths(category_xpaths)
                if category_eles is not None and len(category_eles) > 0:
                    for category_ele in category_eles:
                        url = self.wrapped_url(
                            urlparse.urljoin(self.lr.current_url,
                                             category_ele.attrib['href']))
                        text = category_ele.text.strip()

                        key = '%s@@%s' % (url_obj.key, text)

                        if session.query(Url).filter_by(key=key).count() < 1:
                            session.add(
                                Url(url=url,
                                    type=URL_TYPE.BEST_SELL_CATEGORY,
                                    name=text,
                                    key=key))
                            session.commit()

                # enc sub category

                # pager
                for ele in self.lr.xpaths(
                        '//div[@id="zg_paginationWrapper"]//a'):
                    page_url = urlparse.urljoin(self.lr.current_url,
                                                ele.attrib['href'].strip())
                    session.add(
                        Url(url=page_url,
                            type=URL_TYPE.BEST_SELL_CATEGORY_NEXT))
                    session.commit()

                # end pager

                # product
                product_eles = self.lr.xpaths(
                    '//div[@class="zg_itemImageImmersion"]/a')
                for ele in product_eles:
                    product_url = urlparse.urljoin(self.lr.current_url,
                                                   ele.attrib['href'].strip())
                    asin = product_url.split('/dp/', 1)[1].split('/', 1)[0]

                    if session.query(Url).filter_by(key=asin).count() < 1:
                        session.add(
                            Url(url=product_url,
                                type=URL_TYPE.PRODUCT_URL,
                                key=asin))
                        session.commit()
                # end product

                url_obj.has_crawled = True
                session.commit()

            session.commit()

        while session.query(Url).filter_by(
                type=URL_TYPE.BEST_SELL_CATEGORY_NEXT,
                has_crawled=False).count() > 0:
            for url_obj in session.query(Url).filter_by(
                    type=URL_TYPE.BEST_SELL_CATEGORY_NEXT, has_crawled=False):
                self.load(url_obj.url.encode('utf-8'))

                # product
                product_eles = self.lr.xpaths(
                    '//div[@class="zg_itemImageImmersion"]/a')
                for ele in product_eles:
                    product_url = urlparse.urljoin(self.lr.current_url,
                                                   ele.attrib['href'].strip())
                    asin = product_url.split('/dp/', 1)[1].split('/', 1)[0]

                    if session.query(Url).filter_by(key=asin).count() < 1:
                        session.add(
                            Url(url=product_url,
                                type=URL_TYPE.PRODUCT_URL,
                                key=asin))
                        session.commit()
                # end product

                url_obj.has_crawled = True
                session.commit()