def start_requests(self):
        log.start(logfile=self.__smzdm_log_file, loglevel='INFO', logstdout=False)
        smzdm_config = ConfigParser.RawConfigParser()
        smzdm_config.read("configure/smzdm.ini")
        self.price_pattern = re.compile(smzdm_config.get("item_page", "price_pattern").decode("utf-8"))
        self.usd_price_pattern = re.compile(smzdm_config.get("item_page", "usd_price_pattern").decode("utf-8"))
        self.jpy_price_pattern = re.compile(smzdm_config.get("item_page", "jpy_price_pattern").decode("utf-8"))
        self.eur_price_pattern = re.compile(smzdm_config.get("item_page", "eur_price_pattern").decode("utf-8"))
        self.head_separator = smzdm_config.get("item_page", "head_separator_pattern").decode("utf-8")
        self.attachment_pattern = re.compile(smzdm_config.get("item_page", "attachment_pattern").decode("utf-8"))

        config_file_name = "configure/shopping_page.ini"
        shopping_config = ConfigParser.RawConfigParser()
        shopping_config.read(config_file_name)

        for section_name in shopping_config.sections():
            log.msg("Supported url pattern:\t%s" % shopping_config.get(section_name, "url_pattern").decode('utf8'), level=log.DEBUG, spider=SmzdmSpider)
            url_pattern = re.compile(shopping_config.get(section_name, "url_pattern").decode('utf8'))
            title_xpath = shopping_config.get(section_name, "title_xpath")
            price_xpath = shopping_config.get(section_name, "price_xpath")
            price_redudant_pattern = re.compile(shopping_config.get(section_name, "price_redudant_pattern").decode('utf8'))
            description_xpath = shopping_config.get(section_name, "description_xpath")
            description_img_xpath = shopping_config.get(section_name, "description_img_xpath")
            currency = shopping_config.get(section_name, "currency")
            title_img_xpath_list = shopping_config.get(section_name, "title_img_xpath").split(",")
            self.__url_pattern_xpath_dict[url_pattern] = (title_xpath, \
                    price_xpath, price_redudant_pattern, description_xpath, description_img_xpath, currency, title_img_xpath_list)
        CrawlSpider.start_requests(self)
        yield WebdriverRequest('http://www.smzdm.com/fenlei/yingertuiche/youhui/p1', meta={'category': 'stroller'}, callback=self.parse_smzdm_list_page)
        yield WebdriverRequest('http://www.smzdm.com/fenlei/anquanzuoyi/youhui/p1', meta={'category': 'car_seat'}, callback=self.parse_smzdm_list_page)
        yield WebdriverRequest('http://www.smzdm.com/fenlei/lego/youhui/p1', meta={'category': 'lego'}, callback=self.parse_smzdm_list_page)
        yield WebdriverRequest('http://www.smzdm.com/fenlei/huwaibeibao/youhui/p1', meta={'category': 'backpack'}, callback=self.parse_smzdm_list_page)
        yield WebdriverRequest('http://www.smzdm.com/fenlei/yingertuiche/haitao/p1', meta={'category': 'stroller'}, callback=self.parse_smzdm_list_page)
        yield WebdriverRequest('http://www.smzdm.com/fenlei/anquanzuoyi/haitao/p1', meta={'category': 'car_seat'}, callback=self.parse_smzdm_list_page)
        yield WebdriverRequest('http://www.smzdm.com/fenlei/lego/haitao/p1', meta={'category': 'lego'}, callback=self.parse_smzdm_list_page)
        yield WebdriverRequest('http://www.smzdm.com/fenlei/huwaibeibao/haitao/p1', meta={'category': 'backpack'}, callback=self.parse_smzdm_list_page)
Exemple #2
0
    def start_requests(self):
        log.start(logfile=self.__smzdm_log_file,
                  loglevel='INFO',
                  logstdout=False)
        smzdm_config = ConfigParser.RawConfigParser()
        smzdm_config.read("configure/smzdm.ini")
        self.price_pattern = re.compile(
            smzdm_config.get("item_page", "price_pattern").decode("utf-8"))
        self.usd_price_pattern = re.compile(
            smzdm_config.get("item_page", "usd_price_pattern").decode("utf-8"))
        self.jpy_price_pattern = re.compile(
            smzdm_config.get("item_page", "jpy_price_pattern").decode("utf-8"))
        self.eur_price_pattern = re.compile(
            smzdm_config.get("item_page", "eur_price_pattern").decode("utf-8"))
        self.head_separator = smzdm_config.get(
            "item_page", "head_separator_pattern").decode("utf-8")
        self.attachment_pattern = re.compile(
            smzdm_config.get("item_page",
                             "attachment_pattern").decode("utf-8"))

        config_file_name = "configure/shopping_page.ini"
        shopping_config = ConfigParser.RawConfigParser()
        shopping_config.read(config_file_name)

        for section_name in shopping_config.sections():
            log.msg("Supported url pattern:\t%s" % shopping_config.get(
                section_name, "url_pattern").decode('utf8'),
                    level=log.DEBUG,
                    spider=SmzdmSpider)
            url_pattern = re.compile(
                shopping_config.get(section_name,
                                    "url_pattern").decode('utf8'))
            title_xpath = shopping_config.get(section_name, "title_xpath")
            price_xpath = shopping_config.get(section_name, "price_xpath")
            price_redudant_pattern = re.compile(
                shopping_config.get(section_name,
                                    "price_redudant_pattern").decode('utf8'))
            description_xpath = shopping_config.get(section_name,
                                                    "description_xpath")
            description_img_xpath = shopping_config.get(
                section_name, "description_img_xpath")
            currency = shopping_config.get(section_name, "currency")
            title_img_xpath_list = shopping_config.get(
                section_name, "title_img_xpath").split(",")
            self.__url_pattern_xpath_dict[url_pattern] = (title_xpath, \
                    price_xpath, price_redudant_pattern, description_xpath, description_img_xpath, currency, title_img_xpath_list)
        CrawlSpider.start_requests(self)
        yield WebdriverRequest(
            'http://www.smzdm.com/fenlei/yingertuiche/youhui/p1',
            meta={'category': 'stroller'},
            callback=self.parse_smzdm_list_page)
        yield WebdriverRequest(
            'http://www.smzdm.com/fenlei/anquanzuoyi/youhui/p1',
            meta={'category': 'car_seat'},
            callback=self.parse_smzdm_list_page)
        yield WebdriverRequest('http://www.smzdm.com/fenlei/lego/youhui/p1',
                               meta={'category': 'lego'},
                               callback=self.parse_smzdm_list_page)
        yield WebdriverRequest(
            'http://www.smzdm.com/fenlei/huwaibeibao/youhui/p1',
            meta={'category': 'backpack'},
            callback=self.parse_smzdm_list_page)
        yield WebdriverRequest(
            'http://www.smzdm.com/fenlei/yingertuiche/haitao/p1',
            meta={'category': 'stroller'},
            callback=self.parse_smzdm_list_page)
        yield WebdriverRequest(
            'http://www.smzdm.com/fenlei/anquanzuoyi/haitao/p1',
            meta={'category': 'car_seat'},
            callback=self.parse_smzdm_list_page)
        yield WebdriverRequest('http://www.smzdm.com/fenlei/lego/haitao/p1',
                               meta={'category': 'lego'},
                               callback=self.parse_smzdm_list_page)
        yield WebdriverRequest(
            'http://www.smzdm.com/fenlei/huwaibeibao/haitao/p1',
            meta={'category': 'backpack'},
            callback=self.parse_smzdm_list_page)
Exemple #3
0
 def start_requests(self):
     """Combine scrape and start requests."""
     return itertools.chain(CallbackMixin.scrape_requests(self),
                            _CrawlSpider.start_requests(self))
 def start_requests(self):
     """Combine scrape and start requests."""
     return itertools.chain(CallbackMixin.scrape_requests(self),
                            _CrawlSpider.start_requests(self))
Exemple #5
0
 def start_requests(self):
     """CrawlSpider's start_requests() should take precedence over 
     SitemapSpider.
     
     """
     return CrawlSpider.start_requests(self)