Esempio n. 1
0
 def parse(self, response):
     for url in GetFrequentWordsUrl(GetCommodityID(1,
                                                   spidername=self.name)):
         # 在这里就进行翻页操作
         yield Request(url=url,
                       callback=self.parse_detail,
                       meta={'url': url})
Esempio n. 2
0
class HM(scrapy.Spider):
    name = 'Qimi_tag'
    allowed_domains = ['tmall.com']
    start_urls = GetFrequentWordsUrl(GetCommodityID(0, spidername=name))

    def parse(self, response):
        for url in GetFrequentWordsUrl(GetCommodityID(1,
                                                      spidername=self.name)):
            # 在这里就进行翻页操作
            yield Request(url=url,
                          callback=self.parse_detail,
                          meta={'url': url})

    def parse_detail(self, response):

        crawls_time = datetime.datetime.now().strftime(
            '%Y-%m-%d %H:%M:%S')  # time formatting
        # 提取商品ID
        GetId = lambda: re.match('.*?itemId=(\d.*)&.*?', response.meta['url']
                                 ).group(1)
        com_id = int(GetId())
        # parse
        '''yield item'''
        jsoncontent = re.search('^[^(]*?\((.*)\)[^)]*$',
                                response.text).group(1)
        jsdict = json.loads(jsoncontent)
        if jsdict['tags']['tagClouds']:
            for i in jsdict['tags']['tagClouds']:
                '''extract_field'''
                tag_count = i['count']
                posi = i['posi']
                tag = i['tag']
                weight = i['weight']
                id = i['id']
                '''callback_storefunc'''
                item = AimshopspiderTagItem()
                item['com_id'] = com_id  # 商品id
                item['tag'] = tag  # 标签
                item['posi'] = posi  # 好评为True,差评为False
                item['tag_count'] = tag_count  # 标签计数
                item['crawls_time'] = crawls_time  # 抓取时间
                item['weight'] = weight
                item['id'] = id
                yield item  # 迭代返回