Exemple #1
0
    def parse_intr(self, tree, xpath):
        dom = tree.xpath(xpath)
        introduce = {}
        temporary = ''
        for item in dom:
            item = item.strip()
            if item == '':
                continue
            elif item.find(':') > 0:
                item = item.split(':', 1)
                if item[1] == '':
                    temporary = extract_title(item[0])
                else:
                    introduce[extract_title(item[0])] = extract_text(item[1])
            else:
                if temporary != '':
                    introduce[temporary] = extract_text(item)
                    temporary = ''
                else:
                    continue

        if introduce != '':
            return introduce
        else:
            return ''
Exemple #2
0
    def crawl(self):
        # wareId = '1229271'
        # wareId = '1391817787'
        # priorcategory = ["家居家装","清洁用品","衣物清洁"]
        # presentcategory = ['1','2','3']
        # ids = uuid.uuid1()


        wareId = self.key
        ids =  self.data.get('uuid')
        category_data = extract_category(self)

        url = 'http://m.360buy.com/product/guige/%s.html'%(str(wareId))
        html_stream = ProcessData.get_web_data(url)
        tree = etree.HTML(html_stream.text)
        xpath = "//table[@class='Ptable']/tr/td/text()"
        dom = tree.xpath(xpath)
        specifications = {}
        temporary = ''
        i = 0
        for item in dom:
            item = item.strip()
            if item == '':
                continue
            if i%2 ==0:
                specifications[item] = ''
                temporary = extract_title(item)
            else:
                specifications[temporary] = extract_text(item)

            i += 1

        data = {
            'ecnorms':specifications
        }
        # specifications = json.dumps(specifications, ensure_ascii=False)
        introduce = IntroduceCrawler.crawl(wareId,ids)
        ecbrands = introduce[u'品牌'] if introduce.get(u'品牌') else ''
   #     ecnames = introduce[u'商品名称'].replace('\'',' ') if introduce.get(u'商品名称') else ''
        ecnames = introduce[u'商品名称'] if introduce.get(u'商品名称') else ''
        crawl_data = {
            'id': ids,
            'source': self.data.get('source'),
            'source_id': wareId,
            'summary': specifications,
            'introduce': introduce,
            'name': ecnames,
            'brand': ecbrands
        }
        crawl_data.update(category_data)
        model = EcDetailModel(crawl_data)
        export(model)
Exemple #3
0
    def crawl(wareId,ids ):
        import sys
        reload(sys)
        sys.setdefaultencoding("utf-8")

        url = 'http://item.jd.com/%s.html'%(str(wareId))
        html_stream = ProcessData.get_web_data(url)
        if html_stream=={}:
            return {}
        html_stream.encoding = 'gb2312'
        tree = etree.HTML(html_stream.text)
        xpath = "//div[@id='product-detail-1']/ul[@class='detail-list']/li//text()"
        dom = tree.xpath(xpath)
        introduce = {}
        temporary = ''
        for item in dom:
            item = item.strip()
            if item == '':
                continue
            elif item.find(':') >0:
                item = item.split(':',1)
                if item[1] == '':
                    temporary = extract_title(item[0])
                else:
                    introduce[extract_title(item[0])] = extract_text(item[1])
            else:
                if temporary != '':
                    introduce[temporary] = extract_text(item)
                    temporary = ''
                else:
                    continue

        if introduce != '':
            return introduce
        else:
            return ''
Exemple #4
0
 def parse_summary(self, tree, xpath):
     dom = tree.xpath(xpath)
     specifications = {}
     temporary = ''
     i = 0
     for item in dom:
         item = item.strip()
         if item == '':
             continue
         if i % 2 == 0:
             specifications[item] = ''
             temporary = extract_title(item)
         else:
             specifications[temporary] = extract_text(item)
         i += 1
     return specifications