Ejemplo n.º 1
0
    def parse3(self, response):
        goods_data = response.meta["goods_data"]
        com_data = response.meta["com_data"]
        data = response.text
        data = data.replace('\\', '')
        try:
            goods_data["detail"] = data[:-3].split('":"')[1]
        except:
            pass
        if goods_data["detail"]:
            try:
                doc = pyquery.PyQuery(goods_data["detail"])
                for i in doc('*').items():
                    if i.attr('background'):
                        i.remove()
            except:
                pass
            try:
                for i in doc('img').items():
                    src = i.attr('src')
                    if not src:
                        i.remove()
                    try:
                        if '?' in src:
                            src = src.split('?')[0]
                        if '"' in src:
                            src = src.replace('"', '').replace('\\', '')
                    except:
                        pass
                    upyun_pic = image_push(response.url, src)
                    i.attr('src', upyun_pic)
            except:
                pass

            try:
                for i in doc('a').items():
                    if 'detail.1688.com' in i.attr('href'):
                        i.attr('href', '')
                for i in doc('map').items():
                    i.remove()
            except:
                pass
            try:
                for i in doc('a').items():
                    if 'alicdn' in i.attr('href'):
                        i.remove()
            except:
                pass
            goods_data["detail"] = doc.outer_html()
        try:
            yield scrapy.Request(url=goods_data["com_url"],
                                 meta={
                                     "goods_data": goods_data,
                                     "com_data": com_data
                                 },
                                 callback=self.parse_company,
                                 dont_filter=True)
        except:
            pass
Ejemplo n.º 2
0
    def parse(self, response):
        print(response.url)
        title = ""
        price = ""
        offer_num = ""
        send_time = ""
        send_money = ""
        com_name = ""
        buy_sell_num = ""
        com_addr = ""
        auth = ""
        com_url = ""
        mobile = ""
        telephone = ""
        seller = ""
        attrs_kv = []
        detail = ""
        thumb_1 = ""
        thumb_2 = ""
        thumb = ""
        cate_name_1 = ""
        cate_name_2 = ""
        cate_name_3 = ""
        min_price = max_price = 0
        price_unit = ''
        content = data = ''
        if response.xpath('//h1[@class="proTitle"]/text()'):
            try:
                try:
                    title = response.xpath(
                        '//h1[@class="proTitle"]/text()').extract()[0]
                except:
                    pass
                try:
                    price = response.xpath(
                        '//div[@class="topPriceRig"]/text()').extract()[1]
                except:
                    pass
                if not price:
                    try:
                        price = response.xpath(
                            '//div[@class="topPriceRig"]/text()').extract()[0]
                        mprice = price.replace('\r', '').replace(
                            '\n', '').replace('\t', '').replace(' ',
                                                                '').split('-')
                        min_price = mprice[0].strip().replace(u'¥', '')
                        max_price = mprice[1].strip().replace(u'¥', '')
                    except:
                        pass
                if not price:
                    try:
                        price = response.xpath(
                            '//div[@class="topPriceRig telBra"]/text()'
                        ).extract()[0]
                    except:
                        pass
                try:
                    price = price.replace('\r', '').replace('\n', '').replace(
                        '\t', '').replace(' ', '')
                except:
                    pass
                try:
                    if u'¥' in price:
                        price = price.replace(u'¥', '')
                except:
                    pass
                try:
                    offer_num = response.xpath(
                        '//span[@class="supply-numb"]/text()').extract()[0]
                except:
                    pass
                try:
                    for i in response.xpath('//div[@class="item-row-w"]'):
                        row = i.xpath('string(.)')
                        if u'发货期限' in row[0].extract():
                            send_time = i.xpath('text()').extract()[1]
                    send_time = send_time.replace('\r', '').replace(
                        '\n', '').replace('\t', '').replace(' ', '')
                except:
                    pass
                # try:
                #     send_money = response.xpath('//span[@class="i-txt"]/text()')
                # except:
                #     pass
                try:
                    buy_sell_num = response.xpath(
                        '//li[@class="line-btm"]/div/a/text()').extract()[0]
                except:
                    pass
                try:
                    com_name = response.xpath(
                        '//div[@class="comply-name"]/p/a/text()').extract()[0]
                    for i in response.xpath(
                            '//div[@class="item-mmt-txt"]/ul/li'):
                        row = i.xpath('string(.)')
                        if u'所在地区' in row[0].extract():
                            com_addr = i.xpath('div/p/text()').extract()[0]
                        if u'认证信息' in row[0].extract():
                            try:
                                auth = i.xpath('div/a/text()').extract()[0]
                            except:
                                auth = i.xpath('div/text()').extract()[0]
                    com_url = response.xpath(
                        '//p[@class="cName"]/a/@href').extract()[0]
                except:
                    pass
                try:
                    mobile = response.xpath(
                        '//em[@class="c-red"]/text()').extract()[0][1:]
                    telephone = response.xpath(
                        '//div[@class="p tel1"]/em/text()').extract()[0]
                    telephone = telephone[1:].split(' ')[0]
                    if not seller:
                        seller = response.xpath(
                            '//div[@class="p name"]/em/text()').extract(
                            )[0][1:]
                except:
                    pass
                try:
                    for i in response.xpath(
                            '//div[@class="d-vopy  parameter "]/ul/li'):
                        key = i.xpath('span/text()').extract()[0].replace(
                            '\r', '').replace('\n',
                                              '').replace('\t',
                                                          '').replace(' ',
                                                                      '')[:-1]
                        value = i.xpath('p/text()').extract()[0].replace(
                            '\r', '').replace('\n',
                                              '').replace('\t',
                                                          '').replace(' ', '')
                        str = key + '|' + value
                        attrs_kv.append(str)
                except:
                    pass
                    # try:
                    #     detail = json.loads(data[1:-1])["html"]
                    # except:
                    pass
                try:
                    thumb = response.xpath(
                        '//ul[@id="thumblist"]/li[1]/div/a/@rel').extract()[0]
                    thumb = re.findall(r"largeimage: '(.*?)'", thumb)[0]
                    thumb_1 = response.xpath(
                        '//ul[@id="thumblist"]/li[2]/div/a/@rel').extract()[0]
                    thumb_1 = re.findall(r"largeimage: '(.*?)'", thumb_1)[0]
                    thumb_2 = response.xpath(
                        '//ul[@id="thumblist"]/li[3]/div/a/@rel').extract()[0]
                    thumb_2 = re.findall(r"largeimage: '(.*?)'", thumb_2)[0]
                except:
                    pass
                try:
                    json_data = re.findall(r'"supCatClass":(.*?),"supcatId"',
                                           response.text)[0]
                    json_data = json.loads(json_data)
                    cate_name_1 = json_data[0]["catName"]
                    cate_name_2 = json_data[1]["catName"]
                    cate_name_3 = json_data[2]["catName"]
                except:
                    pass
            except:
                pass
            # 另一种页面的情况

        ss = response.xpath('//script/text()').extract()
        update_time = ''
        keys = []
        for i in ss:
            text = i
            for j in text.split('var'):
                keys.append(j.strip())
        for i in keys:
            i = i.replace('null',
                          'None').replace('false',
                                          'False').replace('true', 'True')
            if i:
                try:
                    exec i in locals()
                except Exception as e:
                    pass
        try:
            com_username = company_username.decode('utf-8')
        except:
            com_username = ''
        try:
            keywords = productWord
        except:
            try:
                keywords = searchVal
            except:
                try:
                    keywords = urllib.unquote(keywordencode).decode('gbk')
                except:
                    keywords = ''
        try:
            keywords = keywords.decode('utf-8')
        except:
            pass
        try:
            update_time = supplyInfoJson['pubDate'].split(' ')[0]
        except:
            update_time = (datetime.datetime.now() -
                           datetime.timedelta(30)).strftime('%Y-%m-%d')
        try:
            brand = supplyInfoJson['brandName']
        except:
            brand = ''
        try:
            brand = brand.decode('utf-8')
        except:
            pass
        try:
            businAttList = supplyInfoJson['businAttList']
        except:
            businAttList = []
        from_area = ''
        if businAttList:
            for i in businAttList:
                if i['attname'] == '产地':
                    from_area = i['attvalue']
                if not brand:
                    if i['attname'] == '品牌':
                        brand = i['attvalue']
        try:
            from_area = from_area.decode('utf-8')
        except:
            pass

        try:
            seller = companyContactor
        except:
            try:
                seller = contactor
            except:
                pass
        try:
            fax = companyJson['fax']
        except:
            fax = ''
        to_area = qq = ww = wechat = ''
        try:
            detail = supplyInfoJson['introduce']
            detail = detail.decode("utf-8")
        except:
            pass
        if u'质量保证,欢迎咨询洽谈' in detail:
            my_doc = pyquery.PyQuery(response.text)
            my_doc = my_doc("#introduce")
            detail = my_doc.outer_html()
        if detail:
            try:
                doc = pyquery.PyQuery(detail)
            except:
                pass
            print "start up upyun detail"
            for i in doc('img').items():
                try:
                    if i.attr('data-ke-src'):
                        i.attr('data-ke-src', '')
                except:
                    pass
                src = i.attr('src')
                try:
                    if 'hc360' not in src or 'no_pic' in src or 'nopic' in src:
                        i.remove()
                        continue
                except:
                    pass
                try:
                    if thumb and 'no_pic' in thumb:
                        thumb = src
                    if thumb and 'nopic' in thumb:
                        thumb = src
                except:
                    pass
                # hl = hashlib.md5()
                # hl.update(src.encode(encoding='utf-8'))
                # src_md5 = hl.hexdigest()  # md5加密的文件名
                # # 取出图片后缀
                # b = src.split(".")
                # tail = b[-1]
                # full_name = src_md5 + "." + tail
                # new_src = urlparse.urljoin(response.url,src)
                # pic_byte = self.get_pic_byte(new_src, 10)
                # if not pic_byte:
                #     i.remove()
                #     continue
                # upyun_pic = self.my_up_upyun("/" + full_name, pic_byte, 10)
                try:
                    upyun_pic = image_push(response.url, src)
                except:
                    upyun_pic = ''
                if upyun_pic and 'hc360' in upyun_pic:
                    i.remove()
                    continue
                i.attr('src', upyun_pic)
            try:
                for i in doc('a').items():
                    # if 'b2b.hc360.com/supplyself/' in i.attr('href'):
                    #     i.replace_with(pyquery.PyQuery(i.text()))
                    if i.attr('href') and 'hc360' in i.attr('href'):
                        # i.replace_with(pyquery.PyQuery(i.text()))
                        i.remove()
            except:
                pass

            try:
                for i in doc('img').items():
                    if i.attr('src'):
                        src = i.attr('src')
                        if 'hc360' in src or '//' == src:
                            i.remove()
                        if i.attr('data-ke-src'):
                            i.remove_attr('data-ke-src')
                        if i.attr('data-mce-src'):
                            i.remove_attr('data-mce-src')
                        if i.attr('data-cke-saved-src'):
                            i.remove_attr('data-cke-saved-src')
            except:
                pass
            try:
                for i in doc('*').items():
                    if i.attr('src') and 'hc360' in i.attr('src'):
                        i.attr('src', '')
            except:
                pass
            detail = doc.outer_html()
            try:
                detail = detail.replace('<div style="overflow:hidden;">',
                                        '<div>')
            except:
                pass
        try:
            min_amount = int(
                response.xpath('//tr[@class="item-cur-tab"]/td/text()').
                extract()[0].split('-')[0].strip())
        except:
            min_amount = 1
        try:
            price = re.search(r'\d+\.?\d+', price).group()
        except:
            price = 0
        if not min_price:
            min_price = price
        if not max_price:
            max_price = price
        if offer_num:
            try:
                res = re.search(r'(\d+)(.+)', offer_num.replace(' ',
                                                                '')).groups()
                offer_num = res[0]
                if len(res) > 1:
                    price_unit = res[1]
            except:
                pass
        print "start up upyun thumb"
        if thumb:
            # hl = hashlib.md5()
            # hl.update(thumb.encode(encoding='utf-8'))
            # src_md5 = hl.hexdigest()  # md5加密的文件名
            # # 取出图片后缀
            # b = thumb.split(".")
            # tail = b[-1]
            # full_name = src_md5 + "." + tail
            # new_src = urlparse.urljoin(response.url, thumb)
            # pic_byte = self.get_pic_byte(new_src , 10)
            # thumb = self.my_up_upyun("/" + full_name, pic_byte , 10)
            thumb = image_push(response.url, thumb)
        if 'hc360' in thumb:
            thumb = ''
        if thumb_1:
            # hl = hashlib.md5()
            # hl.update(thumb_1.encode(encoding='utf-8'))
            # src_md5 = hl.hexdigest()  # md5加密的文件名
            # # 取出图片后缀
            # b = thumb_1.split(".")
            # tail = b[-1]
            # full_name = src_md5 + "." + tail
            # new_src = urlparse.urljoin(response.url, thumb_1)
            # pic_byte = self.get_pic_byte(new_src, 10)
            # thumb_1 = self.my_up_upyun("/" + full_name, pic_byte, 10)
            thumb_1 = image_push(response.url, thumb_1)
            if 'hc360' in thumb_1:
                thumb_1 = ''
        if thumb_2:
            # hl = hashlib.md5()
            # hl.update(thumb_2.encode(encoding='utf-8'))
            # src_md5 = hl.hexdigest()  # md5加密的文件名
            # # 取出图片后缀
            # b = thumb_2.split(".")
            # tail = b[-1]
            # full_name = src_md5 + "." + tail
            # new_src = urlparse.urljoin(response.url, thumb_2)
            # pic_byte = self.get_pic_byte(new_src, 10)
            # thumb_2 = self.my_up_upyun("/" + full_name, pic_byte, 10)
            thumb_2 = image_push(response.url, thumb_2)
            if 'hc360' in thumb_2:
                thumb_2 = ''

        goods_data = {
            'source_url': response.url,
            'title': title,
            'price': price,
            'offer_num': offer_num,
            'send_time': send_time,
            'send_money': send_money,
            'com_name': com_name,
            'com_addr': com_addr,
            'auth': auth,
            'com_url': com_url,
            'mobile': mobile,
            'telephone': telephone,
            'seller': seller,
            'attrs_kv': attrs_kv,
            'detail': detail,
            'thumb_1': thumb_1,
            'thumb_2': thumb_2,
            'thumb': thumb,
            'cate_name_1': cate_name_1,
            'cate_name_2': cate_name_2,
            'cate_name_3': cate_name_3,
            'update_time': datetime.datetime.now().strftime('%Y-%m-%d'),
            'com_username': com_username,
            'keywords': keywords,
            'min_amount': min_amount,
            'min_price': min_price,
            'max_price': max_price,
            'price_unit': price_unit,
            'brand': brand,
            'to_area': to_area,
            'from_area': from_area,
            'qq': qq,
            'ww': ww,
            'fax': fax,
            'wechat': wechat,
        }

        # 获取企业url判断企业是否已被爬取
        com_url = ""
        try:
            com_url = response.xpath(
                '//p[@class="cName"]/a/@href').extract()[0]
        except:
            pass
        if not com_url:
            try:
                com_url = response.xpath(
                    '//div[@class="goods-tit goods-tit-blue"]/a/@href'
                ).extract()[0]
            except:
                pass
        # 取出企业的关键词
        reg = 'http://(.*?).b2b.hc360.com'
        com_word = re.findall(reg, com_url)[0]
        print "start test com"
        test_com_url = 'http://spiderhub.gongchang.com/write_to_online/data_show_onerow?secret=gc7232275&dataset=hc360_company&hkey=http://' + com_word + '.wx.hc360.com/shop/show.html'
        response = requests.get(test_com_url)
        # print(response.text)
        response = json.loads(response.text)
        # False则该企业未被爬取,True则该企业已被爬取
        print(com_url, response["status"])
        if response["status"] != True:
            # 爬取该企业的信息,并将企业信息放入Item 的 com_data中,与goods_data 一起交给mongoPipe处理
            url_1 = "http://detail.b2b.hc360.com/detail/turbine/template/moblie,vmoblie,getcontact_us.html?username="******"goods_data": goods_data,
                                         "com_word": com_word
                                     },
                                     callback=self.parse_company)
            except:
                pass
        else:
            if goods_data["detail"]:
                Item = HuicongGoodsFenbuItem()
                Item["goods_data"] = goods_data
                Item["com_data"] = ""
                yield Item
Ejemplo n.º 3
0
    def parse_company2(self, response):
        goods_data = response.meta["goods_data"]
        com_word = response.meta["com_word"]
        com_data = response.meta["com_data"]
        print("yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy")
        print(response.url)
        content_2 = response.text
        try:
            content_2 = json.loads(content_2)
        except:
            content_2 = {}
        basic_info = content_2.get('basicInfo', {})
        comdesc = basic_info.get('companyIntroduce', '')
        imageUrl = basic_info.get('imageUrl', [])
        com_pic_upyun = ""
        com_pic = ""
        if imageUrl:
            com_pic = imageUrl[0].get('companyPicUrl', '')
            if com_pic:
                # com_pic_upyun = async_image_push.image_push(x[0], com_pic)
                # hl = hashlib.md5()
                # hl.update(com_pic.encode(encoding='utf-8'))
                # src_md5 = hl.hexdigest()  # md5加密的文件名
                # # 取出图片后缀
                # b = com_pic.split(".")
                # tail = b[-1]
                # full_name = src_md5 + "." + tail
                # new_src = urlparse.urljoin(response.url, com_pic)
                # pic_byte = self.get_pic_byte(new_src, 10)
                # com_pic_upyun = self.my_up_upyun("/" + full_name, pic_byte, 10)
                com_pic_upyun = image_push(response.url, com_pic)
        if 'hc360' in com_pic_upyun:
            com_pic_upyun = ''
        detail_info = content_2.get('detailInfo', {})
        if detail_info:
            if not com_data["address"]:
                com_data["address"] = detail_info.get('address', '')
            regcapital = detail_info.get('capital', '')
            if not com_data["contact"]:
                com_data["contact"] = detail_info.get('contactPeople', '')
            regyear = detail_info.get('createDate', '')
            if not com_data["conn_peopel_sex"]:
                com_data["conn_peopel_sex"] = detail_info.get('gender', '')
            main_industry = detail_info.get('industry', '')
            if not com_data["product"]:
                com_data["product"] = detail_info.get('majorProducts', '')
            busmode = detail_info.get('pattern', '')
            phone_info = detail_info.get('phone', [])
            if phone_info:
                for i in phone_info:
                    if i['name'] == u'传真' and not com_data["fax"]:
                        com_data["fax"] = i.get('value', '')
                    if i['name'] == u'手机' and not com_data["mobile"]:
                        com_data["mobile"] = i.get('value', '')
                    if i['name'] == u'电话1' and not com_data["tel"]:
                        com_data["tel"] = i.get('value', '')
            if not com_data["conn_peopel_position"]:
                com_data["conn_peopel_position"] = detail_info.get(
                    'position', '')
            ceo = detail_info.get('representative', '')
        com_data["comdesc"] = comdesc
        com_data["com_pic"] = com_pic
        com_data["com_pic_upyun"] = com_pic_upyun
        com_data["regcapital"] = regcapital
        com_data["regyear"] = regyear
        com_data["main_industry"] = main_industry
        com_data["busmode"] = busmode
        com_data["ceo"] = ceo

        try:
            yield scrapy.Request(
                url='https://js.hc360.com/b2b/%s/company.html' % (com_word, ),
                meta={
                    "goods_data": goods_data,
                    "com_word": com_word,
                    "com_data": com_data
                },
                callback=self.parse_company3)
        except:
            pass
Ejemplo n.º 4
0
    def parse3(self, response):
        goods_data = response.meta["goods_data"]
        com_data = response.meta["com_data"]
        data = response.text
        data = data.replace('\\', '')
        try:
            goods_data["detail"] = data[:-3].split('":"')[1]
        except:
            pass
        if goods_data["detail"]:
            try:
                doc = pyquery.PyQuery(goods_data["detail"])
                for i in doc('*').items():
                    if i.attr('background'):
                        i.remove()
            except:
                pass
            try:
                for i in doc('img').items():
                    src = i.attr('src')
                    if not src:
                        i.remove()
                    try:
                        if '?' in src:
                            src = src.split('?')[0]
                        if '"' in src:
                            src = src.replace('"', '').replace('\\', '')
                    except:
                        pass
                    # hl = hashlib.md5()
                    # hl.update(src.encode(encoding='utf-8'))
                    # src_md5 = hl.hexdigest()  # md5加密的文件名
                    # # 取出图片后缀
                    # b = src.split(".")
                    # tail = b[-1]
                    # full_name = src_md5 + "." + tail
                    # pic_byte = ""
                    # new_src = src
                    # pic_byte = self.get_pic_byte(new_src, 10)
                    # if not pic_byte:
                    #     i.remove()
                    #     continue
                    # upyun_pic = self.my_up_upyun("/" + full_name, pic_byte, 10)
                    upyun_pic = image_push(response.url, src)
                    i.attr('src', upyun_pic)
            except:
                pass

            try:
                for i in doc('a').items():
                    if 'detail.1688.com' in i.attr('href'):
                        i.attr('href', '')
                for i in doc('map').items():
                    i.remove()
            except:
                pass
            try:
                for i in doc('a').items():
                    if 'alicdn' in i.attr('href'):
                        i.remove()
            except:
                pass
            goods_data["detail"] = doc.outer_html()
        try:
            yield scrapy.Request(url=goods_data["com_url"],
                                 meta={
                                     "goods_data": goods_data,
                                     "com_data": com_data
                                 },
                                 callback=self.parse_company)
        except:
            pass
Ejemplo n.º 5
0
    def parse2(self, response):
        goods_data = response.meta["goods_data"]
        detail_url = response.meta["detail_url"]
        try:
            goods_data["seller"] = response.xpath(
                '//a[@class="membername"]/text()').extract()[0]
        except:
            pass
        try:
            for i in response.xpath('//div[@class="contcat-desc"]/dl'):
                row = i.xpath('string(.)')
                row = row[0].extract().replace('\r', '').replace(
                    '\n', '').replace('\t',
                                      '').replace(' ',
                                                  '').replace(u'\xa0', '')
                a, b = row.split(u":")
                if u'电话' == a:
                    goods_data["telephone"] = b
                if u'移动电话' == a and u'登录' not in b:
                    goods_data["mobile"] = b
                if u'传真' == a:
                    goods_data["fax"] = b
                if u'地址' == a:
                    goods_data["com_addr"] = b
        except:
            pass

        if goods_data["thumb"]:
            try:
                # hl = hashlib.md5()
                # hl.update(goods_data["thumb"].encode(encoding='utf-8'))
                # src_md5 = hl.hexdigest()  # md5加密的文件名
                # # 取出图片后缀
                # b = goods_data["thumb"].split(".")
                # tail = b[-1]
                # full_name = src_md5 + "." + tail
                # new_src = urlparse.urljoin(response.url, goods_data["thumb"])
                # pic_byte = self.get_pic_byte(new_src , 10)
                # goods_data["thumb"] = self.my_up_upyun("/" + full_name, pic_byte , 10)
                goods_data["thumb"] = image_push(response.url,
                                                 goods_data["thumb"])
            except:
                pass
        if goods_data["thumb_1"]:
            try:
                # hl = hashlib.md5()
                # hl.update(goods_data["thumb_1"].encode(encoding='utf-8'))
                # src_md5 = hl.hexdigest()  # md5加密的文件名
                # # 取出图片后缀
                # b = goods_data["thumb_1"].split(".")
                # tail = b[-1]
                # full_name = src_md5 + "." + tail
                # new_src = urlparse.urljoin(response.url, goods_data["thumb_1"])
                # pic_byte = self.get_pic_byte(new_src , 10)
                # goods_data["thumb_1"] = self.my_up_upyun("/" + full_name, pic_byte , 10)
                goods_data["thumb_1"] = image_push(response.url,
                                                   goods_data["thumb_1"])
            except:
                pass
        if goods_data["thumb_2"]:
            try:
                # hl = hashlib.md5()
                # hl.update(goods_data["thumb_2"].encode(encoding='utf-8'))
                # src_md5 = hl.hexdigest()  # md5加密的文件名
                # # 取出图片后缀
                # b = goods_data["thumb_2"].split(".")
                # tail = b[-1]
                # full_name = src_md5 + "." + tail
                # new_src = urlparse.urljoin(response.url, goods_data["thumb_2"])
                # pic_byte = self.get_pic_byte(new_src , 10)
                # goods_data["thumb_2"] = self.my_up_upyun("/" + full_name, pic_byte , 10)
                goods_data["thumb_2"] = image_push(response.url,
                                                   goods_data["thumb_2"])
            except:
                pass
        if 'alicdn' in goods_data["thumb"]:
            goods_data["thumb"] = ''
        if 'alicdn' in goods_data["thumb_1"]:
            goods_data["thumb_1"] = ''
        if 'alicdn' in goods_data["thumb_2"]:
            goods_data["thumb_2"] = ''

        com_address = goods_data["com_addr"]
        com_product = ''
        com_comname = goods_data["com_name"]
        com_com_auth = ''
        com_contact = goods_data["seller"]
        com_conn_peopel_sex = ''
        com_fax = goods_data["fax"]
        com_mobile = goods_data["mobile"]
        com_tel = goods_data["telephone"]
        com_conn_peopel_position = ''
        com_source_url = goods_data["com_url"]
        com_comname_short = goods_data["com_name"]
        com_comtype = ''
        com_com_addr1 = ''
        com_ceo = ''
        com_provinces_and_cities = ''
        com_regyear = ''
        com_regcapital = ''
        com_employ = ''
        com_main_industry = ''
        com_main_addr = ''
        com_user_auth = ''
        com_new_login = ''
        com_wechat = ''
        com_comdesc = ''
        com_com_pic = ''
        com_com_pic_upyun = ''
        com_buy_goods = ''
        com_rdnum = ''
        com_busmode = ''
        com_period = ''
        com_survey = ''
        com_regist = ''
        com_com_status = ''
        com_bank_type = ''
        com_bank_num = ''
        com_bank_people = ''
        com_brand_name = ''
        com_customer = ''
        com_annulsale = ''
        com_annulexport = ''
        com_annulimport = ''
        com_business = ''
        com_com_area = ''
        com_monthly_production = ''
        com_OEM = ''
        com_zip = ''
        com_com_tel = ''
        com_email = ''
        com_website = ''
        com_aministration_area = ''
        com_com_addr2 = ''
        com_qc = ''
        com_com_location = ''
        com_com_reg_addr = ''
        com_business_num = ''
        com_tax_num = ''
        com_management_system = ''
        com_conn_peopel_department = ''

        com_data = {
            'address': com_address,
            'product': com_product,
            'comname': com_comname,
            'com_auth': com_com_auth,
            'contact': com_contact,
            'conn_peopel_sex': com_conn_peopel_sex,
            'fax': com_fax,
            'mobile': com_mobile,
            'tel': com_tel,
            'conn_peopel_position': com_conn_peopel_position,
            'source_url': com_source_url,
            'comname_short': com_comname_short,
            'comtype': com_comtype,
            'com_addr1': com_com_addr1,
            'ceo': com_ceo,
            'provinces_and_cities': com_provinces_and_cities,
            'regyear': com_regyear,
            'regcapital': com_regcapital,
            'employ': com_employ,
            'main_industry': com_main_industry,
            'main_addr': com_main_addr,
            'user_auth': com_user_auth,
            'new_login': com_new_login,
            'wechat': com_wechat,
            'comdesc': com_comdesc,
            'com_pic': com_com_pic,
            'com_pic_upyun': com_com_pic_upyun,
            'buy_goods': com_buy_goods,
            'rdnum': com_rdnum,
            'busmode': com_busmode,
            'period': com_period,
            'survey': com_survey,
            'regist': com_regist,
            'com_status': com_com_status,
            'bank_type': com_bank_type,
            'bank_num': com_bank_num,
            'bank_people': com_bank_people,
            'brand_name': com_brand_name,
            'customer': com_customer,
            'annulsale': com_annulsale,
            'annulexport': com_annulexport,
            'annulimport': com_annulimport,
            'business': com_business,
            'com_area': com_com_area,
            'monthly_production': com_monthly_production,
            'OEM': com_OEM,
            'zip': com_zip,
            'com_tel': com_com_tel,
            'email': com_email,
            'website': com_website,
            'aministration_area': com_aministration_area,
            'com_addr2': com_com_addr2,
            'qc': com_qc,
            'com_location': com_com_location,
            'com_reg_addr': com_com_reg_addr,
            'business_num': com_business_num,
            'tax_num': com_tax_num,
            'management_system': com_management_system,
            'conn_peopel_department': com_conn_peopel_department,
        }

        if detail_url:
            try:
                yield scrapy.Request(url=detail_url,
                                     meta={
                                         "goods_data": goods_data,
                                         "com_data": com_data
                                     },
                                     callback=self.parse3)
            except:
                pass
        else:
            if goods_data["detail"]:
                try:
                    doc = pyquery.PyQuery(goods_data["detail"])
                    for i in doc('*').items():
                        if i.attr('background'):
                            i.remove()
                except:
                    pass
                try:
                    for i in doc('img').items():
                        src = i.attr('src')
                        if not src:
                            i.remove()
                        try:
                            if '?' in src:
                                src = src.split('?')[0]
                        except:
                            pass
                        # # hl = hashlib.md5()
                        # # hl.update(src.encode(encoding='utf-8'))
                        # # src_md5 = hl.hexdigest()  # md5加密的文件名
                        # # # 取出图片后缀
                        # # b = src.split(".")
                        # # tail = b[-1]
                        # # full_name = src_md5 + "." + tail
                        # # pic_byte = ""
                        # # new_src = src
                        # # pic_byte = self.get_pic_byte(new_src, 10)
                        # # if not pic_byte:
                        # #     i.remove()
                        # #     continue
                        # upyun_pic = self.my_up_upyun("/" + full_name, pic_byte, 10)
                        upyun_pic = image_push(response.url, src)
                        i.attr('src', upyun_pic)
                except:
                    pass
                try:
                    for i in doc('a').items():
                        if 'detail.1688.com' in i.attr('href'):
                            i.attr('href', '')
                    for i in doc('map').items():
                        i.remove()
                except:
                    pass
                goods_data["detail"] = doc.outer_html()
                if 'alicdn' in goods_data["detail"]:
                    goods_data["detail"] = ''
            try:
                yield scrapy.Request(url=goods_data["com_url"],
                                     meta={
                                         "goods_data": goods_data,
                                         "com_data": com_data
                                     },
                                     callback=self.parse_company)
            except:
                pass
Ejemplo n.º 6
0
    def parse_company2(self, response):
        goods_data = response.meta["goods_data"]
        com_word = response.meta["com_word"]
        com_data = response.meta["com_data"]
        print(response.url)
        content_2 = response.text
        try:
            content_2 = json.loads(content_2)
        except:
            content_2 = {}
        basic_info = content_2.get('basicInfo', {})
        comdesc = basic_info.get('companyIntroduce', '')
        imageUrl = basic_info.get('imageUrl', [])
        com_pic_upyun = ""
        com_pic = ""
        if imageUrl:
            com_pic = imageUrl[0].get('companyPicUrl', '')
            if com_pic:
                com_pic_upyun = image_push(response.url, com_pic)
        if 'hc360' in com_pic_upyun:
            com_pic_upyun = ''
        detail_info = content_2.get('detailInfo', {})
        if detail_info:
            if not com_data["address"]:
                com_data["address"] = detail_info.get('address', '')
            regcapital = detail_info.get('capital', '')
            if not com_data["contact"]:
                com_data["contact"] = detail_info.get('contactPeople', '')
            regyear = detail_info.get('createDate', '')
            if not com_data["conn_peopel_sex"]:
                com_data["conn_peopel_sex"] = detail_info.get('gender', '')
            main_industry = detail_info.get('industry', '')
            if not com_data["product"]:
                com_data["product"] = detail_info.get('majorProducts', '')
            busmode = detail_info.get('pattern', '')
            phone_info = detail_info.get('phone', [])
            if phone_info:
                for i in phone_info:
                    if i['name'] == u'传真' and not com_data["fax"]:
                        com_data["fax"] = i.get('value', '')
                    if i['name'] == u'手机' and not com_data["mobile"]:
                        com_data["mobile"] = i.get('value', '')
                    if i['name'] == u'电话1' and not com_data["tel"]:
                        com_data["tel"] = i.get('value', '')
            if not com_data["conn_peopel_position"]:
                com_data["conn_peopel_position"] = detail_info.get(
                    'position', '')
            ceo = detail_info.get('representative', '')
        com_data["comdesc"] = comdesc
        com_data["com_pic"] = com_pic
        com_data["com_pic_upyun"] = com_pic_upyun
        com_data["regcapital"] = regcapital
        com_data["regyear"] = regyear
        com_data["main_industry"] = main_industry
        com_data["busmode"] = busmode
        com_data["ceo"] = ceo

        try:
            yield scrapy.Request(
                url='https://js.hc360.com/b2b/%s/company.html' % (com_word, ),
                meta={
                    "goods_data": goods_data,
                    "com_word": com_word,
                    "com_data": com_data
                },
                callback=self.parse_company3)
        except:
            pass
Ejemplo n.º 7
0
    def parse(self, response):
        print(response.url)
        title = ""
        price = ""
        offer_num = ""
        send_time = ""
        send_money = ""
        com_name = ""
        buy_sell_num = ""
        com_addr = ""
        auth = ""
        com_url = ""
        mobile = ""
        telephone = ""
        seller = ""
        attrs_kv = []
        detail = ""
        thumb_1 = ""
        thumb_2 = ""
        thumb = ""
        cate_name_1 = ""
        cate_name_2 = ""
        cate_name_3 = ""
        min_price = max_price = 0
        price_unit = ''
        content = data = ''
        if response.xpath('//h1[@class="proTitle"]/text()'):
            try:
                try:
                    title = response.xpath(
                        '//h1[@class="proTitle"]/text()').extract()[0]
                except:
                    pass
                try:
                    price = response.xpath(
                        '//div[@class="topPriceRig"]/text()').extract()[1]
                except:
                    pass
                if not price:
                    try:
                        price = response.xpath(
                            '//div[@class="topPriceRig"]/text()').extract()[0]
                        mprice = price.replace('\r', '').replace(
                            '\n', '').replace('\t', '').replace(' ',
                                                                '').split('-')
                        min_price = mprice[0].strip().replace(u'¥', '')
                        max_price = mprice[1].strip().replace(u'¥', '')
                    except:
                        pass
                if not price:
                    try:
                        price = response.xpath(
                            '//div[@class="topPriceRig telBra"]/text()'
                        ).extract()[0]
                    except:
                        pass
                try:
                    price = price.replace('\r', '').replace('\n', '').replace(
                        '\t', '').replace(' ', '')
                except:
                    pass
                try:
                    if u'¥' in price:
                        price = price.replace(u'¥', '')
                except:
                    pass
                try:
                    offer_num = response.xpath(
                        '//span[@class="supply-numb"]/text()').extract()[0]
                except:
                    pass
                try:
                    for i in response.xpath('//div[@class="item-row-w"]'):
                        row = i.xpath('string(.)')
                        if u'发货期限' in row[0].extract():
                            send_time = i.xpath('text()').extract()[1]
                    send_time = send_time.replace('\r', '').replace(
                        '\n', '').replace('\t', '').replace(' ', '')
                except:
                    pass
                try:
                    buy_sell_num = response.xpath(
                        '//li[@class="line-btm"]/div/a/text()').extract()[0]
                except:
                    pass
                try:
                    com_name = response.xpath(
                        '//div[@class="comply-name"]/p/a/text()').extract()[0]
                    for i in response.xpath(
                            '//div[@class="item-mmt-txt"]/ul/li'):
                        row = i.xpath('string(.)')
                        if u'所在地区' in row[0].extract():
                            com_addr = i.xpath('div/p/text()').extract()[0]
                        if u'认证信息' in row[0].extract():
                            try:
                                auth = i.xpath('div/a/text()').extract()[0]
                            except:
                                auth = i.xpath('div/text()').extract()[0]
                    com_url = response.xpath(
                        '//p[@class="cName"]/a/@href').extract()[0]
                except:
                    pass
                try:
                    mobile = response.xpath(
                        '//em[@class="c-red"]/text()').extract()[0][1:]
                    telephone = response.xpath(
                        '//div[@class="p tel1"]/em/text()').extract()[0]
                    telephone = telephone[1:].split(' ')[0]
                    if not seller:
                        seller = response.xpath(
                            '//div[@class="p name"]/em/text()').extract(
                            )[0][1:]
                except:
                    pass
                try:
                    for i in response.xpath(
                            '//div[@class="d-vopy  parameter "]/ul/li'):
                        key = i.xpath('span/text()').extract()[0].replace(
                            '\r', '').replace('\n',
                                              '').replace('\t',
                                                          '').replace(' ',
                                                                      '')[:-1]
                        value = i.xpath('p/text()').extract()[0].replace(
                            '\r', '').replace('\n',
                                              '').replace('\t',
                                                          '').replace(' ', '')
                        str = key + '|' + value
                        attrs_kv.append(str)
                except:
                    pass
                try:
                    thumb = response.xpath(
                        '//ul[@id="thumblist"]/li[1]/div/a/@rel').extract()[0]
                    thumb = re.findall(r"largeimage: '(.*?)'", thumb)[0]
                    thumb_1 = response.xpath(
                        '//ul[@id="thumblist"]/li[2]/div/a/@rel').extract()[0]
                    thumb_1 = re.findall(r"largeimage: '(.*?)'", thumb_1)[0]
                    thumb_2 = response.xpath(
                        '//ul[@id="thumblist"]/li[3]/div/a/@rel').extract()[0]
                    thumb_2 = re.findall(r"largeimage: '(.*?)'", thumb_2)[0]
                except:
                    pass
                try:
                    json_data = re.findall(r'"supCatClass":(.*?),"supcatId"',
                                           response.text)[0]
                    json_data = json.loads(json_data)
                    cate_name_1 = json_data[0]["catName"]
                    cate_name_2 = json_data[1]["catName"]
                    cate_name_3 = json_data[2]["catName"]
                except:
                    pass
            except:
                pass

        ss = response.xpath('//script/text()').extract()
        update_time = ''
        keys = []
        for i in ss:
            text = i
            for j in text.split('var'):
                keys.append(j.strip())
        for i in keys:
            i = i.replace('null',
                          'None').replace('false',
                                          'False').replace('true', 'True')
            if i:
                try:
                    exec i in locals()
                except Exception as e:
                    pass
        try:
            com_username = company_username.decode('utf-8')
        except:
            com_username = ''
        try:
            keywords = productWord
        except:
            try:
                keywords = searchVal
            except:
                try:
                    keywords = urllib.unquote(keywordencode).decode('gbk')
                except:
                    keywords = ''
        try:
            keywords = keywords.decode('utf-8')
        except:
            pass
        try:
            update_time = supplyInfoJson['pubDate'].split(' ')[0]
        except:
            update_time = (datetime.datetime.now() -
                           datetime.timedelta(30)).strftime('%Y-%m-%d')
        try:
            brand = supplyInfoJson['brandName']
        except:
            brand = ''
        try:
            brand = brand.decode('utf-8')
        except:
            pass
        try:
            businAttList = supplyInfoJson['businAttList']
        except:
            businAttList = []
        from_area = ''
        if businAttList:
            for i in businAttList:
                if i['attname'] == '产地':
                    from_area = i['attvalue']
                if not brand:
                    if i['attname'] == '品牌':
                        brand = i['attvalue']
        try:
            from_area = from_area.decode('utf-8')
        except:
            pass

        try:
            seller = companyContactor
        except:
            try:
                seller = contactor
            except:
                pass
        try:
            fax = companyJson['fax']
        except:
            fax = ''
        to_area = qq = ww = wechat = ''
        try:
            detail = supplyInfoJson['introduce']
            detail = detail.decode("utf-8")
        except:
            pass
        if u'质量保证,欢迎咨询洽谈' in detail or not detail:
            my_doc = pyquery.PyQuery(response.text)
            my_doc = my_doc("#introduce")
            detail = my_doc.outer_html()
        if detail:
            try:
                doc = pyquery.PyQuery(detail)
            except:
                pass
            print "start up upyun detail"
            try:
                for i in doc('a').items():
                    if i.attr('href') and 'hc360' in i.attr('href'):
                        i.remove()
            except:
                pass
            try:
                for i in doc('img').items():
                    try:
                        if i.attr('data-ke-src'):
                            i.attr('data-ke-src', '')
                    except:
                        pass
                    src = i.attr('src')
                    try:
                        if 'hc360' not in src or 'no_pic' in src or 'nopic' in src:
                            i.remove()
                            continue
                    except:
                        pass
                    try:
                        if thumb and 'no_pic' in thumb:
                            thumb = src
                        if thumb and 'nopic' in thumb:
                            thumb = src
                    except:
                        pass
                    upyun_pic = ''
                    try:
                        upyun_pic = image_push(response.url, src)
                    except:
                        pass
                    if 'hc360' in upyun_pic:
                        i.remove()
                        continue
                    i.attr('src', upyun_pic)
            except:
                pass

            try:
                for i in doc('img').items():
                    if i.attr('src'):
                        src = i.attr('src')
                        if 'hc360' in src or '//' == src:
                            i.remove()
                        if i.attr('data-ke-src'):
                            i.remove_attr('data-ke-src')
                        if i.attr('data-mce-src'):
                            i.remove_attr('data-mce-src')
                        if i.attr('data-cke-saved-src'):
                            i.remove_attr('data-cke-saved-src')
            except:
                pass
            try:
                for i in doc('*').items():
                    if i.attr('src') and 'hc360' in i.attr('src'):
                        i.attr('src', '')
                    if i.attr('data-tfs-url'):
                        i.attr('data-tfs-url', '')
                    if i.attr('data-url'):
                        i.attr('data-url', '')
            except:
                pass
            detail = doc.outer_html()
            try:
                detail = detail.replace('<div style="overflow:hidden;">',
                                        '<div>')
            except:
                pass
        if detail and u'正在加载' in detail:
            detail = ''
        try:
            min_amount = int(
                response.xpath('//tr[@class="item-cur-tab"]/td/text()').
                extract()[0].split('-')[0].strip())
        except:
            min_amount = 1
        try:
            price = re.search(r'\d+\.?\d+', price).group()
        except:
            price = 0
        if not min_price:
            min_price = price
        if not max_price:
            max_price = price
        if offer_num:
            try:
                res = re.search(r'(\d+)(.+)', offer_num.replace(' ',
                                                                '')).groups()
                offer_num = res[0]
                if len(res) > 1:
                    price_unit = res[1]
            except:
                pass
        print "start up upyun thumb"
        if thumb:
            thumb = image_push(response.url, thumb)
        if 'hc360' in thumb:
            thumb = ''
        if thumb_1:
            thumb_1 = image_push(response.url, thumb_1)
            if 'hc360' in thumb_1:
                thumb_1 = ''
        if thumb_2:
            thumb_2 = image_push(response.url, thumb_2)
            if 'hc360' in thumb_2:
                thumb_2 = ''

        goods_data = {
            'source_url': response.url,
            'title': title,
            'price': price,
            'offer_num': offer_num,
            'send_time': send_time,
            'send_money': send_money,
            'com_name': com_name,
            'com_addr': com_addr,
            'auth': auth,
            'com_url': com_url,
            'mobile': mobile,
            'telephone': telephone,
            'seller': seller,
            'attrs_kv': attrs_kv,
            'detail': detail,
            'thumb_1': thumb_1,
            'thumb_2': thumb_2,
            'thumb': thumb,
            'cate_name_1': cate_name_1,
            'cate_name_2': cate_name_2,
            'cate_name_3': cate_name_3,
            'update_time': datetime.datetime.now().strftime('%Y-%m-%d'),
            'com_username': com_username,
            'keywords': keywords,
            'min_amount': min_amount,
            'min_price': min_price,
            'max_price': max_price,
            'price_unit': price_unit,
            'brand': brand,
            'to_area': to_area,
            'from_area': from_area,
            'qq': qq,
            'ww': ww,
            'fax': fax,
            'wechat': wechat,
        }

        # 获取企业url判断企业是否已被爬取
        com_url = ""
        try:
            com_url = response.xpath(
                '//p[@class="cName"]/a/@href').extract()[0]
        except:
            pass
        if not com_url:
            try:
                com_url = response.xpath(
                    '//div[@class="goods-tit goods-tit-blue"]/a/@href'
                ).extract()[0]
            except:
                pass
        # 取出企业的关键词
        reg = 'http://(.*?).b2b.hc360.com'
        com_word = re.findall(reg, com_url)[0]
        print(" ")
        test_com_url = 'http://' + com_word + '.wx.hc360.com/shop/show.html'
        conn = pymysql.connect(host='192.168.14.90',
                               port=3306,
                               user='******',
                               passwd='123456',
                               db='hc360',
                               charset='utf8')
        cursor = conn.cursor()
        cursor.execute(
            "select * from com_tmp where url = '{}'".format(test_com_url))
        conn.commit()
        result = cursor.fetchone()
        if not result:
            # 企业没有爬过
            try:
                cursor.execute(
                    "in sert into com_tmp (url) v  alues ('{}')".format(
                        test_com_url))
                conn.commit()
            except:
                pass
            cursor.close()
            conn.close()
            # 爬取该企业的信息,并将企业信息放入Item 的 com_data中,与goods_data 一起交给mongoPipe处理
            url_1 = "http://detail.b2b.hc360.com/detail/turbine/template/moblie,vmoblie,getcontact_us.html?username="******"goods_data": goods_data,
                                         "com_word": com_word
                                     },
                                     callback=self.parse_company)
            except:
                pass
        else:
            cursor.close()
            conn.close()
            # 企业爬过了
            if goods_data["detail"]:
                Item = HuicongGoodsFenbuItem()
                Item["goods_data"] = goods_data
                Item["com_data"] = ""
                yield Item