Beispiel #1
0
 def get_huangye(self, html):
     status = 2
     try:
         company_intro = html.xpath(
             '//*[@id="site_content"]/div[1]/div/div[1]/div/div[2]/div[2]/span'
         )[0].text
         if company_intro is None or len(str(company_intro)) < 5:
             company_intro = html.xpath(
                 '//*[@id="site_content"]/div[1]/div/div/div/div[2]/div[2]/span/text()'
             )
         self.resultItem['company_intro'] = process_str(company_intro)
     except Exception as e:
         print(str(self.company) + str(e.message) + ",company_intro")
     try:
         company_hidder = html.xpath(
             '//*[@id="site_content"]/div[1]/div/div[1]/div/div[2]/p'
         )[0].text
         if company_hidder is None or str(company_hidder) < 5:
             company_hidder = html.xpath(
                 '//*[@id="site_content"]/div[1]/div/div/div/div[2]/p'
             )[0].text
         self.resultItem['company_hidder'] = process_str(company_hidder)
         self.resultItem['title'] = process_str(
             html.xpath('/html/head/title')[0].text)
     except:
         print(str(self.company) + str(e.message) + ",company_hidder")
     try:
         self.resultItem['classifications'] = process_str(
             html.xpath('//meta[@name="keywords"]/@content'))
     except:
         print(str(self.company) + str(e.message) + ",classifications")
     return status
Beispiel #2
0
    def parse_company_info(self, html1):
        # 公司信息
        try:
            html = etree.HTML(html1)
        except:
            return
        try:
            self.resultItem['shop_long'] = html.xpath(
                '//a[@href="https://cxt.1688.com/"]/text()')
        except Exception as e:
            print(str(self.company) + str(e.message))

        #联系人姓名
        self.resultItem['contact_name'] = self.get_contact_name(html)

        #电话号码
        self.resultItem['telphone'] = self.get_telphone(html)

        #手机号码
        self.resultItem['phone'] = self.get_phone(html)
        #经营年份
        try:
            self.resultItem['bus_year'] = html.xpath(
                '//*[@id="site_content"]/div/div/div/div[2]/div/div[2]/div/div[1]/div/div/h1/a[2]/text()'
            )
        except Exception as e:
            print(str(self.company) + str(e.message))

        #信用等级
        self.resultItem['credit_level'] = process_str(
            self.get_credit_level(html))

        #公司简介
        self.resultItem['company_intro'] = process_str(
            self.get_company_intro(html))

        #公司经营品牌
        self.resultItem['brand'] = process_str(self.get_company_brand(html))

        #旺旺号码
        self.resultItem['wang_wang'] = process_str(self.get_wangwang(html))

        #注册资本
        self.resultItem['registered_capital'] = process_str(
            self.get_original_money(html))

        self.get_trade_credit_record(html)
        self.get_buyer_service_ablity(html)
        return 1
Beispiel #3
0
    def company_baseinfo(self, html1):
        try:
            html = etree.HTML(html1)
        except:
            return
        url = ''
        # 公司名称
        try:
            company_name = html.xpath(
                '//*[@id="offer1"]/div[1]/div[2]/div[1]/a[1]')
            new_company = stringQ2B(
                company_name[0].get('title')).lower().strip().replace(
                    ')', '').replace('(', '')
            if cmp(new_company, self.company) != 0:
                print(self.company + ",公司名称不相等")
                return 1
            self.resultItem['company'] = process_str(
                company_name[0].get('title'))
            url = company_name[0].get("href")
            self.resultItem['url'] = url
        except Exception as e:
            print(str(self.company) + str('公司名称不存在'))
            return 3

        # 公司主营
        try:
            main_c = html.xpath(
                '//*[@id="offer1"]/div[1]/div[2]/div[3]/div[1]/div[1]/a/*')
            main_d = []
            for i in range(0, len(main_c)):
                main_d.append(main_c[i].text)
            self.resultItem['main_d'] = ','.join(main_d)
        except Exception as e:
            print(str(self.company) + str(e.message))

        # 公司地址
        try:
            self.resultItem['address'] = html.xpath(
                '//*[@id="offer1"]/div[1]/div[2]/div[3]/div[1]/div[2]/a'
            )[0].text
        except Exception as e:
            print(str(self.company) + str(e.message))

        # 公司人数
        try:
            self.resultItem['company_persons'] = html.xpath(
                '//*[@id="offer1"]/div[1]/div[2]/div[3]/div[1]/div[3]/a'
            )[0].get('title')
        except Exception as e:
            print(str(self.company) + str(e.message))

        # 贸易类型
        try:
            self.resultItem['model'] = html.xpath(
                '//*[@id="offer1"]/div[1]/div[2]/div[3]/div[2]/div[1]/b'
            )[0].text
        except Exception as e:
            print(str(self.company) + str(e.message))

        return url
Beispiel #4
0
 def get_yellow_page_info(self, html1):
     try:
         html = etree.HTML(html1)
     except:
         return
     status = 2
     try:
         company_intro = html.xpath(
             '//*[@id="site_content"]/div[1]/div/div[1]/div/div[2]/div[2]/span'
         )[0].text
     except Exception as e:
         company_intro = html.xpath(
             '//*[@id="site_content"]/div[1]/div/div/div/div[2]/div[2]/span/text()'
         )
         pass
     if company_intro is None or len(str(company_intro)) < 5:
         print('company_intro is has no scrape')
     else:
         self.resultItem['company_intro'] = company_intro
     try:
         company_hidder = html.xpath(
             '//*[@id="site_content"]/div[1]/div/div[1]/div/div[2]/p'
         )[0].text
         if company_hidder is None or str(company_hidder) < 5:
             company_hidder = html.xpath(
                 '//*[@id="site_content"]/div[1]/div/div/div/div[2]/p'
             )[0].text
         self.resultItem['company_hidder'] = process_str(company_hidder)
         self.get_contract_name(self.resultItem['company_hidder'])
         self.resultItem['title'] = process_str(
             html.xpath('/html/head/title')[0].text)
     except:
         self.logger.error(
             str(self.company) + str(e.message) + ",company_hidder")
     try:
         self.resultItem['classifications'] = process_str("".join(
             html.xpath('//meta[@name="keywords"]/@content')))
     except:
         self.logger.error(
             str(self.company) + str(e.message) + ",classifications")
     return status
Beispiel #5
0
 def get_goods_title(self, html):
     try:
         list = []
         titles = html.xpath(
             "//*[@class='offer-list-row']/li/div[3]/a/@title")
         index = 0
         for title in titles:
             list.append(str(index) + ":" + process_str(title))
             index += 1
             if index > 20:
                 break
         self.resultItem['goods_title'] = "@@".join(list)
     except Exception as e:
         print(str(self.company) + str(e.message) + ",contact_name")
Beispiel #6
0
    def get_offer_info(self, html, i):
        resultItem = {}
        try:
            company_name = html.xpath(
                '//*[@id="offer%d"]/div[1]/div[2]/div[1]/a[1]' % i)
            new_company = stringQ2B(
                company_name[0].get('title')).lower().strip().replace(
                    ')', '').replace('(', '')
            resultItem['company'] = new_company
            url = company_name[0].get("href")
            resultItem['url'] = url
        except Exception as e:
            print(str(resultItem.get('company')) + str('公司名称不存在'))

        # 公司主营
        try:
            main_c = html.xpath(
                '//*[@id="offer%d"]/div[1]/div[2]/div[3]/div[1]/div[1]/a/*' %
                i)
            main_d = []
            for i in range(0, len(main_c)):
                main_d.append(main_c[i].text)
            resultItem['main_d'] = process_str(','.join(main_d))
        except Exception as e:
            print(str(resultItem.get('company')) + str(':公司主营解析错误'))

        # 公司地址
        try:
            resultItem['address'] = process_str(
                html.xpath(
                    '//*[@id="offer%d"]/div[1]/div[2]/div[3]/div[1]/div[2]/a' %
                    i)[0].text)
        except Exception as e:
            print(str(resultItem.get('company')) + str(':公司地址解析错误'))

        try:
            resultItem['shop_name'] = process_str(
                html.xpath('//*[@id="offer%d"]/div[1]/div[2]/div[2]/span' %
                           i)[0].text)
        except Exception as e:
            print(str(resultItem.get('company')) + str(':店铺名称解析错误'))

        # 公司人数
        try:
            resultItem['company_persons'] = process_str(
                html.xpath(
                    '//*[@id="offer%d"]/div[1]/div[2]/div[3]/div[1]/div[3]/a' %
                    i)[0].get('title'))
        except Exception as e:
            print(str(resultItem.get('company')) + str('公司人数解析错误'))

        # 贸易类型
        try:
            resultItem['model'] = html.xpath(
                '//*[@id="offer%d"]/div[1]/div[2]/div[3]/div[2]/div[1]/b' %
                i)[0].text
        except Exception as e:
            print(str(resultItem.get('company')) + str('贸易类型解析错误'))
        #
        for j in range(1, 4):
            try:
                resultItem[html.xpath(
                    '//*[@id="offer%d"]/div[1]/div[2]/div[3]/div[2]/div[%d]/span'
                    % (i, j + 1)
                )[0].text] = html.xpath(
                    '//*[@id="offer%d"]/div[1]/div[2]/div[3]/div[2]/div[%d]/a'
                    % (i, j + 1))[0].text
            except Exception as e:
                pass
        return resultItem