Esempio n. 1
0
    def get_info(self, data):
        info = {}
        tr_list = data.xpath(".//table[@id='table_dcdy']//tr[@name = 'dcdy']")
        for i, singledata in enumerate(tr_list):
            temp = {}
            td_list = singledata.xpath("./td")
            if len(td_list) == 0:
                continue

            temp["code"] = deal_html_code.remove_symbol(
                td_list[1].xpath("string(.)"))

            dates = deal_html_code.remove_symbol(td_list[2].xpath("string(.)"))
            temp["dates"] = deal_html_code.change_date_style(dates)
            temp["dept"] = deal_html_code.remove_symbol(
                td_list[3].xpath("string(.)"))
            temp["amount"] = deal_html_code.remove_symbol(
                td_list[4].xpath("string(.)"))
            temp["status"] = deal_html_code.remove_symbol(
                td_list[5].xpath("string(.)"))

            onclick = td_list[6].xpath("./a/@onclick")[0]
            tuple = deal_html_code.match_key_content(str(onclick))
            xh = tuple[0]
            detail_url = self._url.format(self._pripid, xh)
            person_info, goods_info = self.get_detail_info(detail_url, temp)
            temp["person_info"] = person_info
            temp["goods_info"] = goods_info
            info[i] = temp
        return info
Esempio n. 2
0
 def get_info(self, data):
     info = {}
     tr_list = data.xpath(".//table[@id='table_sfxz']//tr[@name = 'sfxz']")
     for i, singledata in enumerate(tr_list):
         temp = {}
         td_list = singledata.xpath("./td")
         if len(td_list) == 0:
             continue
         executor = deal_html_code.remove_symbol(
             td_list[1].xpath("string(.)"))
         temp["exceutor"] = executor
         stock_amount = deal_html_code.remove_symbol(
             td_list[2].xpath("string(.)"))
         temp["stock_amount"] = stock_amount
         court = deal_html_code.remove_symbol(td_list[3].xpath("string(.)"))
         temp["court"] = court
         notice_no = deal_html_code.remove_symbol(
             td_list[4].xpath("string(.)"))
         temp["notice_no"] = notice_no
         temp["enforce_no"] = notice_no
         status = deal_html_code.remove_symbol(
             td_list[5].xpath("string(.)"))
         temp["status"] = status
         onclik = td_list[6].xpath("./a/@onclick")[0]
         tuple = deal_html_code.match_key_content(str(onclik))
         xh = tuple[0]
         lx = tuple[1]
         detail_url = self._url.format(self._pripid, lx, xh)
         self.get_deatail_info(detail_url, info)
         info[i] = temp
     return info
Esempio n. 3
0
	def get_info(self, data):
		tr_list = data.xpath(".//tr[@name = 'xzxk']")
		info = {}
		for i, singledata in enumerate(tr_list):
			temp = {}
			td_list = singledata.xpath("./td")
			# number = deal_html_code.remove_symbol(td_list[0].xpath("string(.)"))
			temp["name"] = ''
			temp["code"] = deal_html_code.remove_symbol(td_list[1].xpath("string(.)"))
			temp["filename"] = deal_html_code.remove_symbol(td_list[2].xpath("string(.)"))
			start_date = deal_html_code.remove_symbol(td_list[3].xpath("string(.)"))
			temp["start_date"] = deal_html_code.change_chinese_date(start_date)
			end_date = deal_html_code.remove_symbol(td_list[4].xpath("string(.)"))
			temp["end_date"] = deal_html_code.change_chinese_date(end_date)
			temp["gov_dept"] = deal_html_code.remove_symbol(td_list[5].xpath("string(.)"))
			temp["content"] = deal_html_code.remove_symbol(td_list[6].xpath("string(.)"))
			temp["status"] = deal_html_code.remove_symbol(td_list[7].xpath("string(.)"))
			onclick = td_list[8].xpath("./a/@onclick")
			if len(onclick) == 0:
				logging.info("该条信息无详情信息!")
			else:
				onclick = onclick[0]
				tuple = deal_html_code.match_key_content(str(onclick))
				pripid = tuple[0]
				xh = tuple[1]
				lx = tuple[2]
				detail_url = self._url.format(pripid, xh, lx)
				self.get_detail_info(detail_url)
			info[i] = temp
		return info
Esempio n. 4
0
	def deal_single_page(self, info, data, start):
		sbxx_detail = data.xpath("//li[@name = 'sbxx']//a/@onclick")
		# enumerate(sbxx_date,start),start 代表的是i 的开始位置,enumerate详细用法找百度
		# 是因为商标信息是需要进行翻页,保存信息的做法是用字典的(key,value)键值对,编号作为键值(key),对应每条信息(value)
		# 一是方便查找,二是便于存储,根据键值的唯一性,每一条信息都要有一个唯一的键值
		# 如果页面中没有序号,就需要自己自定义每条信息的序号,页面中有序号的可以用页面中的序号作为键值,没有则可以采用这种做法
		for i, value in enumerate(sbxx_detail, start):
			tuple = deal_html_code.match_key_content(str(value))
			reg_num = tuple[0]
			picname = tuple[1]
			detail = self._detail_url.format(reg_num, picname)
			temp = self.get_single_info(detail)
			temp["ia_img"] = host + picname
			info[i] = temp
Esempio n. 5
0
    def deal_single_info(self, td_list, i, info):
        json_data = {
            "name": "",
            "types": "",
            "license_code": "",
            "license_type": "",
            "true_amount": "",
            "reg_amount": "",
            "ra_ways": "",
            "ra_date": "0000-00-00",
            "ta_ways": "",
            "ta_date": "0000-00-00"
        }
        # 如果是投资人,即包含两个字段

        if len(td_list) <= 2:

            name = deal_html_code.remove_symbol(td_list[1].xpath("string(.)"))
            types = deal_html_code.remove_symbol(td_list[2].xpath("string(.)"))
            json_data["name"] = name
            json_data["types"] = types
            json_data["license_code"] = ''
            json_data["license_type"] = ''
        else:
            # number = td_list[0].xpath("string(.)")

            name = deal_html_code.remove_symbol(td_list[1].xpath("string(.)"))
            json_data["name"] = name
            types = deal_html_code.remove_symbol(td_list[2].xpath("string(.)"))
            json_data["types"] = types
            license_type = deal_html_code.remove_symbol(
                td_list[3].xpath("string(.)"))
            json_data["license_type"] = license_type
            license_code = deal_html_code.remove_symbol(
                td_list[3].xpath("string(.)"))
            json_data["license_code"] = license_code
            detail = td_list[4].xpath("./a")
            # 如果长度为0证明是个假的详情,~-~即没有链接
            if len(detail) == 0:
                logging.info("该条信息无详情!")
            else:
                showRyxx = td_list[4].xpath("./a/@onclick")[0]
                key = deal_html_code.match_key_content(str(showRyxx))
                xh = key[0]
                pripid = key[1]
                isck = key[2]
                detail_url = self.url.format(xh, pripid, isck)
                self.deal_detail_info(detail_url)
        info[i] = json_data
Esempio n. 6
0
 def get_info(self):
     url = self._url.format(self._pripid)
     headers = config.headers
     result, status_code = Send_Request().send_requests(url, headers)
     info = {}
     if status_code == 200:
         data = etree.HTML(result,
                           parser=etree.HTMLParser(encoding='utf-8'))
         tr_list = data.xpath(
             "//table[@id = 'table_xzcf']//tr[@name = 'xzcf']")
         for i, singledata in enumerate(tr_list):
             temp = {}
             td_list = singledata.xpath("./td")
             temp["number"] = deal_html_code.remove_symbol(
                 td_list[1].xpath("string(.)"))
             temp["types"] = deal_html_code.remove_symbol(
                 td_list[2].xpath("string(.)"))
             temp["content"] = deal_html_code.remove_symbol(
                 td_list[3].xpath("string(.)"))
             temp["gov_dept"] = deal_html_code.remove_symbol(
                 td_list[4].xpath("string(.)"))
             date = deal_html_code.remove_symbol(
                 td_list[5].xpath("string(.)"))
             temp["date"] = deal_html_code.change_chinese_date(date)
             pub_date = deal_html_code.remove_symbol(
                 td_list[6].xpath("string(.)"))
             temp["pub_date"] = deal_html_code.change_chinese_date(pub_date)
             if len(tr_list) > 7:
                 onclick = tr_list[7].xpath("./a[@onclick]")
                 if len(onclick) == 0:
                     tuple = deal_html_code.match_key_content(onclick[0])
                     pripid = tuple[0]
                     xh = tuple[1]
                     pdfurl = detail_url.format(pripid, xh)
             else:
                 pdfurl = ''
             temp["pdfutl"] = pdfurl
             temp["name"] = ''
         info[i] = temp
     return info