Beispiel #1
0
def ysdm(kw):

    """云速打码

	http://ysdm.net/

	"""

    url = "http://api.ysdm.net/create.json"

    try:

        image = Image.open(BytesIO(kw["body"]))

    except Exception as e:

        # raise("illegal Image File!")

        print(e)

        return False

    paramDict = {
        "username": "******",
        "password": "******",
        "typeid": 5000,
        "timeout": 90,
        "softid": 1,
        "softkey": "b40ffbee5c1cf4e38028c197eb2fc751",
    }

    res = http_upload_image(url, kw["body"], paramDict)

    try:

        rs = json.loads(res)

        return rs["Result"]

    except:

        return ""
Beispiel #2
0
def get_info(corp, proxyinfo=""):
    socket.setdefaulttimeout(10)
    """采集函数
	参数说明:
		corp  公司名称
		proxyinfo 代理ip  (格式为 ip:port) 为空时使用本机ip
	返回值说明:
	    status  状态码
	    base_info 采集到的工商信息
	"""
    # 状态码 0 正常,1代理ip失效或者网站无法访问 2 ip被封 3公司不存在
    status = 0
    # 基本信息
    base_info = []
    # 股东信息
    boss_info = []

    # pdb.set_trace()
    # http模拟器
    http = HttpWrap()
    # 设定代理ip格式 {"代理类型http|https":"ip:port"}
    if proxyinfo:
        http.set_proxy({"http": proxyinfo})
    res = http.request(url_home, method="GET")
    # 访问主页面用于注册cookie信息,如果无法访问则直接返回失败
    if res.code != 200:
        # print(res.code)
        if res.code > 200:
            ille_proxy_ip.add(proxyinfo)
        return [corp, base_info, 1, proxyinfo]

    """验证过程,循环验证直到成功"""
    # 成功标识
    flag = 0
    html = ""
    cu_time = int(time.time())
    # 出错次数
    err_type = 0
    while flag == 0:
        if datamodel.g_exit:
            return [corp, base_info, 1, proxyinfo]
        try:
            if err_type > 10:
                return [corp, base_info, 1, proxyinfo]
            rand_time = time.strftime("%a %b %d %Y %H:%M:%S GMT 0800")
            url = url_code % time.time()
            res = http.request(url, method="GET")
            data = {}
            # print('step...1')
            if res.code == 200:

                # 保存验证码
                try:
                    im = res.read()
                except:
                    im = ""
                    time.sleep(1)
                    continue
                code = http_upload_image(img_decode_url, im)

                try:
                    code = json.loads(code)
                except Exception as e:
                    # traceback.print_exc()
                    continue

                print(code)
                # 手工输入验证码
                # code = raw_input('input the code:').decode('gbk').encode('utf-8')
                if not code:
                    err_type += 1
                    continue
                data = {"checkNo": request.quote(code)}
                # 重新设置头
                http.reset_headers()
                http.set_header("Accetp", "application/json, text/javascript, */*; q=0.08")
                http.set_header("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8")
                http.set_header("Referer", url_home)
                http.set_header("X-Requested-With", "XMLHttpRequest")
                http.set_header("Referer", "http://www.ahcredit.gov.cn/searchList.jsp")
                res = http.request(url_check, "POST", data)
                # print('step...3')
                if res.code == 200:
                    html = http.read(res)
                    jdata = json.loads(html)
                    # print(jdata)
                    if jdata == "{success:true}":
                        break

                else:
                    err_type += 1
                    # print(res.code)
                    time.sleep(5)
                    # return [corp,base_info,1,proxyinfo]
            else:
                # print(res.code)
                if res.code == 403:
                    time.sleep(20)
                err_type += 1

        except Exception as e:
            traceback.print_exc()
        time.sleep(1)
        # pdb.set_trace()
        # 列表页

    try:
        data = {"checkNo": code, "entName": corp}

        res = http.request(url_list, "POST", data)
        if res.code == -1:
            # print('get html :',res.code)
            return [corp, base_info, 1, proxyinfo]
            # pdb.set_trace()
        html = http.read(res)

        if "无查询结果" in html:
            # print('您搜索的条件无查询结果')
            return [corp, base_info, 3, proxyinfo]

        try:
            context = etree.HTML(html)
        except:
            print(html)
            return [corp, base_info, 1, proxyinfo]
        url_nodes = context.xpath('//div[@class="list"]//a')
        if not url_nodes:
            return [corp, base_info, 1, proxyinfo]
        for url_node in url_nodes:
            try:
                url = "%s%s" % (host, url_node.get("href"))
                _base_info = format_html(url)
                if _base_info:
                    base_info.append(_base_info)
            except Exception as e:
                traceback.print_exc()
                if "reg_no" not in base_info:
                    base_info.append(_base_info)
    except Exception as e:
        traceback.print_exc()
        return [corp, base_info, 1, proxyinfo]
    return [corp, base_info, status, proxyinfo]
Beispiel #3
0
def get_info(corp,proxyinfo=''):
	if len(corp) <4:
		return [corp,[],3,proxyinfo]
	socket.setdefaulttimeout(10)
	"""采集函数
	参数说明:
		corp  公司名称
		proxyinfo 代理ip  (格式为 ip:port) 为空时使用本机ip
	返回值说明:
	    status  状态码
	    base_info 采集到的工商信息
	"""
	#状态码 0 正常,1代理ip失效或者网站无法访问 2 ip被封 3公司不存在
	status = 0
	#基本信息
	base_info=[]

	#pdb.set_trace()
	#http模拟器
	http = HttpWrap()
	#设定代理ip格式 {"代理类型http|https":"ip:port"}
	if proxyinfo:
		http.set_proxy({'http':proxyinfo})
	res = http.request(url_home,method='GET')
	#访问主页面用于注册cookie信息,如果无法访问则直接返回失败
	if res.code != 200:
		#print(res.code)
		#if res.code>200:
		#	ille_proxy_ip.add(proxyinfo)
		return [corp,base_info,1,proxyinfo]
	
	"""验证过程,循环验证直到成功"""
	#成功标识
	flag = 0	
	html=""
	cu_time = int(time.time())
	#出错次数
	err_type = 0 
	while flag ==0:
		if datamodel.g_exit:
			return [corp,base_info,1,proxyinfo]
		try:
			
			url = url_code % time.time()
			res = http.request(url,method='GET')
			data = {}
			#print('step...1',res.code)
			if res.code == 200:
				#保存验证码
				try:
					im = res.read()
				except:
					continue
				code = http_upload_image(img_decode_url,im)
					
				#手工输入验证码
				#code = raw_input('input the code:').decode('gbk').encode('utf-8')
				#print("code:",code,corp,proxyinfo)
				#print('step...2')
				data={'name':corp,'verifyCode':code}
				#重新设置头
				http.reset_headers()
				http.set_header('Accetp','application/json, text/javascript, */*; q=0.01')
				http.set_header('Content-Type','application/x-www-form-urlencoded; charset=UTF-8')
				http.set_header('Referer',url_home)
				http.set_header('X-Requested-With','XMLHttpRequest')
				res = http.request(url_check,"POST",data)
				#print('step...3')
				if res.code == 200:
					html = http.read(res)
					jdata = json.loads(html)
					#print(jdata)
					if jdata[0]['TIPS'] and 'IP'  in jdata[0]['TIPS']:
						#print(jdata)
						ille_proxy_ip.add(proxyinfo)
						return [corp,base_info,2,proxyinfo]
					if "没有符合查询条件的结果" in jdata[0]['COUNT']:
						return [corp,base_info,3,proxyinfo]
					#	logger.info("iperror:%" % jdata[0]['TIPS'])
					#print ("res:",html)
					if not jdata[0]['TIPS']:
						html = jdata[0]['INFO']
						break
				else:
					err_type+=1
					#return [corp,base_info,1,proxyinfo]
			#elif res.code >200:
				#return [corp,base_info,1,proxyinfo]
			else:
				return [corp,base_info,1,proxyinfo]
				err_type+=1
			if err_type >10 :
				return [corp,base_info,1,proxyinfo]		
		except Exception as e:
			traceback.print_exc()	
		time.sleep(1)
	#pdb.set_trace()
	#列表页

	#取出详情页的url
	if not html:
		return [corp,base_info,1,proxyinfo]
	#print ("html:",html)
	try:
		
		context = etree.HTML(html)
		dt_nodes = context.xpath('//dt')
		dd_nodes = context.xpath('//dd')
		for i in range(0,len(dt_nodes)):

			if dt_nodes[i].text:
				comname = dt_nodes[i].text
				text = etree.tostring(dd_nodes[i],encoding='utf-8').decode()
				base = get_iile_info(text)
				base['name'] = comname
				base_info.append(base)
			else:
				base={}
				link_info = dt_nodes[i].find('a').get('onclick').strip()[12:-2].replace("'",'').split(',')
				url ='http://www.jsgsj.gov.cn:58888%s' % (link_info[0].strip())
				data = {'containContextPath':link_info[5].strip(),'id':link_info[2].strip(),
						'name':'','org':link_info[1].strip(),'reg_no':link_info[4].strip(),'seq_id':link_info[3].strip()}

				#基本资料
				data={'id':link_info[2].strip(),'org':link_info[1].strip(),'seq_id':link_info[3].strip(),'specificQuery':'basicInfo'}
				base = format_html(data)
				if base:
					base_info.append(base)
	except Exception as e:
		traceback.print_exc()

		if  not  base_info:
			return [corp,base_info,1,proxyinfo]
	return [corp,base_info,status,proxyinfo]
Beispiel #4
0
def get_info(corp,proxyinfo=''):
	if len(corp) <4:
		return [corp,[],3,proxyinfo]
	socket.setdefaulttimeout(10)
	"""采集函数
	参数说明:
		corp  公司名称
		proxyinfo 代理ip  (格式为 ip:port) 为空时使用本机ip
	返回值说明:
	    status  状态码
	    base_info 采集到的工商信息
	"""
	#状态码 0 正常,1代理ip失效或者网站无法访问 2 ip被封 3公司不存在
	status = 0
	#基本信息
	base_info=[]
	#股东信息
	boss_info=[]

	#pdb.set_trace()
	#http模拟器
	http = HttpWrap()
	#设定代理ip格式 {"代理类型http|https":"ip:port"}
	if proxyinfo:
		http.set_proxy({'http':proxyinfo})
	res = http.request(url_home,method='GET')
	#访问主页面用于注册cookie信息,如果无法访问则直接返回失败
	if res.code != 200:
		#print(res.code)
		if res.code>200:
			ille_proxy_ip.add(proxyinfo)
		return [corp,base_info,1,proxyinfo]
	
	"""验证过程,循环验证直到成功"""
	#成功标识
	flag = 0	
	html=""
	cu_time = int(time.time())
	#出错次数
	err_type = 0 
	while flag ==0:
		if datamodel.g_exit:
			return [corp,base_info,1,proxyinfo]
		try:
			if err_type >10 :
				return [corp,base_info,1,proxyinfo]
			
			url = url_code % int(time.time())
			res = http.request(url,method='GET')
			data = {}
			#print('step...1')
			if res.code == 200:
	
				#保存验证码
				try:
					im = res.read()
				except:
					im=''
					time.sleep(1)
					continue

				code = http_upload_image(img_decode_url,im)

				#手工输入验证码
				#code = raw_input('input the code:').decode('gbk').encode('utf-8')
				if not code:
					err_type+=1
					continue
				data={'searchContent':corp,'vcode':code}
				#重新设置头
				http.reset_headers()
				http.set_header('Accetp','application/json, text/javascript, */*; q=0.08')
				http.set_header('Content-Type','application/x-www-form-urlencoded; charset=UTF-8')
				http.set_header('Referer',url_home)
				http.set_header('X-Requested-With','XMLHttpRequest')
				res = http.request(url_check,"POST",data)
				#print('step...3')
				if res.code == 200:
					html = http.read(res)
					#pdb.set_trace()
					if '您查询的信息多于' not in html:
						continue
					if '您查询的信息多于 0 条记录' in html:
						return [corp,base_info,3,proxyinfo]
					flag=1
					break
				else:
					err_type+=1
					#print(res.code)
					time.sleep(5)
					#return [corp,base_info,1,proxyinfo]
			else:
				#print(res.code)
				if res.code == 403:
					time.sleep(20)
				err_type+=1
		
		except Exception as e:
			#traceback.print_exc()	
			err_type+=1
		time.sleep(1)
		if err_type>10:
			return [corp,base_info,1,proxyinfo]

	try:
		context = etree.HTML(html)
		nodes = context.xpath('//div[@class="content"]//a')
		#pdb.set_trace()
		for node in nodes:
			 
			#url = "http://tjcredit.gov.cn%s" % node.get('href')	
			entid = node.get('href').split('=')[1]
			url="http://tjcredit.gov.cn/platform/saic/baseInfo.json?entId=%s&departmentId=scjgw&infoClassId=dj" % entid	

			'''
			res = result.read().decode()
			jurl_result =re.findall('"/platform/saic/topInfoClass.json.*"',res)
			if not jurl_result:
				continue
			jurl = "http://tjcredit.gov.cn%s" % jurl_result[0][1:-1]
			j_result = http.request(jurl)
			if j_result.code !=200:
				continue
			jdata = json.loads(j_result.read().decode())
			base_url = "http://tjcredit.gov.cn%s" % jdata[0]['url']
			result = http.request(base_url)
			
			if result.code !=200:
				continue
			'''
			_base_info = format_html(url)
			if _base_info:
				base_info.append(_base_info)
			#else:
			#	print(html)
	except:
		traceback.print_exc()
		print(url)
		return [corp,base_info,1,proxyinfo]	
	
	return [corp,base_info,status,proxyinfo]
Beispiel #5
0
def get_info(corp,proxyinfo=''):
	if len(corp) <4:
		return [corp,[],3,proxyinfo]
	socket.setdefaulttimeout(10)
	"""采集函数
	参数说明:
		corp  公司名称
		proxyinfo 代理ip  (格式为 ip:port) 为空时使用本机ip
	返回值说明:
	    status  状态码
	    base_info 采集到的工商信息
	"""
	#状态码 0 正常,1代理ip失效或者网站无法访问 2 ip被封 3公司不存在
	status = 0
	#基本信息
	base_info=[]
	#股东信息
	boss_info=[]
	#pdb.set_trace()
	#http模拟器
	http = HttpWrap()
	#设定代理ip格式 {"代理类型http|https":"ip:port"}
	if proxyinfo:
		http.set_proxy({'http':proxyinfo})
	res = http.request(url_home,method='GET')
	#访问主页面用于注册cookie信息,如果无法访问则直接返回失败
	if res.code != 200:
		#print(res.code)
		if res.code>200:
			ille_proxy_ip.add(proxyinfo)
		return [corp,base_info,1,proxyinfo]
	
	"""验证过程,循环验证直到成功"""
	#成功标识
	flag = 0	
	html=""
	cu_time = int(time.time())
	#出错次数
	err_type = 0 
	while flag ==0:
		if datamodel.g_exit:
			return [corp,base_info,1,proxyinfo]
		try:
			if err_type >10 :
				return [corp,base_info,1,proxyinfo]
			
			url = url_code % int(time.time())
			res = http.request(url,method='GET')
			data = {}
			#print('step...1')
			if res.code == 200:
	
				#保存验证码
				try:
					im = res.read()
				except:
					im=''
					time.sleep(1)
					continue

				code = http_upload_image(img_decode_url,im)

				#print(code)
				#手工输入验证码
				#code = raw_input('input the code:').decode('gbk').encode('utf-8')
				if not code:
					err_type+=1
					continue
				data={'key':corp,'code':code}
				#重新设置头
				http.reset_headers()
				http.set_header('Accetp','application/json, text/javascript, */*; q=0.08')
				http.set_header('Content-Type','application/x-www-form-urlencoded; charset=UTF-8')
				http.set_header('Referer',url_home)
				http.set_header('X-Requested-With','XMLHttpRequest')
				res = http.request(url_check,"POST",data)
				#print('step...3')
				if res.code == 200:
					html = http.read(res)
					if '验证码不正确' in html:
						continue
					if '您搜索的条件无查询结果' in html:
						return [corp,base_info,3,proxyinfo]

					break
				
				else:
					err_type+=1
					#print(res.code)
					time.sleep(5)
					#return [corp,base_info,1,proxyinfo]
			else:
				#print(res.code)
				if res.code == 403:
					time.sleep(20)
				err_type+=1
		
		except Exception as e:
			#traceback.print_exc()	
			err_type+=1
		time.sleep(1)
		if err_type>10:
			return [corp,base_info,1,proxyinfo]
	#提取数据		
	try:
		context = etree.HTML(html)
		nodes = context.xpath('//div[@class="item"]/a')
		#pdb.set_trace()
		for node in nodes:
			_base_info = {}
			entId, opid,entType = (node.get('data-entid'),node.get('data-id'),node.get('data-type'))
			name = node.text.strip()
			
			data = {'entId':entId,'id':opid,'type':entType,'name':name}
			page_res = http.request(url_list,'POST',data)
			page = http.read(page_res)
			page_txt = etree.HTML(page)
			data_type= re.findall('type=\'(\d+)\'',page_txt.get('ng-init'))[0]
			url = url_info % (entId,http.urlencode(opid),data_type)
			_base_info = format_html(url)
			if _base_info:
				base_info.append(_base_info)
		if base_info:
			return [corp,base_info,status,proxyinfo]
	except:
		#traceback.print_exc()
		return [corp,base_info,1,proxyinfo]	
	return [corp,base_info,status,proxyinfo]
Beispiel #6
0
def get_info(corp,proxyinfo=''):
	socket.setdefaulttimeout(10)
	"""采集函数
	参数说明:
		corp  公司名称
		proxyinfo 代理ip  (格式为 ip:port) 为空时使用本机ip
	返回值说明:
	    status  状态码
	    base_info 采集到的工商信息
	"""
	#状态码 0 正常,1代理ip失效或者网站无法访问 2 ip被封 3公司不存在
	status = 0
	#基本信息
	base_info={}
	#股东信息
	boss_info=[]
    #用于匹配json结果
	title_base={'C2':'name','C1':'reg_no','C3':'type','C4':'reg_date','C5':'faren','C6':'reg_capital','C7':'addr','C8':'biz_scope','C9':'open_date','C10':'close_date','C11':'reg_authority','C12':'audit_date','C13':'reg_status'}

	#pdb.set_trace()
	#http模拟器
	http = HttpWrap()
	#设定代理ip格式 {"代理类型http|https":"ip:port"}
	if proxyinfo:
		http.set_proxy({'http':proxyinfo})
	res = http.request(url_home,method='GET')
	#访问主页面用于注册cookie信息,如果无法访问则直接返回失败
	if res.code != 200:
		#print(res.code)
		if res.code>200:
			ille_proxy_ip.add(proxyinfo)
		return [corp,base_info,1,proxyinfo]
	
	"""验证过程,循环验证直到成功"""
	#成功标识
	flag = 0	
	html=""
	cu_time = int(time.time())
	#出错次数
	err_type = 0 
	while flag ==0:
		#if datamodel.g_exit:
		#	return [corp,base_info,1,proxyinfo]
		try:
			rand_time = time.strftime('%a %b %d %Y %H:%M:%S GMT 0800')
			url = url_code #% rand_time
			res = http.request(url,method='GET')
			data = {}
			#print('step...1')
			if res.code == 200:
	
				#保存验证码
				try:
					im = res.read()
				except:
					im=''
					continue
				code = http_upload_image(img_decode_url,im)
					
				#手工输入验证码
				#code = raw_input('input the code:').decode('gbk').encode('utf-8')
				#print(code)
				#print('step...2')
				data={'name':corp,'verifyCode':code}
				#重新设置头
				http.reset_headers()
				http.set_header('Accetp','application/json, text/javascript, */*; q=0.01')
				http.set_header('Content-Type','application/x-www-form-urlencoded; charset=UTF-8')
				http.set_header('Referer',url_home)
				http.set_header('X-Requested-With','XMLHttpRequest')
				res = http.request(url_check,"POST",data)
				#print('step...3')
				if res.code == 200:
					html = http.read(res)
					jdata = json.loads(html)
					#print(jdata)
					if jdata[0]['TIPS'] and 'IP'  in jdata[0]['TIPS']:
						#print(jdata)
						ille_proxy_ip.add(proxyinfo)
						return [corp,base_info,2,proxyinfo]
					if "没有符合查询条件的结果" in jdata[0]['COUNT']:
						return [corp,base_info,3,proxyinfo]
					#	logger.info("iperror:%" % jdata[0]['TIPS'])
					#print ("res:",html)
					if not jdata[0]['TIPS']:
						html = jdata[0]['INFO']
						break
				else:
					err_type+=1
					#return [corp,base_info,1,proxyinfo]
			else:
				err_type+=1
			if err_type >10 :
				return [corp,base_info,1,proxyinfo]		
		except Exception as e:
			traceback.print_exc()	
		time.sleep(1)
	#pdb.set_trace()
	#列表页

	#取出详情页的url
	if not html:
		return [corp,base_info,1,proxyinfo]
	#print ("html:",html)
	try:
		
		context = etree.HTML(html)
		nodes = context.xpath("//a")
		link_info = nodes[0].attrib['onclick'].strip()[12:-2].replace("'",'').split(',')
		url ='http://www.jsgsj.gov.cn:58888%s' % (link_info[0].strip())
		data = {'containContextPath':link_info[5].strip(),'id':link_info[2].strip(),
			'name':'','org':link_info[1].strip(),'reg_no':link_info[4].strip(),'seq_id':link_info[3].strip()}
					
		#详情页基本资料
		#self.reset_headers()
		#self.set_headers('Content-Type','application/x-www-form-urlencoded; charset=UTF-8')
		#self.set_headers('Accept','text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
		#self.set_headers('Referer','http://www.jsgsj.gov.cn:58888/province/queryResultList.jsp')
		#res = self.request(url)
		###############
		#http.reset_headers()
		'''
		self.set_headers('Accept','application/json, text/javascript, */*; q=0.01')
		self.set_headers('Content-Type','application/x-www-form-urlencoded; charset=UTF-8')
		self.set_headers('X-Requested-With','XMLHttpRequest')
		self.set_headers('Referer',http://www.jsgsj.gov.cn:58888/ecipplatform/inner_pspc/pspc_queryCorpInfor_gsRelease.jsp')
		'''
		http.headers={'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)','Accept-Encoding': 'gzip, deflate','Accept-Language': 'zh-CN'}
		http.headers['Accept']='application/json, text/javascript, */*; q=0.01'
		http.headers['Content-Type'] = 'application/x-www-form-urlencoded; charset=UTF-8'
		http.headers['Referer'] = 'http://www.jsgsj.gov.cn:58888/ecipplatform/inner_pspc/pspc_queryCorpInfor_gsRelease.jsp'
		#基本资料
		url = "http://www.jsgsj.gov.cn:58888/ecipplatform/ciServlet.json?ciEnter=true"
		data={'id':link_info[2].strip(),'org':link_info[1].strip(),'seq_id':link_info[3].strip(),'specificQuery':'basicInfo'}
		base_info['gov_url']=json.dumps(data)
		res = http.request(url,'POST',data)
		#连接出错,做失败处理
		if  res.code==-1:
			print(res.code)
			return [corp,base_info,1,proxyinfo]
		info = res.read().decode()
		#print(res.code,info)
		data = json.loads(info)[0]
		base_info['corp_id']=link_info[2].strip()
		base_info['corp_org']=link_info[1].strip()
		base_info['corp_seq_id']=link_info[3].strip()
		for k,v in data.items():
			if k in title_base:
				base_info[title_base[k]]=v
				#股东信息
		url = 'http://www.jsgsj.gov.cn:58888/ecipplatform/ciServlet.json?ciEnter=true'
		data = {'CORP_ID':link_info[2].strip(),'CORP_ORG':link_info[1].strip(),'CORP_SEQ_ID':link_info[3].strip(),'pageNo':1,'pageSize':5,'showRecordLine':1,'specificQuery':'investmentInfor'}	

		res = http.request(url,'POST',data)
		#取股东出错,放弃股东信息,返回结果
		if res.code !=200:
			return [corp,base_info,status,proxyinfo]
		info = res.read().decode()
		#print(res.code,info)
		try:
			data = json.loads(info)
			for row in data['items']:
				boss_info.append([row['C1'],row['C2']])
			if boss_info:
				base_info['shareholders']=json.dumps(boss_info)
		except Exception as e:
			traceback.print_exc()
	except Exception as e:
		#traceback.print_exc()
		try:
			base_info['name']=corp
			base_info['reg_status']='已注销'
			pe={'reg_no':'注册号:\<span\>(.*?)\<',
				'faren':'法定代表人:\<span\>(.*?)\<|投资人:\<span\>(.*?)\<|经营者:\<span\>(.*?)\<',
				'reg_authority':'登记机关:\<span\>(.*?)\<',
				'cancell_date':'注销日期:\<span\>(.*?)\<|吊销日期:\<span\>(.*?)\<'}
			for k,v in pe.items(): 
				rs = re.findall(v,html)
				if rs:
					base_info[k] = rs[0]
					if type(rs[0]) in [list,tuple]:
						if rs[0][0]:
							base_info[k] = rs[0][0]
						elif rs[0][1]:
							base_info[k] = rs[0][1]
						elif(len(rs[0])>2):
							base_info[k] = rs[0][2]
		except:
			pass
		if 'reg_no' not in base_info:
			return [corp,base_info,1,proxyinfo]
	return [corp,base_info,status,proxyinfo]