def curl(mod,act,prama): http = HttpWrap() http.set_header('Content-Type','application/json') body = json.dumps(prama) url = "http://192.168.10.126:6000/%s/%s" % (mod,act) res = http.request(url,'POST',body) res = json.loads(http.read(res)) #print(json.dumps(res,indent=1)) pprint(res)
def get_stat_data(name,info): """通过配置文件,获取统计数据 """ #url提交模式 http = HttpWrap() http.set_header('Content-type','application/json') url = "http://192.168.10.126:1985/api/set" for i in range(0,len(info['history_from'])): itm = info['history_from'][i] source = itm['source'].split('.') if source[1] == 'sphinx': host_info = get_host_by_data(itm['source']) if not host_info : return [-1,"key erro %s not in sysconfig." % row['source']] sp = sphinx(host_info['host'],host_info['port']) expression = itm['expression'] expression['index'] = source[2] total_found = 0 while True: if total_found >0: if expression['pageSize'] * expression['page'] >=total_found: break expression['page'] +=1 sp.initQuery(itm['expression']) rs = sp.RunQueries() if rs and rs[0]['status']==0: total_found = rs[0]['total_found'] _items = {} for row in rs[0]['matches']: _items["%s%s" % (itm['key_prefix'],row['attrs'][itm['key']])]=[row['attrs'][itm['value']],utils.timestamp(0,'d')] if _items: data = json.dumps({'gkey':name,'data':_items}) _rs = http.request(url,"POST",data) rs = http.read(_rs) print(rs) else: print(sp._error) break
def get_info(corp, proxyinfo=""): socket.setdefaulttimeout(10) """采集函数 参数说明: corp 公司名称 proxyinfo 代理ip (格式为 ip:port) 为空时使用本机ip 返回值说明: status 状态码 base_info 采集到的工商信息 """ # 状态码 0 正常,1代理ip失效或者网站无法访问 2 ip被封 3公司不存在 status = 0 # 基本信息 base_info = [] # 股东信息 boss_info = [] # pdb.set_trace() # http模拟器 http = HttpWrap() # 设定代理ip格式 {"代理类型http|https":"ip:port"} if proxyinfo: http.set_proxy({"http": proxyinfo}) res = http.request(url_home, method="GET") # 访问主页面用于注册cookie信息,如果无法访问则直接返回失败 if res.code != 200: # print(res.code) if res.code > 200: ille_proxy_ip.add(proxyinfo) return [corp, base_info, 1, proxyinfo] """验证过程,循环验证直到成功""" # 成功标识 flag = 0 html = "" cu_time = int(time.time()) # 出错次数 err_type = 0 while flag == 0: if datamodel.g_exit: return [corp, base_info, 1, proxyinfo] try: if err_type > 10: return [corp, base_info, 1, proxyinfo] rand_time = time.strftime("%a %b %d %Y %H:%M:%S GMT 0800") url = url_code % time.time() res = http.request(url, method="GET") data = {} # print('step...1') if res.code == 200: # 保存验证码 try: im = res.read() except: im = "" time.sleep(1) continue code = http_upload_image(img_decode_url, im) try: code = json.loads(code) except Exception as e: # traceback.print_exc() continue print(code) # 手工输入验证码 # code = raw_input('input the code:').decode('gbk').encode('utf-8') if not code: err_type += 1 continue data = {"checkNo": request.quote(code)} # 重新设置头 http.reset_headers() http.set_header("Accetp", "application/json, text/javascript, */*; q=0.08") http.set_header("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8") http.set_header("Referer", url_home) http.set_header("X-Requested-With", "XMLHttpRequest") http.set_header("Referer", "http://www.ahcredit.gov.cn/searchList.jsp") res = http.request(url_check, "POST", data) # print('step...3') if res.code == 200: html = http.read(res) jdata = json.loads(html) # print(jdata) if jdata == "{success:true}": break else: err_type += 1 # print(res.code) time.sleep(5) # return [corp,base_info,1,proxyinfo] else: # print(res.code) if res.code == 403: time.sleep(20) err_type += 1 except Exception as e: traceback.print_exc() time.sleep(1) # pdb.set_trace() # 列表页 try: data = {"checkNo": code, "entName": corp} res = http.request(url_list, "POST", data) if res.code == -1: # print('get html :',res.code) return [corp, base_info, 1, proxyinfo] # pdb.set_trace() html = http.read(res) if "无查询结果" in html: # print('您搜索的条件无查询结果') return [corp, base_info, 3, proxyinfo] try: context = etree.HTML(html) except: print(html) return [corp, base_info, 1, proxyinfo] url_nodes = context.xpath('//div[@class="list"]//a') if not url_nodes: return [corp, base_info, 1, proxyinfo] for url_node in url_nodes: try: url = "%s%s" % (host, url_node.get("href")) _base_info = format_html(url) if _base_info: base_info.append(_base_info) except Exception as e: traceback.print_exc() if "reg_no" not in base_info: base_info.append(_base_info) except Exception as e: traceback.print_exc() return [corp, base_info, 1, proxyinfo] return [corp, base_info, status, proxyinfo]
def get_info(corp,proxyinfo=''): if len(corp) <4: return [corp,[],3,proxyinfo] socket.setdefaulttimeout(10) """采集函数 参数说明: corp 公司名称 proxyinfo 代理ip (格式为 ip:port) 为空时使用本机ip 返回值说明: status 状态码 base_info 采集到的工商信息 """ #状态码 0 正常,1代理ip失效或者网站无法访问 2 ip被封 3公司不存在 status = 0 #基本信息 base_info=[] #pdb.set_trace() #http模拟器 http = HttpWrap() #设定代理ip格式 {"代理类型http|https":"ip:port"} if proxyinfo: http.set_proxy({'http':proxyinfo}) res = http.request(url_home,method='GET') #访问主页面用于注册cookie信息,如果无法访问则直接返回失败 if res.code != 200: #print(res.code) #if res.code>200: # ille_proxy_ip.add(proxyinfo) return [corp,base_info,1,proxyinfo] """验证过程,循环验证直到成功""" #成功标识 flag = 0 html="" cu_time = int(time.time()) #出错次数 err_type = 0 while flag ==0: if datamodel.g_exit: return [corp,base_info,1,proxyinfo] try: url = url_code % time.time() res = http.request(url,method='GET') data = {} #print('step...1',res.code) if res.code == 200: #保存验证码 try: im = res.read() except: continue code = http_upload_image(img_decode_url,im) #手工输入验证码 #code = raw_input('input the code:').decode('gbk').encode('utf-8') #print("code:",code,corp,proxyinfo) #print('step...2') data={'name':corp,'verifyCode':code} #重新设置头 http.reset_headers() http.set_header('Accetp','application/json, text/javascript, */*; q=0.01') http.set_header('Content-Type','application/x-www-form-urlencoded; charset=UTF-8') http.set_header('Referer',url_home) http.set_header('X-Requested-With','XMLHttpRequest') res = http.request(url_check,"POST",data) #print('step...3') if res.code == 200: html = http.read(res) jdata = json.loads(html) #print(jdata) if jdata[0]['TIPS'] and 'IP' in jdata[0]['TIPS']: #print(jdata) ille_proxy_ip.add(proxyinfo) return [corp,base_info,2,proxyinfo] if "没有符合查询条件的结果" in jdata[0]['COUNT']: return [corp,base_info,3,proxyinfo] # logger.info("iperror:%" % jdata[0]['TIPS']) #print ("res:",html) if not jdata[0]['TIPS']: html = jdata[0]['INFO'] break else: err_type+=1 #return [corp,base_info,1,proxyinfo] #elif res.code >200: #return [corp,base_info,1,proxyinfo] else: return [corp,base_info,1,proxyinfo] err_type+=1 if err_type >10 : return [corp,base_info,1,proxyinfo] except Exception as e: traceback.print_exc() time.sleep(1) #pdb.set_trace() #列表页 #取出详情页的url if not html: return [corp,base_info,1,proxyinfo] #print ("html:",html) try: context = etree.HTML(html) dt_nodes = context.xpath('//dt') dd_nodes = context.xpath('//dd') for i in range(0,len(dt_nodes)): if dt_nodes[i].text: comname = dt_nodes[i].text text = etree.tostring(dd_nodes[i],encoding='utf-8').decode() base = get_iile_info(text) base['name'] = comname base_info.append(base) else: base={} link_info = dt_nodes[i].find('a').get('onclick').strip()[12:-2].replace("'",'').split(',') url ='http://www.jsgsj.gov.cn:58888%s' % (link_info[0].strip()) data = {'containContextPath':link_info[5].strip(),'id':link_info[2].strip(), 'name':'','org':link_info[1].strip(),'reg_no':link_info[4].strip(),'seq_id':link_info[3].strip()} #基本资料 data={'id':link_info[2].strip(),'org':link_info[1].strip(),'seq_id':link_info[3].strip(),'specificQuery':'basicInfo'} base = format_html(data) if base: base_info.append(base) except Exception as e: traceback.print_exc() if not base_info: return [corp,base_info,1,proxyinfo] return [corp,base_info,status,proxyinfo]
def get_info(corp,proxyinfo=''): if len(corp) <4: return [corp,[],3,proxyinfo] socket.setdefaulttimeout(10) """采集函数 参数说明: corp 公司名称 proxyinfo 代理ip (格式为 ip:port) 为空时使用本机ip 返回值说明: status 状态码 base_info 采集到的工商信息 """ #状态码 0 正常,1代理ip失效或者网站无法访问 2 ip被封 3公司不存在 status = 0 #基本信息 base_info=[] #股东信息 boss_info=[] #pdb.set_trace() #http模拟器 http = HttpWrap() #设定代理ip格式 {"代理类型http|https":"ip:port"} if proxyinfo: http.set_proxy({'http':proxyinfo}) res = http.request(url_home,method='GET') #访问主页面用于注册cookie信息,如果无法访问则直接返回失败 if res.code != 200: #print(res.code) if res.code>200: ille_proxy_ip.add(proxyinfo) return [corp,base_info,1,proxyinfo] """验证过程,循环验证直到成功""" #成功标识 flag = 0 html="" cu_time = int(time.time()) #出错次数 err_type = 0 while flag ==0: if datamodel.g_exit: return [corp,base_info,1,proxyinfo] try: if err_type >10 : return [corp,base_info,1,proxyinfo] url = url_code % int(time.time()) res = http.request(url,method='GET') data = {} #print('step...1') if res.code == 200: #保存验证码 try: im = res.read() except: im='' time.sleep(1) continue code = http_upload_image(img_decode_url,im) #手工输入验证码 #code = raw_input('input the code:').decode('gbk').encode('utf-8') if not code: err_type+=1 continue data={'searchContent':corp,'vcode':code} #重新设置头 http.reset_headers() http.set_header('Accetp','application/json, text/javascript, */*; q=0.08') http.set_header('Content-Type','application/x-www-form-urlencoded; charset=UTF-8') http.set_header('Referer',url_home) http.set_header('X-Requested-With','XMLHttpRequest') res = http.request(url_check,"POST",data) #print('step...3') if res.code == 200: html = http.read(res) #pdb.set_trace() if '您查询的信息多于' not in html: continue if '您查询的信息多于 0 条记录' in html: return [corp,base_info,3,proxyinfo] flag=1 break else: err_type+=1 #print(res.code) time.sleep(5) #return [corp,base_info,1,proxyinfo] else: #print(res.code) if res.code == 403: time.sleep(20) err_type+=1 except Exception as e: #traceback.print_exc() err_type+=1 time.sleep(1) if err_type>10: return [corp,base_info,1,proxyinfo] try: context = etree.HTML(html) nodes = context.xpath('//div[@class="content"]//a') #pdb.set_trace() for node in nodes: #url = "http://tjcredit.gov.cn%s" % node.get('href') entid = node.get('href').split('=')[1] url="http://tjcredit.gov.cn/platform/saic/baseInfo.json?entId=%s&departmentId=scjgw&infoClassId=dj" % entid ''' res = result.read().decode() jurl_result =re.findall('"/platform/saic/topInfoClass.json.*"',res) if not jurl_result: continue jurl = "http://tjcredit.gov.cn%s" % jurl_result[0][1:-1] j_result = http.request(jurl) if j_result.code !=200: continue jdata = json.loads(j_result.read().decode()) base_url = "http://tjcredit.gov.cn%s" % jdata[0]['url'] result = http.request(base_url) if result.code !=200: continue ''' _base_info = format_html(url) if _base_info: base_info.append(_base_info) #else: # print(html) except: traceback.print_exc() print(url) return [corp,base_info,1,proxyinfo] return [corp,base_info,status,proxyinfo]
def get_info(corp,proxyinfo=''): if len(corp) <4: return [corp,[],3,proxyinfo] socket.setdefaulttimeout(10) """采集函数 参数说明: corp 公司名称 proxyinfo 代理ip (格式为 ip:port) 为空时使用本机ip 返回值说明: status 状态码 base_info 采集到的工商信息 """ #状态码 0 正常,1代理ip失效或者网站无法访问 2 ip被封 3公司不存在 status = 0 #基本信息 base_info=[] #股东信息 boss_info=[] #pdb.set_trace() #http模拟器 http = HttpWrap() #设定代理ip格式 {"代理类型http|https":"ip:port"} if proxyinfo: http.set_proxy({'http':proxyinfo}) res = http.request(url_home,method='GET') #访问主页面用于注册cookie信息,如果无法访问则直接返回失败 if res.code != 200: #print(res.code) if res.code>200: ille_proxy_ip.add(proxyinfo) return [corp,base_info,1,proxyinfo] """验证过程,循环验证直到成功""" #成功标识 flag = 0 html="" cu_time = int(time.time()) #出错次数 err_type = 0 while flag ==0: if datamodel.g_exit: return [corp,base_info,1,proxyinfo] try: if err_type >10 : return [corp,base_info,1,proxyinfo] url = url_code % int(time.time()) res = http.request(url,method='GET') data = {} #print('step...1') if res.code == 200: #保存验证码 try: im = res.read() except: im='' time.sleep(1) continue code = http_upload_image(img_decode_url,im) #print(code) #手工输入验证码 #code = raw_input('input the code:').decode('gbk').encode('utf-8') if not code: err_type+=1 continue data={'key':corp,'code':code} #重新设置头 http.reset_headers() http.set_header('Accetp','application/json, text/javascript, */*; q=0.08') http.set_header('Content-Type','application/x-www-form-urlencoded; charset=UTF-8') http.set_header('Referer',url_home) http.set_header('X-Requested-With','XMLHttpRequest') res = http.request(url_check,"POST",data) #print('step...3') if res.code == 200: html = http.read(res) if '验证码不正确' in html: continue if '您搜索的条件无查询结果' in html: return [corp,base_info,3,proxyinfo] break else: err_type+=1 #print(res.code) time.sleep(5) #return [corp,base_info,1,proxyinfo] else: #print(res.code) if res.code == 403: time.sleep(20) err_type+=1 except Exception as e: #traceback.print_exc() err_type+=1 time.sleep(1) if err_type>10: return [corp,base_info,1,proxyinfo] #提取数据 try: context = etree.HTML(html) nodes = context.xpath('//div[@class="item"]/a') #pdb.set_trace() for node in nodes: _base_info = {} entId, opid,entType = (node.get('data-entid'),node.get('data-id'),node.get('data-type')) name = node.text.strip() data = {'entId':entId,'id':opid,'type':entType,'name':name} page_res = http.request(url_list,'POST',data) page = http.read(page_res) page_txt = etree.HTML(page) data_type= re.findall('type=\'(\d+)\'',page_txt.get('ng-init'))[0] url = url_info % (entId,http.urlencode(opid),data_type) _base_info = format_html(url) if _base_info: base_info.append(_base_info) if base_info: return [corp,base_info,status,proxyinfo] except: #traceback.print_exc() return [corp,base_info,1,proxyinfo] return [corp,base_info,status,proxyinfo]
def get_info(corp,proxyinfo=''): socket.setdefaulttimeout(10) """采集函数 参数说明: corp 公司名称 proxyinfo 代理ip (格式为 ip:port) 为空时使用本机ip 返回值说明: status 状态码 base_info 采集到的工商信息 """ #状态码 0 正常,1代理ip失效或者网站无法访问 2 ip被封 3公司不存在 status = 0 #基本信息 base_info={} #股东信息 boss_info=[] #用于匹配json结果 title_base={'C2':'name','C1':'reg_no','C3':'type','C4':'reg_date','C5':'faren','C6':'reg_capital','C7':'addr','C8':'biz_scope','C9':'open_date','C10':'close_date','C11':'reg_authority','C12':'audit_date','C13':'reg_status'} #pdb.set_trace() #http模拟器 http = HttpWrap() #设定代理ip格式 {"代理类型http|https":"ip:port"} if proxyinfo: http.set_proxy({'http':proxyinfo}) res = http.request(url_home,method='GET') #访问主页面用于注册cookie信息,如果无法访问则直接返回失败 if res.code != 200: #print(res.code) if res.code>200: ille_proxy_ip.add(proxyinfo) return [corp,base_info,1,proxyinfo] """验证过程,循环验证直到成功""" #成功标识 flag = 0 html="" cu_time = int(time.time()) #出错次数 err_type = 0 while flag ==0: #if datamodel.g_exit: # return [corp,base_info,1,proxyinfo] try: rand_time = time.strftime('%a %b %d %Y %H:%M:%S GMT 0800') url = url_code #% rand_time res = http.request(url,method='GET') data = {} #print('step...1') if res.code == 200: #保存验证码 try: im = res.read() except: im='' continue code = http_upload_image(img_decode_url,im) #手工输入验证码 #code = raw_input('input the code:').decode('gbk').encode('utf-8') #print(code) #print('step...2') data={'name':corp,'verifyCode':code} #重新设置头 http.reset_headers() http.set_header('Accetp','application/json, text/javascript, */*; q=0.01') http.set_header('Content-Type','application/x-www-form-urlencoded; charset=UTF-8') http.set_header('Referer',url_home) http.set_header('X-Requested-With','XMLHttpRequest') res = http.request(url_check,"POST",data) #print('step...3') if res.code == 200: html = http.read(res) jdata = json.loads(html) #print(jdata) if jdata[0]['TIPS'] and 'IP' in jdata[0]['TIPS']: #print(jdata) ille_proxy_ip.add(proxyinfo) return [corp,base_info,2,proxyinfo] if "没有符合查询条件的结果" in jdata[0]['COUNT']: return [corp,base_info,3,proxyinfo] # logger.info("iperror:%" % jdata[0]['TIPS']) #print ("res:",html) if not jdata[0]['TIPS']: html = jdata[0]['INFO'] break else: err_type+=1 #return [corp,base_info,1,proxyinfo] else: err_type+=1 if err_type >10 : return [corp,base_info,1,proxyinfo] except Exception as e: traceback.print_exc() time.sleep(1) #pdb.set_trace() #列表页 #取出详情页的url if not html: return [corp,base_info,1,proxyinfo] #print ("html:",html) try: context = etree.HTML(html) nodes = context.xpath("//a") link_info = nodes[0].attrib['onclick'].strip()[12:-2].replace("'",'').split(',') url ='http://www.jsgsj.gov.cn:58888%s' % (link_info[0].strip()) data = {'containContextPath':link_info[5].strip(),'id':link_info[2].strip(), 'name':'','org':link_info[1].strip(),'reg_no':link_info[4].strip(),'seq_id':link_info[3].strip()} #详情页基本资料 #self.reset_headers() #self.set_headers('Content-Type','application/x-www-form-urlencoded; charset=UTF-8') #self.set_headers('Accept','text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8') #self.set_headers('Referer','http://www.jsgsj.gov.cn:58888/province/queryResultList.jsp') #res = self.request(url) ############### #http.reset_headers() ''' self.set_headers('Accept','application/json, text/javascript, */*; q=0.01') self.set_headers('Content-Type','application/x-www-form-urlencoded; charset=UTF-8') self.set_headers('X-Requested-With','XMLHttpRequest') self.set_headers('Referer',http://www.jsgsj.gov.cn:58888/ecipplatform/inner_pspc/pspc_queryCorpInfor_gsRelease.jsp') ''' http.headers={'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)','Accept-Encoding': 'gzip, deflate','Accept-Language': 'zh-CN'} http.headers['Accept']='application/json, text/javascript, */*; q=0.01' http.headers['Content-Type'] = 'application/x-www-form-urlencoded; charset=UTF-8' http.headers['Referer'] = 'http://www.jsgsj.gov.cn:58888/ecipplatform/inner_pspc/pspc_queryCorpInfor_gsRelease.jsp' #基本资料 url = "http://www.jsgsj.gov.cn:58888/ecipplatform/ciServlet.json?ciEnter=true" data={'id':link_info[2].strip(),'org':link_info[1].strip(),'seq_id':link_info[3].strip(),'specificQuery':'basicInfo'} base_info['gov_url']=json.dumps(data) res = http.request(url,'POST',data) #连接出错,做失败处理 if res.code==-1: print(res.code) return [corp,base_info,1,proxyinfo] info = res.read().decode() #print(res.code,info) data = json.loads(info)[0] base_info['corp_id']=link_info[2].strip() base_info['corp_org']=link_info[1].strip() base_info['corp_seq_id']=link_info[3].strip() for k,v in data.items(): if k in title_base: base_info[title_base[k]]=v #股东信息 url = 'http://www.jsgsj.gov.cn:58888/ecipplatform/ciServlet.json?ciEnter=true' data = {'CORP_ID':link_info[2].strip(),'CORP_ORG':link_info[1].strip(),'CORP_SEQ_ID':link_info[3].strip(),'pageNo':1,'pageSize':5,'showRecordLine':1,'specificQuery':'investmentInfor'} res = http.request(url,'POST',data) #取股东出错,放弃股东信息,返回结果 if res.code !=200: return [corp,base_info,status,proxyinfo] info = res.read().decode() #print(res.code,info) try: data = json.loads(info) for row in data['items']: boss_info.append([row['C1'],row['C2']]) if boss_info: base_info['shareholders']=json.dumps(boss_info) except Exception as e: traceback.print_exc() except Exception as e: #traceback.print_exc() try: base_info['name']=corp base_info['reg_status']='已注销' pe={'reg_no':'注册号:\<span\>(.*?)\<', 'faren':'法定代表人:\<span\>(.*?)\<|投资人:\<span\>(.*?)\<|经营者:\<span\>(.*?)\<', 'reg_authority':'登记机关:\<span\>(.*?)\<', 'cancell_date':'注销日期:\<span\>(.*?)\<|吊销日期:\<span\>(.*?)\<'} for k,v in pe.items(): rs = re.findall(v,html) if rs: base_info[k] = rs[0] if type(rs[0]) in [list,tuple]: if rs[0][0]: base_info[k] = rs[0][0] elif rs[0][1]: base_info[k] = rs[0][1] elif(len(rs[0])>2): base_info[k] = rs[0][2] except: pass if 'reg_no' not in base_info: return [corp,base_info,1,proxyinfo] return [corp,base_info,status,proxyinfo]