def name(url): headers = config.headers_detail content, status_code = Send_Request().send_request(url, headers) # print content info = {} if status_code == 200: flag = 1 result = etree.HTML(content, parser=etree.HTMLParser(encoding='utf-8')) dl = result.xpath("//div[@class='viewBox']//dl")[0] datalist = etree.tostring(dl).split( '<dt style="color:#333;margin-bottom:10px;"/>') datalist.remove(datalist[0]) if len(datalist) > 0: pattern = re.compile(".*共(.*?)页.*") number = re.findall(pattern, content) if len(number) == 1: totalpage = int(number[0]) else: totalpage = 0 if int(totalpage) == 1: j = 0 deal_single_info(datalist, info, j) else: j = 0 deal_single_info(datalist, info, j) entid = deal_html_code.match_entid(url) cid = deal_html_code.match_cid(url) for k in xrange(2, totalpage + 1): href = share_url.format(entid, cid, k) content, status_code = Send_Request().send_request( href, headers) if status_code == 200: start = k * 5 + 1 result = etree.HTML( content, parser=etree.HTMLParser(encoding='utf-8')) dl = result.xpath("//div[@class='viewBox']//dl")[0] datalist = etree.tostring(dl).split( '<dt style="color:#333;margin-bottom:10px;"/>') datalist.remove(datalist[0]) if len(datalist) > 0: deal_single_info(datalist, info, start) else: pass else: logging.info("无股东及出资信息") else: flag = 100000004 info = deal_html_code.remove_repeat(info) return info, flag
def name(url): headers = config.headers_detail content, status_code = Send_Request().send_request(url, headers) if status_code == 200: flag = 1 result = etree.xpath(content, parser=etree.HTMLParser(encoding='utf-8')) dl = result.xpath("//div[@class = viewBox']//dl")[0] info = {} if "企业名称" in content: datallist = etree.tostring(dl).split( '<dd style="border-bottom:1px solid #AE0000;padding-bottom:10px;">' ) datallist.remove(datallist[-1]) pattern = re.compile(u".*共(.*?)页.*") number = re.findall(pattern, content) if len(number) == 1: totalpage = int(number[0]) else: totalpage = 0 if int(totalpage) == 1: j = 0 deal_single_info(datallist, info, j) else: j = 0 deal_single_info(datallist, info, j) entid = deal_html_code.match_entid(url) cid = deal_html_code.match_cid(url) href = out_invest_url.format(entid, cid) for k in xrange(2, totalpage + 1): content, status_code = Send_Request().send_request(href) if status_code == 200: start = k * 5 + 1 result = etree.HTML( content, parser=etree.HTMLParser(encoding='utf-8')) dl = result.xpath("//div[@class='viewBox']/dl")[0] datalist = etree.tostring(dl).split( '<dd style="border-bottom:1px solid #AE0000;padding-bottom:10px;">' ) if len(datalist) > 0: datalist.remove(datalist[-1]) deal_single_info(datalist, info, start) else: pass else: flag = 100000004 else: flag = 100000004 if flag == 1: info = deal_html_code.remove_repeat(info) return info, flag
def name(url): headers = config.headers_detail content, status_code = Send_Request().send_request(url, headers) if status_code == 200: flag = 1 result = etree.HTML(content, parser=etree.HTMLParser(encoding='utf-8')) dl = result.xpath("//div[@class='viewBox']/dl")[0] info = {} if "企业名称" in content: pattern = re.compile(".*共(.*?)页.*") number = re.findall(pattern, content) if len(number) == 1: totalpage = int(number[0]) else: totalpage = 0 if int(totalpage) == 1: j = 0 deal_single_info(dl, info, j) else: j = 0 deal_single_info(dl, info, j) entid = deal_html_code.match_entid(url) cid = deal_html_code.match_cid(url) for k in xrange(2, totalpage + 1): href = out_invest_url.format(entid, cid, k) content, status_code = Send_Request().send_request( href, headers) if status_code == 200: start = (k - 1) * 5 + 1 result = etree.HTML( content, parser=etree.HTMLParser(encoding='utf-8')) dl = result.xpath("//div[@class='viewBox']/dl")[0] deal_single_info(dl, info, start) else: pass else: flag = 100000004 else: flag = 100000004 if flag == 1: info = deal_html_code.remove_repeat(info) return info, flag