def parse_html(self,links): # 连接数据库 db,cursor = ktgg.con_mysql() for i in links: d = {} url = 'http://' + re.findall('//(.*?)/',self.url)[0] + i text,html= ktgg.request_dis(url) if text == '': continue # 提取一些信息 d['posttime'] = re.findall('发布时间(.*?)<',html)[0].replace(':','').strip() d['court'] = '长沙市望城区人民法院' d['source'] = self.url d['url'] = url d['title'] = re.findall("'b_title'>(.*?)<",html)[0] d['province'] = '湖南省' # 防止body为空,如果为空则为标题 for i in self.tihuan: text = text.replace(i,'') d['body'] = text if text == '': d['body'] = d['title'] self.parse_text(d,db,cursor) # 关闭数据库 ktgg.clo_mysql(db,cursor)
def parse_html(self, links): # 连接数据库 db, cursor = ktgg.con_mysql() for i in links: d = {} url = self.url + i while True: try: res = requests.get(url, headers=self.headers, timeout=3.05) res.encoding = 'gb18030' html = res.text except (Timeout, ConnectionError): continue break # 提取一些公共信息 text = etree.HTML(html) d['court'] = '资兴区人名法院' d['source'] = self.url d['url'] = url d['title'] = text.xpath('//font/b/text()')[0] d['posttime'] = text.xpath( '//p[@align="center"][3]/text()')[0].split(':')[-1] d['province'] = '湖南省' self.parse_text(text, d, db, cursor) time.sleep(1) # 关闭数据库 ktgg.clo_mysql(db, cursor)
def parse_html(self,links): # 连接数据库 db,cursor = ktgg.con_mysql() for i in links: d = {} url = 'http://bhqfy.chinacourt.gov.cn' + i text,html= ktgg.request_dis(url) if text == 0: continue # 提取一些信息 d['posttime'] = re.findall('发布时间(.*?)<',html)[0].replace(':','').strip() d['court'] = '北湖人民法院' d['source'] = self.url d['url'] = url d['title'] = re.findall("'b_title'>(.*?)<",html)[0] d['province'] = '湖南省' if text == '': text = d['title'] html = etree.HTML(html) self.parse_text(text,html,d,db,cursor) # 关闭数据库 ktgg.clo_mysql(db,cursor)
def parse_html(self, links): # 连接数据库 db, cursor = ktgg.con_mysql() for i in links: d = {} url = 'http://czyxfy.chinacourt.gov.cn' + i print(url) while True: try: res = requests.get(url, headers=self.headers, timeout=3.05) res.encoding = 'gb18030' html = res.text except (Timeout, ConnectionError): continue break # 提取一些公共信息 text = etree.HTML(html) try: d['court'] = '永兴人名法院' d['source'] = self.url d['url'] = url d['title'] = text.xpath('//p[@align="center"]//b/text()')[0] d['posttime'] = text.xpath( '//p[@align="center"]/text()')[0].split(':')[1] d['province'] = '湖南省' except IndexError: continue self.parse_text(d, db, cursor) # 关闭数据库 ktgg.clo_mysql(db, cursor)
def parse_html(self, links): # 连接数据库 db, cursor = ktgg.con_mysql() for i in links: d = {} url = 'http://zzxfy.chinacourt.gov.cn' + i text, html = ktgg.request_dis(url) if text == 0: continue # 提取一些信息 d['posttime'] = re.findall('发布时间(.*?)<', html)[0].replace(':', '').strip() d['court'] = '湖南省渌口区人民法院' d['source'] = self.url d['url'] = url d['title'] = re.findall("'b_title'>(.*?)<", html)[0] d['province'] = '湖南省' # 文本不存在就用标题替代文本 if text == '': text = d['title'] # 做一个特殊的处理,删除这两条信息(一个非开庭公告,一个内容为表格形式) if '保护当事人的诉讼权利' in d['title']: continue if '2012年8月1日至8月31日' in d['title']: continue self.parse_text(text, d, db, cursor) # 关闭数据库 ktgg.clo_mysql(db, cursor)
def parse_html(self, links): # 连接数据库 db, cursor = ktgg.con_mysql() for i in links: d = {} url = 'http://hnyzfy.chinacourt.gov.cn' + i text, html = ktgg.request_dis(url) if text == 0: continue # 提取一些信息 d['posttime'] = re.findall('发布时间(.*?)<', html)[0].replace(':', '').strip() d['court'] = '宜章人民法院' d['source'] = self.url d['url'] = url d['title'] = re.findall("'b_title'>(.*?)<", html)[0] if d['title'] == '': t = etree.HTML(html) d['title'] = t.xpath('//div[@class="b_title"]/span/text()')[0] d['province'] = '湖南省' self.parse_text(text, d, db, cursor) # 关闭数据库 ktgg.clo_mysql(db, cursor)
def parse_html(self, links): # 连接数据库 db, cursor = ktgg.con_mysql() for i in links: d = {} url = 'http://zyqfy.chinacourt.gov.cn' + i while True: try: res = requests.get(url, headers=self.headers, timeout=3.05) res.encoding = 'gb18030' html = res.text except (Timeout, ConnectionError): continue break # 获取所有的文本内容 text = etree.HTML(html) content = text.xpath('//span[@class="detail_content"]') if content == []: continue # 提取一些公共信息 d['court'] = '资阳区人名法院' d['source'] = self.url d['url'] = url d['title'] = text.xpath('//p[@align="center"]//b/text()')[0] d['posttime'] = text.xpath('//p[@align="center"]/text()')[0].split( ':')[1] d['province'] = '湖南省' # 格式化文本 info = [] t = content[0].xpath('./text()') if t: info.append(t[0].replace('\xa0', '')) for i in content[0].xpath('./p/text()'): info.append(i.replace('\xa0', '')) for i in content[0].xpath('./font/text()'): info.append(i.replace('\xa0', '')) if info == []: info = [d['title']] self.parse_text(info, d, db, cursor) time.sleep(1) # 关闭数据库 ktgg.clo_mysql(db, cursor)
def parse_html(self, links): # 连接数据库 db, cursor = ktgg.con_mysql() for i in links: d = {} url = 'http://sfqfy.chinacourt.gov.cn' + i text, html = ktgg.request_dis(url) if text == '': continue # 提取一些信息 d['posttime'] = re.findall('发布时间:(.*?)<', html)[0].strip() d['court'] = '石峰区人民法院' d['source'] = self.url d['url'] = url d['title'] = re.findall("'b_title'>(.*?)<", html)[0] d['province'] = '湖南省' self.parse_text(text, d, db, cursor) # 关闭数据库 ktgg.clo_mysql(db, cursor)