Example #1
0
    def parse_html(self,links):
        # 连接数据库
        db,cursor = ktgg.con_mysql()

        for i in links:
            d = {}
            url = 'http://' + re.findall('//(.*?)/',self.url)[0] + i
            text,html= ktgg.request_dis(url)
            if text == '':
                continue
            # 提取一些信息
            d['posttime'] = re.findall('发布时间(.*?)<',html)[0].replace(':','').strip()
            d['court'] = '长沙市望城区人民法院'
            d['source'] = self.url
            d['url'] = url
            d['title'] = re.findall("'b_title'>(.*?)<",html)[0]
            d['province'] = '湖南省'
            # 防止body为空,如果为空则为标题
            for i in self.tihuan:
                text = text.replace(i,'')
            d['body'] = text
            if text == '':
                d['body'] = d['title']
            self.parse_text(d,db,cursor)

        # 关闭数据库
        ktgg.clo_mysql(db,cursor)
Example #2
0
    def parse_html(self,links):
        # 连接数据库
        db,cursor = ktgg.con_mysql()

        for i in links:
            d = {}
            url = 'http://bhqfy.chinacourt.gov.cn' + i
            text,html= ktgg.request_dis(url)
            if text == 0:
                continue
    
            # 提取一些信息
            d['posttime'] = re.findall('发布时间(.*?)<',html)[0].replace(':','').strip()
            d['court'] = '北湖人民法院'
            d['source'] = self.url
            d['url'] = url
            d['title'] = re.findall("'b_title'>(.*?)<",html)[0]
            d['province'] = '湖南省'
            if text == '':
                text = d['title']
            html = etree.HTML(html)
            self.parse_text(text,html,d,db,cursor)

        # 关闭数据库
        ktgg.clo_mysql(db,cursor)
Example #3
0
    def parse_html(self, links):
        # 连接数据库
        db, cursor = ktgg.con_mysql()

        for i in links:
            d = {}
            url = 'http://zzxfy.chinacourt.gov.cn' + i
            text, html = ktgg.request_dis(url)
            if text == 0:
                continue

            # 提取一些信息
            d['posttime'] = re.findall('发布时间(.*?)<',
                                       html)[0].replace(':', '').strip()
            d['court'] = '湖南省渌口区人民法院'
            d['source'] = self.url
            d['url'] = url
            d['title'] = re.findall("'b_title'>(.*?)<", html)[0]
            d['province'] = '湖南省'
            # 文本不存在就用标题替代文本
            if text == '':
                text = d['title']
            # 做一个特殊的处理,删除这两条信息(一个非开庭公告,一个内容为表格形式)
            if '保护当事人的诉讼权利' in d['title']:
                continue
            if '2012年8月1日至8月31日' in d['title']:
                continue
            self.parse_text(text, d, db, cursor)

        # 关闭数据库
        ktgg.clo_mysql(db, cursor)
Example #4
0
    def parse_html(self, links):
        # 连接数据库
        db, cursor = ktgg.con_mysql()

        for i in links:
            d = {}
            url = 'http://hnyzfy.chinacourt.gov.cn' + i
            text, html = ktgg.request_dis(url)
            if text == 0:
                continue

            # 提取一些信息
            d['posttime'] = re.findall('发布时间(.*?)<',
                                       html)[0].replace(':', '').strip()
            d['court'] = '宜章人民法院'
            d['source'] = self.url
            d['url'] = url
            d['title'] = re.findall("'b_title'>(.*?)<", html)[0]
            if d['title'] == '':
                t = etree.HTML(html)
                d['title'] = t.xpath('//div[@class="b_title"]/span/text()')[0]
            d['province'] = '湖南省'
            self.parse_text(text, d, db, cursor)

        # 关闭数据库
        ktgg.clo_mysql(db, cursor)
Example #5
0
    def parse_html(self, links):
        # 连接数据库
        db, cursor = ktgg.con_mysql()
        for i in links:
            d = {}
            url = 'http://sfqfy.chinacourt.gov.cn' + i
            text, html = ktgg.request_dis(url)
            if text == '':
                continue
            # 提取一些信息
            d['posttime'] = re.findall('发布时间:(.*?)<', html)[0].strip()
            d['court'] = '石峰区人民法院'
            d['source'] = self.url
            d['url'] = url
            d['title'] = re.findall("'b_title'>(.*?)<", html)[0]
            d['province'] = '湖南省'
            self.parse_text(text, d, db, cursor)

        # 关闭数据库
        ktgg.clo_mysql(db, cursor)