Example #1
0
    def parse_text(self,d,db,cursor):
        # 由于格式原因分为两种情况
        if '排期开庭' in d['title']:
            l = re.findall('(\d{1,4}[年].*?[日上下号])(.*?)\d{1,2}、{1,2}',d['body'])
            for info in l:
                d['body'] = info[0] + info[1]
                d['sorttime'] = info[0]
                d['anyou'] = ktgg.set_anyou(info[1])

                caseNo = re.findall('[\[【((].*?号',info[1])
                d['caseNo'] = ''
                if caseNo:
                    d['caseNo'] = caseNo[0]

                courtNum = re.findall('我院(.*?)开庭审理',info[1])
                d['courtNum'] = ''
                if courtNum:
                    d['courtNum'] = courtNum[0].replace('公开','')

                for i in self.pname_p:
                    s = re.findall(i % d['anyou'],info[1])
                    if s :
                        d['plaintiff'] = s[0][0].replace('原告人','').replace('原告','').replace(',','').replace(',','')
                        d['pname'] = s[0][1].replace('被告人','').replace('被告','')
                        break
                
                d['md5'] = ktgg.get_md5(d['body'],d['url'])
                ktgg.ins_mysql(d,'ktgg_kt_wuhan',db,cursor)

        else:
            anyou = ktgg.set_anyou()
            d['anyou'] = ktgg.search_anyou(anyou,d['body'])
            d['sorttime'] = re.findall('\d{1,4}[年月].*?[日上下号]',d['body'])[0]
            # 案号
            caseNo = re.findall('[\[【((].*?号',d['body'])
            d['caseNo'] = ''
            if caseNo:
                d['caseNo'] = caseNo[0]
            # 开庭地点
            courtNum = re.findall('我院(.*?)开庭审理',d['body'])
            d['courtNum'] = ''
            if courtNum:
                d['courtNum'] = courtNum[0].replace('公开','')
            # 获取原告和被告
            for i in self.pname_p:
                l = re.findall(i % d['anyou'],d['body'])
                if l :
                    d['plaintiff'] = l[0][0].replace('原告人','').replace('原告','').replace(',','').replace(',','')
                    d['pname'] = l[0][1].replace('被告人','').replace('被告','')
                    break
            d['md5'] = ktgg.get_md5(d['body'],d['url'])
            ktgg.ins_mysql(d,'ktgg_kt_wuhan',db,cursor)
Example #2
0
 def parse_text(self, d, db, cursor):
     # 获取案由
     d['anyou'] = ktgg.set_anyou(d['body'])
     # 获取开庭时间
     try:
         d['sorttime'] = re.findall(r'(\d{4}年\d{1,2}月\d{1,2})日',
                                    d['body'])[0] + '日'
     except IndexError:
         d['sorttime'] = ''
     # 获取开庭地点
     for i in self.courtNum:
         l = re.findall(i, d['body'])
         if l:
             d['courtNum'] = l[0].split('庭')[0] + '庭'
             break
     # 获取原告和被告
     for i in self.pname_p:
         l = re.findall(i % d['anyou'], d['body'])
         if l:
             if len(l[0]) == 2:
                 d['plaintiff'] = l[0][0].replace('原告人', '').replace(
                     '原告', '').replace(' ', '')
                 d['pname'] = l[0][1].replace('被告人',
                                              '').replace('被告', '').replace(
                                                  ' ', '')
             else:
                 d['plaintiff'] = ''
                 d['pname'] = l[0].replace('被告人',
                                           '').replace('被告',
                                                       '').replace(' ', '')
             break
     # print(d)
     ktgg.ins_mysql(d, 'ktgg_ceshi', db, cursor)
Example #3
0
    def parse_text(self, text, d, db, cursor):
        d['body'] = re.sub('\s', '', text)
        # 提取开庭地点
        courtNum = re.findall('法院(.*?庭)', d['body'])
        if courtNum:
            d['courtNum'] = courtNum[0]
        # 提取时间
        sorttime = re.findall('\d{1,4}[年月].*?[日号]', d['body'])
        if sorttime:
            d['sorttime'] = sorttime[0]
        # 提取审判员
        judge = re.findall('审判员(.*?)[书代]', d['body'])
        if judge:
            d['judge'] = judge[0].replace(':', '')

        # 提取被告,案由,原告(从标题上面提取)
        party = re.findall('被告(.*)', d['title'])
        if party:
            party = party[0]
            anyou = ktgg.set_anyou()
            start, end = ktgg.search_anyou(anyou, party)
            if start == 0:
                return
            d['anyou'] = party[start:end]
            d['pname'] = re.findall('(.*?)%s' % d['anyou'],
                                    party)[0].replace('人', '')
        d['md5'] = ktgg.get_md5(d['body'], d['url'])
        ktgg.ins_mysql(d, 'ktgg_kt_wuhan', db, cursor)
Example #4
0
 def parse_text(self, text, d, db, cursor):
     infos = text.xpath('//span[@class="detail_content"]//tr')[1:]
     for info in infos:
         d_info = d.copy()
         # 提取body
         d_info['body'] = info.xpath('string(.)').replace('\r', '').replace(
             '\n', '')
         # 提取案号
         d_info['caseNo'] = info.xpath('./td[2]/span/text()')[0]
         # 提取审判员
         d_info['judge'] = info.xpath('./td[5]/span/text()')[0]
         # 提取开庭地点
         d_info['courtNum'] = info.xpath('./td[6]/span/text()')[0]
         # 提取时间
         d_info['sorttime'] = info.xpath('./td[7]/span/text()')[0].split(
             ' ')[0]
         # 提取原告和被告和案由
         party = info.xpath('./td[3]/span/text()')[0]
         for i in self.party:
             try:
                 party = re.findall(i, party)[0]
             except IndexError:
                 continue
             else:
                 anyou = ktgg.set_anyou()
                 if type(party) is str:
                     start, end = ktgg.search_anyou(anyou, party)
                     if start == 0:
                         return
                     d_info['anyou'] = party[start:end]
                     d_info['pname'] = re.findall(
                         '(.*?)%s' % d_info['anyou'],
                         party)[0].replace('被告人', '').replace('被告', '')
                 elif type(party) is tuple:
                     start, end = ktgg.search_anyou(anyou, party[1])
                     if start == 0:
                         return
                     d_info['anyou'] = party[1][start:end]
                     d_info['plaintiff'] = party[0].replace('原告人',
                                                            '').replace(
                                                                '原告', '')
                     d_info['pname'] = re.findall(
                         '(.*?)%s' % d_info['anyou'],
                         party[1])[0].replace('被告人', '').replace('被告', '')
             break
         d_info['md5'] = ktgg.get_md5(d_info['body'], d_info['url'])
         ktgg.ins_mysql(d_info, 'ktgg_kt_wuhan', db, cursor)
Example #5
0
    def parse_text(self, text, d, db, cursor):
        for i in self.tihuan:
            text = text.replace(i, '')
        d['body'] = text

        # 提取开庭时间
        sorttime = re.findall('\d{1,4}[年月].*?[日号]', text)
        if sorttime:
            d['sorttime'] = sorttime[0]
        # 提取开庭地点
        courtNum = re.findall('在(.{2,7}庭)', text)
        if courtNum:
            d['courtNum'] = courtNum[0]
        else:
            courtNum = re.findall('第.{1,4}庭', text)
            if courtNum:
                d['courtNum'] = courtNum[0]

        # 提取原告,被告和案由
        for i in self.party:
            try:
                party = re.findall(i, d['body'])[0]
            except IndexError:
                continue
            else:
                anyou = ktgg.set_anyou()
                if type(party) is str:
                    start, end = ktgg.search_anyou(anyou, party)
                    if start == 0:
                        ktgg.write_txt('anyou', text)
                    d['anyou'] = party[start:end]
                    d['pname'] = re.findall('(.*?)%s' % d['anyou'],
                                            party)[0].replace('人', '')
                elif type(party) is tuple:
                    start, end = ktgg.search_anyou(anyou, party[1])
                    if start == 0:
                        return
                    d['anyou'] = party[1][start:end]
                    d['plaintiff'] = party[0].replace('人', '')
                    d['pname'] = re.findall('(.*?)%s' % d['anyou'],
                                            party[1])[0].replace('人', '')
            break
        d['md5'] = ktgg.get_md5(d['body'], d['url'])
        ktgg.ins_mysql(d, 'ktgg_kt_wuhan', db, cursor)
Example #6
0
 def parse_text(self, d, db, cursor):
     # 提取详细信息
     d['body'] = d['title']
     # 提取开庭地点
     sorttime = re.findall('\d{1,4}[年月].*?[日号]', d['title'])
     if sorttime:
         d['sorttime'] = sorttime[0]
     # 提取开庭时间
     courtNum = re.findall('第.{1,4}庭', d['title'])
     if courtNum:
         d['courtNum'] = courtNum[0]
     # 提取案由和被告以及原告
     for i in self.party:
         try:
             party = re.findall(i, d['title'])[0]
         except IndexError:
             continue
         else:
             anyou = ktgg.set_anyou()
             if type(party) is str:
                 start, end = ktgg.search_anyou(anyou, party)
                 if start == 0:
                     return
                 d['anyou'] = party[start:end]
                 d['pname'] = re.findall('(.*?)%s' % d['anyou'],
                                         party)[0].replace('被告人',
                                                           '').replace(
                                                               '被告', '')
             elif type(party) is tuple:
                 start, end = ktgg.search_anyou(anyou, party[1])
                 if start == 0:
                     return
                 d['anyou'] = party[1][start:end]
                 d['plaintiff'] = party[0].replace('原告人',
                                                   '').replace('原告', '')
                 d['pname'] = re.findall('(.*?)%s' % d['anyou'],
                                         party[1])[0].replace('被告人',
                                                              '').replace(
                                                                  '被告', '')
         break
     d['md5'] = ktgg.get_md5(d['body'], d['url'])
     ktgg.ins_mysql(d, 'ktgg_kt_wuhan', db, cursor)
     time.sleep(0.5)
Example #7
0
 def parse_text(self, text, d, db, cursor):
     # 切割文本
     infos = re.split('\n', text)
     for info in infos:
         if info:
             d_info = d.copy()
             # 提取时间
             d_info['body'] = info.replace('\xa0', '').replace('\r', '')
             sorttime = re.findall('\d{1,4}[年月].*?[日号]', info)
             if sorttime:
                 d_info['sorttime'] = sorttime[0]
             # 提取开庭地点
             courtNum = re.findall('在(.*?庭)', info)
             if courtNum:
                 d_info['courtNum'] = courtNum[0]
             d_info['md5'] = ktgg.get_md5(d_info['body'], d_info['url'])
             # 提取案由,原告,被告
             for party in self.party:
                 party = re.findall(party, info)
                 if party:
                     party = party[0]
                     d_info['plaintiff'] = party[0].replace('原告',
                                                            '').replace(
                                                                '人', '')
                     anyou = ktgg.set_anyou()
                     start, end = ktgg.search_anyou(anyou, party[1])
                     d_info['anyou'] = party[1][start:end]
                     pname = re.findall('(.*?)%s' % d_info['anyou'],
                                        party[1])
                     if pname:
                         d_info['pname'] = pname[0].replace('被告',
                                                            '').replace(
                                                                '人', '')
                         break
                 else:
                     continue
             d_info['md5'] = ktgg.get_md5(d_info['body'], d_info['url'])
             ktgg.ins_mysql(d_info, 'ktgg_kt_wuhan', db, cursor)
Example #8
0
    def parse_text(self, text, d, db, cursor):
        # 切割文本
        infos = re.split('\n', text)
        f = []
        for info in infos:
            d_info = d.copy()
            info = re.split('\s', info)
            info = list(filter(None, info))
            # 第一种情况
            start = 0
            if len(info) >= 6:
                d_info['sorttime'] = ''
                d_info['caseNo'] = ''
                d_info['body'] = ''.join(info)
                for i in info:
                    # 提取案号,案由,被告和原告
                    if ('号' in i) and (d_info['caseNo'] == ''):
                        d_info['caseNo'] = i
                        # 获取party
                        index = info.index(i)
                        party = info[index + 1]
                        # 获取案由
                        anyou = ktgg.set_anyou()
                        start, end = ktgg.search_anyou(anyou, party)
                        if start == 0:
                            break
                        d_info['anyou'] = party[start:end]
                        # 获取原告和被告
                        if '诉' in party:
                            p = re.split('诉', party)
                            d_info['plaintiff'] = p[0]
                            d_info['pname'] = re.findall(
                                '(.*?)%s' % d_info['anyou'], p[1])
                        else:
                            d_info['pname'] = re.findall(
                                '(.*?)%s' % d_info['anyou'], party)[0]
                    # 提取开庭时间和开庭地点
                    if d_info['sorttime'] == '':
                        sorttime = re.findall('\d{4}-\d{2}-\d{2}', i)
                        if sorttime:
                            d_info['sorttime'] = sorttime[0]
                            index = info.index(i)
                            d_info['courtNum'] = info[index - 1]
                if start == 0:
                    continue
                d_info['md5'] = ktgg.get_md5(d_info['body'], d_info['url'])
                ktgg.ins_mysql(d_info, 'ktgg_kt_wuhan', db, cursor)

            # 第二种情况
            elif 0 < len(info):
                f.append(info)
                if len(f) == 2:
                    info = f[0] + f[1]
                    d_info['body'] = ''.join(info)
                    # 提取时间
                    sorttime = re.findall('\d{4}年.*?日', d_info['body'])
                    if sorttime:
                        d_info['sorttime'] = sorttime[0]
                    # 提取法庭
                    courtNum = re.findall('第.{2,6}庭|回龙法庭', d_info['body'])
                    if courtNum:
                        d_info['courtNum'] = courtNum[0]
                    # 获取案号
                    caseNo = re.findall('[((民].*?号', d_info['body'])
                    if caseNo:
                        d_info['caseNo'] = caseNo[0]

                    for i in info:
                        if '诉' in i:
                            # 获取案由
                            anyou = ktgg.set_anyou()
                            start, end = ktgg.search_anyou(anyou, i)
                            if start == 0:
                                break
                            d_info['anyou'] = i[start:end]
                            # 获取原告和被告
                            if '诉' in i:
                                p = re.split('诉', i)
                                if '号' in p[0]:
                                    d_info['plaintiff'] = p[0].split('号')
                                else:
                                    d_info['plaintiff'] = p[0]
                                d_info['pname'] = re.findall(
                                    '(.*?)%s' % d_info['anyou'], p[1])
                            else:
                                d_info['pname'] = re.findall(
                                    '(.*?)%s' % d_info['anyou'], party)[0]

                    f = []
                if start == 0:
                    continue
                d_info['md5'] = ktgg.get_md5(d_info['body'], d_info['url'])
                ktgg.ins_mysql(d_info, 'ktgg_kt_wuhan', db, cursor)
Example #9
0
    def parse_text(self,text,html,d,db,cursor):
        if '开庭公告' in d['title']:
            infos = html.xpath('//tbody/tr')[1:]
            for info in infos:
                d_info = d.copy()
                d_info['body'] = info.xpath('string(.)').replace('\r','').replace('\n','')
                if len(info.xpath('./td')) == 4:
                    # 提取时间
                    d_info['sorttime'] = info.xpath('./td[4]')[0].xpath('string(.)').split(' ')[0]
                    # 提取地点
                    d_info['courtNum'] = info.xpath('./td[3]')[0].xpath('string(.)')
                    party = info.xpath('./td[2]')[0].xpath('string(.)')
                else:
                    # 提取时间
                    d_info['sorttime'] = info.xpath('./td[3]')[0].xpath('string(.)').split(' ')[0]
                    # 提取地点
                    d_info['courtNum'] = info.xpath('./td[2]')[0].xpath('string(.)')
                    party = info.xpath('./td[1]')[0].xpath('string(.)')
                # 提取被告和原告和案由
                for i in self.party:
                    try:
                        party = re.findall(i,party)[0]
                    except IndexError:
                        continue
                    else:
                        anyou = ktgg.set_anyou()
                        if type(party) is str:
                            start,end = ktgg.search_anyou(anyou,party)
                            if start == 0:
                                return
                            d_info['anyou'] = party[start:end]
                            d_info['pname'] = re.findall('(.*?)%s' % d_info['anyou'],party)[0].replace('被告人','').replace('被告','')
                        elif type(party) is tuple:
                            start,end = ktgg.search_anyou(anyou,party[1])
                            if start == 0:
                                return
                            d_info['anyou'] = party[1][start:end]
                            d_info['plaintiff'] = party[0].replace('原告人','').replace('原告','')
                            d_info['pname'] = re.findall('(.*?)%s' % d_info['anyou'],party[1])[0].replace('被告人','').replace('被告','')
                    break
                else:
                    return
                d_info['md5'] = ktgg.get_md5(d_info['body'],d_info['url'])
                ktgg.ins_mysql(d_info,'ktgg_kt_wuhan',db,cursor)

        else:
            d['body'] = text
            # 提取日期
            sorttime = re.findall('\d{1,2}月.*?日',d['title'])
            if sorttime:
                d['sorttime'] = d['posttime'].split('-')[0] + '年' + sorttime[0]
            #提取审判庭    
            courtNum = re.findall('在(.{2,5}庭)',d['body'])
            if courtNum:
                d['courtNum'] = courtNum[0]
            for i in self.party:
                try:
                    party = re.findall(i,d['body'])[0]
                except IndexError:
                    continue
                else:
                    anyou = ktgg.set_anyou()
                    if type(party) is str:
                        start,end = ktgg.search_anyou(anyou,party)
                        if start == 0:
                            return
                        d['anyou'] = party[start:end]
                        d['pname'] = re.findall('(.*?)%s' % d['anyou'],party)[0].replace('被告人','').replace('被告','').replace('审','').replace('理','')
                    elif type(party) is tuple:
                        start,end = ktgg.search_anyou(anyou,party[1])
                        if start == 0:
                            return
                        d['anyou'] = party[1][start:end]
                        d['plaintiff'] = party[0].replace('原告人','').replace('原告','').replace('审','').replace('理','')
                        d['pname'] = re.findall('(.*?)%s' % d['anyou'],party[1])[0].replace('被告人','').replace('被告','')
                break
            else:
                return
            d['md5'] = ktgg.get_md5(d['body'],d['url'])
            ktgg.ins_mysql(d,'ktgg_kt_wuhan',db,cursor)
Example #10
0
    def parse_text(self, text, d, db, cursor):
        if text[0][-2:] == '审理':
            text = [text[0] + text[1]]
        for info in text:
            if len(info) <= 30:
                continue
            d['body'] = info
            d['sorttime'] = ''
            d['courtNum'] = ''
            d['anyou'] = ''
            d['pname'] = ''
            d['plaintiff'] = ''

            # 获取开庭时间
            sorttime = re.findall('\d{2,4}[年月].*?[日号]', info)
            if sorttime:
                d['sorttime'] = sorttime[0]

            # 获取开庭地点
            courtNum = re.findall('第.*?庭', info)
            if courtNum:
                d['courtNum'] = ','.join(courtNum)

            # 获取被告和案由的文本
            p = re.findall('被告.*?[罪案,。]', info)
            if not p:
                continue

            # 获取案由
            anyou = ktgg.set_anyou()
            L = []
            for x in p:
                l = []
                for ay in anyou:
                    if ay in x:
                        l.append(ay)
                if l == []:
                    L.append('')
                    continue
                l.sort(reverse=True, key=len)
                L.append(l[0])
            d['anyou'] = ','.join(L)

            # 获取被告
            pnames = []
            for x, y in zip(p, L):
                if y == '':
                    continue
                pname = re.findall('被告(.*?)%s' % y, x)[0]
                for i in self.p:
                    pname = pname.replace(i, '')
                pnames.append(pname)
            d['pname'] = ','.join(pnames)
            print(d['pname'])

            # 获取原告
            plaintiff = re.findall('原告(.*?)[诉与]', info)
            if plaintiff:
                d['plaintiff'] = plaintiff[0]

            d['md5'] = ktgg.get_md5(info, d['url'])
            ktgg.ins_mysql(d, 'ktgg_kt_wuhan', db, cursor)
Example #11
0
    def parse_text(self, text, d, db, cursor):
        # 对文本切片获取每个案列
        infos = re.split('\n', text)

        for info in infos:
            if '案由' in info or not info:
                continue

            # 获取案例的列表
            l = re.split('\s', info)

            # 获取body
            for i in self.tihuan:
                info = info.replace(i, '')
            d['body'] = info

            # 案号
            caseNo = re.findall('[民(].*?号', info)
            d['caseNo'] = ''
            if caseNo:
                d['caseNo'] = caseNo[0]

            # 案由
            b = 0
            anyou = ktgg.set_anyou()
            for i in l:
                for a in anyou:
                    if a in i:
                        d['anyou'] = i
                        b = 1
                        break
                if b == 1:
                    break
            else:
                continue

            # 开庭地点
            for i in l:
                courtNum = re.findall('第.*?庭', i)
                d['courtNum'] = ''
                if courtNum:
                    d['courtNum'] = courtNum[0]
                    break

            # 开庭时间
            sorttime = ''
            for i in l[-1::-1]:
                for x in self.t:
                    if x in i:
                        sorttime = i
                        break
                if sorttime:
                    break
            # 格式化时间
            for i in self.re:
                s = re.findall(i, sorttime)
                if s:
                    d['sorttime'] = s[0]
                    break
            else:
                d['sorttime'] = ''

            if '.' in d['sorttime']:
                times = d['sorttime'].split('.')
                if len(times) == 2:
                    d['sorttime'] = times[0] + '月' + times[1] + '日'
                else:
                    d['sorttime'] = times[0] + '年' + times[1] + '月' + times[
                        2] + '日'
            elif '/' in d['sorttime']:
                times = sorttime.split('/')
                d['sorttime'] = times[0] + '年' + times[1] + '月' + times[2] + '日'

            # 原告和被告
            for i in l:
                if i == d['anyou']:
                    s = l.index(i) + 1
                    while True:
                        if l[s] == '':
                            s += 1
                            continue
                        yuan_bei = l[s]
                        break
                    break

            # 获取原告
            d['party'] = ''
            d['plaintiff'] = ''
            d['pname'] = ''
            if ';' in yuan_bei:
                d['plaintiff'] = yuan_bei.split(';')[0].replace(
                    '原告', '').replace(' ', '').replace(':', '')
                d['pname'] = yuan_bei.split(';')[1].replace('被告', '').replace(
                    ' ', '').replace(':', '')
            elif ';' in yuan_bei:
                d['plaintiff'] = yuan_bei.split(';')[0].replace(
                    '原告', '').replace(' ', '').replace(':', '')
                d['pname'] = yuan_bei.split(';')[1].replace('被告', '').replace(
                    ' ', '').replace(':', '')
            elif '诉' in yuan_bei:
                d['plaintiff'] = yuan_bei.split('诉')[0].replace(
                    '原告', '').replace(' ', '').replace(':', '')
                d['pname'] = yuan_bei.split('诉')[1].replace('被告', '').replace(
                    ' ', '').replace(':', '')
            else:
                d['party'] = yuan_bei
            d['md5'] = ktgg.get_md5(info, d['url'])
            ktgg.ins_mysql(d, 'ktgg_kt_wuhan', db, cursor)