def parse_text(self,d,db,cursor): # 由于格式原因分为两种情况 if '排期开庭' in d['title']: l = re.findall('(\d{1,4}[年].*?[日上下号])(.*?)\d{1,2}、{1,2}',d['body']) for info in l: d['body'] = info[0] + info[1] d['sorttime'] = info[0] d['anyou'] = ktgg.set_anyou(info[1]) caseNo = re.findall('[\[【((].*?号',info[1]) d['caseNo'] = '' if caseNo: d['caseNo'] = caseNo[0] courtNum = re.findall('我院(.*?)开庭审理',info[1]) d['courtNum'] = '' if courtNum: d['courtNum'] = courtNum[0].replace('公开','') for i in self.pname_p: s = re.findall(i % d['anyou'],info[1]) if s : d['plaintiff'] = s[0][0].replace('原告人','').replace('原告','').replace(',','').replace(',','') d['pname'] = s[0][1].replace('被告人','').replace('被告','') break d['md5'] = ktgg.get_md5(d['body'],d['url']) ktgg.ins_mysql(d,'ktgg_kt_wuhan',db,cursor) else: anyou = ktgg.set_anyou() d['anyou'] = ktgg.search_anyou(anyou,d['body']) d['sorttime'] = re.findall('\d{1,4}[年月].*?[日上下号]',d['body'])[0] # 案号 caseNo = re.findall('[\[【((].*?号',d['body']) d['caseNo'] = '' if caseNo: d['caseNo'] = caseNo[0] # 开庭地点 courtNum = re.findall('我院(.*?)开庭审理',d['body']) d['courtNum'] = '' if courtNum: d['courtNum'] = courtNum[0].replace('公开','') # 获取原告和被告 for i in self.pname_p: l = re.findall(i % d['anyou'],d['body']) if l : d['plaintiff'] = l[0][0].replace('原告人','').replace('原告','').replace(',','').replace(',','') d['pname'] = l[0][1].replace('被告人','').replace('被告','') break d['md5'] = ktgg.get_md5(d['body'],d['url']) ktgg.ins_mysql(d,'ktgg_kt_wuhan',db,cursor)
def parse_text(self, d, db, cursor): # 获取案由 d['anyou'] = ktgg.set_anyou(d['body']) # 获取开庭时间 try: d['sorttime'] = re.findall(r'(\d{4}年\d{1,2}月\d{1,2})日', d['body'])[0] + '日' except IndexError: d['sorttime'] = '' # 获取开庭地点 for i in self.courtNum: l = re.findall(i, d['body']) if l: d['courtNum'] = l[0].split('庭')[0] + '庭' break # 获取原告和被告 for i in self.pname_p: l = re.findall(i % d['anyou'], d['body']) if l: if len(l[0]) == 2: d['plaintiff'] = l[0][0].replace('原告人', '').replace( '原告', '').replace(' ', '') d['pname'] = l[0][1].replace('被告人', '').replace('被告', '').replace( ' ', '') else: d['plaintiff'] = '' d['pname'] = l[0].replace('被告人', '').replace('被告', '').replace(' ', '') break # print(d) ktgg.ins_mysql(d, 'ktgg_ceshi', db, cursor)
def parse_text(self, text, d, db, cursor): d['body'] = re.sub('\s', '', text) # 提取开庭地点 courtNum = re.findall('法院(.*?庭)', d['body']) if courtNum: d['courtNum'] = courtNum[0] # 提取时间 sorttime = re.findall('\d{1,4}[年月].*?[日号]', d['body']) if sorttime: d['sorttime'] = sorttime[0] # 提取审判员 judge = re.findall('审判员(.*?)[书代]', d['body']) if judge: d['judge'] = judge[0].replace(':', '') # 提取被告,案由,原告(从标题上面提取) party = re.findall('被告(.*)', d['title']) if party: party = party[0] anyou = ktgg.set_anyou() start, end = ktgg.search_anyou(anyou, party) if start == 0: return d['anyou'] = party[start:end] d['pname'] = re.findall('(.*?)%s' % d['anyou'], party)[0].replace('人', '') d['md5'] = ktgg.get_md5(d['body'], d['url']) ktgg.ins_mysql(d, 'ktgg_kt_wuhan', db, cursor)
def parse_text(self, text, d, db, cursor): infos = text.xpath('//span[@class="detail_content"]//tr')[1:] for info in infos: d_info = d.copy() # 提取body d_info['body'] = info.xpath('string(.)').replace('\r', '').replace( '\n', '') # 提取案号 d_info['caseNo'] = info.xpath('./td[2]/span/text()')[0] # 提取审判员 d_info['judge'] = info.xpath('./td[5]/span/text()')[0] # 提取开庭地点 d_info['courtNum'] = info.xpath('./td[6]/span/text()')[0] # 提取时间 d_info['sorttime'] = info.xpath('./td[7]/span/text()')[0].split( ' ')[0] # 提取原告和被告和案由 party = info.xpath('./td[3]/span/text()')[0] for i in self.party: try: party = re.findall(i, party)[0] except IndexError: continue else: anyou = ktgg.set_anyou() if type(party) is str: start, end = ktgg.search_anyou(anyou, party) if start == 0: return d_info['anyou'] = party[start:end] d_info['pname'] = re.findall( '(.*?)%s' % d_info['anyou'], party)[0].replace('被告人', '').replace('被告', '') elif type(party) is tuple: start, end = ktgg.search_anyou(anyou, party[1]) if start == 0: return d_info['anyou'] = party[1][start:end] d_info['plaintiff'] = party[0].replace('原告人', '').replace( '原告', '') d_info['pname'] = re.findall( '(.*?)%s' % d_info['anyou'], party[1])[0].replace('被告人', '').replace('被告', '') break d_info['md5'] = ktgg.get_md5(d_info['body'], d_info['url']) ktgg.ins_mysql(d_info, 'ktgg_kt_wuhan', db, cursor)
def parse_text(self, text, d, db, cursor): for i in self.tihuan: text = text.replace(i, '') d['body'] = text # 提取开庭时间 sorttime = re.findall('\d{1,4}[年月].*?[日号]', text) if sorttime: d['sorttime'] = sorttime[0] # 提取开庭地点 courtNum = re.findall('在(.{2,7}庭)', text) if courtNum: d['courtNum'] = courtNum[0] else: courtNum = re.findall('第.{1,4}庭', text) if courtNum: d['courtNum'] = courtNum[0] # 提取原告,被告和案由 for i in self.party: try: party = re.findall(i, d['body'])[0] except IndexError: continue else: anyou = ktgg.set_anyou() if type(party) is str: start, end = ktgg.search_anyou(anyou, party) if start == 0: ktgg.write_txt('anyou', text) d['anyou'] = party[start:end] d['pname'] = re.findall('(.*?)%s' % d['anyou'], party)[0].replace('人', '') elif type(party) is tuple: start, end = ktgg.search_anyou(anyou, party[1]) if start == 0: return d['anyou'] = party[1][start:end] d['plaintiff'] = party[0].replace('人', '') d['pname'] = re.findall('(.*?)%s' % d['anyou'], party[1])[0].replace('人', '') break d['md5'] = ktgg.get_md5(d['body'], d['url']) ktgg.ins_mysql(d, 'ktgg_kt_wuhan', db, cursor)
def parse_text(self, d, db, cursor): # 提取详细信息 d['body'] = d['title'] # 提取开庭地点 sorttime = re.findall('\d{1,4}[年月].*?[日号]', d['title']) if sorttime: d['sorttime'] = sorttime[0] # 提取开庭时间 courtNum = re.findall('第.{1,4}庭', d['title']) if courtNum: d['courtNum'] = courtNum[0] # 提取案由和被告以及原告 for i in self.party: try: party = re.findall(i, d['title'])[0] except IndexError: continue else: anyou = ktgg.set_anyou() if type(party) is str: start, end = ktgg.search_anyou(anyou, party) if start == 0: return d['anyou'] = party[start:end] d['pname'] = re.findall('(.*?)%s' % d['anyou'], party)[0].replace('被告人', '').replace( '被告', '') elif type(party) is tuple: start, end = ktgg.search_anyou(anyou, party[1]) if start == 0: return d['anyou'] = party[1][start:end] d['plaintiff'] = party[0].replace('原告人', '').replace('原告', '') d['pname'] = re.findall('(.*?)%s' % d['anyou'], party[1])[0].replace('被告人', '').replace( '被告', '') break d['md5'] = ktgg.get_md5(d['body'], d['url']) ktgg.ins_mysql(d, 'ktgg_kt_wuhan', db, cursor) time.sleep(0.5)
def parse_text(self, text, d, db, cursor): # 切割文本 infos = re.split('\n', text) for info in infos: if info: d_info = d.copy() # 提取时间 d_info['body'] = info.replace('\xa0', '').replace('\r', '') sorttime = re.findall('\d{1,4}[年月].*?[日号]', info) if sorttime: d_info['sorttime'] = sorttime[0] # 提取开庭地点 courtNum = re.findall('在(.*?庭)', info) if courtNum: d_info['courtNum'] = courtNum[0] d_info['md5'] = ktgg.get_md5(d_info['body'], d_info['url']) # 提取案由,原告,被告 for party in self.party: party = re.findall(party, info) if party: party = party[0] d_info['plaintiff'] = party[0].replace('原告', '').replace( '人', '') anyou = ktgg.set_anyou() start, end = ktgg.search_anyou(anyou, party[1]) d_info['anyou'] = party[1][start:end] pname = re.findall('(.*?)%s' % d_info['anyou'], party[1]) if pname: d_info['pname'] = pname[0].replace('被告', '').replace( '人', '') break else: continue d_info['md5'] = ktgg.get_md5(d_info['body'], d_info['url']) ktgg.ins_mysql(d_info, 'ktgg_kt_wuhan', db, cursor)
def parse_text(self, text, d, db, cursor): # 切割文本 infos = re.split('\n', text) f = [] for info in infos: d_info = d.copy() info = re.split('\s', info) info = list(filter(None, info)) # 第一种情况 start = 0 if len(info) >= 6: d_info['sorttime'] = '' d_info['caseNo'] = '' d_info['body'] = ''.join(info) for i in info: # 提取案号,案由,被告和原告 if ('号' in i) and (d_info['caseNo'] == ''): d_info['caseNo'] = i # 获取party index = info.index(i) party = info[index + 1] # 获取案由 anyou = ktgg.set_anyou() start, end = ktgg.search_anyou(anyou, party) if start == 0: break d_info['anyou'] = party[start:end] # 获取原告和被告 if '诉' in party: p = re.split('诉', party) d_info['plaintiff'] = p[0] d_info['pname'] = re.findall( '(.*?)%s' % d_info['anyou'], p[1]) else: d_info['pname'] = re.findall( '(.*?)%s' % d_info['anyou'], party)[0] # 提取开庭时间和开庭地点 if d_info['sorttime'] == '': sorttime = re.findall('\d{4}-\d{2}-\d{2}', i) if sorttime: d_info['sorttime'] = sorttime[0] index = info.index(i) d_info['courtNum'] = info[index - 1] if start == 0: continue d_info['md5'] = ktgg.get_md5(d_info['body'], d_info['url']) ktgg.ins_mysql(d_info, 'ktgg_kt_wuhan', db, cursor) # 第二种情况 elif 0 < len(info): f.append(info) if len(f) == 2: info = f[0] + f[1] d_info['body'] = ''.join(info) # 提取时间 sorttime = re.findall('\d{4}年.*?日', d_info['body']) if sorttime: d_info['sorttime'] = sorttime[0] # 提取法庭 courtNum = re.findall('第.{2,6}庭|回龙法庭', d_info['body']) if courtNum: d_info['courtNum'] = courtNum[0] # 获取案号 caseNo = re.findall('[((民].*?号', d_info['body']) if caseNo: d_info['caseNo'] = caseNo[0] for i in info: if '诉' in i: # 获取案由 anyou = ktgg.set_anyou() start, end = ktgg.search_anyou(anyou, i) if start == 0: break d_info['anyou'] = i[start:end] # 获取原告和被告 if '诉' in i: p = re.split('诉', i) if '号' in p[0]: d_info['plaintiff'] = p[0].split('号') else: d_info['plaintiff'] = p[0] d_info['pname'] = re.findall( '(.*?)%s' % d_info['anyou'], p[1]) else: d_info['pname'] = re.findall( '(.*?)%s' % d_info['anyou'], party)[0] f = [] if start == 0: continue d_info['md5'] = ktgg.get_md5(d_info['body'], d_info['url']) ktgg.ins_mysql(d_info, 'ktgg_kt_wuhan', db, cursor)
def parse_text(self,text,html,d,db,cursor): if '开庭公告' in d['title']: infos = html.xpath('//tbody/tr')[1:] for info in infos: d_info = d.copy() d_info['body'] = info.xpath('string(.)').replace('\r','').replace('\n','') if len(info.xpath('./td')) == 4: # 提取时间 d_info['sorttime'] = info.xpath('./td[4]')[0].xpath('string(.)').split(' ')[0] # 提取地点 d_info['courtNum'] = info.xpath('./td[3]')[0].xpath('string(.)') party = info.xpath('./td[2]')[0].xpath('string(.)') else: # 提取时间 d_info['sorttime'] = info.xpath('./td[3]')[0].xpath('string(.)').split(' ')[0] # 提取地点 d_info['courtNum'] = info.xpath('./td[2]')[0].xpath('string(.)') party = info.xpath('./td[1]')[0].xpath('string(.)') # 提取被告和原告和案由 for i in self.party: try: party = re.findall(i,party)[0] except IndexError: continue else: anyou = ktgg.set_anyou() if type(party) is str: start,end = ktgg.search_anyou(anyou,party) if start == 0: return d_info['anyou'] = party[start:end] d_info['pname'] = re.findall('(.*?)%s' % d_info['anyou'],party)[0].replace('被告人','').replace('被告','') elif type(party) is tuple: start,end = ktgg.search_anyou(anyou,party[1]) if start == 0: return d_info['anyou'] = party[1][start:end] d_info['plaintiff'] = party[0].replace('原告人','').replace('原告','') d_info['pname'] = re.findall('(.*?)%s' % d_info['anyou'],party[1])[0].replace('被告人','').replace('被告','') break else: return d_info['md5'] = ktgg.get_md5(d_info['body'],d_info['url']) ktgg.ins_mysql(d_info,'ktgg_kt_wuhan',db,cursor) else: d['body'] = text # 提取日期 sorttime = re.findall('\d{1,2}月.*?日',d['title']) if sorttime: d['sorttime'] = d['posttime'].split('-')[0] + '年' + sorttime[0] #提取审判庭 courtNum = re.findall('在(.{2,5}庭)',d['body']) if courtNum: d['courtNum'] = courtNum[0] for i in self.party: try: party = re.findall(i,d['body'])[0] except IndexError: continue else: anyou = ktgg.set_anyou() if type(party) is str: start,end = ktgg.search_anyou(anyou,party) if start == 0: return d['anyou'] = party[start:end] d['pname'] = re.findall('(.*?)%s' % d['anyou'],party)[0].replace('被告人','').replace('被告','').replace('审','').replace('理','') elif type(party) is tuple: start,end = ktgg.search_anyou(anyou,party[1]) if start == 0: return d['anyou'] = party[1][start:end] d['plaintiff'] = party[0].replace('原告人','').replace('原告','').replace('审','').replace('理','') d['pname'] = re.findall('(.*?)%s' % d['anyou'],party[1])[0].replace('被告人','').replace('被告','') break else: return d['md5'] = ktgg.get_md5(d['body'],d['url']) ktgg.ins_mysql(d,'ktgg_kt_wuhan',db,cursor)
def parse_text(self, text, d, db, cursor): if text[0][-2:] == '审理': text = [text[0] + text[1]] for info in text: if len(info) <= 30: continue d['body'] = info d['sorttime'] = '' d['courtNum'] = '' d['anyou'] = '' d['pname'] = '' d['plaintiff'] = '' # 获取开庭时间 sorttime = re.findall('\d{2,4}[年月].*?[日号]', info) if sorttime: d['sorttime'] = sorttime[0] # 获取开庭地点 courtNum = re.findall('第.*?庭', info) if courtNum: d['courtNum'] = ','.join(courtNum) # 获取被告和案由的文本 p = re.findall('被告.*?[罪案,。]', info) if not p: continue # 获取案由 anyou = ktgg.set_anyou() L = [] for x in p: l = [] for ay in anyou: if ay in x: l.append(ay) if l == []: L.append('') continue l.sort(reverse=True, key=len) L.append(l[0]) d['anyou'] = ','.join(L) # 获取被告 pnames = [] for x, y in zip(p, L): if y == '': continue pname = re.findall('被告(.*?)%s' % y, x)[0] for i in self.p: pname = pname.replace(i, '') pnames.append(pname) d['pname'] = ','.join(pnames) print(d['pname']) # 获取原告 plaintiff = re.findall('原告(.*?)[诉与]', info) if plaintiff: d['plaintiff'] = plaintiff[0] d['md5'] = ktgg.get_md5(info, d['url']) ktgg.ins_mysql(d, 'ktgg_kt_wuhan', db, cursor)
def parse_text(self, text, d, db, cursor): # 对文本切片获取每个案列 infos = re.split('\n', text) for info in infos: if '案由' in info or not info: continue # 获取案例的列表 l = re.split('\s', info) # 获取body for i in self.tihuan: info = info.replace(i, '') d['body'] = info # 案号 caseNo = re.findall('[民(].*?号', info) d['caseNo'] = '' if caseNo: d['caseNo'] = caseNo[0] # 案由 b = 0 anyou = ktgg.set_anyou() for i in l: for a in anyou: if a in i: d['anyou'] = i b = 1 break if b == 1: break else: continue # 开庭地点 for i in l: courtNum = re.findall('第.*?庭', i) d['courtNum'] = '' if courtNum: d['courtNum'] = courtNum[0] break # 开庭时间 sorttime = '' for i in l[-1::-1]: for x in self.t: if x in i: sorttime = i break if sorttime: break # 格式化时间 for i in self.re: s = re.findall(i, sorttime) if s: d['sorttime'] = s[0] break else: d['sorttime'] = '' if '.' in d['sorttime']: times = d['sorttime'].split('.') if len(times) == 2: d['sorttime'] = times[0] + '月' + times[1] + '日' else: d['sorttime'] = times[0] + '年' + times[1] + '月' + times[ 2] + '日' elif '/' in d['sorttime']: times = sorttime.split('/') d['sorttime'] = times[0] + '年' + times[1] + '月' + times[2] + '日' # 原告和被告 for i in l: if i == d['anyou']: s = l.index(i) + 1 while True: if l[s] == '': s += 1 continue yuan_bei = l[s] break break # 获取原告 d['party'] = '' d['plaintiff'] = '' d['pname'] = '' if ';' in yuan_bei: d['plaintiff'] = yuan_bei.split(';')[0].replace( '原告', '').replace(' ', '').replace(':', '') d['pname'] = yuan_bei.split(';')[1].replace('被告', '').replace( ' ', '').replace(':', '') elif ';' in yuan_bei: d['plaintiff'] = yuan_bei.split(';')[0].replace( '原告', '').replace(' ', '').replace(':', '') d['pname'] = yuan_bei.split(';')[1].replace('被告', '').replace( ' ', '').replace(':', '') elif '诉' in yuan_bei: d['plaintiff'] = yuan_bei.split('诉')[0].replace( '原告', '').replace(' ', '').replace(':', '') d['pname'] = yuan_bei.split('诉')[1].replace('被告', '').replace( ' ', '').replace(':', '') else: d['party'] = yuan_bei d['md5'] = ktgg.get_md5(info, d['url']) ktgg.ins_mysql(d, 'ktgg_kt_wuhan', db, cursor)