def mining(items): obj = {} a = items.get('articles') articles = eval(items.get('articles')) if a else [] article = '\n'.join(articles) title = items.get('title') # 必须包括的 obj['case_no'] = items.get('case_no', '') obj['publish_date'] = items.get('publish_date') obj['court_name'] = items.get('court_name', '') obj['source'] = items.get('source') obj['title'] = items.get('title', '') obj['update_time'] = update_time() obj['org_url'] = items.get('org_url') # 可能不在, 自己提取 obj['type'] = items.get('type') if items.get('type') else type_extract( title) obj['trial_round'] = items.get('trial_round') if items.get( 'trial_round') else trial_round_extract(title) obj['content_type'] = content_type_extract(content_type=items.get( 'content_type')) if items.get('content_type') else None content_type = items.get('content_type') reason = items.get('reason') trial_date = items.get('trial_date') if articles: ws = WenshuBase(article) litigants, agents = litigants_agent_extract('\n'.join( ws.role_paragraph)) court_officers = court_extract('\n'.join(ws.court_paragraph)) # print(ws.claims_paragraphs) litigation_request, claim = claim_extract(ws.claims_paragraphs) trial_date = trial_date if trial_date else trial_date_extract( ws.court_paragraph) court_level = court_level_extract(obj.get('court_name')) obj['litigants'] = litigants obj['agents'] = agents obj['court_officers'] = court_officers obj['court_level'] = court_level obj['content'] = '<br>'.join(articles) obj['content_type'] = content_type if content_type else content_type_extract( verdict=ws.verdict_paragraph, title=obj.get('title')) obj['reasons'] = reason_extract(ws.reason_description, obj.get('title'), obj.get('type'), reason) obj['verdict'] = ws.verdict obj['trial_date'] = trial_date obj['litigation_request'] = litigation_request obj['claim'] = claim obj['instrument_id'] = get_md5(obj.get('title')) + get_md5( pbracket(obj.get('case_no'))) return obj
def process_ws(items): l = heilongjiang_list(items) a = heilongjiang_article(items) ws = WenshuBase('\n'.join(a[5:])) litigants, agents = litigants_agent_extract('\n'.join(ws.role_paragraph)) court_officers = court_extract('\n'.join(ws.court_paragraph)) type = type_extract(l.get('title')) reasons = reason_extract(reason_description=ws.reason_description, title=l.get('title'), trial_type=type) trial_date = trial_date_extract(''.join(ws.court_paragraph)) court_level = court_level_extract(l.get('court_name')) trial_round = trial_round_extract(l.get('title')) content_type = content_type_extract(ws.verdict_paragraph, l.get('title')) case_no = a[4] claim = '' if content_type == '判决书' and trial_round == '一审': for reason in reasons: if reason['reason_code_level2'] == 104 or reason[ 'reason_code_level2'] == 105: claim = claim_extract(ws.claims_paragraphs) obj = { 'case_no': pbracket(case_no), 'reasons': reasons, 'source': '黑龙江市高级人民法院', 'type': type, 'title': pbracket(l.get('title')), 'content': '<br>'.join(a[5:]), 'agents': agents, 'update_time': update_time(), 'litigants': litigants, 'content_type': content_type, 'trial_round': trial_round, 'court_level': court_level, 'verdict': ws.verdict, 'trial_date': trial_date, 'court_officers': court_officers, 'court_name': l.get('court_name'), 'claim': claim, 'operator': 'leifeng', 'instrument_id': get_md5(l.get('title')) + get_md5(pbracket(case_no)) } ines(id=obj['instrument_id'], path='http://10.1.1.28:9200/judge_doc/local_doc', data=obj) if is_exists(url='http://10.1.1.28:9200/judge_doc/total_doc', field='case_no', value=obj['case_no']): ines(id=obj['instrument_id'], path='http://10.1.1.28:9200/judge_doc/total_doc', data=obj)
def process_ws(items): article = cpws_article(items) obj = {} cpws_lists = cpws_list(items) obj['instrument_id'] = cpws_lists.get('instrument_id') obj['court_name'] = cpws_lists.get('court_name') obj['type'] = cpws_lists.get('trial_type') obj['trial_round'] = cpws_lists.get('trial_round') obj['trial_date'] = cpws_lists.get('trial_date') obj['title'] = cpws_lists.get('title') obj['case_no'] = cpws_lists.get('case_no') obj['source'] = '裁判文书网' obj['update_time'] = update_time() obj['operator'] = 'leifeng' if article == '无全文': obj['has_content'] = False try: if article and article != '无全文': ws = WenshuBase(article) litigants, agents = litigants_agent_extract('\n'.join( ws.role_paragraph)) court_officers = court_extract('\n'.join(ws.court_paragraph)) # print(ws.claims_paragraphs) claim = claim_extract(ws.claims_paragraphs) obj['litigants'] = litigants obj['agents'] = agents, obj['court_officers'] = court_officers obj['court_level'] = court_level_extract( cpws_lists.get('court_name')) obj['publish_date'] = eval( items['detail_response'] )['PubDate'] if '{' in items['detail_response'] else '' obj['content'] = ws.article obj['content_type'] = content_type_extract( verdict=ws.verdict_paragraph, title=cpws_lists.get('title')) reasons = reason_extract(ws.reason_description, cpws_lists.get('title'), cpws_lists.get('trial_type')) obj['reasons'] = reasons obj['verdict'] = ws.verdict obj['claim'] = claim except: pass return obj
ines(id=obj['instrument_id'], path='http://10.1.1.28:9200/judge_doc/total_doc', data=obj) if __name__ == '__main__': mb = MysqlBase(connector) for i in range(1): for items in mb._execute( "select * from judge_doc_qinghai limit 1".format()): print(items['id']) try: l = qinghai_list(items) a = qinghai_article(items) ws = WenshuBase('\n'.join(a[3:])) litigants, agents = litigants_agent_extract('\n'.join( ws.role_paragraph)) court_officers = court_extract('\n'.join(ws.court_paragraph)) type = type_extract(l.get('title')) reasons = reason_extract( reason_description=ws.reason_description, title=l.get('title'), trial_type=type) trial_date = trial_date_extract(''.join(ws.court_paragraph)) court_level = court_level_extract(l.get('court_name')) trial_round = trial_round_extract(l.get('title')) content_type = content_type_extract(ws.verdict_paragraph, l.get('title')) case_no = l.get('case_no') claim = ''
def suqing(article): article = article.replace('<br>', '\n') ws = WenshuBase(article) c = claim_extract(ws.claims_paragraphs) return ''.join(c).replace('\n', '')
def process_ws(items): obj = {} shls = shanghai_list(items) trial_type = shanghai_trial_type(items) court_name = shanghai_court_name(items) # content = shanghai_content(items) article = shanghai_aricle(items) ws = WenshuBase(article) litigants, agents = litigants_agent_extract('\n'.join(ws.role_paragraph)) court_officers = court_extract('\n'.join(ws.court_paragraph)) content_type = content_type_extract(ws.verdict_paragraph, shls['title']) reasons = reason_extract(reason_description=ws.reason_description, title=shls['title'], trial_type=trial_type) court_level = court_level_extract(court_name) claim = '' if content_type == '判决书' and shls['trial_round'] == '一审': for reason in reasons: if reason['reason_code_level2'] == 104 or reason[ 'reason_code_level2'] == 105: claim = claim_extract(ws.claims_paragraphs) obj = { 'case_no': pbracket(shls['case_no']), 'reasons': reasons, 'source': '上海市高级人民法院', 'type': trial_type, 'title': pbracket(shls['title']), 'content': re.sub('\n', '<br>', article), 'agents': agents, 'update_time': update_time(), 'litigants': litigants, 'content_type': content_type, 'trial_round': shls['trial_round'], 'court_level': court_level, 'verdict': ws.verdict, 'trial_date': shls['trial_date'], 'court_officers': court_officers, 'court_name': court_name, 'claim': claim, 'operator': 'leifeng', 'instrument_id': get_md5(shls['title']) + get_md5(pbracket(shls['case_no'])) } ines(id=obj['instrument_id'], path='http://10.1.1.28:9200/judge_doc/local_doc', data=obj) if is_exists(url='http://10.1.1.28:9200/judge_doc/total_doc', field='case_no', value=obj['case_no']): ines(id=obj['instrument_id'], path='http://10.1.1.28:9200/judge_doc/total_doc', data=obj)