def mining(items): obj = {} a = items.get('articles') articles = eval(items.get('articles')) if a else [] article = '\n'.join(articles) title = items.get('title') # 必须包括的 obj['case_no'] = items.get('case_no', '') obj['publish_date'] = items.get('publish_date') obj['court_name'] = items.get('court_name', '') obj['source'] = items.get('source') obj['title'] = items.get('title', '') obj['update_time'] = update_time() obj['org_url'] = items.get('org_url') # 可能不在, 自己提取 obj['type'] = items.get('type') if items.get('type') else type_extract( title) obj['trial_round'] = items.get('trial_round') if items.get( 'trial_round') else trial_round_extract(title) obj['content_type'] = content_type_extract(content_type=items.get( 'content_type')) if items.get('content_type') else None content_type = items.get('content_type') reason = items.get('reason') trial_date = items.get('trial_date') if articles: ws = WenshuBase(article) litigants, agents = litigants_agent_extract('\n'.join( ws.role_paragraph)) court_officers = court_extract('\n'.join(ws.court_paragraph)) # print(ws.claims_paragraphs) litigation_request, claim = claim_extract(ws.claims_paragraphs) trial_date = trial_date if trial_date else trial_date_extract( ws.court_paragraph) court_level = court_level_extract(obj.get('court_name')) obj['litigants'] = litigants obj['agents'] = agents obj['court_officers'] = court_officers obj['court_level'] = court_level obj['content'] = '<br>'.join(articles) obj['content_type'] = content_type if content_type else content_type_extract( verdict=ws.verdict_paragraph, title=obj.get('title')) obj['reasons'] = reason_extract(ws.reason_description, obj.get('title'), obj.get('type'), reason) obj['verdict'] = ws.verdict obj['trial_date'] = trial_date obj['litigation_request'] = litigation_request obj['claim'] = claim obj['instrument_id'] = get_md5(obj.get('title')) + get_md5( pbracket(obj.get('case_no'))) return obj
def process_ws(items): l = heilongjiang_list(items) a = heilongjiang_article(items) ws = WenshuBase('\n'.join(a[5:])) litigants, agents = litigants_agent_extract('\n'.join(ws.role_paragraph)) court_officers = court_extract('\n'.join(ws.court_paragraph)) type = type_extract(l.get('title')) reasons = reason_extract(reason_description=ws.reason_description, title=l.get('title'), trial_type=type) trial_date = trial_date_extract(''.join(ws.court_paragraph)) court_level = court_level_extract(l.get('court_name')) trial_round = trial_round_extract(l.get('title')) content_type = content_type_extract(ws.verdict_paragraph, l.get('title')) case_no = a[4] claim = '' if content_type == '判决书' and trial_round == '一审': for reason in reasons: if reason['reason_code_level2'] == 104 or reason[ 'reason_code_level2'] == 105: claim = claim_extract(ws.claims_paragraphs) obj = { 'case_no': pbracket(case_no), 'reasons': reasons, 'source': '黑龙江市高级人民法院', 'type': type, 'title': pbracket(l.get('title')), 'content': '<br>'.join(a[5:]), 'agents': agents, 'update_time': update_time(), 'litigants': litigants, 'content_type': content_type, 'trial_round': trial_round, 'court_level': court_level, 'verdict': ws.verdict, 'trial_date': trial_date, 'court_officers': court_officers, 'court_name': l.get('court_name'), 'claim': claim, 'operator': 'leifeng', 'instrument_id': get_md5(l.get('title')) + get_md5(pbracket(case_no)) } ines(id=obj['instrument_id'], path='http://10.1.1.28:9200/judge_doc/local_doc', data=obj) if is_exists(url='http://10.1.1.28:9200/judge_doc/total_doc', field='case_no', value=obj['case_no']): ines(id=obj['instrument_id'], path='http://10.1.1.28:9200/judge_doc/total_doc', data=obj)
"select * from judge_doc_qinghai limit 1".format()): print(items['id']) try: l = qinghai_list(items) a = qinghai_article(items) ws = WenshuBase('\n'.join(a[3:])) litigants, agents = litigants_agent_extract('\n'.join( ws.role_paragraph)) court_officers = court_extract('\n'.join(ws.court_paragraph)) type = type_extract(l.get('title')) reasons = reason_extract( reason_description=ws.reason_description, title=l.get('title'), trial_type=type) trial_date = trial_date_extract(''.join(ws.court_paragraph)) court_level = court_level_extract(l.get('court_name')) trial_round = trial_round_extract(l.get('title')) content_type = content_type_extract(ws.verdict_paragraph, l.get('title')) case_no = l.get('case_no') claim = '' if content_type == '判决书' and trial_round == '一审': for reason in reasons: if reason['reason_code_level2'] == 104 or reason[ 'reason_code_level2'] == 105: claim = claim_extract(ws.claims_paragraphs) obj = { 'case_no': pbracket(l.get('case_no')), 'reasons':