def process_ws(items): try: obj = mining(items) ines(id=obj['instrument_id'], path='http://10.1.1.28:9200/judge_doc/local_doc', data=obj) if is_exists(url='http://10.1.1.28:9200/judge_doc/total_doc', field='case_no', value=obj['case_no']): ines(id=obj['instrument_id'], path='http://10.1.1.28:9200/judge_doc/total_doc', data=obj) except Exception as e: id = get_md5(items.get('title')) + get_md5( pbracket(items.get('case_no', ''))) obj = { "_reason_": str(e), "data_size": len(items), "crawl_time": update_time(), "processed": False, "hostname": "worker1.yscredit.com", "data": items, "create_time": update_time(), "ip": "null", "_id_": id, "topic": "裁判文书" } print(obj) ines(id=id, path='http://10.1.1.28:9200/fail_record/fail_record', data=obj)
def mining(items): obj = {} a = items.get('articles') articles = eval(items.get('articles')) if a else [] article = '\n'.join(articles) title = items.get('title') # 必须包括的 obj['case_no'] = items.get('case_no', '') obj['publish_date'] = items.get('publish_date') obj['court_name'] = items.get('court_name', '') obj['source'] = items.get('source') obj['title'] = items.get('title', '') obj['update_time'] = update_time() obj['org_url'] = items.get('org_url') # 可能不在, 自己提取 obj['type'] = items.get('type') if items.get('type') else type_extract( title) obj['trial_round'] = items.get('trial_round') if items.get( 'trial_round') else trial_round_extract(title) obj['content_type'] = content_type_extract(content_type=items.get( 'content_type')) if items.get('content_type') else None content_type = items.get('content_type') reason = items.get('reason') trial_date = items.get('trial_date') if articles: ws = WenshuBase(article) litigants, agents = litigants_agent_extract('\n'.join( ws.role_paragraph)) court_officers = court_extract('\n'.join(ws.court_paragraph)) # print(ws.claims_paragraphs) litigation_request, claim = claim_extract(ws.claims_paragraphs) trial_date = trial_date if trial_date else trial_date_extract( ws.court_paragraph) court_level = court_level_extract(obj.get('court_name')) obj['litigants'] = litigants obj['agents'] = agents obj['court_officers'] = court_officers obj['court_level'] = court_level obj['content'] = '<br>'.join(articles) obj['content_type'] = content_type if content_type else content_type_extract( verdict=ws.verdict_paragraph, title=obj.get('title')) obj['reasons'] = reason_extract(ws.reason_description, obj.get('title'), obj.get('type'), reason) obj['verdict'] = ws.verdict obj['trial_date'] = trial_date obj['litigation_request'] = litigation_request obj['claim'] = claim obj['instrument_id'] = get_md5(obj.get('title')) + get_md5( pbracket(obj.get('case_no'))) return obj
def process_ws(items): l = heilongjiang_list(items) a = heilongjiang_article(items) ws = WenshuBase('\n'.join(a[5:])) litigants, agents = litigants_agent_extract('\n'.join(ws.role_paragraph)) court_officers = court_extract('\n'.join(ws.court_paragraph)) type = type_extract(l.get('title')) reasons = reason_extract(reason_description=ws.reason_description, title=l.get('title'), trial_type=type) trial_date = trial_date_extract(''.join(ws.court_paragraph)) court_level = court_level_extract(l.get('court_name')) trial_round = trial_round_extract(l.get('title')) content_type = content_type_extract(ws.verdict_paragraph, l.get('title')) case_no = a[4] claim = '' if content_type == '判决书' and trial_round == '一审': for reason in reasons: if reason['reason_code_level2'] == 104 or reason[ 'reason_code_level2'] == 105: claim = claim_extract(ws.claims_paragraphs) obj = { 'case_no': pbracket(case_no), 'reasons': reasons, 'source': '黑龙江市高级人民法院', 'type': type, 'title': pbracket(l.get('title')), 'content': '<br>'.join(a[5:]), 'agents': agents, 'update_time': update_time(), 'litigants': litigants, 'content_type': content_type, 'trial_round': trial_round, 'court_level': court_level, 'verdict': ws.verdict, 'trial_date': trial_date, 'court_officers': court_officers, 'court_name': l.get('court_name'), 'claim': claim, 'operator': 'leifeng', 'instrument_id': get_md5(l.get('title')) + get_md5(pbracket(case_no)) } ines(id=obj['instrument_id'], path='http://10.1.1.28:9200/judge_doc/local_doc', data=obj) if is_exists(url='http://10.1.1.28:9200/judge_doc/total_doc', field='case_no', value=obj['case_no']): ines(id=obj['instrument_id'], path='http://10.1.1.28:9200/judge_doc/total_doc', data=obj)
def process_ws(items): try: obj = mining(items) obj = tag(obj) ines(id=obj['instrument_id'], path='{}/judge_doc/local_doc'.format(es_path), data=obj) if obj['source'] != '裁判文书网': if is_exists(url='{}/judge_doc/total_doc'.format(es_path), field='case_no', value=obj['case_no']): ines(id=obj['instrument_id'], path='{}/judge_doc/total_doc'.format(es_path), data=obj) else: ines(id=obj['instrument_id'], path='{}/judge_doc/total_doc'.format(es_path), data=obj) except Exception as e: id = get_md5(items.get('title') + str(update_time())) obj = { "_reason_": str(e), "data_size": len(items), "crawl_time": update_time(), "processed": False, "hostname": "worker1.yscredit.com", "data": items, "create_time": update_time(), "ip": "null", "_id_": id, "topic": "裁判文书" } print(obj) ines(id=id, path='{}/fail_record/fail_record'.format(es_path), data=obj)
trial_round, 'court_level': court_level, 'verdict': ws.verdict, 'trial_date': trial_date, 'court_officers': court_officers, 'court_name': l.get('court_name'), 'claim': claim, 'operator': 'leifeng', 'instrument_id': get_md5(l.get('title')) + get_md5(pbracket(l.get('case_no'))) } ines(id=obj['instrument_id'], path='http://10.1.1.28:9200/judge_doc/local_doc', data=obj) if is_exists(url='http://10.1.1.28:9200/judge_doc/total_doc', field='case_no', value=obj['case_no']): ines(id=obj['instrument_id'], path='http://10.1.1.28:9200/judge_doc/total_doc', data=obj) except e: print(e)
ines(id=obj['instrument_id'], path='{}/judge_doc/total_doc'.format(es_path), data=obj) else: ines(id=obj['instrument_id'], path='{}/judge_doc/total_doc'.format(es_path), data=obj) except Exception as e: id = get_md5(items.get('title') + str(update_time())) obj = { "_reason_": str(e), "data_size": len(items), "crawl_time": update_time(), "processed": False, "hostname": "worker1.yscredit.com", "data": items, "create_time": update_time(), "ip": "null", "_id_": id, "topic": "裁判文书" } print(obj) ines(id=id, path='{}/fail_record/fail_record'.format(es_path), data=obj) if __name__ == '__main__': id = get_md5(str(update_time()))
def process_ws(items): obj = {} shls = shanghai_list(items) trial_type = shanghai_trial_type(items) court_name = shanghai_court_name(items) # content = shanghai_content(items) article = shanghai_aricle(items) ws = WenshuBase(article) litigants, agents = litigants_agent_extract('\n'.join(ws.role_paragraph)) court_officers = court_extract('\n'.join(ws.court_paragraph)) content_type = content_type_extract(ws.verdict_paragraph, shls['title']) reasons = reason_extract(reason_description=ws.reason_description, title=shls['title'], trial_type=trial_type) court_level = court_level_extract(court_name) claim = '' if content_type == '判决书' and shls['trial_round'] == '一审': for reason in reasons: if reason['reason_code_level2'] == 104 or reason[ 'reason_code_level2'] == 105: claim = claim_extract(ws.claims_paragraphs) obj = { 'case_no': pbracket(shls['case_no']), 'reasons': reasons, 'source': '上海市高级人民法院', 'type': trial_type, 'title': pbracket(shls['title']), 'content': re.sub('\n', '<br>', article), 'agents': agents, 'update_time': update_time(), 'litigants': litigants, 'content_type': content_type, 'trial_round': shls['trial_round'], 'court_level': court_level, 'verdict': ws.verdict, 'trial_date': shls['trial_date'], 'court_officers': court_officers, 'court_name': court_name, 'claim': claim, 'operator': 'leifeng', 'instrument_id': get_md5(shls['title']) + get_md5(pbracket(shls['case_no'])) } ines(id=obj['instrument_id'], path='http://10.1.1.28:9200/judge_doc/local_doc', data=obj) if is_exists(url='http://10.1.1.28:9200/judge_doc/total_doc', field='case_no', value=obj['case_no']): ines(id=obj['instrument_id'], path='http://10.1.1.28:9200/judge_doc/total_doc', data=obj)
content('p').remove() announcer, ann_date = content('span').text().split(' ') content('span').remove() c = content.text() i = c.find(':') defendant_origin = c[:i] defendants = c[:i].strip().split('、') defendants.append(defendant) ann_content = c[i + 1:].strip() dd = re.findall('(\d+)年(\d+)月(\d+)日', ann_date)[0] for d in defendants: yield { 'ann_type': ann_type, 'announcer': announcer, 'defendant': d, 'defendant_origin': defendant_origin, 'ann_date': date(year=int(dd[0]), month=int(dd[1]), day=int(dd[2])).isoformat(), 'ann_content': ann_content, 'ann_html': article } mb = MysqlBase(connecter) for item in mb._execute("select * from sh_sdgg where is_process = 0"): article = items['detail'] for e in extract_fygg(article): id = get_md5(e['ann_type']) + get_md5(e['defendant']) + get_md5(e['ann_date']) e['id'] = id ines(id=id, path='http://10.1.1.28:9200/court_announcement/court_announcement', data=e)
if content_type == '判决书' and trial_round == '一审': for reason in reasons: if reason['reason_code_level2'] == 104 or reason['reason_code_level2'] == 105: claim = claim_extract(ws.claims_paragraphs) obj = { 'case_no': pbracket(l.get('case_no')), 'reasons': reasons, 'source': '吉林高级人民法院', 'type': type, 'title': pbracket(l.get('title')), 'content': '<br>'.join(a), 'agents': agents, 'update_time': update_time(), 'litigants': litigants, 'content_type': content_type, 'trial_round': trial_round, 'court_level': court_level, 'verdict': ws.verdict, 'trial_date': trial_date, 'court_officers': court_officers, 'court_name': l.get('court_name'), 'claim': claim, 'operator': 'leifeng', 'instrument_id': get_md5(l.get('title')) + get_md5(pbracket(l.get('case_no'))) } ines(id=obj['instrument_id'], path='http://10.1.1.28:9200/judge_doc/local_doc', data=obj) if is_exists(url='http://10.1.1.28:9200/judge_doc/total_doc', field='case_no', value=obj['case_no']): ines(id=obj['instrument_id'], path='http://10.1.1.28:9200/judge_doc/total_doc', data=obj) except e: print(e)