def process_ws(items): try: obj = mining(items) ines(id=obj['instrument_id'], path='http://10.1.1.28:9200/judge_doc/local_doc', data=obj) if is_exists(url='http://10.1.1.28:9200/judge_doc/total_doc', field='case_no', value=obj['case_no']): ines(id=obj['instrument_id'], path='http://10.1.1.28:9200/judge_doc/total_doc', data=obj) except Exception as e: id = get_md5(items.get('title')) + get_md5( pbracket(items.get('case_no', ''))) obj = { "_reason_": str(e), "data_size": len(items), "crawl_time": update_time(), "processed": False, "hostname": "worker1.yscredit.com", "data": items, "create_time": update_time(), "ip": "null", "_id_": id, "topic": "裁判文书" } print(obj) ines(id=id, path='http://10.1.1.28:9200/fail_record/fail_record', data=obj)
def process_ws(items): l = heilongjiang_list(items) a = heilongjiang_article(items) ws = WenshuBase('\n'.join(a[5:])) litigants, agents = litigants_agent_extract('\n'.join(ws.role_paragraph)) court_officers = court_extract('\n'.join(ws.court_paragraph)) type = type_extract(l.get('title')) reasons = reason_extract(reason_description=ws.reason_description, title=l.get('title'), trial_type=type) trial_date = trial_date_extract(''.join(ws.court_paragraph)) court_level = court_level_extract(l.get('court_name')) trial_round = trial_round_extract(l.get('title')) content_type = content_type_extract(ws.verdict_paragraph, l.get('title')) case_no = a[4] claim = '' if content_type == '判决书' and trial_round == '一审': for reason in reasons: if reason['reason_code_level2'] == 104 or reason[ 'reason_code_level2'] == 105: claim = claim_extract(ws.claims_paragraphs) obj = { 'case_no': pbracket(case_no), 'reasons': reasons, 'source': '黑龙江市高级人民法院', 'type': type, 'title': pbracket(l.get('title')), 'content': '<br>'.join(a[5:]), 'agents': agents, 'update_time': update_time(), 'litigants': litigants, 'content_type': content_type, 'trial_round': trial_round, 'court_level': court_level, 'verdict': ws.verdict, 'trial_date': trial_date, 'court_officers': court_officers, 'court_name': l.get('court_name'), 'claim': claim, 'operator': 'leifeng', 'instrument_id': get_md5(l.get('title')) + get_md5(pbracket(case_no)) } ines(id=obj['instrument_id'], path='http://10.1.1.28:9200/judge_doc/local_doc', data=obj) if is_exists(url='http://10.1.1.28:9200/judge_doc/total_doc', field='case_no', value=obj['case_no']): ines(id=obj['instrument_id'], path='http://10.1.1.28:9200/judge_doc/total_doc', data=obj)
def process_ws(items): try: obj = mining(items) obj = tag(obj) ines(id=obj['instrument_id'], path='{}/judge_doc/local_doc'.format(es_path), data=obj) if obj['source'] != '裁判文书网': if is_exists(url='{}/judge_doc/total_doc'.format(es_path), field='case_no', value=obj['case_no']): ines(id=obj['instrument_id'], path='{}/judge_doc/total_doc'.format(es_path), data=obj) else: ines(id=obj['instrument_id'], path='{}/judge_doc/total_doc'.format(es_path), data=obj) except Exception as e: id = get_md5(items.get('title') + str(update_time())) obj = { "_reason_": str(e), "data_size": len(items), "crawl_time": update_time(), "processed": False, "hostname": "worker1.yscredit.com", "data": items, "create_time": update_time(), "ip": "null", "_id_": id, "topic": "裁判文书" } print(obj) ines(id=id, path='{}/fail_record/fail_record'.format(es_path), data=obj)
trial_round, 'court_level': court_level, 'verdict': ws.verdict, 'trial_date': trial_date, 'court_officers': court_officers, 'court_name': l.get('court_name'), 'claim': claim, 'operator': 'leifeng', 'instrument_id': get_md5(l.get('title')) + get_md5(pbracket(l.get('case_no'))) } ines(id=obj['instrument_id'], path='http://10.1.1.28:9200/judge_doc/local_doc', data=obj) if is_exists(url='http://10.1.1.28:9200/judge_doc/total_doc', field='case_no', value=obj['case_no']): ines(id=obj['instrument_id'], path='http://10.1.1.28:9200/judge_doc/total_doc', data=obj) except e: print(e)
def process_ws(items): obj = {} shls = shanghai_list(items) trial_type = shanghai_trial_type(items) court_name = shanghai_court_name(items) # content = shanghai_content(items) article = shanghai_aricle(items) ws = WenshuBase(article) litigants, agents = litigants_agent_extract('\n'.join(ws.role_paragraph)) court_officers = court_extract('\n'.join(ws.court_paragraph)) content_type = content_type_extract(ws.verdict_paragraph, shls['title']) reasons = reason_extract(reason_description=ws.reason_description, title=shls['title'], trial_type=trial_type) court_level = court_level_extract(court_name) claim = '' if content_type == '判决书' and shls['trial_round'] == '一审': for reason in reasons: if reason['reason_code_level2'] == 104 or reason[ 'reason_code_level2'] == 105: claim = claim_extract(ws.claims_paragraphs) obj = { 'case_no': pbracket(shls['case_no']), 'reasons': reasons, 'source': '上海市高级人民法院', 'type': trial_type, 'title': pbracket(shls['title']), 'content': re.sub('\n', '<br>', article), 'agents': agents, 'update_time': update_time(), 'litigants': litigants, 'content_type': content_type, 'trial_round': shls['trial_round'], 'court_level': court_level, 'verdict': ws.verdict, 'trial_date': shls['trial_date'], 'court_officers': court_officers, 'court_name': court_name, 'claim': claim, 'operator': 'leifeng', 'instrument_id': get_md5(shls['title']) + get_md5(pbracket(shls['case_no'])) } ines(id=obj['instrument_id'], path='http://10.1.1.28:9200/judge_doc/local_doc', data=obj) if is_exists(url='http://10.1.1.28:9200/judge_doc/total_doc', field='case_no', value=obj['case_no']): ines(id=obj['instrument_id'], path='http://10.1.1.28:9200/judge_doc/total_doc', data=obj)