Ejemplo n.º 1
0
    def fuzzy_analyse(self, old_data):
        "分析方法,开始对数据进行分析"
        analy = Analyse()
        analy.text = old_data.content_all

        _header, part, content, case_sign = analy.split_to_four_parts()
        if len(_header.split('\n')) < 4:
            _header = "\n".join(analy.text_in_lines[0:6])
        _header = re.sub(u'日期:|法院:|案号:', '', _header)
        clients_attr, lawyers_attr = analy.guess_clients_lawyers(
            part.split('\n'))
        case_sign_key = analy.guess_case_sign(case_sign.split('\n'))
        head_key = analy.guess_header_types(_header.split('\n'))

        clients_attr[u'原告'] = list(set(clients_attr[u'原告']))
        clients_attr[u'被告'] = list(set(clients_attr[u'被告']))
        lawyers_attr[u'原告'] = list(set(lawyers_attr[u'原告']))
        lawyers_attr[u'被告'] = list(set(lawyers_attr[u'被告']))

        plaintiff = ''
        defendant = ''
        plaintiff_lawyers = ''
        defendant_lawyers = ''
        end_time = analy.guess_end_date(case_sign)
        replace_data = analy._replace_data(part)

        if clients_attr[u'原告']:
            plaintiff = ';'.join(
                u"%s:%s:%s" % client for client in clients_attr[u'原告'])
        if clients_attr[u'被告']:
            defendant = ';'.join(
                u"%s:%s:%s" % client for client in clients_attr[u'被告'])
        if lawyers_attr[u'原告']:
            plaintiff_lawyers = ';'.join(
                u"%s:%s" % lawyer for lawyer in lawyers_attr[u'原告'])
        if lawyers_attr[u'被告']:
            defendant_lawyers = ';'.join(
                u"%s:%s" % lawyer for lawyer in lawyers_attr[u'被告'])

        return [(_header, part, content, case_sign),
                (plaintiff, plaintiff_lawyers),
                (defendant, defendant_lawyers),
                case_sign_key,
                head_key,
                replace_data,
                end_time]
Ejemplo n.º 2
0
    def to_ot_rawdata_judgement_court_gov_cn_old(self, old, todat):

        new = ot_rawdata_judgement_court_gov_cn_old()
        new.url = old.url
        new.referer = old.url

        analy = Analyse()
        try:
            raw_html = XPath(old.source_data).execute(
                '//*[@id="ws"]/table')[0].to_html()
        except IndexError:
            print '[Error] Analyse: url = %s' % new.url

            # Request Get
            for item in PROXY:

                r = requests.get(
                    new.url, proxies={'http': 'http:%s:59274' % item}, timeout=30)
                if r.ok:
                    break
            if not r.ok:
                raise Exception,\
                    'Get faild url = %s' % old.url

            to = old.__class__()
            to.id = old.id
            to.source_data = r.text
            raw_html = XPath(to.source_data).execute(
                '//*[@id="ws"]/table')[0].to_html()
            point = insert_database(
                'Judgment', tablename=to.__class__, editor=new)
            point.update()
            return
        text = html_to_text(HTML_PARSER.unescape(raw_html))
        try:
            text = re.sub('//W3C//DTD HTML 4.0 Transitional//EN\'>', '', text)
        except:
            pass
        analy.text = text
        new.content_all = analy.text

        _header, part, content, case_sign = analy.split_to_four_parts()

        new.clients_attr, new.lawyers_attr = analy.guess_clients_lawyers(
            part.split('\n'))

        end_date = analy.guess_end_date(case_sign)
        new.end_date = end_date

        case_sign_key = analy.guess_case_sign(case_sign.split('\n'))
        head_key = analy.guess_header_types(_header.split('\n'))

        new.content = part + content

        new.case_sign = case_sign
        new.case_number = head_key['case_number']
        new.department = head_key['department']
        new.type = head_key['type']
        new.title = head_key['title']
        new.case_type = head_key['case_type']

        new.procedure = new.procedure or analy.guess_procedure(new.case_number)

        new.replace_data = json.dumps(analy._replace_data(part))

        new.chief_judge = ",".join(case_sign_key[u'审判长'])
        new.acting_judges = ",".join(case_sign_key[u'代理审判员'])
        new.judge = ",".join(case_sign_key[u'审判员'])
        new.clerk = ",".join(list(set(case_sign_key[u'书记员'])))

        new.input_time = arrow.now().timestamp

        # if (not new.chief_judge and not new.judge and not new.acting_judges.strip()) or \
        #   (u'事务所' not in new.plaintiff_lawyers and u'事务所' not in new.defendant_lawyers):
        #    return

        new.parent_id = old.id
        print 'Runing String <ot_rawdata_judgement_court_gov_cn_old> parent_id = %s , url = %s' % (old.id, old.url)

        point = insert_database(
            'Judgment', tablename=ot_rawdata_judgement_court_gov_cn_old, editor=new)
        point.insert()