Beispiel #1
0
    def get_region(self, content, isaddress=False):
        '''获取行政区域'''
        if isaddress:
            return self.get_region_from_address(content)

        content_str = toolsutil.utf8_encode(content)
        city = self.region_city_map.get(content_str, None)
        if city:
            return city

        seg_list = jieba.cut(content, cut_all=True)
        city_freq = dict()
        for seg in seg_list:
            seg = str(seg)
            seg = toolsutil.utf8_encode(seg)
            if seg in self.city_set:
                return self.region_city_map.get(seg)
            else:
                province = self.region_city_map.get(seg, '')
                if province != '':
                    city_freq[province] = city_freq.get(province, 0) + 1
        for k, v in sorted(city_freq.items(),
                           lambda x, y: cmp(x[1], y[1]),
                           reverse=True):
            return k
        return ''
Beispiel #2
0
    def get_province(self, content, isaddress=False):
        if isaddress:
            return self.get_province_from_address(content)

        content_str = toolsutil.utf8_encode(content)
        province = self.province_kv.get(content_str, None)
        if province:
            return province
        seg_list = jieba.cut(content, cut_all=True)
        province_freq = dict()
        for seg in seg_list:
            seg = str(seg)
            seg = toolsutil.utf8_encode(seg)
            if seg in self.provinces:
                return seg
            else:
                province = self.province_kv.get(seg, '')
                if province != '':
                    province_freq[province] = province_freq.get(province,
                                                                0) + 1
        for k, v in sorted(province_freq.items(),
                           lambda x, y: cmp(x[1], y[1]),
                           reverse=True):
            return k
        return ''
Beispiel #3
0
 def load_stopword(self, conf):
     priority_kv = dict()
     terms = list()
     for line in open(conf):
         line = line.strip()
         if len(line) <= 0:
             continue
         term = toolsutil.utf8_encode(line)
         priority_kv[term] = len(term)
     for k, v in sorted(priority_kv.items(),
                        lambda x, y: cmp(x[1], y[1]),
                        reverse=True):
         terms.append(toolsutil.utf8_encode(k))
     return terms
Beispiel #4
0
 def norm_litigant(self, litigants):
     norm_litigants = list()
     for litigant in litigants:
         litigant = litigant.replace('(', '(').replace(')', ')')
         litigant = toolsutil.utf8_encode(litigant)
         if '代理' in litigant or '委托' in litigant or '代表' in litigant:
             continue
         while True:
             found = False
             for word in self.stopwords:
                 if litigant.startswith(word):
                     litigant = litigant.replace(word, '')
                     found = True
             if not found:
                 break
         if litigant.endswith(')'):
             idx = litigant.rfind('(')
             if idx != -1:
                 litigant = litigant[:idx]
         if litigant.startswith('('):
             idx = litigant.find(')')
             if idx != -1:
                 litigant = litigant[idx + len(')'):]
         norm_litigants.append(litigant)
     return norm_litigants
    def predict(self, content):
        '''预测行业'''
        industry = ''
        texts = []
        content = toolsutil.utf8_encode(content.strip())
        if len(content) <= 0:
            return industry

        company_seg = ' '.join(jieba.cut(content))
        texts.append(company_seg)
        if len(texts) >= 100000:
            labels = self.classifier.predict_proba(texts, 1)
            if len(labels) != len(texts):
                texts = []
                return industry
            for i in range(len(labels)):
                if len(labels[i]) <= 0:
                    continue
                label, prob = labels[i][0]
                industry = label
            texts = list()

        if len(texts) > 0:
            labels = self.classifier.predict_proba(texts, 1)
            for i in range(len(labels)):
                if len(labels[i]) <= 0:
                    continue
                label, prob = labels[i][0]
                industry = label

        return industry
Beispiel #6
0
    def _load_kv(self, file):
        my_kv = dict()
        my_set = set()
        for line in open(file):
            fields = line.strip().split('\t')
            if len(fields) != 2:
                continue
            city = toolsutil.utf8_encode(fields[0].strip())
            province = toolsutil.utf8_encode(fields[1].strip())
            self.province_index.enter(city)
            my_kv[city] = province
            my_set.add(province)

        self.province_index.fix()

        return my_set, my_kv
Beispiel #7
0
 def __load_char(self, conf):
     causes_regex = set()
     for line in open(conf):
         cause = line.strip()
         if len(cause) <= 0:
             continue
         cause = toolsutil.utf8_encode(cause)
         causes_regex.add(re.compile(cause))
     return list(causes_regex)
Beispiel #8
0
 def get_case_causes(self, content):
     results = list()
     content = toolsutil.utf8_encode(content)
     cause_kv = dict()
     for cause_regex in self.cause_regex_list:
         ret = toolsutil.re_findone(cause_regex, content)
         if ret:
             cause_kv[ret] = len(ret)
     causes = self.__dedup(cause_kv)
     for k, v in sorted(cause_kv.items(),
                        lambda x, y: cmp(x[1], y[1]),
                        reverse=True):
         if k in causes:
             results.append(k)
     return results
Beispiel #9
0
    def _norm_date(self, date_values):
        '''规划时间'''
        if date_values == "" or date_values == None or date_values == "null":
            return ""

        date_values = toolsutil.utf8_encode(date_values)

        for replace_str in self.replace_list:
            date_values = date_values.replace(replace_str, '')
        for replace_space in self.replace_space_list:
            date_values = date_values.replace(replace_space, ' ')

        date_values = date_values.strip().replace('.', '-').replace('/', '-').replace('.', '-')
        date_values = date_values.replace("年", "-").replace("月", "-").replace("点", ":").replace("时", ":").replace("分", ":").replace(":", ":")

        return date_values
Beispiel #10
0
    def _norm_date(self, date_values):
        '''规划时间'''
        if date_values == "" or date_values == None or date_values == "null":
            return ""

        date_values = toolsutil.utf8_encode(date_values)

        for replace_str in self.replace_list:
            date_values = date_values.replace(replace_str, '')
        for replace_space in self.replace_space_list:
            date_values = date_values.replace(replace_space, ' ')

        date_values = date_values.strip().replace('.', '-').replace(
            '/', '-').replace('.', '-')
        date_values = date_values.replace("年", "-").replace("月", "-").replace(
            "点", ":").replace("时", ":").replace("分", ":").replace(":", ":")

        return date_values