Example #1
0
    def load_train_data(self):
        jieba_parse(u"你好")  # load jieba first

        cpath = self.cpath

        data_list = [
            {
                "path": [cpath('documents_with_segments'), ],
                "var": "documents_with_segments", "lazy": True,
            },
            {
                "path": [cpath("idf"), ],
                "var": "idf_cache", "lazy": False,
            },
            {
                "path": [cpath("entropy"), ],
                "var": "entropy_cache", "lazy": False,
            },
            {
                "path": [self.model.dbpath, cpath('documents_with_features'), ],
                "var": "documents_with_features", "lazy": True,
            },
            {
                "path": cpath('tags_tree'),
                "var": "tags_tree", "lazy": False,
            },
            {
                "path": cpath('text_engine'),
                "var": "text_similarity_engine", "lazy": False,
            },
        ]

        if not os.path.exists(cpath('idf')):
            self.model.pull_data()

        for idx1, data1 in enumerate(data_list):
            print "\n" * 5
            print "_" * 150
            print "[" * 3 + "#" * 144 + "]" * 3
            print "[第%i步] 检查预生成 %s" % (idx1 + 1, data1['path'])

            if not isinstance(data1['path'], list):
                data1['path'] = [data1['path']]

            should_read_attr = False
            for path1 in data1['path']:
                if not os.path.exists(path1):
                    should_read_attr = True

            if (not data1['lazy']) or should_read_attr:
                getattr(self, data1['var'])
    def get_region_lines(self, source1_region):
        """ get uniq lines from current `code1` to the root. """
        regiones = jieba_parse(source1_region)
        codes_list = [self.region_with_parents(i1) for i1 in \
                            [self.region_encode(r1) for r1 in regiones] \
                                            if i1 is not None or 0]
        codes_list = sorted(list(set(codes_list)), key=lambda li: len(li))

        enders = set([])
        region_tree = dict()
        for codes1 in codes_list:
            current_region_tree = region_tree
            for idx1, code1 in enumerate(codes1):
                is_ender = (idx1 + 1) == len(codes1)
                if is_ender:
                    enders.add(code1)
                    break
                code2 = codes1[idx1+1]
                if code1 in enders: enders.remove(code1)
                current_region_tree[code1] = current_region_tree.get(code1, dict())
                current_region_tree[code1][code2] = current_region_tree[code1].get(code2, dict())
                current_region_tree = current_region_tree[code1]

        codes_list = [codes1 for codes1 in codes_list if codes1[-1] in enders]

        return [
                    [{"id": code1, "name": self.code_to_name__dict[code1]} \
                    for code1 in codes1 if code1 in self.code_to_name__dict] \
               for codes1 in codes_list ]
Example #3
0
 def documents_with_segments(self):
     """ 纯分词 """
     return ParallelData.process(
         self.model,
         'dict',
         cache_filename=self.model.pickle_path('documents_with_segments'),
         item_func=lambda item1: Counter(jieba_parse(item1.item_content)),
     )
Example #4
0
        def load__region_unit_data():
            data = list(self.region_unit_data__func())
            assert len(data) > 0
            assert isinstance(data[0], dict)
            assert "id"          in data[0]
            assert "name"        in data[0]

            feature_to_unit_ids__dict = defaultdict(list)
            id_to_name__dict = dict()
            for line1 in process_notifier(data):
                id_to_name__dict[line1['id']] = line1['name']
                features = jieba_parse(line1['name'])
                # TODO 移除特殊字符, 比如 "-"
                source1_region = ru_regexp.separate_regiones(line1['name'])[0]

                for kv in itertools.chain(*self.get_region_lines(source1_region)):
                    features.extend(kv.values())
                for feature1 in set(features):
                    feature_to_unit_ids__dict[feature1].append(line1['id'])
            return [id_to_name__dict, dict(feature_to_unit_ids__dict)]
    def get_units_sorted(self, source1_unit):
        """
        1. 主要是兼容单位地址与实际稍微有些出入。
        2. 如果在数据库里完全没有匹配,那就默认用匹配出来的。
        """
        candidate_unit_ids = [unit_id for feature1 in jieba_parse(source1_unit) \
                for unit_id in self.feature_to_unit_ids__dict.get(feature1, []) \
                if unit_id]
        sorted_unit_ids    = Counter(candidate_unit_ids).most_common()

        data = [ \
                {'id': unit_id, \
                 'name': self.region_unit_id_to_name__dict[unit_id], \
                 'rate': freq \
             } for unit_id, freq in sorted_unit_ids]

        if data:
            source2 = data[0]['name']
            source2 = ru_regexp.separate_regiones(source2)[1]
            rate = String.calculate_text_similarity(source1_unit, source2)['similarity_rate']
            if rate < 0.8:
                data.insert(0, {'id':None, 'name':source1_unit, 'rate':None})

        return data
Example #6
0
 def tags_model__extract_features(cls, item, column=None):
     return jieba_parse(item.item_content)
Example #7
0
 def documents_with_segments(self):
     """ 纯分词 """
     return ParallelData.process(self.model, 'dict',
                                 cache_filename=self.model.pickle_path('documents_with_segments'),
                                 item_func=lambda item1: Counter(jieba_parse(item1.item_content)),
                                 )
Example #8
0
 def tags_model__extract_features(cls, item, column=None):
     return jieba_parse(item.item_content)
Example #9
0
    def load_train_data(self):
        jieba_parse(u"你好")  # load jieba first

        cpath = self.cpath

        data_list = [
            {
                "path": [
                    cpath('documents_with_segments'),
                ],
                "var": "documents_with_segments",
                "lazy": True,
            },
            {
                "path": [
                    cpath("idf"),
                ],
                "var": "idf_cache",
                "lazy": False,
            },
            {
                "path": [
                    cpath("entropy"),
                ],
                "var": "entropy_cache",
                "lazy": False,
            },
            {
                "path": [
                    self.model.dbpath,
                    cpath('documents_with_features'),
                ],
                "var": "documents_with_features",
                "lazy": True,
            },
            {
                "path": cpath('tags_tree'),
                "var": "tags_tree",
                "lazy": False,
            },
            {
                "path": cpath('text_engine'),
                "var": "text_similarity_engine",
                "lazy": False,
            },
        ]

        if not os.path.exists(cpath('idf')):
            self.model.pull_data()

        for idx1, data1 in enumerate(data_list):
            print "\n" * 5
            print "_" * 150
            print "[" * 3 + "#" * 144 + "]" * 3
            print "[第%i步] 检查预生成 %s" % (idx1 + 1, data1['path'])

            if not isinstance(data1['path'], list):
                data1['path'] = [data1['path']]

            should_read_attr = False
            for path1 in data1['path']:
                if not os.path.exists(path1):
                    should_read_attr = True

            if (not data1['lazy']) or should_read_attr:
                getattr(self, data1['var'])