def load_train_data(self): jieba_parse(u"你好") # load jieba first cpath = self.cpath data_list = [ { "path": [cpath('documents_with_segments'), ], "var": "documents_with_segments", "lazy": True, }, { "path": [cpath("idf"), ], "var": "idf_cache", "lazy": False, }, { "path": [cpath("entropy"), ], "var": "entropy_cache", "lazy": False, }, { "path": [self.model.dbpath, cpath('documents_with_features'), ], "var": "documents_with_features", "lazy": True, }, { "path": cpath('tags_tree'), "var": "tags_tree", "lazy": False, }, { "path": cpath('text_engine'), "var": "text_similarity_engine", "lazy": False, }, ] if not os.path.exists(cpath('idf')): self.model.pull_data() for idx1, data1 in enumerate(data_list): print "\n" * 5 print "_" * 150 print "[" * 3 + "#" * 144 + "]" * 3 print "[第%i步] 检查预生成 %s" % (idx1 + 1, data1['path']) if not isinstance(data1['path'], list): data1['path'] = [data1['path']] should_read_attr = False for path1 in data1['path']: if not os.path.exists(path1): should_read_attr = True if (not data1['lazy']) or should_read_attr: getattr(self, data1['var'])
def get_region_lines(self, source1_region): """ get uniq lines from current `code1` to the root. """ regiones = jieba_parse(source1_region) codes_list = [self.region_with_parents(i1) for i1 in \ [self.region_encode(r1) for r1 in regiones] \ if i1 is not None or 0] codes_list = sorted(list(set(codes_list)), key=lambda li: len(li)) enders = set([]) region_tree = dict() for codes1 in codes_list: current_region_tree = region_tree for idx1, code1 in enumerate(codes1): is_ender = (idx1 + 1) == len(codes1) if is_ender: enders.add(code1) break code2 = codes1[idx1+1] if code1 in enders: enders.remove(code1) current_region_tree[code1] = current_region_tree.get(code1, dict()) current_region_tree[code1][code2] = current_region_tree[code1].get(code2, dict()) current_region_tree = current_region_tree[code1] codes_list = [codes1 for codes1 in codes_list if codes1[-1] in enders] return [ [{"id": code1, "name": self.code_to_name__dict[code1]} \ for code1 in codes1 if code1 in self.code_to_name__dict] \ for codes1 in codes_list ]
def documents_with_segments(self): """ 纯分词 """ return ParallelData.process( self.model, 'dict', cache_filename=self.model.pickle_path('documents_with_segments'), item_func=lambda item1: Counter(jieba_parse(item1.item_content)), )
def load__region_unit_data(): data = list(self.region_unit_data__func()) assert len(data) > 0 assert isinstance(data[0], dict) assert "id" in data[0] assert "name" in data[0] feature_to_unit_ids__dict = defaultdict(list) id_to_name__dict = dict() for line1 in process_notifier(data): id_to_name__dict[line1['id']] = line1['name'] features = jieba_parse(line1['name']) # TODO 移除特殊字符, 比如 "-" source1_region = ru_regexp.separate_regiones(line1['name'])[0] for kv in itertools.chain(*self.get_region_lines(source1_region)): features.extend(kv.values()) for feature1 in set(features): feature_to_unit_ids__dict[feature1].append(line1['id']) return [id_to_name__dict, dict(feature_to_unit_ids__dict)]
def get_units_sorted(self, source1_unit): """ 1. 主要是兼容单位地址与实际稍微有些出入。 2. 如果在数据库里完全没有匹配,那就默认用匹配出来的。 """ candidate_unit_ids = [unit_id for feature1 in jieba_parse(source1_unit) \ for unit_id in self.feature_to_unit_ids__dict.get(feature1, []) \ if unit_id] sorted_unit_ids = Counter(candidate_unit_ids).most_common() data = [ \ {'id': unit_id, \ 'name': self.region_unit_id_to_name__dict[unit_id], \ 'rate': freq \ } for unit_id, freq in sorted_unit_ids] if data: source2 = data[0]['name'] source2 = ru_regexp.separate_regiones(source2)[1] rate = String.calculate_text_similarity(source1_unit, source2)['similarity_rate'] if rate < 0.8: data.insert(0, {'id':None, 'name':source1_unit, 'rate':None}) return data
def tags_model__extract_features(cls, item, column=None): return jieba_parse(item.item_content)
def documents_with_segments(self): """ 纯分词 """ return ParallelData.process(self.model, 'dict', cache_filename=self.model.pickle_path('documents_with_segments'), item_func=lambda item1: Counter(jieba_parse(item1.item_content)), )
def load_train_data(self): jieba_parse(u"你好") # load jieba first cpath = self.cpath data_list = [ { "path": [ cpath('documents_with_segments'), ], "var": "documents_with_segments", "lazy": True, }, { "path": [ cpath("idf"), ], "var": "idf_cache", "lazy": False, }, { "path": [ cpath("entropy"), ], "var": "entropy_cache", "lazy": False, }, { "path": [ self.model.dbpath, cpath('documents_with_features'), ], "var": "documents_with_features", "lazy": True, }, { "path": cpath('tags_tree'), "var": "tags_tree", "lazy": False, }, { "path": cpath('text_engine'), "var": "text_similarity_engine", "lazy": False, }, ] if not os.path.exists(cpath('idf')): self.model.pull_data() for idx1, data1 in enumerate(data_list): print "\n" * 5 print "_" * 150 print "[" * 3 + "#" * 144 + "]" * 3 print "[第%i步] 检查预生成 %s" % (idx1 + 1, data1['path']) if not isinstance(data1['path'], list): data1['path'] = [data1['path']] should_read_attr = False for path1 in data1['path']: if not os.path.exists(path1): should_read_attr = True if (not data1['lazy']) or should_read_attr: getattr(self, data1['var'])