def init__after(self, record): # 会把全角都转换为半角 self.item_content = UnicodeUtils.stringQ2B(self.item_content) # common extract data part info1 = StringUtils.frequence_chars_info(self.item_content, lambda len1 : len1 * 0.75) self.uniq_chars__len = info1['uniq_chars__len'] self.sorted_freq_chars = info1['sorted_freq_chars'] self.sqrt_chars__len = int(round(math.sqrt(len(self.item_content))))
def init__after(self, record): # 会把全角都转换为半角 self.item_content = UnicodeUtils.stringQ2B(self.item_content) # common extract data part info1 = StringUtils.frequence_chars_info(self.item_content, lambda len1: len1 * 0.75) self.uniq_chars__len = info1['uniq_chars__len'] self.sorted_freq_chars = info1['sorted_freq_chars'] self.sqrt_chars__len = int(round(math.sqrt(len(self.item_content))))
def import_from_file(file1): # NOTE import_from_file 暂不支持depth for line in UnicodeUtils.read(file1).strip().split(line_split): line = line.strip() if TMCTree.root_node not in self: self[TMCTree.root_node] = dict() current_dict = self[TMCTree.root_node] parent_node = TMCTree.root_node for current_node in line.split(item_split): current_node = Node({"name": current_node}) self.name_to_nodes[current_node.name].add(current_node) if current_node not in current_dict: current_dict[current_node] = dict() self.child_name_to_parent_relation_dict[current_node.name].add(parent_node) current_dict = current_dict[current_node] parent_node = current_node
def import_from_file(file1): # NOTE import_from_file 暂不支持depth for line in UnicodeUtils.read(file1).strip().split(line_split): line = line.strip() if TMCTree.root_node not in self: self[TMCTree.root_node] = dict() current_dict = self[TMCTree.root_node] parent_node = TMCTree.root_node for current_node in line.split(item_split): current_node = Node({"name": current_node}) self.name_to_nodes[current_node.name].add(current_node) if current_node not in current_dict: current_dict[current_node] = dict() self.child_name_to_parent_relation_dict[ current_node.name].add(parent_node) current_dict = current_dict[current_node] parent_node = current_node
def __init__(self, source): if "\n" not in source: source = UnicodeUtils.read(source) # 有换行 表示已经读进来了 self.result = dict() current_kp = current_features = None for num, line in enumerate(source.split("\n")): line = line.strip() line = re.sub("'", "\"", line) line = re.sub("u\"", "\"", line) try: current_kp, current_features = self.parse(line, current_kp, current_features) except: print "[num]", num + 1, "[line]", line raise Exception("parse error ...") if current_kp and current_features: self.result[current_kp] = current_features
def load_data_from_input(self, input1): """ return data is a dict. """ def wrap(data): avg = sum(data.values()) / float(len(data)) return defaultdict(lambda: avg, data) if isinstance(input1, dict): return wrap(input1) if not os.path.exists(input1): return defaultdict(float) content = UnicodeUtils.read(input1).strip() try: data = json.loads(content) except: data = dict() for line in content.split("\n"): result = line.split(',') data[result[0]] = float(result[1].strip()) return wrap(data)
def inspect(self): for k1, c1 in self: print UnicodeUtils.rjust(k1, self.max_strlen), ":", c1, "\n"
def stop_words_set(self): return set([w1.strip() for file1 in self.classify.stop_words_files for w1 in UnicodeUtils.read(file1).split("\n")])
def stop_words_set(self): return set([ w1.strip() for file1 in self.classify.stop_words_files for w1 in UnicodeUtils.read(file1).split("\n") ])