コード例 #1
0
ファイル: base.py プロジェクト: 17zuoye/detdup
    def init__after(self, record):
        # 会把全角都转换为半角
        self.item_content = UnicodeUtils.stringQ2B(self.item_content)

        # common extract data part
        info1                  = StringUtils.frequence_chars_info(self.item_content, lambda len1 : len1 * 0.75)
        self.uniq_chars__len   = info1['uniq_chars__len']
        self.sorted_freq_chars = info1['sorted_freq_chars']
        self.sqrt_chars__len   = int(round(math.sqrt(len(self.item_content))))
コード例 #2
0
ファイル: base.py プロジェクト: yonglehou/detdup
    def init__after(self, record):
        # 会把全角都转换为半角
        self.item_content = UnicodeUtils.stringQ2B(self.item_content)

        # common extract data part
        info1 = StringUtils.frequence_chars_info(self.item_content,
                                                 lambda len1: len1 * 0.75)
        self.uniq_chars__len = info1['uniq_chars__len']
        self.sorted_freq_chars = info1['sorted_freq_chars']
        self.sqrt_chars__len = int(round(math.sqrt(len(self.item_content))))
コード例 #3
0
ファイル: tree.py プロジェクト: 17zuoye/textmulclassify
        def import_from_file(file1):
            # NOTE import_from_file 暂不支持depth
            for line in UnicodeUtils.read(file1).strip().split(line_split):
                line = line.strip()
                if TMCTree.root_node not in self:
                    self[TMCTree.root_node] = dict()
                current_dict = self[TMCTree.root_node]
                parent_node  = TMCTree.root_node
                for current_node in line.split(item_split):
                    current_node = Node({"name": current_node})
                    self.name_to_nodes[current_node.name].add(current_node)
                    if current_node not in current_dict:
                        current_dict[current_node] = dict()

                    self.child_name_to_parent_relation_dict[current_node.name].add(parent_node)
                    current_dict = current_dict[current_node]
                    parent_node  = current_node
コード例 #4
0
ファイル: tree.py プロジェクト: mvj3/textmulclassify
        def import_from_file(file1):
            # NOTE import_from_file 暂不支持depth
            for line in UnicodeUtils.read(file1).strip().split(line_split):
                line = line.strip()
                if TMCTree.root_node not in self:
                    self[TMCTree.root_node] = dict()
                current_dict = self[TMCTree.root_node]
                parent_node = TMCTree.root_node
                for current_node in line.split(item_split):
                    current_node = Node({"name": current_node})
                    self.name_to_nodes[current_node.name].add(current_node)
                    if current_node not in current_dict:
                        current_dict[current_node] = dict()

                    self.child_name_to_parent_relation_dict[
                        current_node.name].add(parent_node)
                    current_dict = current_dict[current_node]
                    parent_node = current_node
コード例 #5
0
    def __init__(self, source):
        if "\n" not in source:
            source = UnicodeUtils.read(source)  # 有换行 表示已经读进来了

        self.result = dict()
        current_kp = current_features = None

        for num, line in enumerate(source.split("\n")):
            line = line.strip()
            line = re.sub("'", "\"", line)
            line = re.sub("u\"", "\"", line)

            try:
                current_kp, current_features = self.parse(line, current_kp, current_features)
            except:
                print "[num]", num + 1, "[line]", line
                raise Exception("parse error ...")

            if current_kp and current_features:
                self.result[current_kp] = current_features
コード例 #6
0
    def load_data_from_input(self, input1):
        """ return data is a dict. """
        def wrap(data):
            avg = sum(data.values()) / float(len(data))
            return defaultdict(lambda: avg, data)

        if isinstance(input1, dict):
            return wrap(input1)

        if not os.path.exists(input1):
            return defaultdict(float)

        content = UnicodeUtils.read(input1).strip()
        try:
            data = json.loads(content)
        except:
            data = dict()
            for line in content.split("\n"):
                result = line.split(',')
                data[result[0]] = float(result[1].strip())

        return wrap(data)
コード例 #7
0
    def load_data_from_input(self, input1):
        """ return data is a dict. """
        def wrap(data):
            avg = sum(data.values()) / float(len(data))
            return defaultdict(lambda: avg, data)

        if isinstance(input1, dict):
            return wrap(input1)

        if not os.path.exists(input1):
            return defaultdict(float)

        content = UnicodeUtils.read(input1).strip()
        try:
            data = json.loads(content)
        except:
            data = dict()
            for line in content.split("\n"):
                result = line.split(',')
                data[result[0]] = float(result[1].strip())

        return wrap(data)
コード例 #8
0
ファイル: model.py プロジェクト: mvj3/textmulclassify
 def inspect(self):
     for k1, c1 in self:
         print UnicodeUtils.rjust(k1, self.max_strlen), ":", c1, "\n"
コード例 #9
0
ファイル: model.py プロジェクト: 17zuoye/textmulclassify
 def inspect(self):
     for k1, c1 in self:
         print UnicodeUtils.rjust(k1, self.max_strlen), ":", c1, "\n"
コード例 #10
0
 def stop_words_set(self):
     return set([w1.strip() for file1 in self.classify.stop_words_files
                 for w1 in UnicodeUtils.read(file1).split("\n")])
コード例 #11
0
 def stop_words_set(self):
     return set([
         w1.strip() for file1 in self.classify.stop_words_files
         for w1 in UnicodeUtils.read(file1).split("\n")
     ])