コード例 #1
0
    def region_unit_data(self):
        """
        features can be any type, include unicode and int.

        OPTIMIZE: add more features.
        """
        def load__region_unit_data():
            data = list(self.region_unit_data__func())
            assert len(data) > 0
            assert isinstance(data[0], dict)
            assert "id"          in data[0]
            assert "name"        in data[0]

            feature_to_unit_ids__dict = defaultdict(list)
            id_to_name__dict = dict()
            for line1 in process_notifier(data):
                id_to_name__dict[line1['id']] = line1['name']
                features = jieba_parse(line1['name'])
                # TODO 移除特殊字符, 比如 "-"
                source1_region = ru_regexp.separate_regiones(line1['name'])[0]

                for kv in itertools.chain(*self.get_region_lines(source1_region)):
                    features.extend(kv.values())
                for feature1 in set(features):
                    feature_to_unit_ids__dict[feature1].append(line1['id'])
            return [id_to_name__dict, dict(feature_to_unit_ids__dict)]
        return cpickle_cache(self.cache_dir + '/region_unit_data.cPickle', load__region_unit_data)
コード例 #2
0
ファイル: entropy.py プロジェクト: mvj3/textmulclassify
    def process(cls, d1, cache_dir):
        """ d1 is {"feature1":count1, "feature2":count2, ... } """

        def func():
            # 1. fetch all features
            uniq_keys = set([])
            for item_id1, item1 in process_notifier(d1):
                [uniq_keys.add(k1) for k1 in item1.iterkeys()]
            uniq_keys = list(uniq_keys)

            # 2. feature1 => {doc1: count1, doc2: count2, ...}
            value_cache = defaultdict(dict)
            for item_id1, item1 in process_notifier(d1):
                for k1, c1 in item1.iteritems():
                    value_cache[k1][item_id1] = c1

            # 3. calculate each feauture's entropy
            entropy_cache = dict()
            total_len = len(d1)
            for k1 in process_notifier(uniq_keys):
                exist_values = value_cache[k1].values()
                total_values = exist_values + [0] * (total_len - len(value_cache))

                entropy_cache[k1] = scipy_entropy(total_values)

            return entropy_cache

        return cpickle_cache(cache_dir + '/entropy.cPickle', func)
コード例 #3
0
    def nested_region_data(self):
        def load__nested_region_dict():
            # data format
            # [ {"name":"浙江", "code":31, "parent_code":1}, ... ]
            data = list(self.nested_region_data__func())
            assert len(data) > 0
            assert isinstance(data[0], dict)
            assert "name"        in data[0]
            assert "code"        in data[0]
            assert "parent_code" in data[0]

            print "load name_to_codes__dict ..."
            name_to_codes__dict = defaultdict(list)
            for d1 in process_notifier(data):
                name_to_codes__dict[ru_regexp.strip_regexp.sub("", d1['name'])].append(d1['code'])
            name_to_codes__dict = dict(name_to_codes__dict)

            print "load code_to_name__dict ..."
            code_to_name__dict = { d1['code'] : d1['name'] for d1 in process_notifier(data) }

            print "load codes_relations ..."
            codes_relations = { d1['code'] : d1['parent_code'] for d1 in process_notifier(data) }

            return [name_to_codes__dict, code_to_name__dict, codes_relations]

        name_to_codes__dict, code_to_name__dict, codes_relations = \
                cpickle_cache(self.cache_dir + '/nested_region_dict.cPickle', load__nested_region_dict)
        return [name_to_codes__dict, code_to_name__dict, codes_relations]
コード例 #4
0
ファイル: classify.py プロジェクト: mvj3/textmulclassify
    def tags_tree(self):
        """ load_features_with_weight """

        # modify tags_tree
        def func():
            if 'manual_kps' in self.opts:
                # 人工标记,如吕文星标记的初高中知识点短规则组合。
                data = {
                    kp1: Counter(features)
                    for kp1, features in ReadManualKps.process(
                        self.opts['manual_kps']).iteritems()
                }
            else:
                # 机器训练
                data = self.model.tags_tree.load__tag_to_words_count__dict(
                    self.model, self.documents_with_features)

            print "load_words_with_weight ..."
            self.model.tags_tree.load_features_with_weight(data)
            self.model.tags_tree.classify = None
            return self.model.tags_tree

        o1 = cpickle_cache(self.model.pickle_path('tags_tree'), func)
        o1.classify = self  # Fix cPickle.UnpickleableError
        return o1
コード例 #5
0
ファイル: model.py プロジェクト: 17zuoye/textmulclassify
    def all_item_ids(cls):
        def filtered_item_ids_by_has_tags():
            print "[load from original data]\n"
            cls.pull_data()
            # 过滤掉 没有Tag的items
            return [item_id1 for item_id1, item1 in process_notifier(cls)
                    if cls.tags_model__extract_tags(item1)]

        print "[load tags_items_ids]\n"
        return cpickle_cache(cls.pickle_path('tags_items_ids'), filtered_item_ids_by_has_tags)
コード例 #6
0
ファイル: model.py プロジェクト: 17zuoye/textmulclassify
 def test_item_ids(cls):
     """ 在ModelCache数据准备好之后,在训练和评估之前,应该先选出test_item_ids. """
     def func():
         print "[select test item ids]\n"
         all_item_ids = cls.all_item_ids()
         random.shuffle(all_item_ids)
         if cls.classify.max_train_data_count == 0:
             return []  # compact
         return all_item_ids[-cls.classify.max_train_data_count:]
     ids = cpickle_cache(cls.pickle_path('test_item_ids'), func)
     return set(ids)
コード例 #7
0
ファイル: model.py プロジェクト: mvj3/textmulclassify
    def test_item_ids(cls):
        """ 在ModelCache数据准备好之后,在训练和评估之前,应该先选出test_item_ids. """
        def func():
            print "[select test item ids]\n"
            all_item_ids = cls.all_item_ids()
            random.shuffle(all_item_ids)
            if cls.classify.max_train_data_count == 0:
                return []  # compact
            return all_item_ids[-cls.classify.max_train_data_count:]

        ids = cpickle_cache(cls.pickle_path('test_item_ids'), func)
        return set(ids)
コード例 #8
0
    def setup(self):
        self.debug = False

        """ 主要数据结构为 total_tag_to_features__dict , 计算item与tags之间得相似度排序。 """
        self.total_tag_to_features__dict = cpickle_cache(self.classify.cpath('text_engine'),
                                                         self.cache__total_tag_to_features__dict)

        # remove features in stop_unicode_set
        for tag1, features_dict1 in self.total_tag_to_features__dict.iteritems():
            self.filter_by_stop_list(features_dict1)

        self.debug = True
コード例 #9
0
    def setup(self):
        self.debug = False
        """ 主要数据结构为 total_tag_to_features__dict , 计算item与tags之间得相似度排序。 """
        self.total_tag_to_features__dict = cpickle_cache(
            self.classify.cpath('text_engine'),
            self.cache__total_tag_to_features__dict)

        # remove features in stop_unicode_set
        for tag1, features_dict1 in self.total_tag_to_features__dict.iteritems(
        ):
            self.filter_by_stop_list(features_dict1)

        self.debug = True
コード例 #10
0
ファイル: model.py プロジェクト: mvj3/textmulclassify
    def all_item_ids(cls):
        def filtered_item_ids_by_has_tags():
            print "[load from original data]\n"
            cls.pull_data()
            # 过滤掉 没有Tag的items
            return [
                item_id1 for item_id1, item1 in process_notifier(cls)
                if cls.tags_model__extract_tags(item1)
            ]

        print "[load tags_items_ids]\n"
        return cpickle_cache(cls.pickle_path('tags_items_ids'),
                             filtered_item_ids_by_has_tags)
コード例 #11
0
ファイル: parallel.py プロジェクト: 17zuoye/model_cache
 def cache__cpu(cpu_offset):
     fq = FileQueue(self.scope_count, self.chunk_size, self.process_count, cpu_offset,
                    lambda chunk1: PickleFile(chunk1, io_prefix, cpu_prefix))
     while_step = 0
     while fq.has_todo():
         while_step += 1
         pn("[%s cache__cpu:%s] todo_list := %s, while_step := %s" % (self.cache_basename, cpu_offset, fq.todo_list, while_step))
         for f1 in fq.todo_list:
             if not f1.is_exists('io'):
                 continue
             if f1.is_exists('cpu'):
                 f1.done = True
                 continue
             try:
                 io_items = cpickle_cache(f1.io_name(), lambda: not_exist)
                 cpu_items = [[i1[0], self.item_func(i1[1])] for i1 in io_items]
                 cpickle_cache(f1.cpu_name(), lambda: cpu_items)
                 f1.done = True
             except:  # 在IO进程中还没有写完这个文件
                 print "Maybe IO error happened ..."
                 continue
         time.sleep(1)
コード例 #12
0
    def idf_cache(self):
        def func():
            feature_in_doc_to_count = defaultdict(int)
            for item_id1, features in process_notifier(self.documents):
                for feature1 in features.iterkeys():
                    feature_in_doc_to_count[feature1] += 1

            idf_result = IdfResult()
            all_num = float(len(self.documents))

            for feature1, count1 in feature_in_doc_to_count.iteritems():
                idf_result[feature1] = math.log(all_num / count1)

            return idf_result

        return cpickle_cache(self.cache_dir + '/idf.cPickle', func)
コード例 #13
0
ファイル: classify.py プロジェクト: 17zuoye/textmulclassify
    def tags_tree(self):
        """ load_features_with_weight """
        # modify tags_tree
        def func():
            if 'manual_kps' in self.opts:
                # 人工标记,如吕文星标记的初高中知识点短规则组合。
                data = {kp1: Counter(features) for kp1, features in ReadManualKps.process(self.opts['manual_kps']).iteritems()}
            else:
                # 机器训练
                data = self.model.tags_tree.load__tag_to_words_count__dict(self.model, self.documents_with_features)

            print "load_words_with_weight ..."
            self.model.tags_tree.load_features_with_weight(data)
            self.model.tags_tree.classify = None
            return self.model.tags_tree

        o1 = cpickle_cache(self.model.pickle_path('tags_tree'), func)
        o1.classify = self  # Fix cPickle.UnpickleableError
        return o1
コード例 #14
0
ファイル: parallel.py プロジェクト: 17zuoye/model_cache
 def persistent(filename, current_items):
     cpickle_cache(filename, lambda: current_items)
     return []
コード例 #15
0
ファイル: parallel.py プロジェクト: 17zuoye/model_cache
    def recache(self):
        # compact with shelve module generate "dat, dir, bak" three postfix files
        io_prefix = self.cache_basename + '.io.'
        io_regexp = io_prefix + '[0-9]*'
        cpu_prefix = self.cache_basename + '.cpu.'
        cpu_regexp = cpu_prefix + '[0-9]*'

        os.system("cd %s" % os.path.dirname(self.cache_basename))

        # A.1. 缓存IO
        def cache__io():
            self.datasource.reconnect_after_fork()

            pn("[%s cache__io] begin total ..." % self.cache_basename)

            def persistent(filename, current_items):
                cpickle_cache(filename, lambda: current_items)
                return []

            # A.1.1 如果全部缓存了,就不处理了
            if (len(self.datasource) / self.chunk_size) + 1 == len(glob.glob(io_regexp)):
                pn("[%s cache__io] end total ..." % self.cache_basename)
                return False

            # A.1.2 否则还是重新处理一遍
            current_items = []
            idx = 0
            for k1, v1 in self.datasource:
                current_items.append([k1, v1])
                if len(current_items) >= self.chunk_size:
                    cache_path = io_prefix + unicode(idx)
                    os.system("rm -f %s" % cache_path)
                    current_items = persistent(cache_path, current_items)
                    idx += self.chunk_size
            if current_items:
                persistent(io_prefix + unicode(idx), current_items)
            pn("[%s cache__io] end total ..." % self.cache_basename)
        multiprocessing.Process(target=cache__io).start()

        # A.2. 在IO基础上缓存CPU
        def cache__cpu(cpu_offset):
            fq = FileQueue(self.scope_count, self.chunk_size, self.process_count, cpu_offset,
                           lambda chunk1: PickleFile(chunk1, io_prefix, cpu_prefix))
            while_step = 0
            while fq.has_todo():
                while_step += 1
                pn("[%s cache__cpu:%s] todo_list := %s, while_step := %s" % (self.cache_basename, cpu_offset, fq.todo_list, while_step))
                for f1 in fq.todo_list:
                    if not f1.is_exists('io'):
                        continue
                    if f1.is_exists('cpu'):
                        f1.done = True
                        continue
                    try:
                        io_items = cpickle_cache(f1.io_name(), lambda: not_exist)
                        cpu_items = [[i1[0], self.item_func(i1[1])] for i1 in io_items]
                        cpickle_cache(f1.cpu_name(), lambda: cpu_items)
                        f1.done = True
                    except:  # 在IO进程中还没有写完这个文件
                        print "Maybe IO error happened ..."
                        continue
                time.sleep(1)
        for cpu_offset in xrange(self.process_count):
            multiprocessing.Process(target=cache__cpu, args=(cpu_offset,)).start()

        # B. 在前面基础上合并全部
        # Check if extract from original is finished.
        acm = ActiveChildrenManagement()
        while acm.still():
            time.sleep(acm.seconds)

        def write(tmp_items):
            if self.output_lambda:
                self.output_lambda([i1[1] for i1 in tmp_items])
            else:
                for item_id, item1 in process_notifier(tmp_items):
                    self.result[item_id] = item1
                self.result.sync()
            return []

        print "\n" * 5, "begin merge ..."
        tmp_items = []
        fs = sorted(glob.glob(cpu_regexp), key=lambda f1: int(f1.split("/")[-1].split(".")[-1]))
        for f1 in fs:
            chunk = cpickle_cache(f1, lambda: not_exist)
            tmp_items.extend(chunk)
            if len(tmp_items) >= self.merge_size:
                tmp_items = write(tmp_items)
        tmp_items = write(tmp_items)

        # update cache result len
        self.result_len = self.output_len_lambda()