Example #1
0
    def __init__(self, features_dir, detdup_data_model):
        self.features_dir          = features_dir

        self.model                 = detdup_data_model

        self.features              = [DefaultFeatures()]
        self.features_map          = dict()

        self.storage_type          = ['memory', 'disk'][0]

        self.is_logger             = True
        self.is_inspect_detail     = False
        self.buffer_logger         = BufferLogger(os.path.join(self.features_dir, 'process.log'))

        self.result                = ItemsGroupAndIndexes()
        self.count                 = 0

        self.candidate_dup_count   = None
Example #2
0
class DetDupCore(object):
    """
    Detect duplicated items, use decision tree.

    Usage:
    -----------
    """

    similarity_rate = 0.90

    def __init__(self, features_dir, detdup_data_model):
        self.features_dir          = features_dir

        self.model                 = detdup_data_model

        self.features              = [DefaultFeatures()]
        self.features_map          = dict()

        self.storage_type          = ['memory', 'disk'][0]

        self.is_logger             = True
        self.is_inspect_detail     = False
        self.buffer_logger         = BufferLogger(os.path.join(self.features_dir, 'process.log'))

        self.result                = ItemsGroupAndIndexes()
        self.count                 = 0

        self.candidate_dup_count   = None

    def select_feature(self, item1):
        f1 = item1.typename
        if not isinstance(f1, str) and not isinstance(f1, unicode): f1 = f1()
        return self.features_map[f1].insert_item(item1)

    def feeded(self):
        for feature1 in self.features:
            # 这个Feature是否有效
            if not feature1.link_to_detdup:
                continue
            # 之前已经导出数据库啦?!
            if os.path.exists(feature1.sqlite3db_path()):
                return True
        return False

    def load_features_from_db(self):
        for feature1 in self.features: feature1.load_features_tree()

    def dump_features_from_memory(self):
        for feature1 in self.features: feature1.dump_features_tree()

    def feed_items(self, obj, persist=True):
        """ Feed items to features """
        # 1. insert it into memory
        [self.select_feature(item1).feed_item() for item1 in process_notifier(obj)]
        # 2. backup into files fully!
        if persist:
            self.dump_features_from_memory()
        return self

    def plug_features(self, features1):
        """
        1. Plug features, and bind typename to classify items
        2. init features tree, memory or disk
        """
        if not isinstance(features1, list): features1 = [features1]
        self.features.extend(features1)
        for f1 in self.features:
            f1.link_to_detdup = self
            f1.build_features_tree()

        for f1 in self.features:
            self.features_map[f1.typename] = f1
        return self

    time_sql = 0
    time_calculate_text_similarity = 0
    time_fetch_content = 0

    def detect_duplicated_items(self, item1):
        feature1 = self.select_feature(item1)
        speed   = Speed()

        t1 = datetime.now()
        item_ids = feature1.fetch_matched_item_ids()
        t2 = datetime.now(); self.time_sql += (t2 - t1).total_seconds();

        # 4. 看看题目相似度
        # 相似度得大于 95%
        new_ids = list()
        for item_id1 in item_ids:
            # 2. 排除自己
            if item_id1 == unicode(item1.item_id): continue

            if item_id1 not in self.model:
                # 删除不一致数据, 以在self.model里为准
                feature1.delete_item_ids([item_id1])
                continue

            t11 = datetime.now()
            content1 = self.model[item_id1].item_content
            t12 = datetime.now(); self.time_fetch_content += (t12 - t11).total_seconds();

            t11 = datetime.now()
            res1 = String.calculate_text_similarity(item1.item_content,
                            content1,
                            inspect=True,
                            skip_special_chars=True,
                            similar_rate_baseline=self.similarity_rate)
            t12 = datetime.now(); self.time_calculate_text_similarity += (t12 - t11).total_seconds();

            if res1['similarity_rate'] > self.similarity_rate:
                new_ids.append(item_id1)
                self.buffer_logger.append(res1['info'])
                self.buffer_logger.inspect()
        print "字符串相似度 [前]", (len(item_ids) - 1), "个,[后]", len(new_ids), "个"

        item_ids = new_ids

        # 如果要排除已处理过为排重的
        speed.tick().inspect()

        print "self.time_sql", self.time_sql
        print "self.time_calculate_text_similarity", self.time_calculate_text_similarity
        print "self.time_fetch_content", self.time_fetch_content

        return item_ids

    def detect_duplicated_items_verbose(self, item_id1, verbose=False):
        self.count += 1
        print "\n"*5, "从", self.candidate_dup_count, "个候选题目中 排重第", self.count, "个题目。", item_id1

        # 如果结果已经计算出来
        if self.result.exists(item_id1):
            return self.result.find(item_id1)

        self.buffer_logger.append("-"*80)
        self.buffer_logger.append("要处理的记录")

        item1 = self.model[item_id1]
        if verbose: item1.inspect()

        self.buffer_logger.append("")
        item_ids = self.detect_duplicated_items(item1)
        self.buffer_logger.append("疑似和", item1.item_id, "重复的条目有", len(item_ids), "个")
        for item_id1 in item_ids:
            if verbose: self.model[item_id1].inspect()
        self.buffer_logger.append("")

        # 输出日志
        if (len(item_ids) > 0) and self.is_logger:
            self.buffer_logger.inspect()
        else:
            self.buffer_logger.clear()

        item_ids.append(unicode(item1.item_id))

        # 有重复结果,就存储一下
        if len(item_ids) > 1:
            self.result.add([i1 for i1 in item_ids])

        return item_ids