Ejemplo n.º 1
0
    def generate_IDF_dic(self, col_name=None):
        if not self.input_sheet:
            self.input_sheet = ToolsBox.read_excel(self.file_name,
                                                   self.sheet_name)
        if col_name is None: col_name = self.col_name
        total_row = len(self.input_sheet)
        count_dict = {}
        # 统计数量
        for item in self.input_sheet:
            # item[col_name] = ToolsBox.clearStr(item[col_name])
            # for char in item[col_name]:
            new_item = copy.deepcopy(item)
            new_item[col_name] = ToolsBox.clearStr(new_item[col_name])
            for char in new_item[col_name]:
                count_dict[
                    char] = count_dict[char] + 1 if char in count_dict else 1

        # 求取IDF值
        for k, v in count_dict.items():
            count_dict[k] = math.log(total_row / v)

        # 排序
        count_dict = dict(
            sorted(count_dict.items(), key=lambda x: x[1], reverse=True))
        # print(count_dict)
        # print(type(count_dict))

        return count_dict
Ejemplo n.º 2
0
    def deduplicate(self):
        if not self.input_sheet:
            self.input_sheet = ToolsBox.read_excel(self.file_name,
                                                   self.sheet_name)
        # ToolsBox.saveExcel("C:\\Users\\15007\\Desktop\\回写.xlsx", self.input_sheet,"运行前")
        reduce_list = []  #存放去重后的记录的list
        count = 0
        # for record in self.input_sheet:
        for i in range(len(self.input_sheet)):
            # for i in range(1000):
            record = copy.deepcopy(self.input_sheet[i])
            count += 1
            reduce_list_len = len(reduce_list)
            print("第%d个小区:%s(已更新%d个小区)" %
                  (count, record[self.col_name], reduce_list_len))
            # temp_similar_communitys = []        # 一个临时存放相似小区的list
            most_similar_community = {}  # 存放最相似的小区记录
            for item in reduce_list:  # 遍历去重后的小区记录集
                names = item['alias'].split(";")
                for name in names:
                    similar = self.get_Comprehensive_similar(
                        record[self.col_name], name)
                    if similar >= self.valve:
                        # temp_similar_communitys.append(record)
                        if most_similar_community:
                            if most_similar_community['similar'] < similar:
                                # print(">>>>>>>>>>>>>%s与%s原有相似度为%f,现在与%s相似度为%f"%(record[self.col_name],most_similar_community[self.col_name],most_similar_community['similar'],item[self.col_name],similar))
                                print(
                                    ">>>>>>>>>>>>>%s与%s原有相似度为%f,现在与%s相似度为%f" %
                                    (record[self.col_name],
                                     most_similar_community[self.col_name],
                                     most_similar_community['similar'], name,
                                     similar))
                                most_similar_community = item
                                most_similar_community['similar'] = similar
                                # most_similar_community['alias'] = name + ";" + record[self.col_name]
                                print(
                                    ">>>>>>>>>>>>>>>>>>>>>>>>%s现在最相似小区为%s,相似度为%f"
                                    % (record[self.col_name],
                                       most_similar_community[self.col_name],
                                       most_similar_community['similar']))
                            elif most_similar_community['similar'] == similar:
                                if len(name) > len(
                                        most_similar_community[self.col_name]):
                                    print(
                                        "===========%s与%s原有相似度为%f,现在与%s相似度为%f"
                                        %
                                        (record[self.col_name],
                                         most_similar_community[self.col_name],
                                         most_similar_community['similar'],
                                         name, similar))
                                    most_similar_community = item
                                    most_similar_community['similar'] = similar
                                    # most_similar_community['alias'] = name + ";" + record[self.col_name]
                                    print(
                                        "==========================%s现在最相似小区为%s,相似度为%f"
                                        %
                                        (record[self.col_name],
                                         most_similar_community[self.col_name],
                                         most_similar_community['similar']))
                                elif len(name) == len(
                                        most_similar_community[self.col_name]):
                                    print(
                                        "???????????????%s与%s原有相似度为%f,现在与%s相似度为%f"
                                        %
                                        (record[self.col_name],
                                         most_similar_community[self.col_name],
                                         most_similar_community['similar'],
                                         name, similar))
                                    # most_similar_community['alias'] += ";" + record[self.col_name]
                                    ToolsBox.printDic(most_similar_community)
                        else:
                            most_similar_community = item
                            most_similar_community['similar'] = similar
                            # most_similar_community['alias'] = name + ";" + record[self.col_name]
                            print("@@@@@@@@@@@@@@@@@@@%s与%s相似度为%f" %
                                  (record[self.col_name],
                                   most_similar_community[self.col_name],
                                   most_similar_community['similar']))

            if most_similar_community:  #如果小区在“去重集”中找到相似小区,更新一下去重集
                for index in range(reduce_list_len):
                    if reduce_list[index][
                            self.col_name] == most_similar_community[
                                self.col_name]:
                        reduce_list[index]['alias'] = reduce_list[index][
                            'alias'] + ";" + record[self.col_name]
                        break
            else:  # 如果没有发现去重后的小区记录集有与当前记录匹配的,说明是一个新小区,加入“去重集”中
                record['alias'] = record[self.col_name]
                reduce_list.append(record)
        # print(type(reduce_list))
        ToolsBox.saveExcel("C:\\Users\\15007\\Desktop\\去重结果.xlsx", reduce_list)