예제 #1
0
    def matchid(self, data):
        # comm_id = ''
        getid = self.get_id_from_arr(data, self.comm_arr)
        try:
            if len(getid) == 1:  # 如果匹配到唯一id
                # self.update_id(data['id'], getid[0][2])
                comm_id = getid[0][2]
            elif len(getid) == 0:  # 如果没匹配到comm,就看看按road是否能匹配
                getroad = self.get_id_from_arr(data, self.road_arr)
                if len(getroad) == 1:  # 匹配到唯一road
                    # self.update_id(data['id'], getroad[0][2])
                    comm_id = getroad[0][2]
                elif len(getroad) == 0:
                    #如果连road也没匹配成功,空在那里
                    # self.insert_err(data)
                    print("---------未匹配成功---------")
                    ToolsBox.printDic(data)
                    comm_id = 0
                elif len(getroad) > 1:  # 如果匹配到不止一个road,进行处理
                    comm_id = self.handle_match_mul(data, getroad)
            elif len(getid) > 1:  # 如果comm匹配到不止一个,进行处理
                comm_id = self.handle_match_mul(data, getid)

        except MySQLdb.Error as e:
            if e.args[0] == 1062:
                print(str(dupli) + "aready have")
                # dupli = dupli + 1

        return comm_id
예제 #2
0
            each_data['total_price'] = ToolsBox.strToInt(price.get_text())

            each_data['from'] = "Danxia"

            each_data = self.pipe(each_data)  # 2016.6.4增加一个专门的数据处理

            if each_data:
                page_datas.append(each_data)
            else:
                if ToolsBox.ShowInvalideData(each_data): page_datas.append(each_data)

        return page_datas
        # return each_data
if __name__ == "__main__":
    downloader = Downloader.Downloader()
    parser = DanxiaPage()
    url = 'https://danxia.com/house/all/PG2'
    headers = {
        "Referer": "https://danxia.com/house/all",
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
    }
    html_cont,code = downloader.download(url,headers=headers)
    # soup = parser.get_soup(html_cont)
    # datas = parser.parse_datas(soup)
    # print(datas)
    urls,datas = parser.page_parse(html_cont)
    for data in datas:
        print('='*50)
        ToolsBox.printDic(data)
예제 #3
0
    def handle_match_mul(self, data, getid):
        """
        处理 getid 中匹配成功不止一个id
        处理的原则是以起始字段在前的为准,
        如果起始字段相同,则以字符串长的为准
        如果起始与字串长度都一样,则人工判断
        """
        flag = False  #标志位,如果能解析出唯一id,则标志位设成ture

        getid.sort(key=lambda x: x[0])  #按照匹配关键字的起始位置排序

        # print('getid ofter sorted:%s'%getid)

        ##########以下是2010/10/21重写的##############
        result = []
        result.append(getid[0])  #先放第一个
        for l in range(1, len(getid)):  #循环比较getid里的每个元素
            if (getid[l][0] >
                    result[0][0]):  # 如果第二个匹配到的关键字起始位置大于第一个,就以第一个为准,不用再匹配了
                break
            else:  #如果有并列第一:
                if len(getid[l][1]) > len(result[0][1]):  #字符串长的优先
                    # 只要发现了比原来存的字串还长的小区,就把原来的清空,标志成无重复
                    result = []
                    result.append(getid[l])
                    flag = False
                    # result[0] = getid[l]
                elif len(getid[l][1]) == len(result[0][1]):  #字符串长度相同的
                    if getid[l][2] != result[0][2]:
                        # 起始位置相同,关键字长度也相同,如果小区id也相同,那就不处理了。
                        # 否则把标志位设成ture,要人工判断一下
                        result.append(getid[l])
                        flag = True

        # first = getid[0]
        # # 用第一个匹配成的合成一个字段:起始位置+小区名称+小区id
        # # 传进getid,挑选出get
        # get = getid[0]
        #
        # for l in range(1,len(getid)):
        #     # 如果第二个匹配到的关键字起始位置大于第一个,就以第一个为准,不用再匹配了
        #     print('getid[l] is %s'%getid[l])
        #     print('get is %s'%get)
        #     print('first is %s'%first)
        #     if(getid[l][0] > first[0]):
        #         print('now break')
        #         break
        #     else:                               #如果有并列第一:
        #         if len(getid[l][1]) > len(first[1]):        #字符串长的优先
        #             first = getid[l]
        #         elif len(getid[l][1]) == len(first[1]):     #字符串长度相同的
        #             if getid[l][2] != first[2]:
        #             # 起始位置相同,关键字长度也相同,有时是取到了同一个小区id,那就不处理了。
        #             # 否则把标志位设成ture,要人工判断一下
        #                 get.append(getid[l])
        #                 # get += str(getid[l][0]) +',' + str(getid[l][1]) +',' + str(getid[l][2]) + "/"
        #                 flag = True
        # print('first is %s'%first)
        # 成功就写进id,没成功就空着即可

        if flag:
            print('*********匹配多个小区id**********')
            ToolsBox.printDic(data)
            print('||||||||||||||||匹配多个小区|||||||||||||||||')
            ToolsBox.printDic(result)
            return len(result)
        else:
            return result[0][2]
예제 #4
0
    def deduplicate(self):
        if not self.input_sheet:
            self.input_sheet = ToolsBox.read_excel(self.file_name,
                                                   self.sheet_name)
        # ToolsBox.saveExcel("C:\\Users\\15007\\Desktop\\回写.xlsx", self.input_sheet,"运行前")
        reduce_list = []  #存放去重后的记录的list
        count = 0
        # for record in self.input_sheet:
        for i in range(len(self.input_sheet)):
            # for i in range(1000):
            record = copy.deepcopy(self.input_sheet[i])
            count += 1
            reduce_list_len = len(reduce_list)
            print("第%d个小区:%s(已更新%d个小区)" %
                  (count, record[self.col_name], reduce_list_len))
            # temp_similar_communitys = []        # 一个临时存放相似小区的list
            most_similar_community = {}  # 存放最相似的小区记录
            for item in reduce_list:  # 遍历去重后的小区记录集
                names = item['alias'].split(";")
                for name in names:
                    similar = self.get_Comprehensive_similar(
                        record[self.col_name], name)
                    if similar >= self.valve:
                        # temp_similar_communitys.append(record)
                        if most_similar_community:
                            if most_similar_community['similar'] < similar:
                                # print(">>>>>>>>>>>>>%s与%s原有相似度为%f,现在与%s相似度为%f"%(record[self.col_name],most_similar_community[self.col_name],most_similar_community['similar'],item[self.col_name],similar))
                                print(
                                    ">>>>>>>>>>>>>%s与%s原有相似度为%f,现在与%s相似度为%f" %
                                    (record[self.col_name],
                                     most_similar_community[self.col_name],
                                     most_similar_community['similar'], name,
                                     similar))
                                most_similar_community = item
                                most_similar_community['similar'] = similar
                                # most_similar_community['alias'] = name + ";" + record[self.col_name]
                                print(
                                    ">>>>>>>>>>>>>>>>>>>>>>>>%s现在最相似小区为%s,相似度为%f"
                                    % (record[self.col_name],
                                       most_similar_community[self.col_name],
                                       most_similar_community['similar']))
                            elif most_similar_community['similar'] == similar:
                                if len(name) > len(
                                        most_similar_community[self.col_name]):
                                    print(
                                        "===========%s与%s原有相似度为%f,现在与%s相似度为%f"
                                        %
                                        (record[self.col_name],
                                         most_similar_community[self.col_name],
                                         most_similar_community['similar'],
                                         name, similar))
                                    most_similar_community = item
                                    most_similar_community['similar'] = similar
                                    # most_similar_community['alias'] = name + ";" + record[self.col_name]
                                    print(
                                        "==========================%s现在最相似小区为%s,相似度为%f"
                                        %
                                        (record[self.col_name],
                                         most_similar_community[self.col_name],
                                         most_similar_community['similar']))
                                elif len(name) == len(
                                        most_similar_community[self.col_name]):
                                    print(
                                        "???????????????%s与%s原有相似度为%f,现在与%s相似度为%f"
                                        %
                                        (record[self.col_name],
                                         most_similar_community[self.col_name],
                                         most_similar_community['similar'],
                                         name, similar))
                                    # most_similar_community['alias'] += ";" + record[self.col_name]
                                    ToolsBox.printDic(most_similar_community)
                        else:
                            most_similar_community = item
                            most_similar_community['similar'] = similar
                            # most_similar_community['alias'] = name + ";" + record[self.col_name]
                            print("@@@@@@@@@@@@@@@@@@@%s与%s相似度为%f" %
                                  (record[self.col_name],
                                   most_similar_community[self.col_name],
                                   most_similar_community['similar']))

            if most_similar_community:  #如果小区在“去重集”中找到相似小区,更新一下去重集
                for index in range(reduce_list_len):
                    if reduce_list[index][
                            self.col_name] == most_similar_community[
                                self.col_name]:
                        reduce_list[index]['alias'] = reduce_list[index][
                            'alias'] + ";" + record[self.col_name]
                        break
            else:  # 如果没有发现去重后的小区记录集有与当前记录匹配的,说明是一个新小区,加入“去重集”中
                record['alias'] = record[self.col_name]
                reduce_list.append(record)
        # print(type(reduce_list))
        ToolsBox.saveExcel("C:\\Users\\15007\\Desktop\\去重结果.xlsx", reduce_list)