def matchid(self, data): # comm_id = '' getid = self.get_id_from_arr(data, self.comm_arr) try: if len(getid) == 1: # 如果匹配到唯一id # self.update_id(data['id'], getid[0][2]) comm_id = getid[0][2] elif len(getid) == 0: # 如果没匹配到comm,就看看按road是否能匹配 getroad = self.get_id_from_arr(data, self.road_arr) if len(getroad) == 1: # 匹配到唯一road # self.update_id(data['id'], getroad[0][2]) comm_id = getroad[0][2] elif len(getroad) == 0: #如果连road也没匹配成功,空在那里 # self.insert_err(data) print("---------未匹配成功---------") ToolsBox.printDic(data) comm_id = 0 elif len(getroad) > 1: # 如果匹配到不止一个road,进行处理 comm_id = self.handle_match_mul(data, getroad) elif len(getid) > 1: # 如果comm匹配到不止一个,进行处理 comm_id = self.handle_match_mul(data, getid) except MySQLdb.Error as e: if e.args[0] == 1062: print(str(dupli) + "aready have") # dupli = dupli + 1 return comm_id
each_data['total_price'] = ToolsBox.strToInt(price.get_text()) each_data['from'] = "Danxia" each_data = self.pipe(each_data) # 2016.6.4增加一个专门的数据处理 if each_data: page_datas.append(each_data) else: if ToolsBox.ShowInvalideData(each_data): page_datas.append(each_data) return page_datas # return each_data if __name__ == "__main__": downloader = Downloader.Downloader() parser = DanxiaPage() url = 'https://danxia.com/house/all/PG2' headers = { "Referer": "https://danxia.com/house/all", 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', } html_cont,code = downloader.download(url,headers=headers) # soup = parser.get_soup(html_cont) # datas = parser.parse_datas(soup) # print(datas) urls,datas = parser.page_parse(html_cont) for data in datas: print('='*50) ToolsBox.printDic(data)
def handle_match_mul(self, data, getid): """ 处理 getid 中匹配成功不止一个id 处理的原则是以起始字段在前的为准, 如果起始字段相同,则以字符串长的为准 如果起始与字串长度都一样,则人工判断 """ flag = False #标志位,如果能解析出唯一id,则标志位设成ture getid.sort(key=lambda x: x[0]) #按照匹配关键字的起始位置排序 # print('getid ofter sorted:%s'%getid) ##########以下是2010/10/21重写的############## result = [] result.append(getid[0]) #先放第一个 for l in range(1, len(getid)): #循环比较getid里的每个元素 if (getid[l][0] > result[0][0]): # 如果第二个匹配到的关键字起始位置大于第一个,就以第一个为准,不用再匹配了 break else: #如果有并列第一: if len(getid[l][1]) > len(result[0][1]): #字符串长的优先 # 只要发现了比原来存的字串还长的小区,就把原来的清空,标志成无重复 result = [] result.append(getid[l]) flag = False # result[0] = getid[l] elif len(getid[l][1]) == len(result[0][1]): #字符串长度相同的 if getid[l][2] != result[0][2]: # 起始位置相同,关键字长度也相同,如果小区id也相同,那就不处理了。 # 否则把标志位设成ture,要人工判断一下 result.append(getid[l]) flag = True # first = getid[0] # # 用第一个匹配成的合成一个字段:起始位置+小区名称+小区id # # 传进getid,挑选出get # get = getid[0] # # for l in range(1,len(getid)): # # 如果第二个匹配到的关键字起始位置大于第一个,就以第一个为准,不用再匹配了 # print('getid[l] is %s'%getid[l]) # print('get is %s'%get) # print('first is %s'%first) # if(getid[l][0] > first[0]): # print('now break') # break # else: #如果有并列第一: # if len(getid[l][1]) > len(first[1]): #字符串长的优先 # first = getid[l] # elif len(getid[l][1]) == len(first[1]): #字符串长度相同的 # if getid[l][2] != first[2]: # # 起始位置相同,关键字长度也相同,有时是取到了同一个小区id,那就不处理了。 # # 否则把标志位设成ture,要人工判断一下 # get.append(getid[l]) # # get += str(getid[l][0]) +',' + str(getid[l][1]) +',' + str(getid[l][2]) + "/" # flag = True # print('first is %s'%first) # 成功就写进id,没成功就空着即可 if flag: print('*********匹配多个小区id**********') ToolsBox.printDic(data) print('||||||||||||||||匹配多个小区|||||||||||||||||') ToolsBox.printDic(result) return len(result) else: return result[0][2]
def deduplicate(self): if not self.input_sheet: self.input_sheet = ToolsBox.read_excel(self.file_name, self.sheet_name) # ToolsBox.saveExcel("C:\\Users\\15007\\Desktop\\回写.xlsx", self.input_sheet,"运行前") reduce_list = [] #存放去重后的记录的list count = 0 # for record in self.input_sheet: for i in range(len(self.input_sheet)): # for i in range(1000): record = copy.deepcopy(self.input_sheet[i]) count += 1 reduce_list_len = len(reduce_list) print("第%d个小区:%s(已更新%d个小区)" % (count, record[self.col_name], reduce_list_len)) # temp_similar_communitys = [] # 一个临时存放相似小区的list most_similar_community = {} # 存放最相似的小区记录 for item in reduce_list: # 遍历去重后的小区记录集 names = item['alias'].split(";") for name in names: similar = self.get_Comprehensive_similar( record[self.col_name], name) if similar >= self.valve: # temp_similar_communitys.append(record) if most_similar_community: if most_similar_community['similar'] < similar: # print(">>>>>>>>>>>>>%s与%s原有相似度为%f,现在与%s相似度为%f"%(record[self.col_name],most_similar_community[self.col_name],most_similar_community['similar'],item[self.col_name],similar)) print( ">>>>>>>>>>>>>%s与%s原有相似度为%f,现在与%s相似度为%f" % (record[self.col_name], most_similar_community[self.col_name], most_similar_community['similar'], name, similar)) most_similar_community = item most_similar_community['similar'] = similar # most_similar_community['alias'] = name + ";" + record[self.col_name] print( ">>>>>>>>>>>>>>>>>>>>>>>>%s现在最相似小区为%s,相似度为%f" % (record[self.col_name], most_similar_community[self.col_name], most_similar_community['similar'])) elif most_similar_community['similar'] == similar: if len(name) > len( most_similar_community[self.col_name]): print( "===========%s与%s原有相似度为%f,现在与%s相似度为%f" % (record[self.col_name], most_similar_community[self.col_name], most_similar_community['similar'], name, similar)) most_similar_community = item most_similar_community['similar'] = similar # most_similar_community['alias'] = name + ";" + record[self.col_name] print( "==========================%s现在最相似小区为%s,相似度为%f" % (record[self.col_name], most_similar_community[self.col_name], most_similar_community['similar'])) elif len(name) == len( most_similar_community[self.col_name]): print( "???????????????%s与%s原有相似度为%f,现在与%s相似度为%f" % (record[self.col_name], most_similar_community[self.col_name], most_similar_community['similar'], name, similar)) # most_similar_community['alias'] += ";" + record[self.col_name] ToolsBox.printDic(most_similar_community) else: most_similar_community = item most_similar_community['similar'] = similar # most_similar_community['alias'] = name + ";" + record[self.col_name] print("@@@@@@@@@@@@@@@@@@@%s与%s相似度为%f" % (record[self.col_name], most_similar_community[self.col_name], most_similar_community['similar'])) if most_similar_community: #如果小区在“去重集”中找到相似小区,更新一下去重集 for index in range(reduce_list_len): if reduce_list[index][ self.col_name] == most_similar_community[ self.col_name]: reduce_list[index]['alias'] = reduce_list[index][ 'alias'] + ";" + record[self.col_name] break else: # 如果没有发现去重后的小区记录集有与当前记录匹配的,说明是一个新小区,加入“去重集”中 record['alias'] = record[self.col_name] reduce_list.append(record) # print(type(reduce_list)) ToolsBox.saveExcel("C:\\Users\\15007\\Desktop\\去重结果.xlsx", reduce_list)