def get_Comprehensive_similar(self, in_str1, in_str2): str1 = ToolsBox.clearStr(in_str1).upper() str2 = ToolsBox.clearStr(in_str2).upper() # str1 = (in_str1).upper() # str2 = (in_str2).upper() fuzzSimilar = fuzz.ratio(str1, str2) / 100 # print("fuzz: %f" %(fuzzSimilar)) selfSimilar = self.get_similar(str1, str2) # print("mysimilar:%f"%(selfSimilar)) return fuzzSimilar * self.fuzzPercentage + selfSimilar * ( 1 - self.fuzzPercentage)
def generate_IDF_dic(self, col_name=None): if not self.input_sheet: self.input_sheet = ToolsBox.read_excel(self.file_name, self.sheet_name) if col_name is None: col_name = self.col_name total_row = len(self.input_sheet) count_dict = {} # 统计数量 for item in self.input_sheet: # item[col_name] = ToolsBox.clearStr(item[col_name]) # for char in item[col_name]: new_item = copy.deepcopy(item) new_item[col_name] = ToolsBox.clearStr(new_item[col_name]) for char in new_item[col_name]: count_dict[ char] = count_dict[char] + 1 if char in count_dict else 1 # 求取IDF值 for k, v in count_dict.items(): count_dict[k] = math.log(total_row / v) # 排序 count_dict = dict( sorted(count_dict.items(), key=lambda x: x[1], reverse=True)) # print(count_dict) # print(type(count_dict)) return count_dict
def parse_datas(self,soup): page_datas = [] titles = soup.select("h1 > a") infos = soup.select("p.house_info") hots = soup.select("p.house_hot") areas = soup.select("div.the_area span") prices = soup.select("div.the_price span") splitby = re.compile(r']|,|\s') for title, info, hot, area, price in zip(titles, infos, hots, areas, prices): each_data = {'advantage': '', 'builded_year': 0, 'spatial_arrangement': '', 'floor_index': 0, 'total_floor': 0} each_data['title'] = title.get_text() each_data['details_url'] = 'http://xm.maitian.cn' + title.get('href') try: each_data['total_price'] = ToolsBox.strToInt(price.get_text()) except Exception as e: with open('logtest.txt', 'a+') as fout: fout.write('*************' + str(datetime.datetime.now()) + '*************\n') fout.write('麦田解析total_price出错,待解析的数据:' + price.get_text()) traceback.print_exc(file=fout) print(traceback.format_exc()) try: each_data['block'] = info.get_text().strip() each_data['community_name'] = splitby.split(each_data['block'])[-1].strip() each_data['block'] = each_data['block'].replace(each_data['community_name'],'') except Exception as e: with open('logtest.txt', 'a+') as fout: fout.write('*************' + str(datetime.datetime.now()) + '*************\n') fout.write('Parse Failt of :%s \n' % info.get_text()) traceback.print_exc(file=fout) print(traceback.format_exc()) # try: # 麦田的格式,这里是户型、优势和楼层 temp = ToolsBox.clearStr(hot.text).split('|') for item in temp: d1 = self.parse_item(item) each_data = self.add_advantage(d1, each_data) #each_data = dict(each_data, **d1) # 这是解析面积 each_data = dict(each_data, **self.parse_item(area.get_text())) each_data['from'] = "MT" each_data = self.pipe(each_data) if each_data: page_datas.append(each_data) else: if ToolsBox.ShowInvalideData(each_data): page_datas.append(each_data) return page_datas
def get_comm_arr_fromMysql(self): sql = "SELECT id,com_name,alias_com_name FROM `t_base_community` WHERE city_name='厦门市'" arr = self.get_list_fromMysql(sql) # print(len(arr)) comm_arr = {} for item in arr: if item['alias_com_name']: comms = item['alias_com_name'].split(';') for comm in comms: comm = ToolsBox.clearStr(comm) if comm not in comm_arr.keys(): comm_arr[comm] = item['id'] if item['com_name']: comm = ToolsBox.clearStr(item['com_name']) if comm not in comm_arr.keys(): comm_arr[comm] = item['id'] # 返回key-value值的字典 # {'城南阳翟教师楼': '04b90367549011ebb98a98039b073fcc', '国联大厦': '04bc8a7f549011ebb98a98039b073fcc'...} return comm_arr
def parse_test(self, soup): adds = soup.select('.property-content-info-comm-address') for add in adds: add_list = [] for string in add.strings: add_list.append(ToolsBox.clearStr(string.strip())) print(add_list) print("*" * 50)
def parse_datas(self, soup): page_datas = [] details = soup.select(".houseInfo") comms = soup.select(".positionInfo a") prices = soup.select(".totalPrice") titles = soup.select("div.title a.CLICKDATA") for title, detail, price, comm in zip(titles, details, prices, comms): each_data = dict(builded_year=0, spatial_arrangement='', floor_index=0, total_floor=0, details_url=title.get('href'), advantage='') each_data['title'] = title.get_text().strip() houseInfos = re.split(r'\s*[|,\s]\s*', ToolsBox.clearStr(detail.get_text())) # print(houseInfos) # print("1"*20) each_data['community_name'] = comm.get_text().strip() if len(each_data['community_name']) >= 20: input(each_data['community_name'] + ':' + str(len(each_data['community_name']))) # houseInfos = houseInfos[1:] #第一个是小区名称,切片去除 for item in houseInfos: # print(item) d1 = self.parse_item(item) each_data = self.add_advantage(d1, each_data) each_data['total_price'] = ToolsBox.strToInt(price.get_text()) each_data['from'] = "Beike" each_data = self.pipe(each_data) if each_data: page_datas.append(each_data) else: if ToolsBox.ShowInvalideData(each_data): page_datas.append(each_data) # print(each_data) if not page_datas: item_num = soup.select(".fl span") if item_num: page_datas = item_num[0].get_text().strip() return page_datas
def parse_datas(self, soup): page_datas = [] # title = soup.select("title") # if len(title) > 0: # print("The page's title is : {0}".format(title[0].get_text())) # else: # print("There is no title finded!") titles = soup.select(".shop_list > dl h4 a") houses = soup.select("p.tel_shop") comms = soup.select(".shop_list > dl dd p.add_shop a") comm_addresses = soup.select(".shop_list > dl dd p.add_shop span") prices = soup.select(".price_right .red b") for title, comm, comm_addresse, house, price in zip( titles, comms, comm_addresses, houses, prices): each_data = dict(builded_year=0, spatial_arrangement='', floor_index=0, total_floor=0, advantage='') each_data['title'] = title.get('title') each_data['details_url'] = "https://xm.esf.fang.com" + title.get( 'href') for item in house.children: if isinstance(item, bs4.element.NavigableString): d1 = self.parse_item(ToolsBox.clearStr(item)) each_data = self.add_advantage(d1, each_data) each_data['community_name'] = comm.get('title').strip() each_data['community_address'] = comm_addresse.get_text().strip() each_data['comm_url'] = comm.get('href').strip() each_data['total_price'] = ToolsBox.strToInt(price.get_text()) each_data['from'] = "Soufan" # each_data = self.pipe(each_data) if each_data: page_datas.append(each_data) else: if ToolsBox.ShowInvalideData(each_data): page_datas.append(each_data) return page_datas
def parse_datas(self, soup): totalfind = soup.select("span.tit em") if 0 == ToolsBox.strToInt(totalfind[1].get_text()): return '0' page_datas = [] communitys = soup.select("h3 > a") adds = soup.select('.li-info>address') dates = soup.select('p.date') prices = soup.select('p>strong') forsales = soup.select('p.bot-tag>span>a') for community, add, date, price, forsale in zip( communitys, adds, dates, prices, forsales): each_data = dict() each_data['community_name'] = community.get('title') each_data['community_url'] = community.get('href') add1 = ToolsBox.clearStr(add.get_text()) addlist = add1.split(']') if len(addlist) > 1: regionlist = addlist[0].replace('[', '').split('-') if len(regionlist) > 1: each_data['region'], each_data['block'] = regionlist else: each_data['region'] = regionlist each_data['address'] = addlist[1] else: each_data['address'] = add1 each_data['builded_year'] = ToolsBox.strToInt(date.get_text()) each_data['forsale_num'] = ToolsBox.strToInt(forsale.get_text()) each_data['price'] = ToolsBox.strToInt(price.get_text()) # each_data['date'] each_data['from'] = "AJK" if each_data: page_datas.append(each_data) else: if ToolsBox.ShowInvalideData(each_data): page_datas.append(each_data) # ToolsBox.printDic(page_datas) return page_datas
def match_list2comm(self): # 取挂牌记录 ListRecords = ML.get_list_fromMysql( "SELECT distinct community_name FROM `ods_hse_detail`") for item in ListRecords: item['clear_name'] = ToolsBox.clearStr(item['community_name']) result = [] # 以开发库的小区名为主,匹配出挂牌数据的列表 for key, value in self.comm_arr.items(): name_dic = dict() name_dic['comm_name'] = key name_dic['comm_id'] = value name_dic['vol'] = 0 name_dic['match_list_comm_name'] = '' name_dic['match_all'] = '' #存放所有匹配度>0.8的小区名 for item in ListRecords: vol = MyVote.cmntVol(key, item['clear_name']) if vol > name_dic['vol']: name_dic['vol'] = vol name_dic['match_list_comm_name'] = item['community_name'] if vol >= 0.8: name_dic['match_all'] = name_dic['match_all'] + item[ 'community_name'] + '(' + '%f' % vol + ');' result.append(name_dic) for item in ListRecords: item['matchid'] = '0' item['match_vol'] = 0 for key, value in self.comm_arr.items(): vol = MyVote.cmntVol(key, item['clear_name']) if vol > item['match_vol']: item['match_vol'] = vol item['matchid'] = value item['match_comm_name'] = key ToolsBox.saveExcel('match.xlsx', result, "Sheet1") ToolsBox.saveExcel('match.xlsx', ListRecords, "Sheet2")
def pipe(self, datadic): # 有效性检验 # 把小区的区块、板块及小区地址写到title里去 for key in datadic: datadic[key] = ToolsBox.clearStr(datadic[key]) title_temp = '' if 'region' in datadic.keys(): if self.excep(datadic['region'].strip()): return False else: title_temp += ' r:' + datadic['region'].strip() if 'block' in datadic.keys(): if self.excep(datadic['block'].strip()): return False else: title_temp += ' b:' + datadic['block'].strip() if 'community_address' in datadic.keys(): datadic['community_address'] = datadic['community_address'].strip() title_temp += ' a:' + datadic['community_address'].strip() if 'title' in datadic.keys(): title2 = title_temp.strip() + ' ' + datadic['title'] else: title2 = title_temp.strip() if len(title2) > 50: title2 = title2[:50] datadic['title'] = title2.strip() if ('community_name' not in datadic.keys()) or len(datadic['community_name']) < 2: return False datadic['community_id'] = self.MI.matchid(datadic) if ('total_floor' in datadic.keys()) and ( 'total_price' in datadic.keys()) and ('area' in datadic.keys()): if datadic['total_price'] is None or datadic[ 'area'] is None or datadic['area'] == 0: return False else: datadic['price'] = round( float(datadic['total_price'] * 10000 / datadic['area']), 2) if datadic['price'] < 1500 or datadic['price'] > 300000: return False # if datadic['community_name'] is None or len(datadic['community_name'])<2: # return False if datadic['total_floor'] > 60: datadic['total_floor'] = 35 #把过高楼层的设为35层 if datadic['total_price'] == 0: return False #2016.9.13 价格为0的过滤掉 if 'builded_year' in datadic.keys(): if datadic['builded_year'] < 1900: datadic['builded_year'] = 0 if datadic['area'] > 20000: return False #面积过大,有时是填写错误,而且面积大于20000的价格参考意义也不大,舍弃 if 'price' not in datadic.keys(): return False #2016.8.1 有时解析过程中出错,跳过了price字段解析,造成没有price,舍弃 #2017.4.14 detail_url字段太长,处理一下 if len(datadic['details_url']) > 250: datadic['details_url'] = datadic['details_url'][:249] if 'advantage' in datadic.keys(): if len(datadic['advantage']) > 20: datadic['advantage'] = datadic['advantage'][:20] return datadic else: if not ('total_floor' in datadic.keys()) and ( 'total_price' in datadic.keys()) and ( 'area' in datadic.keys()) and ('community_name' in datadic.keys()): if u"别墅" in datadic['title']: if datadic['total_price'] is None or datadic[ 'area'] is None or datadic['area'] == 0: return False else: datadic['price'] = round( float(datadic['total_price'] * 10000 / datadic['area']), 2) datadic['total_floor'] = 4 datadic['floor_index'] = 1 datadic['spatial_arrangement'] = datadic[ 'spatial_arrangement'] + u"别墅" if 'spatial_arrangement' in datadic.keys( ) else u"别墅" return datadic return False
def parse_item(self, string): # 2016.8.17增加:传入一个字符串,用正则判断它是面积?户型?单价?楼层?建成年份?优势?解析后返回一个键对值 try: string = string.decode('utf8').strip() except: string = string.strip() parse_dict = {} r1_1 = '(\d+)平方米' r1_2 = '(\d+.?\d+)平米' #厦门house的面积是浮点数 r1_3 = '(\d+.?\d+)㎡' #2016.9.13增加麦田的面积解析 r1_4 = '(\d+.?\d+)m²' #2017.3.8安居客 r1_5 = '(\d+.?\d+)�O' #2018.8.3搜房,这个乱码就是㎡ r1_6 = '(\d+.?\d+)平' #2018.8.3搜房,这个乱码就是㎡ r2_1 = '\d+室' r2_2 = '\d+房' r3_1 = '(\d+)元/' r3_2 = '(\d+)万' r4 = '\d+层' r4_1 = '((?P<floor>[高中低])楼?层)?.?共?(?P<total>\d+)层' r5_1 = '(\d{4})年' r5_2 = '年.*(\d{4})' if re.search(r1_1, string, flags=0)\ or re.search(r1_2, string, flags=0)\ or re.search(r1_3, string, flags=0)\ or re.search(r1_4, string, flags=0)\ or re.search(r1_5, string, flags=0)\ or re.search(r1_6, string, flags=0)\ or re.search(r2_1, string, flags=0)\ or re.search(r2_2, string, flags=0)\ or re.search(r3_1, string, flags=0)\ or re.search(r3_2, string, flags=0)\ or re.search(r4, string, flags=0)\ or re.search(r5_1, string, flags=0)\ or re.search(r5_2, string, flags=0): if re.search(r1_1, string, flags=0): parse_dict['area'] = int(re.search(r1_1, string).groups(0)[0]) elif re.search(r1_2, string, flags=0): parse_dict['area'] = int( round(float(re.search(r1_2, string).groups(0)[0]), 0)) elif re.search(r1_3, string, flags=0): #2016.9.13增加麦田的面积解析 parse_dict['area'] = int( round(float(re.search(r1_3, string).groups(0)[0]), 0)) elif re.search(r1_4, string, flags=0): #2017.3.8安居客的面积解析 parse_dict['area'] = int( round(float(re.search(r1_4, string).groups(0)[0]), 0)) elif re.search(r1_5, string, flags=0): # 2018.8.3搜房的面积解析 parse_dict['area'] = int( round(float(re.search(r1_5, string).groups(0)[0]), 0)) elif re.search(r1_6, string, flags=0): # 2019.9.9乐居的面积解析 parse_dict['area'] = int( round(float(re.search(r1_6, string).groups(0)[0]), 0)) else: pass # if re.search(r4, string, flags=0): # parse_dict['floor_index'],parse_dict['total_floor'] = self.parse_floor(string) if re.search(r4_1, string, flags=0): parse_dict['floor_index'], parse_dict[ 'total_floor'] = self.parse_floor(string, r4_1) # temp_str = re.search(r4_1, string, flags=0).groupdict() # # print(temp_str) # parse_dict['total_floor'] = int(temp_str['total']) # if temp_str['floor']: # if u"高" in temp_str['floor']: # parse_dict['floor_index'] = int(parse_dict['total_floor'] * 5 / 6) # elif u"低" in temp_str['floor']: # parse_dict['floor_index'] = int(parse_dict['total_floor'] / 6) # else: # parse_dict['floor_index'] = int(parse_dict['total_floor'] / 2) # else: # parse_dict['floor_index'] = 1 # print(temp_str.group(0)) # parse_dict['floor_index'] = floor_index # print(temp_str) # string = re.sub(r4_1, "", string, count=0, flags=0) string = ToolsBox.clearStr( re.sub(r4_1, "", string, count=0, flags=0)) # print(string) if re.search(r2_1, string, flags=0) \ or re.search(r2_2, string, flags=0): parse_dict['spatial_arrangement'] = string.strip() if re.search(r5_1, string, flags=0): parse_dict['builded_year'] = int( re.search(r5_1, string).groups(0)[0]) elif re.search(r5_2, string, flags=0): parse_dict['builded_year'] = int( re.search(r5_2, string).groups(0)[0]) else: pass else: if string == '|' or string == '|': pass elif string == '': pass else: #re.search('[南北东西]', string, flags=0): parse_dict['advantage'] = string.strip() # print(parse_dict) return parse_dict
def parse_datas(self, soup): page_datas = [] # details = soup.select("div.house-info") # comms = soup.select("div.house-info > a ") # positions = soup.select("div.house-position") # prices = soup.select("span.georgia") # titles = soup.select("h3 > a") # regions = soup.select(".region") # for title,comm,detail,position,price,region in zip(titles,comms,details,positions,prices,regions): # 2019/9/9乐居网改版面的 titles = soup.select("div.title_in") d_urls = soup.select("div.title_in > a") adds = soup.select("div.address") infos = soup.select("div.house_info") prices = soup.select("div.price > span") for title, add, d_url, info, price in zip(titles, adds, d_urls, infos, prices): each_data = dict(builded_year=0, spatial_arrangement='', floor_index=0, total_floor=0, title=title.get('title')) comms = add.select('span') each_data['community_name'] = ToolsBox.clearStr( comms[0].get_text()) for comm in comms: comm = ToolsBox.clearStr(comm.get_text()) if '-' != comm: if '-' in comm: c_item = comm.split('-') each_data['region'] = c_item[0] each_data['block'] = c_item[1] if '年' in comm: out = self.parse_item(comm) each_data = self.add_advantage(out, each_data) h_info = info.select('span') for item in h_info: item = ToolsBox.clearStr(item.get_text()) each_data = self.add_advantage(self.parse_item(item), each_data) each_data['details_url'] = 'https:' + d_url.get('href') each_data['total_price'] = ToolsBox.strToInt(price.get_text()) # , details_url='http://xm.esf.leju.com' + title.get('href') # mr20 = detail.select("span.mr20") # posi = position.select("span") # for j in range(1,len(posi)): # out = self.parse_item(posi[j].get_text()) # each_data = self.add_advantage(out, each_data) # # if len(out) > 0: # # if ('advantage' in each_data.keys()) and ('advantage' in out.keys()): # # each_data['advantage'] = each_data['advantage'] + ',' + out['advantage'] # # else: # # each_data = dict(each_data, **out) # for item in mr20: # d1 = self.parse_item(item.get_text()) # each_data = self.add_advantage(d1, each_data) # # if len(d1) > 0: # # if ('advantage' in each_data.keys()) and ('advantage' in d1.keys()): # # each_data['advantage'] = each_data['advantage'] + ',' + d1['advantage'] # # else: # # each_data = dict(each_data, **d1) # each_data['community_address'] = region.get_text().strip() # each_data['community_name'] = comm.get_text() # each_data['total_price'] =ToolsBox.strToInt(price.get_text()) # each_data['price'] = round(float(each_data['total_price']*10000/each_data['area']),2) each_data['from'] = "lejv" each_data = self.pipe(each_data) if each_data: page_datas.append(each_data) else: if ToolsBox.ShowInvalideData(each_data): page_datas.append(each_data) return page_datas
def parse_page(self, soup): page_datas = [] details_urls = soup.select(".property>a") titles = soup.select("h3.property-content-title-name") houses = soup.select("div.property-content-detail>section") # houses = soup.select('div.house-details') comms = soup.select('.property-content-info-comm-name') adds = soup.select('.property-content-info-comm-address') prices = soup.select('.property-price-total-num') for details_url, title, details, comm, price, add in zip( details_urls, titles, houses, comms, prices, adds): each_data = dict(advantage='', builded_year=0, spatial_arrangement='', floor_index=0, total_floor=0) each_data['title'] = title.get_text() each_data['details_url'] = details_url.get('href') houses = details.select(".property-content-info") detail = houses[0].select("p") for string in detail: d1 = self.parse_item( ToolsBox.clearStr(string.get_text().strip())) each_data = self.add_advantage(d1, each_data) each_data['community_name'] = comm.get_text().strip() add_list = [] for string in add.strings: add_list.append(ToolsBox.clearStr(string.strip())) try: each_data['region'], each_data['block'], each_data[ 'community_address'] = add_list except Exception as e: with open('logtest.txt', 'a+') as fout: fout.write('*************' + str(datetime.datetime.now()) + '*************\n') fout.write('AJK解析区、板块、地址时出错,待解析的数据:') traceback.print_exc(file=fout) print(traceback.format_exc()) # print(price) each_data['total_price'] = ToolsBox.strToInt(price.get_text()) each_data['from'] = "AJK" # try: # 2016.8.1 这里解析也时有出差,把它保留下来 # each_data['total_price'] = ToolsBox.strToInt(price.get_text()) # except Exception as e: # with open('logtest.txt', 'a+') as fout: # fout.write('*************' + str(datetime.datetime.now()) + '*************\n') # fout.write('AJK解析total_price出错,待解析的数据:' + price.get_text()) # traceback.print_exc(file=fout) # print(traceback.format_exc()) # try: # comminfo = comm.get('title').split() # each_data['community_name'] = comminfo[0] # each_data['region'], each_data['block'], each_data['community_address'] = comminfo[1].split('-', 2) # except Exception as e: # with open('logtest.txt', 'a+') as fout: # fout.write('*************' + str(datetime.datetime.now()) + '*************\n') # fout.write('Parse Failt of :%s \n' % comm.get('title')) # traceback.print_exc(file=fout) # print(traceback.format_exc()) # each_data['community_name'] = each_data['community_name'].strip() # try: # house = details.select('span') # # 2016.8.17 重写了字段解析,抽象出一个parse_item方法 # for h in house: # if len(h.attrs) == 0: # string = h.get_text().encode('utf8') # d1 = {} # d1 = self.parse_item(string) # each_data = self.add_advantage(d1, each_data) #each_data = dict(each_data, **d1) # each_data['from'] = "AJK" # except Exception as e: # with open('logtest.txt', 'a+') as fout: # fout.write('*************' + str(datetime.datetime.now()) + '*************\n') # fout.write(' 待解析的数据:\n') # for i1 in house: # fout.write(str(i1) + '\n') # fout.write('\n 字段数:' + str(len(house)) + '\n') # traceback.print_exc(file=fout) # print(traceback.format_exc()) each_data = self.pipe(each_data) # 2016.6.4增加一个专门的数据处理 if each_data: page_datas.append(each_data) else: if ToolsBox.ShowInvalideData(each_data): page_datas.append(each_data) return page_datas
def parse_datas(self, soup): page_datas = [] details = soup.select(".size") # comms = soup.select("a span.address-eara") # print(comms) prices = soup.select(".num") titles = soup.select("div.ershoufang-list .title a") regions = soup.select("span.area a") lists = soup.select(".ershoufang-list") for title, detail, list1, price, region in zip(titles, details, lists, prices, regions): # for title in titles: each_data = { 'builded_year': 0, 'spatial_arrangement': '', 'floor_index': 0, 'total_floor': 0, 'advantage': '', 'title': title.get('title'), 'details_url': 'http:' + title.get('href') } for item in (detail.stripped_strings): d1 = self.parse_item(item) each_data = self.add_advantage(d1, each_data) each_data['total_price'] = ToolsBox.strToInt(price.get_text()) address = list1.select("dd.address") # print(address[0]) # print(len(address)) # for item in comm.stripped_strings: # print(item) # print(comm.stripped_strings) # print(50*'0') if len(address) > 0: if len(address[0].select("a.address-eara")) > 0: each_data['region'] = ToolsBox.clearStr( address[0].select("a.address-eara")[0].get_text()) if len(address[0].select("span.address-eara")) > 0: # each_data['community_name'] = address[0].select("span.address-eara")[0].get_text() # print(each_data['community_name']) each_data['community_name'] = ToolsBox.clearStr( address[0].select("span.address-eara")[0].get_text()) # try: # except (IndexError) as e: # print("****页面数据不规范*****") # input(address) # each_data['community_name'] = (comm.get_text()) # print(comm.children) # for name in comm.descendants: # print(name) # # pass # print('-'*50) # each_data['region'] = ToolsBox.clearStr(region.get_text()) each_data['from'] = "ganji" # print(each_data) each_data = self.pipe(each_data) # if each_data: page_datas.append(each_data) else: if ToolsBox.ShowInvalideData(each_data): page_datas.append(each_data) return page_datas
def parse_datas(self, soup): page_datas = [] # print(soup) titles = soup.select("h2.title > a") prices = soup.select('p.sum > b') houses = soup.select('.list-info') for title, price, house in zip(titles, prices, houses): each_data = {'advantage': '', 'builded_year': 0, 'spatial_arrangement': '', 'floor_index': 0, 'total_floor': 0, 'title': title.get_text(), 'details_url': title.get('href'), 'total_price': ToolsBox.strToInt(price.get_text())} details = house.select('p.baseinfo') spans = details[0].select('span') for span in spans: string = ToolsBox.clearStr(span.get_text()).encode('utf8') # d1 = {} d1 = self.parse_item(string) each_data = self.add_advantage(d1, each_data) # each_data = dict(each_data, **d1) comms = details[1].select('a') each_data['community_name'] = comms[0].get_text() if comms[0].get('href') is None: each_data['comm_url'] = '' else: each_data['comm_url'] = 'http://xm.58.com' + comms[0].get('href') each_data['from'] = "58" try: if len(comms) >= 2: # input('region') each_data['region'] = comms[1].get_text().strip() except Exception as e: # print('-------这个记录没有拿到小区的区域------------') # ToolsBox.printDic(each_data) print(e) try: if len(comms) >= 3: # input('address') each_data['community_address'] = comms[2].get_text().strip() except Exception as e: # print('-------这个记录没有拿到小区地址------------') # ToolsBox.printDic(each_data) print(e) each_data = self.pipe(each_data) if each_data: match_comm = re.findall(r'^\d+$', each_data['community_name']) # 不知道为什么,有时小区名称会都是数字,需要屏蔽 # print(match_comm) if len(match_comm) > 0: print('/////////////////出现纯数字的小区了!!!!!!////////////////////////') ToolsBox.priList(each_data) print(soup) # print(each_data['community_name']) # var1 = input(each_data['community_name']+'出现纯数字的小区了!!!!!!!!!') else: page_datas.append(each_data) else: if ToolsBox.ShowInvalideData(each_data): page_datas.append(each_data) return page_datas