d1 = self.parse_item(p.get_text().strip()) each_data = self.add_advantage(d1, each_data) each_data = self.pipe(each_data) if each_data: page_datas.append(each_data) else: if ToolsBox.ShowInvalideData(each_data): page_datas.append(each_data) return page_datas if __name__ == "__main__": downloader = Downloader.Downloader() parser = www917Page() url = 'https://www.917.com/sell/pn10/' headers = { "Host": "www.917.com", "Referer": "http://www.917.com/", 'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/7.0)', } html_cont, code = downloader.download(url, headers=headers) urls, datas = parser.page_parse(html_cont) ToolsBox.priList(urls)
each_data['from'] = "lejv" each_data = self.pipe(each_data) if each_data: page_datas.append(each_data) else: if ToolsBox.ShowInvalideData(each_data): page_datas.append(each_data) return page_datas if __name__ == "__main__": downloader = Downloader.Downloader() parser = LejvPage() url = 'https://xm.esf.leju.com/house' headers = { "Host": "xm.esf.leju.com", "Referer": "http://xm.esf.leju.com/house/", 'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/7.0)', } html_cont, code = downloader.download(url, headers=headers) # print(html_cont) urls, datas = parser.page_parse(html_cont) ToolsBox.priList(datas)
def parse_datas(self, soup): page_datas = [] # print(soup) titles = soup.select("h2.title > a") prices = soup.select('p.sum > b') houses = soup.select('.list-info') for title, price, house in zip(titles, prices, houses): each_data = {'advantage': '', 'builded_year': 0, 'spatial_arrangement': '', 'floor_index': 0, 'total_floor': 0, 'title': title.get_text(), 'details_url': title.get('href'), 'total_price': ToolsBox.strToInt(price.get_text())} details = house.select('p.baseinfo') spans = details[0].select('span') for span in spans: string = ToolsBox.clearStr(span.get_text()).encode('utf8') # d1 = {} d1 = self.parse_item(string) each_data = self.add_advantage(d1, each_data) # each_data = dict(each_data, **d1) comms = details[1].select('a') each_data['community_name'] = comms[0].get_text() if comms[0].get('href') is None: each_data['comm_url'] = '' else: each_data['comm_url'] = 'http://xm.58.com' + comms[0].get('href') each_data['from'] = "58" try: if len(comms) >= 2: # input('region') each_data['region'] = comms[1].get_text().strip() except Exception as e: # print('-------这个记录没有拿到小区的区域------------') # ToolsBox.printDic(each_data) print(e) try: if len(comms) >= 3: # input('address') each_data['community_address'] = comms[2].get_text().strip() except Exception as e: # print('-------这个记录没有拿到小区地址------------') # ToolsBox.printDic(each_data) print(e) each_data = self.pipe(each_data) if each_data: match_comm = re.findall(r'^\d+$', each_data['community_name']) # 不知道为什么,有时小区名称会都是数字,需要屏蔽 # print(match_comm) if len(match_comm) > 0: print('/////////////////出现纯数字的小区了!!!!!!////////////////////////') ToolsBox.priList(each_data) print(soup) # print(each_data['community_name']) # var1 = input(each_data['community_name']+'出现纯数字的小区了!!!!!!!!!') else: page_datas.append(each_data) else: if ToolsBox.ShowInvalideData(each_data): page_datas.append(each_data) return page_datas