def parse_news(self, response): if '_' not in response.url: pages = set(find(response, '//div[@class="page"]//a/@href', False)) for page in pages: yield Request(response.meta['house_link'].strip('/') + page, callback=self.parse_news, meta={'house_id': response.meta['house_id']}) story_list = response.xpath('//li[@class="storyList"]') if not story_list: self.logger.warning('no story %s', response.url) return news = [] for story in story_list: try: title = find(story, './h2/a/text()') link = find(story, './h2/a/@href') news_content = {'news_title': title, 'news_link': link} except: news_content = ' '.join(find(story, './p/text()', False)) news.append({ 'update_at': find(story, './div/text()'), 'news_content': news_content }) yield { 'house_id': response.meta['house_id'], 'table': self.name, 'item': ('news', news) }
def parse_news(self, response): story_list = response.xpath('//div[@id="all_hidden"]/div') if not story_list: self.logger.warning('no story %s', response.url) return news = [] for story in story_list: link = find(story, './@link') _news = { 'news_content': { 'news_link': link }, 'update_at': link.split('/')[-2] } try: img = story.xpath('.//img')[0] _news['news_content'].update({ 'news_title': find(img, './@alt'), 'img_link': find(img, './@src') }) except: pass news.append(_news) yield { 'house_id': response.meta['house_id'], 'table': self.name, 'item': ('news', news) }
def parse_news(self, response): if response.url.rstrip('/')[-1].isdigit() is False: xp = '//div[@class="module-pagination"]/a/@href' pages = find(response, xp, False)[4:] for page in pages: yield Request(page, callback=self.parse_news, meta={'house_id': response.meta['house_id']}) story_list = response.xpath('//div[@class="new-review-title"]') if not story_list: self.logger.warning('no story %s', response.url) return news = [{ 'update_at': find(story, './div[2]/text()'), 'news_content': { 'news_title': find(story, './div[1]/a/text()'), 'news_link': find(story, './div[1]/a/@href') } } for story in story_list] yield { 'house_id': response.meta['house_id'], 'table': self.name, 'item': ('news', news) }
def parse_pic_link(self, response): pics = response.xpath(sf.PICS) if not pics: self.logger.warning('pictures unreachable %s', response.url) return host = response.url.split('/photo')[0] for pic in pics: pic_total_num = find(pic, './em/text()') pic_label = find(pic, './span/text()') if pic_label == '户型': continue pic_id = find(pic, './@href').split('list_')[-1].split('_')[0] # TODO: 只拿了前6个,后面的要拿的话 parse_pic yield后会在 mongopipeline # TODO: set item, 会将前面的覆盖 num = int(int(pic_total_num) / 6) + 1 # 只拿前3页 num = 4 if num > 3 else num for page in range(1, num): url = self.picture_url.format(host, response.meta['house_id'], pic_id, page) yield Request(url=url, callback=self.parse_pic, meta={ 'label': pic_label, 'house_id': response.meta['house_id'], })
def parse_xiangce(self, response): """TODO: 如果超过8张,拿不全图片""" pics = response.xpath(q.PICS) if not pics: self.logger.error('pic not found %s', response.url) return pictures = { 'house_id': response.meta['house_id'], 'table': self.name, } album = [] # 房型 if find(pics[0], './@id') == '_apartment': yield { 'new_data': True, 'house_id': response.meta['house_id'], 'table': self.name + '_room', 'room_album': [{ 'room_label': find(item, './div[2]/a/text()'), 'room_url': find(item, q.IMG) } for item in pics[0].xpath('.//ul/li')] } # 所有类型图片 for pic in pics: title = find(pic, q.TITLE) for src in find(pic, q.IMG, False): album.append({'picture_title': title, 'picture_url': src}) pictures.update({'item': ('album', album)}) yield pictures
def parse_xiangce_link(self, response): pics = response.xpath(shjd_xp.PICTURE_URLS) if not pics: self.logger.warning('picture unreachable! %s', response.url) return for pic in pics: yield Request(url=find(pic, './@href'), callback=self.parse_xiangce, meta={ 'house_id': response.meta['house_id'], 'label': find(pic, './text()').split('(')[0] })
def parse_room(self, response): # 解析户型数据 room = { 'new_data': True, 'house_id': response.url.split('/')[-1].split('-')[0], 'table': self.name + '_room' } pics = response.xpath(ajk_xp.ROOM_TYPE_PICS) if not pics: self.logger.warning('room pictures is empty. %s', response.url) else: room['room_album'] = [{ 'picture_title': find(item, './@data-title'), 'picture_url': find(item, './img/@imglazyload-src') } for item in pics] room['room_type'] = find(response, ajk_xp.ROOM_TITLES).split(',')[0] labels = find(response, ajk_xp.ROOM_LABELS, False) if not labels: self.logger.warning('room labels is empty %s', response.url) else: room['room_sale_status'] = labels[0] room['room_labels'] = [label for label in labels[1:]] price = response.xpath(ajk_xp.ROOM_PRICE) if not price: self.logger.warning('room price is empty %s', response.url) else: for item in price: name = find(item, './/strong/text()') if name not in room_price_dict: self.logger.warning('key %s unknown %s', name, response.url) continue room[room_price_dict[name]] = find(item, './span/text()') room_details = response.xpath(ajk_xp.ROOM_DETAILS) if not room_details: self.logger.warning('room details is empty %s', response.url) else: for item in room_details: name = find(item, './strong/text()') if name not in room_details_dict: self.logger.warning('key %s unknown %s', name, response.url) continue room[room_details_dict[name]] = find(item, './span/text()') room_description = find(response, ajk_xp.ROOM_DESCRIPTION, False) if not room_description: self.logger.warning('room description is empty %s', response.url) else: room['room_description'] = ' '.join(room_description) yield room
def parse_xiangce(self, response): yield { 'house_id': response.meta['house_id'], 'table': self.name, 'item': ('album', [{ 'picture_url': find(img, './@src'), 'picture_label': response.meta['label'], 'picture_description': find(img, './@data-name') } for img in response.xpath(shjd_xp.PICTURES)]) }
def base(self, house, item, d): key = find(item, f'./td[@class="label-{d}"]/text()') and \ find(item, f'./td[@class="label-{d}"]/text()').strip(':') value = find(item, f'./td[@class="text-{d}"]/text()') or \ find(item, f'./td[@class="text-full"]/text()') if not key or key == '售楼电话': return if key not in base_info_dict: self.logger.warning('unknown key %s', key) return house[base_info_dict[key]] = value
def parse_house_link(self, response): # 获取每页所有房源链接并逐一遍历 house_links = find(response, ajk_xp.HOUSE_LINKS, False) if not house_links: self.logger.error('cannot find house link of %s', response.url) return for house_link in house_links: house_id = house_link.rstrip('/').split('/')[-1].split('.')[0] city = house_link.split('.')[0].split('/')[-1] host = house_link.split(house_id)[0] url = host + f'canshu-{house_id}.html' # 基本参数 yield Request(url, callback=self.parse_house, meta={ 'house_id': house_id, 'city': CITY[city] }) # 户型 yield Request(url.replace('canshu', 'huxing'), callback=self.parse_room_count) # 图片 yield Request(url.replace('canshu', 'xiangce'), callback=self.parse_pic, meta={'house_id': house_id}) # 动态 yield Request(url.replace('canshu', 'officialnews'), callback=self.parse_news, meta={'house_id': house_id})
def parse(self, response): total_count = find(response, shjd_xp.TOTAL_COUNT) if not total_count: self.logger.error('total count not clear! %s', response.url) return total_pages = int(ceil(int(total_count) / 20)) # if response.url.startswith('http://sz.focus.cn/'): # url = response.url.strip('.html') + '_p{}' + '.html' # url_xiangqing = 'http://sz.focus.cn/loupan/' + '{}/xiangxi/' # url_huxing = 'http://sz.focus.cn/loupan/' + '{}/huxing/' # url_xiangce = 'http://sz.focus.cn/loupan/' + '{}/tu/' # else: url = response.url + 'p{}/' url_xiangqing = response.url + '{}/xiangqing.html' url_huxing = response.url + '{}/huxing/' url_xiangce = response.url + '{}/xiangce/' url_dongtai = response.url + '{}/dongtai/' for page in range(1, total_pages + 1): yield Request(url.format(page), callback=self.parse_house_link, meta={ 'xq': url_xiangqing, 'hx': url_huxing, 'xc': url_xiangce, 'dt': url_dongtai })
def parse_room_url(self, response): # 解析所有户型链接并逐一遍历 urls = find(response, ajk_xp.ROOM_URLS, False) if not urls: self.logger.warning('room urls is empty. %s', response.url) return for url in urls: yield Request(url, callback=self.parse_room)
def parse_huxing(self, response): room = { 'new_data': True, 'house_id': response.meta['house_id'], 'table': self.name + '_room', } room_pics = find(response, shjd_xp.ROOM_PICS, False) if not room_pics: self.logger.warning('room pictures unreachable! %s', response.url) else: room['room_album'] = [{'picture_url': img} for img in room_pics] room['room_type'] = find(response, shjd_xp.ROOM_TYPE) room['room_sale_status'] = find(response, shjd_xp.SALE_STATUS) price = ''.join(find(response, shjd_xp.ROOM_PRICE, False)) room['reference_price'] = price room_info = response.xpath(shjd_xp.ROOM_INFO) if not room_info: self.logger.warning('room info unreachable! %s', response.url) else: for item in room_info: key = find(item, './label/text()') self.room_details_dict[key] = find(item, './text()') room_des = response.xpath(shjd_xp.ROOM_DESCRIPTION) if not room_des: self.logger.warning('room description empty! %s', response.url) else: room['room_description'] = find(room_des, '../div/text()') yield room
def parse_house(self, response): # 解析房源的基本参数 house = { 'new_data': True, 'sale_status': find(response, ajk_xp.SALE_STATUS), 'house_id': response.meta['house_id'], 'city': response.meta['city'], 'table': self.name } # 参数 for item in response.xpath(ajk_xp.ITEMS): name = find(item, ajk_xp.NAME) if not name or name in ['楼盘图片', '售楼处电话']: continue if name not in base_info_dict: self.logger.warning('name %s unknown %s', name, response.url) continue if name in ['楼盘名称', '开发商', '物业公司']: value = find(item, './/a/text()') \ or find(item, './div[2]/text()') elif name in ['楼盘特点', '楼盘户型']: value = find(item, './/a/text()', False) if name == '楼盘户型': value = house_type_split(value) elif name in ['区域位置', '参考单价']: value = ''.join(find(item, './/text()', False)).strip() value = value.lstrip(name).rstrip('[价格走势]').strip() else: value = find(item, './div[contains(@class, "des")]/text()') house[base_info_dict[name]] = value yield house
def parse_xiangqing(self, response): house = { 'new_data': True, 'table': self.name, 'house_id': response.meta['house_id'], 'city': CITY[response.meta['city']], 'building_name': find(response, q.NAME), 'alias_name': find(response, q.ALIAS), 'description': ''.join(find(response, q.DESCRIPTION, False)) } for div_id in ['basics', 'saleIntro', 'building', 'property']: xp = f'//div[@id="{div_id}"]/div[2]/ul/li' for item in response.xpath(xp): name = find(item, './span/text()') if not name: continue if name not in base_info_dict: self.logger.warning('name %s not in dict %s', name, response.url) continue house[base_info_dict[name]] = find(item, './p/text()') try: other_info = ' '.join(find(response, q.OTHER_INFO_MORE, False)) except: other_info = ' '.join(find(response, q.OTHER_INFO, False)) house.update({'transportation': other_info}) yield house
def parse(self, response): pages = find(response, sf.PAGE_COUNT) if not pages: self.logger.error('cannot find pages of %s', response.url) return pages = int(pages.strip('/')) for page in range(1, pages + 1): url = response.url.rstrip('/') + f'/b9{page}/' yield Request(url, callback=self.parse_house_link)
def parse_huxing_link(self, response): rooms = find(response, shjd_xp.ROOM, False) if not rooms: self.logger.warning('room urls unreachable! %s', response.url) return for room in rooms: yield Request(room, callback=self.parse_huxing, meta={'house_id': response.meta['house_id']})
def parse_room_count(self, response): # 根据户型总数遍历每一页 total = find(response, ajk_xp.ROOM_COUNT) if not total: self.logger.warning('room count is empty. %s', response.url) return for page in range(1, int(total) // 8 + 1 + 1): url = str(response.url).replace('.html', f'/s?p={page}') yield Request(url, callback=self.parse_room_url)
def parse_xiangqing(self, response): city = response.url.split('.')[0].split('/')[-1] house = { 'new_data': True, 'city': CITY[city], 'table': self.name, 'house_id': response.meta['house_id'], 'alias_name': find(response, shjd_xp.OTHER_NAME), } labels = find(response, shjd_xp.LABELS, False) if not labels: self.logger.warning('empty labels! %s', response.url) else: house['labels'] = labels base_items = response.xpath(shjd_xp.INFO) if not base_items: self.logger.warning('base info is empty! %s', response.url) else: for item in base_items: self.base(house, item, 'l') self.base(house, item, 'r') licenses = response.xpath(shjd_xp.LICENSE) if not licenses: self.logger.warning('license info is empty! %s', response.url) else: house['license'] = [{ 'license_number': find(item, './td[1]/text()'), 'license_start_at': find(item, './td[2]/span/text()'), 'bind_building': find(item, './td[3]/text()'), } for item in licenses] price = response.xpath(shjd_xp.PRICE) if not price: self.logger.warning('price unreachable! %s', response.url) else: house['price_history'] = [ { 'release_time': find(item, './td[1]/span/text()'), # 'highest_price': find(item, './td[2]/text()'), # 'avg_price': find(item, './td[3]/span/text()'), # 'lowest_price': find(item, './td[4]/text()'), 'price_details': find(item, './td[last()]/text()') } for item in price ] house['price'] = house['price_history'][0]['price_details'] house['description'] = find(response, shjd_xp.DESCRIPTION) yield house
def parse_news(self, response): # TODO: 只获取了第一页 story_list = response.xpath('//div[@class="bd"]') if not story_list: self.logger.warning('no story %s', response.url) return news = [{ 'update_at': find(story, './div/span/text()'), 'news_content': { 'news_link': find(story, './/h3/a/@href'), 'news_title': find(story, './/h3/a/text()') } } for story in story_list] yield { 'house_id': response.meta['house_id'], 'table': self.name, 'item': ('news', news) }
def parse(self, response): # 获取总页数并逐一遍历 pages = find(response, ajk_xp.TOTAL_PAGES) if not pages: self.logger.error('cannot find pages of %s', response.url) return # 每页有50条数据 pages = int(pages) // 50 + 1 for page in range(1, pages + 1): url = response.url.rstrip('/') + f'/loupan/all/p{page}/' yield Request(url, callback=self.parse_house_link)
def parse_pictorial(self, response): # 画报与图片不同 pic_items = response.xpath(ajk_xp.PIC_ITEMS) if not pic_items: self.logger.warning('pictorial is empty %s', response.url) return yield { 'house_id': response.meta['house_id'], 'table': self.name, 'item': ('pictorial', [{ 'picture_url': find(item, './/img/@data-src'), 'picture_title': find(item, './/h3/text()'), 'picture_description': find(item, './/p/text()') } for item in pic_items][:-2]) }
def parse_pic(self, response): # 解析图片参数 labels = find(response, ajk_xp.PIC_HEADER, False) if '画报' in labels: labels.remove('画报') urls = find(response, ajk_xp.PICTORIAL, False) for url in urls: yield Request(url.split('?')[0], callback=self.parse_pictorial, meta={'house_id': response.meta['house_id']}) html = ''.join(response.text.replace('\n', '').split(' ')) data = self.ptn_pic_loc.findall(html) if not data: self.logger.error('picture info not found %s', response.url) return data = data[0].replace('big', '"big"').replace('small', '"small"')\ .replace('image_id', '"image_id"').replace(' ', ' ')\ .replace('image_des', '"image_des"').replace('\'', '"') + ']' try: data = json.loads(data) except: self.logger.error('json loads error %s', response.url) return yield { 'house_id': response.meta['house_id'], 'table': self.name, 'item': ('album', [{ 'picture_label': label, 'picture_url': url, 'picture_description': des } for label, pic in zip(labels, data) for url, des in zip(pic['big'], pic['image_des'])]) }
def parse_house_link(self, response): house_links = find(response, sf.HOUSE_LINK, False) if not house_links: self.logger.error('cannot find house_link of %s', response.url) return house_ids = self.ptn_house_id.findall(response.text) if not house_ids: self.logger.error('house ids not found! %s', response.url) return # house_labels = response.content.decode('gb2312', 'replace') city = response.url.split('.fang.')[0].split('.')[-1] house_ids = house_ids[0].split(',') for house_link, house_id in zip(house_links, house_ids): house_link = house_link.split('/?')[0] # 基本参数 yield Request(house_link + f'house/{house_id}/housedetail.htm', callback=self.parse_house, meta={ 'house_id': house_id, 'city': city }) # 户型 yield Request(self.huxing_url.format(house_link, house_id), callback=self.parse_room, meta={'house_id': house_id}) # 图片 yield Request(house_link + f'photo/{house_id}.htm', callback=self.parse_pic_link, meta={'house_id': house_id}) # 动态 yield Request(house_link + f'house/{house_id}/dongtai.htm', callback=self.parse_news, meta={ 'house_id': house_id, 'house_link': house_link })
def parse_house_link(self, response): house_ids = find(response, shjd_xp.HOUSE_IDS, False) if not house_ids: self.logger.error('house ids not clear! %s', response.url) return for house_id in house_ids: # 详情 yield Request(url=response.meta['xq'].format(house_id), callback=self.parse_xiangqing, meta={'house_id': house_id}) # 户型 yield Request(url=response.meta['hx'].format(house_id), callback=self.parse_huxing_link, meta={'house_id': house_id}) # 相册 yield Request(url=response.meta['xc'].format(house_id), callback=self.parse_xiangce_link, meta={'house_id': house_id}) # 动态 yield Request(url=response.meta['dt'].format(house_id), callback=self.parse_news, meta={'house_id': house_id})
def parse_house(self, response): house = { # if false then push the data to array 'new_data': True, 'house_id': response.meta['house_id'], 'city': CITY[response.meta['city']], 'table': self.name, 'building_name': find(response, sf.NAME), 'alias_name': find(response, sf.ALIAS), 'feature': find(response, sf.LABELS, False), 'price': find(response, sf.PRICE), 'description': find(response, sf.DESCRIPTION) } # 参数 for item in response.xpath(sf.INFO): name = find(item, './div[1]/text() | ./span/text()') if not name: continue name = name.strip(':') if name in ['项目特色', '楼盘特色', '预售许可证', '咨询电话']: continue if name not in self.kw_dict: self.logger.warning('name %s unknown %s', name, response.url) continue name = self.kw_dict[name] if isinstance(name, str): value = find(item, './div[2]/text()') else: name, value = name[0], find(item, name[1]) house[name] = value history = response.xpath(sf.HISTORY) if not history: self.logger.warning('history unreachable! %s', response.url) else: if len(history) > 1: licenses = history[0].xpath('.//tr[position()>1]') house['license'] = [{ 'license_number': find(item, './td[1]/text()'), 'license_start_at': find(item, './td[2]/text()'), 'bind_building': find(item, './td[3]/text()'), } for item in licenses] price = history[1].xpath('.//tr[position()>1]') else: price = history[0].xpath('.//tr[position()>1]') house['price_history'] = [{ 'release_time': find(item, './td[1]/text()'), 'price_details': find(item, './td[last()]/text()') } for item in price] yield house