def parse(self, response): meta = response.meta item = meta['item'] item['house_layout'] = self.parse_layout(response) images_url = meta['root_url'] + 'xiangce/' yield from ParseUtil.start_request(images_url, ImagesParser().parse, meta)
def start_requests(self): item = HousemarkettrackerItem() # item['house_name'] = building.xpath('@title').extract_first() # item['home_page'] = root_url meta = {} meta['item'] = item meta['root_url'] = 'https://cd.fang.lianjia.com/loupan/p_sdwdgcabfld/' root_url = self.start_urls[0] yield from ParseUtil.start_request(root_url, self.parse, meta)
def parse(self, response): buildings = response.xpath('/html/body/div[4]/ul[2]/li/a') for building in buildings: root_url = "https://cd.fang.lianjia.com" + building.xpath( '@href').extract_first() item = HousemarkettrackerItem() item['house_name'] = building.xpath('@title').extract_first() item['home_page'] = root_url meta = {} meta['item'] = item meta['root_url'] = root_url detail_url = root_url + 'xiangqing/' print('start: ------' + root_url) yield from ParseUtil.start_request(detail_url, DetailParser().parse, meta)
def parse(self, response): meta = response.meta item = meta['item'] basic_dict = self.parse_basic_info(response) plan_dict = self.parse_planning_info(response) facility_dict = self.parse_ancillary_facility(response) pre_sales_list = self.parse_pre_sales(response) opening_info_list = self.parse_sales_info(response) meta = response.meta item['house_detail'] = HouseDetail(basic_dict, opening_info_list, plan_dict, pre_sales_list, facility_dict).__dict__ meta['item'] = item home_page_url = meta['root_url'] yield from ParseUtil.start_request_with_lua( home_page_url, HouseHomePageParser().parse, meta)
def parse(self, res): news_list = [] news_div_s = res.xpath('//div[@class="dongtai-one for-dtpic"]') for new_div in news_div_s: news_dict = {} news_dict['tag'] = new_div.xpath( 'a/span[@class="a-tag"]/text()').extract_first() news_dict['title'] = new_div.xpath( 'a/span[@class="a-title"]/text()').extract_first() news_dict['time'] = new_div.xpath( 'a/span[@class="a-time"]/text()').extract_first() news_dict['content'] = new_div.xpath( 'child::*//div[@class="a-word"]/a/text()').extract_first() news_dict['link'] = new_div.xpath( 'child::*//div[@class="a-word"]/a/@href').extract_first() news_list.append(news_dict) meta = res.meta item = meta['item'] if item.get('house_news') is None: item['house_news'] = news_list else: item['house_news'] += news_list page = res.xpath('//div[@class="page-box"]') current_page_str = page.xpath('@data-current').extract_first() if current_page_str is None: yield item else: current_page_index = int(current_page_str) total_count = int(page.xpath('@data-total-count').extract_first()) total_pages = ceil(total_count / 20.0) if current_page_index < total_pages: next_page_url = meta['root_url'] + 'dongtai/pg' + str( current_page_index + 1) yield from ParseUtil.start_request(next_page_url, NewsParser().parse, meta) else: yield item
def parse(self, response): meta = response.meta item = meta['item'] all_album_divs = response.xpath('//div[@class="tab-group"]') image_dict = {} for album_div in all_album_divs: title = album_div.xpath('h4/a/text()').extract_first() title = re.search(r'(.+?)(\d*)', title).group(1) image_li_s = album_div.xpath('ul/li') image_list = [] for image_li in image_li_s: image_url = image_li.xpath('a/img/@src').extract_first() image_url = re.sub(r'235x178', '1000x', image_url) image_list.append(image_url) image_dict[title] = image_list item['house_images'] = image_dict comments_url = meta['root_url'] + 'pinglun/' yield from ParseUtil.start_request(comments_url, CommentParser().parse, meta)
def parse_comments(self, res): li_s = res.xpath('//li[@data-role="commentitem"]') comments_in_page = [] for li in li_s: comment_content_dict = {} # user user = li.xpath('div[@class="l_userpic"]') comment_content_dict['user_image'] = user.xpath( '//img/@src').extract_first() user_line = user.xpath('div[@class="info"]//text()').extract() comment_content_dict['user_name'] = self.normalize_space( user_line[0]) if len(user_line) > 1: # visitor don't have user life info comment_content_dict['user_life'] = self.normalize_space( user_line[1]) # comment comment = li.xpath('div[@class="r_comment"]') comment_content_dict['tag'] = comment.xpath( 'span[@class="tag"]/text()').extract_first() star = comment.xpath( 'child::*//div[@class="star_info"]/@style').extract_first() star = 5 * int(re.match('.+?(\d+)%', star).group(1)) / 100 comment_content_dict['star'] = star all_item_score_s = comment.xpath( 'child::*/div[@class="num"]/span/text()').extract() for specific_score in all_item_score_s: key_value = specific_score.split(':') comment_content_dict[key_value[0]] = key_value[1] comment_content_dict['words'] = li.xpath( 'child::*//div[@class="words"]/text()').extract_first() comment_content_dict['time'] = li.xpath( 'child::*//div[@class="time"]/text()').extract_first() comment_content_dict['like'] = li.xpath( 'child::*//div[@class="like"]/span/text()').extract_first() comments_in_page.append(comment_content_dict) # meta meta = res.meta item = meta['item'] item['house_comment']['comments'] += (comments_in_page) current_page_str = res.xpath( '//div[@class="page-box"]/@data-current').extract_first() if current_page_str is None: yield from self.start_parse_news(meta) else: current_page = int(current_page_str) total_pages = ceil( int( res.xpath('//div[@class="page-box"]/@data-total-count'). extract_first()) / 20.0) next_page_rul = meta['root_url'] + 'pinglun/pg' + str( current_page + 1) if current_page < total_pages: yield from ParseUtil.start_request( next_page_rul, CommentParser().parse_comments, meta) else: yield from self.start_parse_news(meta)
def start_parse_news(self, meta): news_url = meta['root_url'] + 'dongtai/' yield from ParseUtil.start_request(news_url, NewsParser().parse, meta)