def parse(self, response): cities = [] select_a = response.xpath('//*[@class="Z_city_list Z_select_body"]/a') for a in select_a: url = from_xpath(a, './@href', xt.urljoin, source=response) city = from_xpath(a, './/text()', xt.string_join) cities.append(city) yield Request(url, self.parse_city, meta={"city": city}, dont_filter=True) self.cities = cities
def parse_city(self, response): city = response.meta['city'] places = response.xpath('//a[text()="区域"]/following-sibling::div//a') for place in places: url = from_xpath(place, './@href', xt.urljoin, source=response) area_place = from_xpath(place, './text()', xt.string_join) house_place = "-".join((city, area_place)) yield Request(url, self.parse_area, meta={ 'house_place': house_place, 'city': city }, dont_filter=True)
def parse(self, response): print(from_xpath(response, '//title/text()')) print(from_xpath(response, '//title/text()', xt.extract)) print( from_xpath(response, '//*[@class="one_entity"][1]//text()', xt.string_join)) print( from_xpath(response, '//*[@class="one_entity"][1]//h2/a/@href', xt.urljoin)) print( from_xpath(response, '//*[@class="one_entity"][1]', xt.analysis_article)) alc, new, content = from_xpath(response, [ '//*[@class="one_entity"]', ['.//h2//text()', xt.string_join], [ './/h2/a/@href', xt.urljoin, {}, lambda url: not url.startswith('https') ], ]) for el1, el2 in content: print(el1, el2) if new and False: # there is no mean for False, just don't want to get next page yield next_page_request(response, 'page=(\d+)')
def parse_area(self, response): img_url = re.search('url\((.*?)\)', response.text).group(1) house_price_list = zr.get_price( requests.get(response.urljoin(img_url)).content) new = 0 city = response.meta["city"] house_place = response.meta["house_place"] houses = response.xpath( '//div[@class="Z_list"]/div[@class="Z_list-box"]' '/div[@class="item"][(./div[@class="info-box"])]') for house in houses: url = from_xpath(house, './div[@class="info-box"]//h5/a/@href', xt.urljoin, source=response) if self.mongodb.count({"url": url}): continue new += 1 item = {} item['url'] = url item['city'] = city item["house_place"] = house_place item["house_name"] = from_xpath( house, './div[@class="info-box"]//h5/a/text()') item["house_area"], \ item["house_floor"] = \ from_xpath(house, './div[@class="info-box"]/div[@class="desc"]/div[1]/text()').split('|', maxsplit=1) item["distance_from_subway"] = \ from_xpath(house, './div[@class="info-box"]/div[@class="desc"]/div[@class="location"]/text()', xt.string_join) item["download_time"] = time.time() prices_style = \ from_xpath(house, './div[@class="info-box"]/div[@class="price"]/span[@class="num"]/@style', xt.extract) price = [] for price_style in prices_style: position_num = re.search('position:\s*-(.*?)px', price_style).group(1) price.append(house_price_list[price_list.index( int(float(position_num)))]) item["house_price"] = int(''.join(map(lambda x: str(x), price))) yield item self.log(f"new item: {new}", level=logging.INFO) if new: next_page = from_xpath(response, '//a[text()="下一页"]/@href', xt.urljoin) if isinstance(next_page, str) and next_page.startswith("http"): yield response.request.replace(url=next_page)
def parse_detail(self, response): yield { "url": response.url, "title": from_xpath(response, "//title/text()") }