Beispiel #1
0
 def parse(self, response):
     cities = []
     select_a = response.xpath('//*[@class="Z_city_list Z_select_body"]/a')
     for a in select_a:
         url = from_xpath(a, './@href', xt.urljoin, source=response)
         city = from_xpath(a, './/text()', xt.string_join)
         cities.append(city)
         yield Request(url,
                       self.parse_city,
                       meta={"city": city},
                       dont_filter=True)
     self.cities = cities
Beispiel #2
0
 def parse_city(self, response):
     city = response.meta['city']
     places = response.xpath('//a[text()="区域"]/following-sibling::div//a')
     for place in places:
         url = from_xpath(place, './@href', xt.urljoin, source=response)
         area_place = from_xpath(place, './text()', xt.string_join)
         house_place = "-".join((city, area_place))
         yield Request(url,
                       self.parse_area,
                       meta={
                           'house_place': house_place,
                           'city': city
                       },
                       dont_filter=True)
Beispiel #3
0
    def parse(self, response):
        print(from_xpath(response, '//title/text()'))
        print(from_xpath(response, '//title/text()', xt.extract))
        print(
            from_xpath(response, '//*[@class="one_entity"][1]//text()',
                       xt.string_join))
        print(
            from_xpath(response, '//*[@class="one_entity"][1]//h2/a/@href',
                       xt.urljoin))
        print(
            from_xpath(response, '//*[@class="one_entity"][1]',
                       xt.analysis_article))

        alc, new, content = from_xpath(response, [
            '//*[@class="one_entity"]',
            ['.//h2//text()', xt.string_join],
            [
                './/h2/a/@href', xt.urljoin, {},
                lambda url: not url.startswith('https')
            ],
        ])

        for el1, el2 in content:
            print(el1, el2)

        if new and False:  # there is no mean for False, just don't want to get next page
            yield next_page_request(response, 'page=(\d+)')
Beispiel #4
0
    def parse_area(self, response):
        img_url = re.search('url\((.*?)\)', response.text).group(1)
        house_price_list = zr.get_price(
            requests.get(response.urljoin(img_url)).content)

        new = 0
        city = response.meta["city"]
        house_place = response.meta["house_place"]
        houses = response.xpath(
            '//div[@class="Z_list"]/div[@class="Z_list-box"]'
            '/div[@class="item"][(./div[@class="info-box"])]')

        for house in houses:
            url = from_xpath(house,
                             './div[@class="info-box"]//h5/a/@href',
                             xt.urljoin,
                             source=response)
            if self.mongodb.count({"url": url}):
                continue
            new += 1
            item = {}
            item['url'] = url
            item['city'] = city
            item["house_place"] = house_place
            item["house_name"] = from_xpath(
                house, './div[@class="info-box"]//h5/a/text()')
            item["house_area"], \
            item["house_floor"] = \
                from_xpath(house, './div[@class="info-box"]/div[@class="desc"]/div[1]/text()').split('|', maxsplit=1)
            item["distance_from_subway"] = \
                from_xpath(house, './div[@class="info-box"]/div[@class="desc"]/div[@class="location"]/text()',
                           xt.string_join)
            item["download_time"] = time.time()

            prices_style = \
                from_xpath(house, './div[@class="info-box"]/div[@class="price"]/span[@class="num"]/@style',
                           xt.extract)
            price = []
            for price_style in prices_style:
                position_num = re.search('position:\s*-(.*?)px',
                                         price_style).group(1)
                price.append(house_price_list[price_list.index(
                    int(float(position_num)))])
            item["house_price"] = int(''.join(map(lambda x: str(x), price)))

            yield item

        self.log(f"new item: {new}", level=logging.INFO)
        if new:
            next_page = from_xpath(response, '//a[text()="下一页"]/@href',
                                   xt.urljoin)
            if isinstance(next_page, str) and next_page.startswith("http"):
                yield response.request.replace(url=next_page)
Beispiel #5
0
 def parse_detail(self, response):
     yield {
         "url": response.url,
         "title": from_xpath(response, "//title/text()")
     }