コード例 #1
0
ファイル: fang.py プロジェクト: chocoai/estate
    def parse_item_newhouse(self, response):
        print("parse new house <%s>" % response.url)
        for div in response.xpath(
                '//div[@id="newhouse_loupai_list"]//div[@class="clearfix"]'):
            l = ItemLoader(item=PropertyItem(), selector=div)
            l.default_output_processor = TakeFirst()
            url = urljoin(
                response.url,
                urlparse(div.xpath('(.//a)[1]/@href').extract()).path)
            l.add_xpath("title", '(.//a)[2]//text()',
                        MapCompose(lambda x: self.spc_reg.sub("", x)), Join())
            l.add_xpath("price", './/div[@class="nhouse_price"]//text()',
                        MapCompose(lambda x: self.spc_reg.sub("", x)), Join())
            l.add_xpath("address", './/div[@class="address"]//text()',
                        MapCompose(lambda x: self.spc_reg.sub("", x)),
                        Join("-"))
            l.add_value("dist_name", response.meta.get("dist_name"))
            l.add_value("subdist_name", response.meta.get("subdist_name"))
            l.add_value("url", url)

            l.add_value("source", response.url)
            l.add_value("project", self.settings.get("BOT_NAME"))
            l.add_value("spider", self.name)
            l.add_value("server", socket.gethostname())
            l.add_value("date", datetime.datetime.utcnow())

            yield l.load_item()
コード例 #2
0
ファイル: SecondHouseSpider.py プロジェクト: chocoai/estate
    def parse_58(self, response):
        self.logger.info("process 58 url")

        l = ItemLoader(item=PropertyItem(), selector=response)
        l.default_output_processor = TakeFirst()
        l.add_xpath("title", '//div[@class="house-title"]/h1[@class="c_333 f20"]/text()')
        l.add_value("url", response.url)
        l.add_xpath("price", '//p[@class="house-basic-item1"]/span[@class="price"]/text()')
        l.add_xpath("address",
                    '(//span[@class="c_000 mr_10"][1]/a[1])[1]/text()|(//span[@class="c_000 mr_10"][1]/a[2])[1]/text()'
                    '|//span[@class="c_000 mr_10"]/text()',
                    Join(), MapCompose(lambda x: "".join(x.split())))
        # l.add_xpath("district", '(//span[@class="c_000 mr_10"][1]/a[1])[2]/text()',
        #            MapCompose(lambda x: x.strip()))
        # l.add_xpath("subdistrict", '(//span[@class="c_000 mr_10"][1]/a[2])[2]/text()',
        #            MapCompose(lambda x: x.strip()))
        # l.add_xpath("agent_name", '//a[@class="c_000 agent-name-txt"]/text()', MapCompose(lambda x: x.strip()))
        # l.add_xpath("agent_company", '//p[@class="agent-belong"]/text()')
        l.add_xpath("agent_phone", '//p[@class="phone-num"]/text()')
        # l.add_value("category_id_secondhouse", self.category_id_secondhouse)
        # l.add_value("station_name", "58")

        # ids
        self._load_ids(l, response)
        # housekeeping
        self._load_keephouse(l, response)

        yield l.load_item()
コード例 #3
0
ファイル: SecondHouseSpider.py プロジェクト: chocoai/estate
    def parse_centanet(self, response):
        self.logger.info("process centanet url")

        l = ItemLoader(item=PropertyItem(), selector=response)
        l.default_output_processor = TakeFirst()
        l.add_xpath("title", '//h5[@class="f18"]/text()')
        l.add_value("url", response.url)
        l.add_xpath("price", '//div[@class="roombase-price "]/span[@class="cRed"]/text()')
        l.add_xpath("address", '(//li/div[@class="txt_r f666"])[6]/text()', MapCompose(lambda x: "".join(x.strip().split())))
        # l.add_xpath("district", '//div[@class="fl breadcrumbs-area f000 "]/a[3]/text()', MapCompose(lambda x: x.strip()))
        # l.add_xpath("subdistrict", '//div[@class="fl breadcrumbs-area f000 "]/a[4]/text()', MapCompose(lambda x: x.strip()))
        l.add_xpath("agent_name", '//a[@class="f000 f18"]/b/text()')
        l.add_xpath("agent_company", '(//p[@class="f333"])[1]/text()',
                    Join(), MapCompose(lambda x: "".join(x.replace(":", "").split())))
        l.add_xpath("recent_activation", '//p[@class="f333"]/span[@class="f666"][1]/text()',
                    MapCompose(lambda x: int(x)), re=r"\d+")
        # l.add_value("category_id_secondhouse", self.category_id_secondhouse)
        # l.add_value("station_name", "中原地产")

        # ids
        self._load_ids(l, response)
        # housekeeping
        self._load_keephouse(l, response)

        yield l.load_item()
コード例 #4
0
ファイル: fang.py プロジェクト: chocoai/estate
    def scrape_content_secondhouse(self, response):
        self.logger.critical("scrape item from <%s>", response.url)
        l = ItemLoader(item=PropertyItem(), response=response)
        l.default_output_processor = TakeFirst()
        l.add_xpath("agent_name", '//a[@id="agantesfxq_C04_02"]/text()')
        l.add_xpath("agent_company",
                    '(//div[@class="tjcont-list-cline2"]/span)[2]/text()')
        l.add_xpath(
            "agent_phone",
            '//div[contains(@class,"tjcont-list-cline3")]/span/text()')
        l.add_xpath("title", '(//div[@id="lpname"]/div)[1]/text()',
                    MapCompose(lambda x: self.spc_reg.sub("", x)), Join('-'))
        l.add_xpath("price", '//*[text() ="万"]//text()', Join())
        l.add_xpath("address", '//a[@id="agantesfxq_C03_05"]/text()')
        l.add_value("dist_name", response.meta.get("dist_name"))
        l.add_value("subdist_name", response.meta.get("subdist_name"))
        l.add_value("url", response.url)

        l.add_value("source", response.request.url)
        l.add_value("project", self.settings.get("BOT_NAME"))
        l.add_value("spider", self.name)
        l.add_value("server", socket.gethostname())
        l.add_value("date", datetime.datetime.utcnow())

        yield l.load_item()
コード例 #5
0
ファイル: fangdd.py プロジェクト: chocoai/estate
    def parse(self, response):

        district = response.xpath('(//div[@class="_23XzT"]//text())[1]'
                                  ).extract_first().strip().replace("\"", "")
        subdistrict = response.xpath('(//div[@class="_23XzT"]//text())[2]'
                                     ).extract_first().strip().replace(
                                         "\"", "")

        for div in response.xpath('//ul[@class=""]/li'):
            l = ItemLoader(item=PropertyItem(), selector=div)
            l.default_output_processor = TakeFirst()
            l.add_xpath("title", '(.//a)[1]//text()',
                        MapCompose(lambda x: self.spc_reg.sub("", x)))
            l.add_xpath(
                "url", "(.//a)[1]//@href",
                MapCompose(lambda x: urljoin(response.url,
                                             urlparse(x).path)))
            l.add_xpath("price", './/span[text() = "万"]/..//text()', Join())
            l.add_xpath("address", './/span[@class="_13KXy"]//text()',
                        MapCompose(lambda x: self.spc_reg.sub("", x)),
                        Join('-'))
            l.add_value("dist_name", district)
            l.add_value("subdist_name", subdistrict)

            # housekeeping
            l.add_value("source", response.url)
            l.add_value("project", self.settings.get("BOT_NAME"))
            l.add_value("spider", self.name)
            l.add_value("server", socket.gethostname())
            l.add_value("date",
                        datetime.datetime.now().strftime("%Y%m%d%H%M%S"))

            yield l.load_item()
        self._upd_retrived(response.url, 1)
コード例 #6
0
ファイル: SecondHouseSpider.py プロジェクト: chocoai/estate
    def parse_ganji(self, response):
        self.logger.info("process ganji url")

        l = ItemLoader(item=PropertyItem(), selector=response)
        l.default_output_processor = TakeFirst()
        l.add_xpath("title", '//p[@class="card-title"]/i/text()')
        l.add_value("url", response.url)
        l.add_xpath("price", '//div[@class="price-wrap"]/span[1]/text()')
        l.add_xpath("address",
                    '(//li[@class="er-item f-fl"][1]/span[@class][2]/text()|//li[@class="er-item f-fl"][1]/span[@class][2]/a[@class="xiaoqu card-blue"])[2]/text()',
                    Join(), MapCompose(lambda x: "".join(x.split())))
        # l.add_xpath("district", '//span[@class="content"]/a[@class="blue"][1]/text()',
        #            MapCompose(lambda x: x.strip()))
        # l.add_xpath("subdistrict", '//span[@class="content"]/a[@class="blue"][2]/text()',
        #            MapCompose(lambda x: x.strip()))
        l.add_xpath("agent_name", '//p[@class="name"][1]/text()', MapCompose(lambda x: x.strip()))
        l.add_xpath("agent_company", '//div[@class="user_other"]/span/text()')
        l.add_xpath("agent_phone", '//div[@id="full_phone_show"]/@data-phone')
        # l.add_value("category_id_secondhouse", self.category_id_secondhouse)
        # l.add_value("station_name", "赶集网")

        # ids
        self._load_ids(l, response)
        # housekeeping
        self._load_keephouse(l, response)

        yield l.load_item()
コード例 #7
0
ファイル: ganji.py プロジェクト: chocoai/estate
    def parse_item(self, response):
        district = response.xpath(
            "(//a[text()='不限'])[1]//ancestor::ul//li[@class='item current']//text()"
        ).extract_first()
        subdistrict = response.xpath(
            "(//a[text()='不限'])[2]//ancestor::div//a[@class='subway-item current']//text()"
        ).extract_first()

        for div in response.xpath("//div[contains(@id,'puid-')]"):
            l = ItemLoader(item=PropertyItem(), selector=div)
            l.default_output_processor = TakeFirst()
            l.add_xpath("title", "(.//a)[2]/text()",
                        MapCompose(lambda x: self.spc_reg.sub("", x)))
            l.add_xpath(
                "url", "(.//a)[2]/@href",
                MapCompose(lambda x: urljoin(response.url,
                                             urlparse(x).path)))
            l.add_xpath("price", ".//div[@class='price']//text()", Join())
            l.add_xpath("address", ".//span[@class='area']//text()",
                        MapCompose(lambda x: self.spc_reg.sub("", x)), Join())
            l.add_value("dist_name", district)
            l.add_value("subdist_name", subdistrict)

            # housekeeping
            l.add_value("source", response.url)
            l.add_value("project", self.settings.get("BOT_NAME"))
            l.add_value("spider", self.name)
            l.add_value("server", socket.gethostname())
            l.add_value("date", datetime.datetime.utcnow())

            yield l.load_item()
        self._upd_retrived(response.url, 1)
コード例 #8
0
ファイル: NewHouseSpider.py プロジェクト: chocoai/estate
    def parse_ganji(self, response):
        self.logger.info("process ganji url")

        l = ItemLoader(item=PropertyItem(), selector=response)
        l.default_output_processor = TakeFirst()
        l.add_xpath("title", '//p[@class="card-title"]/i/text()')
        l.add_value("url", response.url)
        l.add_xpath("price", '//span[@class="price"]/text()')
        l.add_xpath(
            "address",
            '(//li[@class="er-item f-fl"])[2]/span[@class="content"]/a/text()',
            Join(), MapCompose(lambda x: "".join(x.split())))
        l.add_xpath("agent_name", '//p[@class="name"]/text()',
                    MapCompose(lambda x: x.strip()))
        l.add_xpath("agent_company", '//span[@clas="company"]/text()')
        l.add_xpath("agent_phone",
                    '//a[@class="phone_num js_person_phone"]/text()',
                    Join(),
                    re="(\\d+)")

        # ids
        self._load_ids(l, response)
        # housekeeping
        self._load_keephouse(l, response)

        yield l.load_item()
コード例 #9
0
ファイル: fang.py プロジェクト: chocoai/estate
    def parse(self, response):
        self.logger.info("start parese url %s" % response.url)
        for div in response.xpath('//div[@class="house-listBox"]/div'):
            l = ItemLoader(item=PropertyItem(), selector=div)
            l.default_output_processor = TakeFirst()
            l.add_xpath("title", '(.//a)[2]/text()',
                        MapCompose(lambda x: self.spc_reg.sub("", x)))
            l.add_xpath(
                "url", "(.//a)[2]/@href",
                MapCompose(lambda x: urljoin(response.url,
                                             urlparse(x).path)))
            l.add_xpath("price", './/p[@class="price-nub cRed"]/text()',
                        Join())
            l.add_xpath("address", './/a[@class="f000 mr_10"]//text()',
                        MapCompose(lambda x: self.spc_reg.sub("", x)), Join())

            l.add_xpath("dist_name", './/p[@class="f7b mb_15"]/text()', Join(),
                        MapCompose(lambda x: x.split("-")[0].strip()))

            l.add_xpath("subdist_name", './/p[@class="f7b mb_15"]/text()',
                        Join(),
                        MapCompose(lambda x: x.split("-")[1].split()[0]))

            # housekeeping
            l.add_value("source", response.url)
            l.add_value("project", self.settings.get("BOT_NAME"))
            l.add_value("spider", self.name)
            l.add_value("server", socket.gethostname())
            l.add_value("date",
                        datetime.datetime.now().strftime("%Y%m%d%H%M%S"))

            yield l.load_item()
コード例 #10
0
ファイル: SecondHouseSpider.py プロジェクト: chocoai/estate
    def parse_fangdd(self, response):
        self.logger.info("process fangdd url")

        l = ItemLoader(item=PropertyItem(), selector=response)
        l.default_output_processor = TakeFirst()
        l.add_xpath("title", '//h1[@class="_3940o"]/text()', MapCompose(lambda x: x.strip()))
        l.add_value("url", response.url)
        l.add_xpath("price", '//span[@class="_1A2vc _1nbqO"]/text()')
        l.add_xpath("address", '//span[@class="V1q7v"]/span[@class="HtyCL"]/text()')
        l.add_xpath("agent_name", '//span[@class="_2M6sV"]/text()')

        # ids
        self._load_ids(l, response)
        # housekeeping
        self._load_keephouse(l, response)

        yield l.load_item()
コード例 #11
0
ファイル: NewHouseSpider.py プロジェクト: chocoai/estate
    def parse_centanet(self, response):
        self.logger.info("process centanet url")

        l = ItemLoader(item=PropertyItem(), selector=response)
        l.default_output_processor = TakeFirst()
        l.add_xpath("title", '//h5[@class="mr25 f16 "]/a/text()',
                    MapCompose(lambda x: "".join(x.split())))
        l.add_value("url", response.url)
        l.add_xpath("price", '//span[@class="nhpice"]/b/text()')
        l.add_xpath("address", '(//p[@class="txt_r"])[1]/text()')
        l.add_xpath("agent_name", '//span[@class="f000 f18 mr6"]/text()')
        # l.add_value("category_id_shop", self.category_id_shop)
        # l.add_value("station_name", "中原地产")

        # ids
        self._load_ids(l, response)
        # housekeeping
        self._load_keephouse(l, response)

        yield l.load_item()
コード例 #12
0
ファイル: NewHouseSpider.py プロジェクト: chocoai/estate
    def parse_fang(self, response):
        self.logger.info("process fang url")

        l = ItemLoader(item=PropertyItem(), selector=response)
        l.default_output_processor = TakeFirst()
        l.add_xpath("title", '//div[@class="tit"]/h1/strong/text()')
        l.add_value("url", response.url)
        l.add_xpath("price", '//span[@class="prib cn_ff"]/text()')
        l.add_xpath("address", '//div[@class="inf_left fl"]/span/text()')
        l.add_xpath("agent_name", '//dt[@class="wai"]/a/text()')
        l.add_xpath("agent_company", '//li[@class="tf cl_333"]/a/text()')
        # l.add_value("category_id_shop", self.category_id_shop)
        # l.add_value("station_name", "房天下")
        # category_name
        # station_name
        # ids
        self._load_ids(l, response)
        # housekeeping
        self._load_keephouse(l, response)

        yield l.load_item()
コード例 #13
0
ファイル: ShopSpider.py プロジェクト: chocoai/estate
    def parse_fang(self, response):
        self.logger.info("process fang url")

        l = ItemLoader(item=PropertyItem(), selector=response)
        l.default_output_processor = TakeFirst()
        l.add_xpath("title", '//div[@class="title"]/h1/text()', Join(), MapCompose(lambda x: "".join(x.split())))
        l.add_value("url", response.url)
        l.add_xpath("price", '//span[@class="red20b"]/text()')
        l.add_xpath("address", '(//div[@class="wrap"]//dl/dt)[3]/text()', Join(), MapCompose(lambda x: "".join(x.split())))
        l.add_xpath("agent_name", '//span[@id="agentname"]/text()')
        l.add_xpath("agent_company", '//dd[@class="black"]/a/text()', Join(), MapCompose(lambda x: "".join(x.split())))
        l.add_xpath("agent_phone", '//div[@class="phone_top"]//label[@id="mobilecode"]/text()',MapCompose(lambda x: "".join(x.split())))
        # l.add_value("category_id_shop", self.category_id_shop)
        # l.add_value("station_name", "房天下")

        # ids
        self._load_ids(l, response)
        # housekeeping
        self._load_keephouse(l, response)

        yield l.load_item()
コード例 #14
0
ファイル: fang.py プロジェクト: chocoai/estate
    def parse(self, response):

        district = response.xpath(
            '(//a[@id = "list_105"]/../a)[1]/text()').extract_first()
        subdistrict = response.xpath(
            '(//a[@id = "list_105"]/../a)[2]/text()').extract_first()

        for div in response.xpath('//dl[contains(@id,"list_D")]'):
            l = ItemLoader(item=PropertyItem(), selector=div)
            l.default_output_processor = TakeFirst()
            url = urljoin(
                response.url,
                urlparse(div.xpath("(.//a)[1]//@href").extract_first()).path)

            yield Request(url,
                          callback=self.scrape_content_secondhouse,
                          meta={
                              "dist_name": district,
                              "subdist_name": subdistrict
                          })
        self._upd_retrived(response.url, 1)
コード例 #15
0
ファイル: SecondHouseSpider.py プロジェクト: chocoai/estate
    def parse_qfang(self, response):
        self.logger.info("process qfang url")

        l = ItemLoader(item=PropertyItem(), selector=response)
        l.default_output_processor = TakeFirst()
        l.add_xpath("title", '//h2[@class="house-title fl"]/text()')
        l.add_value("url", response.url)
        l.add_xpath("price", '//p[@class="head-info-price  fl"]/span/text()')
        l.add_xpath("address", '//p[@class="corresponding-con"]/a/text()')
        # l.add_xpath("district", '//div[@class="r-b-a fl clearfix"]/p/a[1]/text()',
        #            MapCompose(lambda x: x.strip()))
        # l.add_xpath("subdistrict", '//div[@class="r-b-a fl clearfix"]/p/a[1]/text()',
        #            MapCompose(lambda x: x.strip()))
        l.add_xpath("agent_name", '//p[@class="name fl"]/a/text()')
        # l.add_value("category_id_secondhouse", self.category_id_secondhouse)
        # l.add_value("station_name", "Q房网")

        # ids
        self._load_ids(l, response)
        # housekeeping
        self._load_keephouse(l, response)
        yield l.load_item()
コード例 #16
0
ファイル: ShopSpider.py プロジェクト: chocoai/estate
    def parse_anjuke(self, response):
        self.logger.info("process anjuke url")

        l = ItemLoader(item=PropertyItem(), selector=response)
        l.default_output_processor = TakeFirst()
        l.add_xpath("title", '//div[@class="wrapper"]/h1/text()', Join(), MapCompose(lambda x: "".join(x.split())))
        l.add_value("url", response.url)
        l.add_xpath("price", '//span[@class="price-tag"]/em/text()')
        l.add_xpath("address", '//span[@class="desc addresscommu"]/text()',
                    Join(), MapCompose(lambda x: "".join(x.split())))
        l.add_xpath("agent_name", '//div[@class="bro-info clearfix"]/h5/text()')
        l.add_xpath("agent_company", '//p[@class="comp_info"]/a/text()',
                    Join(), MapCompose(lambda x: "".join(x.split())))
        # l.add_value("category_id_secondhouse", self.category_id_secondhouse)
        # l.add_value("station_name", "安居客")

        # ids
        self._load_ids(l, response)
        # housekeeping
        self._load_keephouse(l, response)

        yield l.load_item()
コード例 #17
0
ファイル: ShopSpider.py プロジェクト: chocoai/estate
    def parse_58(self, response):
        self.logger.info("process 58 url")

        l = ItemLoader(item=PropertyItem(), selector=response)
        l.default_output_processor = TakeFirst()
        l.add_xpath("title", '//div[@class="house-title"]/h1/text()', Join(), MapCompose(lambda x: "".join(x.split())))
        l.add_value("url", response.url)
        l.add_xpath("price", '//span[@class="house_basic_title_money_num"]/text()')
        l.add_xpath("address",
                    '//span[@class="house_basic_title_content_item3 xxdz-des"]/text()',
                    Join(), MapCompose(lambda x: "".join(x.split())))
        l.add_xpath("agent_name", '//span[@class="f14 c_333 jjrsay"]/text()', MapCompose(lambda x: x.strip()))
        # l.add_xpath("agent_company", '//span[@class="f14 c_333 jjrsay"]/text()')
        l.add_xpath("agent_phone", '//p[@class="phone-num"]/text()', MapCompose(lambda x: "".join(x.split())))
        # l.add_value("category_id_secondhouse", self.category_id_secondhouse)
        # l.add_value("station_name", "58同城")

        # ids
        self._load_ids(l, response)
        # housekeeping
        self._load_keephouse(l, response)

        yield l.load_item()
コード例 #18
0
ファイル: NewHouseSpider.py プロジェクト: chocoai/estate
    def parse_fangdd(self, response):
        self.logger.info("process fangdd url")

        l = ItemLoader(item=PropertyItem(), selector=response)
        l.default_output_processor = TakeFirst()
        l.add_xpath("title", '//h1[@class="_3sWIj"]/text()',
                    MapCompose(lambda x: "".join(x.split())))
        l.add_value("url", response.url)
        l.add_xpath("price", '//div[@class="C1hVk"]/text()',
                    MapCompose(lambda x: "".join(x.split())))
        l.add_xpath("address",
                    '//div[@class="_2mmF- _3YJ15 undefined"]/text()', Join(),
                    MapCompose(lambda x: "".join(x.split())))
        l.add_xpath("agent_name", '//span[@class="zMrme"]/text()',
                    MapCompose(lambda x: x.strip()))
        # l.add_value("category_id_secondhouse", self.category_id_secondhouse)
        # l.add_value("station_name", "房多多")

        # ids
        self._load_ids(l, response)
        # housekeeping
        self._load_keephouse(l, response)

        yield l.load_item()
コード例 #19
0
ファイル: NewHouseSpider.py プロジェクト: chocoai/estate
    def parse_qfang(self, response):
        self.logger.info("process qfang url")

        l = ItemLoader(item=PropertyItem(), selector=response)
        l.default_output_processor = TakeFirst()
        l.add_xpath("title", '//h2[@class="house-title fl"]/text()',
                    MapCompose(lambda x: x.strip()))
        l.add_value("url", response.url)
        l.add_xpath("price",
                    '//p[@class="newhs-average-price fl"]/span/text()')
        l.add_xpath("address",
                    '//p[@class="project-address clearfix"]/em/text()', Join(),
                    MapCompose(lambda x: "".join(x.split())))
        l.add_xpath("agent_name", '(//p[@class="name"]/span)[1]/text()')

        # l.add_value("category_id_secondhouse", self.category_id_secondhouse)
        # l.add_value("station_name", "Q房网")

        # ids
        self._load_ids(l, response)
        # housekeeping
        self._load_keephouse(l, response)

        yield l.load_item()
コード例 #20
0
ファイル: SecondHouseSpider.py プロジェクト: chocoai/estate
    def parse_fang(self, response):
        self.logger.info("process fang url")

        l = ItemLoader(item=PropertyItem(), selector=response)
        l.default_output_processor = TakeFirst()
        l.add_xpath("title", '//div[@id="lpname"]/div[1]/text()')
        l.add_value("url", response.url)
        l.add_xpath("price", '//div[@class="trl-item price_esf  sty1"]/i/text()')
        l.add_xpath("address", '//div[@class="rcont"]/a[@id="agantesfxq_C03_05"]/text()')
        # l.add_xpath("district", '//div[@id="address"]/a[1]/text()',
        #            MapCompose(lambda x: x.strip()))
        # l.add_xpath("subdistrict", '//div[@id="address"]/a[2]/text()',
        #            MapCompose(lambda x: x.strip()))
        l.add_xpath("agent_name", '//span[@class="zf_jjname"]/a/text()')
        l.add_xpath("agent_company", '//div[@class="tjcont-list-cline2"]/span[2]/text()')
        l.add_xpath("agent_phone", '//div[@class="tjcont-list-cline3 font16"]/span/text()')
        # l.add_value("category_id_secondhouse", self.category_id_secondhouse)

        # ids
        self._load_ids(l, response)
        # housekeeping
        self._load_keephouse(l, response)

        yield l.load_item()
コード例 #21
0
    def parse_item(self, response):

        # agency table
        l = ItemLoader(item=AgentItem(), response=response)
        l.default_output_processor = TakeFirst()
        l.add_xpath("name", '//div[@class="sthys3"]/text()', re=r":(\w+)")
        l.add_xpath("telephone", '//div[@class="sttelct2 sttelct"]/text()',
                    MapCompose(lambda x: "".join(x.split())))
        l.item.setdefault("company", None)
        l.add_xpath("company", '//li[@class="st14 stb starial"]//text()')
        l.add_xpath("address",
                    '//div[@class="xflilist"]/div[3]//text()',
                    re=r':(\w+)')
        l.add_xpath("register_date",
                    '//div[@class="jbfx"]/text()',
                    re=r'登记日期:([\d/]+)')

        l.add_value("city_name", self.city_name)
        l.add_value("dist_name", self.dist_name)
        l.add_value("category_name", self.category_name)
        l.add_value("station_name", self.station_name)
        l.add_xpath("subdist_name",
                    '(//div[@class="xx_xq_l200"])[2]/text()',
                    re='区域:(?:昆山)?(\\w+)')

        # housekeeping
        l.add_value("source", response.url)
        l.add_value("project", self.settings.get("BOT_NAME"))
        l.add_value("spider", self.name)
        l.add_value("server", socket.gethostname())
        l.add_value("dt", datetime.datetime.utcnow())
        item = l.load_item()

        if not item.get("subdist_name"):
            self.logger.critical(
                "subdsitrict name is not scrape, save response as a file")
            f = open("failed_html/html_%s.html" %
                     parse_qs(urlparse(response.url).query).get("id")[0],
                     'w',
                     encoding='utf8')
            f.write(response.text)
            f.close()
            # return Request(url=response.url)

        yield item

        # properties table
        l = ItemLoader(item=PropertyItem(), response=response)
        l.default_output_processor = TakeFirst()
        l.add_xpath('title', '//div[@class="xxview_title"]/text()')
        l.add_value("url", response.url)
        l.add_xpath(
            "price", '//div[@class="xx_xq_l200"]/span[@class="st22 '
            'sthuangs stb starial"]/text()')
        l.add_xpath("address",
                    '//div[@class="wydzleft"]/text()',
                    MapCompose(lambda x: x.strip()),
                    re=r'物业地址:([^\x01-\x1f]+)')
        l.add_xpath("agent_name",
                    '//div[@class="sthys3"]/text()',
                    re=r":(\w+)")
        l.item.setdefault("agent_company", None)
        l.add_xpath("agent_company", '//li[@class="st14 stb starial"]//text()')
        l.add_xpath('agent_phone', '//div[@class="sttelct2 sttelct"]/text()',
                    MapCompose(lambda x: "".join(x.split())))
        l.add_xpath("recent_activation",
                    '//div[@class="fyfbtime"]/text()',
                    re='查看人次:(\\d+)')

        l.add_value("city_name", self.city_name)
        l.add_value("dist_name", self.dist_name)
        l.add_value('station_name', self.station_name)
        l.add_value("category_name", self.category_name)
        l.add_xpath("subdist_name",
                    '(//div[@class="xx_xq_l200"])[2]/text()',
                    re='区域:(?:昆山)?(\\w+)')

        # housekeeping
        l.add_value("source", response.request.url)
        l.add_value("project", self.settings.get("BOT_NAME"))
        l.add_value("spider", self.name)
        l.add_value("server", socket.gethostname())
        l.add_value("dt", datetime.datetime.utcnow())
        yield l.load_item()