def parse_item_newhouse(self, response): print("parse new house <%s>" % response.url) for div in response.xpath( '//div[@id="newhouse_loupai_list"]//div[@class="clearfix"]'): l = ItemLoader(item=PropertyItem(), selector=div) l.default_output_processor = TakeFirst() url = urljoin( response.url, urlparse(div.xpath('(.//a)[1]/@href').extract()).path) l.add_xpath("title", '(.//a)[2]//text()', MapCompose(lambda x: self.spc_reg.sub("", x)), Join()) l.add_xpath("price", './/div[@class="nhouse_price"]//text()', MapCompose(lambda x: self.spc_reg.sub("", x)), Join()) l.add_xpath("address", './/div[@class="address"]//text()', MapCompose(lambda x: self.spc_reg.sub("", x)), Join("-")) l.add_value("dist_name", response.meta.get("dist_name")) l.add_value("subdist_name", response.meta.get("subdist_name")) l.add_value("url", url) l.add_value("source", response.url) l.add_value("project", self.settings.get("BOT_NAME")) l.add_value("spider", self.name) l.add_value("server", socket.gethostname()) l.add_value("date", datetime.datetime.utcnow()) yield l.load_item()
def parse_58(self, response): self.logger.info("process 58 url") l = ItemLoader(item=PropertyItem(), selector=response) l.default_output_processor = TakeFirst() l.add_xpath("title", '//div[@class="house-title"]/h1[@class="c_333 f20"]/text()') l.add_value("url", response.url) l.add_xpath("price", '//p[@class="house-basic-item1"]/span[@class="price"]/text()') l.add_xpath("address", '(//span[@class="c_000 mr_10"][1]/a[1])[1]/text()|(//span[@class="c_000 mr_10"][1]/a[2])[1]/text()' '|//span[@class="c_000 mr_10"]/text()', Join(), MapCompose(lambda x: "".join(x.split()))) # l.add_xpath("district", '(//span[@class="c_000 mr_10"][1]/a[1])[2]/text()', # MapCompose(lambda x: x.strip())) # l.add_xpath("subdistrict", '(//span[@class="c_000 mr_10"][1]/a[2])[2]/text()', # MapCompose(lambda x: x.strip())) # l.add_xpath("agent_name", '//a[@class="c_000 agent-name-txt"]/text()', MapCompose(lambda x: x.strip())) # l.add_xpath("agent_company", '//p[@class="agent-belong"]/text()') l.add_xpath("agent_phone", '//p[@class="phone-num"]/text()') # l.add_value("category_id_secondhouse", self.category_id_secondhouse) # l.add_value("station_name", "58") # ids self._load_ids(l, response) # housekeeping self._load_keephouse(l, response) yield l.load_item()
def parse_centanet(self, response): self.logger.info("process centanet url") l = ItemLoader(item=PropertyItem(), selector=response) l.default_output_processor = TakeFirst() l.add_xpath("title", '//h5[@class="f18"]/text()') l.add_value("url", response.url) l.add_xpath("price", '//div[@class="roombase-price "]/span[@class="cRed"]/text()') l.add_xpath("address", '(//li/div[@class="txt_r f666"])[6]/text()', MapCompose(lambda x: "".join(x.strip().split()))) # l.add_xpath("district", '//div[@class="fl breadcrumbs-area f000 "]/a[3]/text()', MapCompose(lambda x: x.strip())) # l.add_xpath("subdistrict", '//div[@class="fl breadcrumbs-area f000 "]/a[4]/text()', MapCompose(lambda x: x.strip())) l.add_xpath("agent_name", '//a[@class="f000 f18"]/b/text()') l.add_xpath("agent_company", '(//p[@class="f333"])[1]/text()', Join(), MapCompose(lambda x: "".join(x.replace(":", "").split()))) l.add_xpath("recent_activation", '//p[@class="f333"]/span[@class="f666"][1]/text()', MapCompose(lambda x: int(x)), re=r"\d+") # l.add_value("category_id_secondhouse", self.category_id_secondhouse) # l.add_value("station_name", "中原地产") # ids self._load_ids(l, response) # housekeeping self._load_keephouse(l, response) yield l.load_item()
def scrape_content_secondhouse(self, response): self.logger.critical("scrape item from <%s>", response.url) l = ItemLoader(item=PropertyItem(), response=response) l.default_output_processor = TakeFirst() l.add_xpath("agent_name", '//a[@id="agantesfxq_C04_02"]/text()') l.add_xpath("agent_company", '(//div[@class="tjcont-list-cline2"]/span)[2]/text()') l.add_xpath( "agent_phone", '//div[contains(@class,"tjcont-list-cline3")]/span/text()') l.add_xpath("title", '(//div[@id="lpname"]/div)[1]/text()', MapCompose(lambda x: self.spc_reg.sub("", x)), Join('-')) l.add_xpath("price", '//*[text() ="万"]//text()', Join()) l.add_xpath("address", '//a[@id="agantesfxq_C03_05"]/text()') l.add_value("dist_name", response.meta.get("dist_name")) l.add_value("subdist_name", response.meta.get("subdist_name")) l.add_value("url", response.url) l.add_value("source", response.request.url) l.add_value("project", self.settings.get("BOT_NAME")) l.add_value("spider", self.name) l.add_value("server", socket.gethostname()) l.add_value("date", datetime.datetime.utcnow()) yield l.load_item()
def parse(self, response): district = response.xpath('(//div[@class="_23XzT"]//text())[1]' ).extract_first().strip().replace("\"", "") subdistrict = response.xpath('(//div[@class="_23XzT"]//text())[2]' ).extract_first().strip().replace( "\"", "") for div in response.xpath('//ul[@class=""]/li'): l = ItemLoader(item=PropertyItem(), selector=div) l.default_output_processor = TakeFirst() l.add_xpath("title", '(.//a)[1]//text()', MapCompose(lambda x: self.spc_reg.sub("", x))) l.add_xpath( "url", "(.//a)[1]//@href", MapCompose(lambda x: urljoin(response.url, urlparse(x).path))) l.add_xpath("price", './/span[text() = "万"]/..//text()', Join()) l.add_xpath("address", './/span[@class="_13KXy"]//text()', MapCompose(lambda x: self.spc_reg.sub("", x)), Join('-')) l.add_value("dist_name", district) l.add_value("subdist_name", subdistrict) # housekeeping l.add_value("source", response.url) l.add_value("project", self.settings.get("BOT_NAME")) l.add_value("spider", self.name) l.add_value("server", socket.gethostname()) l.add_value("date", datetime.datetime.now().strftime("%Y%m%d%H%M%S")) yield l.load_item() self._upd_retrived(response.url, 1)
def parse_ganji(self, response): self.logger.info("process ganji url") l = ItemLoader(item=PropertyItem(), selector=response) l.default_output_processor = TakeFirst() l.add_xpath("title", '//p[@class="card-title"]/i/text()') l.add_value("url", response.url) l.add_xpath("price", '//div[@class="price-wrap"]/span[1]/text()') l.add_xpath("address", '(//li[@class="er-item f-fl"][1]/span[@class][2]/text()|//li[@class="er-item f-fl"][1]/span[@class][2]/a[@class="xiaoqu card-blue"])[2]/text()', Join(), MapCompose(lambda x: "".join(x.split()))) # l.add_xpath("district", '//span[@class="content"]/a[@class="blue"][1]/text()', # MapCompose(lambda x: x.strip())) # l.add_xpath("subdistrict", '//span[@class="content"]/a[@class="blue"][2]/text()', # MapCompose(lambda x: x.strip())) l.add_xpath("agent_name", '//p[@class="name"][1]/text()', MapCompose(lambda x: x.strip())) l.add_xpath("agent_company", '//div[@class="user_other"]/span/text()') l.add_xpath("agent_phone", '//div[@id="full_phone_show"]/@data-phone') # l.add_value("category_id_secondhouse", self.category_id_secondhouse) # l.add_value("station_name", "赶集网") # ids self._load_ids(l, response) # housekeeping self._load_keephouse(l, response) yield l.load_item()
def parse_item(self, response): district = response.xpath( "(//a[text()='不限'])[1]//ancestor::ul//li[@class='item current']//text()" ).extract_first() subdistrict = response.xpath( "(//a[text()='不限'])[2]//ancestor::div//a[@class='subway-item current']//text()" ).extract_first() for div in response.xpath("//div[contains(@id,'puid-')]"): l = ItemLoader(item=PropertyItem(), selector=div) l.default_output_processor = TakeFirst() l.add_xpath("title", "(.//a)[2]/text()", MapCompose(lambda x: self.spc_reg.sub("", x))) l.add_xpath( "url", "(.//a)[2]/@href", MapCompose(lambda x: urljoin(response.url, urlparse(x).path))) l.add_xpath("price", ".//div[@class='price']//text()", Join()) l.add_xpath("address", ".//span[@class='area']//text()", MapCompose(lambda x: self.spc_reg.sub("", x)), Join()) l.add_value("dist_name", district) l.add_value("subdist_name", subdistrict) # housekeeping l.add_value("source", response.url) l.add_value("project", self.settings.get("BOT_NAME")) l.add_value("spider", self.name) l.add_value("server", socket.gethostname()) l.add_value("date", datetime.datetime.utcnow()) yield l.load_item() self._upd_retrived(response.url, 1)
def parse_ganji(self, response): self.logger.info("process ganji url") l = ItemLoader(item=PropertyItem(), selector=response) l.default_output_processor = TakeFirst() l.add_xpath("title", '//p[@class="card-title"]/i/text()') l.add_value("url", response.url) l.add_xpath("price", '//span[@class="price"]/text()') l.add_xpath( "address", '(//li[@class="er-item f-fl"])[2]/span[@class="content"]/a/text()', Join(), MapCompose(lambda x: "".join(x.split()))) l.add_xpath("agent_name", '//p[@class="name"]/text()', MapCompose(lambda x: x.strip())) l.add_xpath("agent_company", '//span[@clas="company"]/text()') l.add_xpath("agent_phone", '//a[@class="phone_num js_person_phone"]/text()', Join(), re="(\\d+)") # ids self._load_ids(l, response) # housekeeping self._load_keephouse(l, response) yield l.load_item()
def parse(self, response): self.logger.info("start parese url %s" % response.url) for div in response.xpath('//div[@class="house-listBox"]/div'): l = ItemLoader(item=PropertyItem(), selector=div) l.default_output_processor = TakeFirst() l.add_xpath("title", '(.//a)[2]/text()', MapCompose(lambda x: self.spc_reg.sub("", x))) l.add_xpath( "url", "(.//a)[2]/@href", MapCompose(lambda x: urljoin(response.url, urlparse(x).path))) l.add_xpath("price", './/p[@class="price-nub cRed"]/text()', Join()) l.add_xpath("address", './/a[@class="f000 mr_10"]//text()', MapCompose(lambda x: self.spc_reg.sub("", x)), Join()) l.add_xpath("dist_name", './/p[@class="f7b mb_15"]/text()', Join(), MapCompose(lambda x: x.split("-")[0].strip())) l.add_xpath("subdist_name", './/p[@class="f7b mb_15"]/text()', Join(), MapCompose(lambda x: x.split("-")[1].split()[0])) # housekeeping l.add_value("source", response.url) l.add_value("project", self.settings.get("BOT_NAME")) l.add_value("spider", self.name) l.add_value("server", socket.gethostname()) l.add_value("date", datetime.datetime.now().strftime("%Y%m%d%H%M%S")) yield l.load_item()
def parse_fangdd(self, response): self.logger.info("process fangdd url") l = ItemLoader(item=PropertyItem(), selector=response) l.default_output_processor = TakeFirst() l.add_xpath("title", '//h1[@class="_3940o"]/text()', MapCompose(lambda x: x.strip())) l.add_value("url", response.url) l.add_xpath("price", '//span[@class="_1A2vc _1nbqO"]/text()') l.add_xpath("address", '//span[@class="V1q7v"]/span[@class="HtyCL"]/text()') l.add_xpath("agent_name", '//span[@class="_2M6sV"]/text()') # ids self._load_ids(l, response) # housekeeping self._load_keephouse(l, response) yield l.load_item()
def parse_centanet(self, response): self.logger.info("process centanet url") l = ItemLoader(item=PropertyItem(), selector=response) l.default_output_processor = TakeFirst() l.add_xpath("title", '//h5[@class="mr25 f16 "]/a/text()', MapCompose(lambda x: "".join(x.split()))) l.add_value("url", response.url) l.add_xpath("price", '//span[@class="nhpice"]/b/text()') l.add_xpath("address", '(//p[@class="txt_r"])[1]/text()') l.add_xpath("agent_name", '//span[@class="f000 f18 mr6"]/text()') # l.add_value("category_id_shop", self.category_id_shop) # l.add_value("station_name", "中原地产") # ids self._load_ids(l, response) # housekeeping self._load_keephouse(l, response) yield l.load_item()
def parse_fang(self, response): self.logger.info("process fang url") l = ItemLoader(item=PropertyItem(), selector=response) l.default_output_processor = TakeFirst() l.add_xpath("title", '//div[@class="tit"]/h1/strong/text()') l.add_value("url", response.url) l.add_xpath("price", '//span[@class="prib cn_ff"]/text()') l.add_xpath("address", '//div[@class="inf_left fl"]/span/text()') l.add_xpath("agent_name", '//dt[@class="wai"]/a/text()') l.add_xpath("agent_company", '//li[@class="tf cl_333"]/a/text()') # l.add_value("category_id_shop", self.category_id_shop) # l.add_value("station_name", "房天下") # category_name # station_name # ids self._load_ids(l, response) # housekeeping self._load_keephouse(l, response) yield l.load_item()
def parse_fang(self, response): self.logger.info("process fang url") l = ItemLoader(item=PropertyItem(), selector=response) l.default_output_processor = TakeFirst() l.add_xpath("title", '//div[@class="title"]/h1/text()', Join(), MapCompose(lambda x: "".join(x.split()))) l.add_value("url", response.url) l.add_xpath("price", '//span[@class="red20b"]/text()') l.add_xpath("address", '(//div[@class="wrap"]//dl/dt)[3]/text()', Join(), MapCompose(lambda x: "".join(x.split()))) l.add_xpath("agent_name", '//span[@id="agentname"]/text()') l.add_xpath("agent_company", '//dd[@class="black"]/a/text()', Join(), MapCompose(lambda x: "".join(x.split()))) l.add_xpath("agent_phone", '//div[@class="phone_top"]//label[@id="mobilecode"]/text()',MapCompose(lambda x: "".join(x.split()))) # l.add_value("category_id_shop", self.category_id_shop) # l.add_value("station_name", "房天下") # ids self._load_ids(l, response) # housekeeping self._load_keephouse(l, response) yield l.load_item()
def parse(self, response): district = response.xpath( '(//a[@id = "list_105"]/../a)[1]/text()').extract_first() subdistrict = response.xpath( '(//a[@id = "list_105"]/../a)[2]/text()').extract_first() for div in response.xpath('//dl[contains(@id,"list_D")]'): l = ItemLoader(item=PropertyItem(), selector=div) l.default_output_processor = TakeFirst() url = urljoin( response.url, urlparse(div.xpath("(.//a)[1]//@href").extract_first()).path) yield Request(url, callback=self.scrape_content_secondhouse, meta={ "dist_name": district, "subdist_name": subdistrict }) self._upd_retrived(response.url, 1)
def parse_qfang(self, response): self.logger.info("process qfang url") l = ItemLoader(item=PropertyItem(), selector=response) l.default_output_processor = TakeFirst() l.add_xpath("title", '//h2[@class="house-title fl"]/text()') l.add_value("url", response.url) l.add_xpath("price", '//p[@class="head-info-price fl"]/span/text()') l.add_xpath("address", '//p[@class="corresponding-con"]/a/text()') # l.add_xpath("district", '//div[@class="r-b-a fl clearfix"]/p/a[1]/text()', # MapCompose(lambda x: x.strip())) # l.add_xpath("subdistrict", '//div[@class="r-b-a fl clearfix"]/p/a[1]/text()', # MapCompose(lambda x: x.strip())) l.add_xpath("agent_name", '//p[@class="name fl"]/a/text()') # l.add_value("category_id_secondhouse", self.category_id_secondhouse) # l.add_value("station_name", "Q房网") # ids self._load_ids(l, response) # housekeeping self._load_keephouse(l, response) yield l.load_item()
def parse_anjuke(self, response): self.logger.info("process anjuke url") l = ItemLoader(item=PropertyItem(), selector=response) l.default_output_processor = TakeFirst() l.add_xpath("title", '//div[@class="wrapper"]/h1/text()', Join(), MapCompose(lambda x: "".join(x.split()))) l.add_value("url", response.url) l.add_xpath("price", '//span[@class="price-tag"]/em/text()') l.add_xpath("address", '//span[@class="desc addresscommu"]/text()', Join(), MapCompose(lambda x: "".join(x.split()))) l.add_xpath("agent_name", '//div[@class="bro-info clearfix"]/h5/text()') l.add_xpath("agent_company", '//p[@class="comp_info"]/a/text()', Join(), MapCompose(lambda x: "".join(x.split()))) # l.add_value("category_id_secondhouse", self.category_id_secondhouse) # l.add_value("station_name", "安居客") # ids self._load_ids(l, response) # housekeeping self._load_keephouse(l, response) yield l.load_item()
def parse_58(self, response): self.logger.info("process 58 url") l = ItemLoader(item=PropertyItem(), selector=response) l.default_output_processor = TakeFirst() l.add_xpath("title", '//div[@class="house-title"]/h1/text()', Join(), MapCompose(lambda x: "".join(x.split()))) l.add_value("url", response.url) l.add_xpath("price", '//span[@class="house_basic_title_money_num"]/text()') l.add_xpath("address", '//span[@class="house_basic_title_content_item3 xxdz-des"]/text()', Join(), MapCompose(lambda x: "".join(x.split()))) l.add_xpath("agent_name", '//span[@class="f14 c_333 jjrsay"]/text()', MapCompose(lambda x: x.strip())) # l.add_xpath("agent_company", '//span[@class="f14 c_333 jjrsay"]/text()') l.add_xpath("agent_phone", '//p[@class="phone-num"]/text()', MapCompose(lambda x: "".join(x.split()))) # l.add_value("category_id_secondhouse", self.category_id_secondhouse) # l.add_value("station_name", "58同城") # ids self._load_ids(l, response) # housekeeping self._load_keephouse(l, response) yield l.load_item()
def parse_fangdd(self, response): self.logger.info("process fangdd url") l = ItemLoader(item=PropertyItem(), selector=response) l.default_output_processor = TakeFirst() l.add_xpath("title", '//h1[@class="_3sWIj"]/text()', MapCompose(lambda x: "".join(x.split()))) l.add_value("url", response.url) l.add_xpath("price", '//div[@class="C1hVk"]/text()', MapCompose(lambda x: "".join(x.split()))) l.add_xpath("address", '//div[@class="_2mmF- _3YJ15 undefined"]/text()', Join(), MapCompose(lambda x: "".join(x.split()))) l.add_xpath("agent_name", '//span[@class="zMrme"]/text()', MapCompose(lambda x: x.strip())) # l.add_value("category_id_secondhouse", self.category_id_secondhouse) # l.add_value("station_name", "房多多") # ids self._load_ids(l, response) # housekeeping self._load_keephouse(l, response) yield l.load_item()
def parse_qfang(self, response): self.logger.info("process qfang url") l = ItemLoader(item=PropertyItem(), selector=response) l.default_output_processor = TakeFirst() l.add_xpath("title", '//h2[@class="house-title fl"]/text()', MapCompose(lambda x: x.strip())) l.add_value("url", response.url) l.add_xpath("price", '//p[@class="newhs-average-price fl"]/span/text()') l.add_xpath("address", '//p[@class="project-address clearfix"]/em/text()', Join(), MapCompose(lambda x: "".join(x.split()))) l.add_xpath("agent_name", '(//p[@class="name"]/span)[1]/text()') # l.add_value("category_id_secondhouse", self.category_id_secondhouse) # l.add_value("station_name", "Q房网") # ids self._load_ids(l, response) # housekeeping self._load_keephouse(l, response) yield l.load_item()
def parse_fang(self, response): self.logger.info("process fang url") l = ItemLoader(item=PropertyItem(), selector=response) l.default_output_processor = TakeFirst() l.add_xpath("title", '//div[@id="lpname"]/div[1]/text()') l.add_value("url", response.url) l.add_xpath("price", '//div[@class="trl-item price_esf sty1"]/i/text()') l.add_xpath("address", '//div[@class="rcont"]/a[@id="agantesfxq_C03_05"]/text()') # l.add_xpath("district", '//div[@id="address"]/a[1]/text()', # MapCompose(lambda x: x.strip())) # l.add_xpath("subdistrict", '//div[@id="address"]/a[2]/text()', # MapCompose(lambda x: x.strip())) l.add_xpath("agent_name", '//span[@class="zf_jjname"]/a/text()') l.add_xpath("agent_company", '//div[@class="tjcont-list-cline2"]/span[2]/text()') l.add_xpath("agent_phone", '//div[@class="tjcont-list-cline3 font16"]/span/text()') # l.add_value("category_id_secondhouse", self.category_id_secondhouse) # ids self._load_ids(l, response) # housekeeping self._load_keephouse(l, response) yield l.load_item()
def parse_item(self, response): # agency table l = ItemLoader(item=AgentItem(), response=response) l.default_output_processor = TakeFirst() l.add_xpath("name", '//div[@class="sthys3"]/text()', re=r":(\w+)") l.add_xpath("telephone", '//div[@class="sttelct2 sttelct"]/text()', MapCompose(lambda x: "".join(x.split()))) l.item.setdefault("company", None) l.add_xpath("company", '//li[@class="st14 stb starial"]//text()') l.add_xpath("address", '//div[@class="xflilist"]/div[3]//text()', re=r':(\w+)') l.add_xpath("register_date", '//div[@class="jbfx"]/text()', re=r'登记日期:([\d/]+)') l.add_value("city_name", self.city_name) l.add_value("dist_name", self.dist_name) l.add_value("category_name", self.category_name) l.add_value("station_name", self.station_name) l.add_xpath("subdist_name", '(//div[@class="xx_xq_l200"])[2]/text()', re='区域:(?:昆山)?(\\w+)') # housekeeping l.add_value("source", response.url) l.add_value("project", self.settings.get("BOT_NAME")) l.add_value("spider", self.name) l.add_value("server", socket.gethostname()) l.add_value("dt", datetime.datetime.utcnow()) item = l.load_item() if not item.get("subdist_name"): self.logger.critical( "subdsitrict name is not scrape, save response as a file") f = open("failed_html/html_%s.html" % parse_qs(urlparse(response.url).query).get("id")[0], 'w', encoding='utf8') f.write(response.text) f.close() # return Request(url=response.url) yield item # properties table l = ItemLoader(item=PropertyItem(), response=response) l.default_output_processor = TakeFirst() l.add_xpath('title', '//div[@class="xxview_title"]/text()') l.add_value("url", response.url) l.add_xpath( "price", '//div[@class="xx_xq_l200"]/span[@class="st22 ' 'sthuangs stb starial"]/text()') l.add_xpath("address", '//div[@class="wydzleft"]/text()', MapCompose(lambda x: x.strip()), re=r'物业地址:([^\x01-\x1f]+)') l.add_xpath("agent_name", '//div[@class="sthys3"]/text()', re=r":(\w+)") l.item.setdefault("agent_company", None) l.add_xpath("agent_company", '//li[@class="st14 stb starial"]//text()') l.add_xpath('agent_phone', '//div[@class="sttelct2 sttelct"]/text()', MapCompose(lambda x: "".join(x.split()))) l.add_xpath("recent_activation", '//div[@class="fyfbtime"]/text()', re='查看人次:(\\d+)') l.add_value("city_name", self.city_name) l.add_value("dist_name", self.dist_name) l.add_value('station_name', self.station_name) l.add_value("category_name", self.category_name) l.add_xpath("subdist_name", '(//div[@class="xx_xq_l200"])[2]/text()', re='区域:(?:昆山)?(\\w+)') # housekeeping l.add_value("source", response.request.url) l.add_value("project", self.settings.get("BOT_NAME")) l.add_value("spider", self.name) l.add_value("server", socket.gethostname()) l.add_value("dt", datetime.datetime.utcnow()) yield l.load_item()