def parse_homes(self, response):

        rows = response.xpath('//tr[contains(@class,"inre-clm")]')
        for row in rows:

            date = row.xpath(".//td/text()")[0].extract()
            address_number = row.xpath(
                './/div[contains(@class,"address")]/text()')[0].extract()
            address_area_1 = row.xpath(
                './/div[contains(@class,"address")]/a/text()')[0].extract()
            try:
                address_area_2 = row.xpath(
                    './/div[contains(@class,"address")]/text()')[1].extract()
            except:
                address_area_2 = ""

            price = row.xpath(".//td/text()")[-2].extract()
            property_type = row.xpath(".//td/text()")[-1].extract()

            l = ItemLoader(item=HouseItem(), selector=row)

            l.add_value('date', date)
            l.add_value('address_number', address_number)
            l.add_value('address_area_1', address_area_1)
            l.add_value('address_area_2', address_area_2)
            l.add_value('price', price)
            l.add_value('property_type', property_type)

            yield l.load_item()
Example #2
0
 def county(self,response):
     item=response.meta
     #继承上个函数得到省市及链接
     province=item['province']
     city=item['city']
     city_href=item['city_href']
     url=response.url
     item = HouseItem()
     #存储到该函数
     item['province'] = province
     item['city'] = city
     item['city_href'] = city_href
     body=response.body
     soup=BeautifulSoup(body,"lxml")
     tbody=soup.find_all('tbody',class_=None)
     # 会出现传入的第一个链接,获取内容不全的情况
     if tbody==[]:
         headers = {
             'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
         }
         urls=requests.get(url,headers=headers).text
         soup=BeautifulSoup(urls,'lxml')
         tbody = soup.find_all('tbody', class_=None)
     #包含一个市的所有区县信息
     tr=tbody[-1].find_all('tr',class_=None)
     #获得每个区县信息
     for t in tr[1:]:
         t=str(t).replace('\n','').replace('\r','').replace('\t','')
         # print(t)
         try:
             county_info=re.findall('<a class="c_blue" href="(.*?)" title="(.*?)">(.*?)</a>',t)[0]
             #获得区县名称
             item['county']=county_info[-1]
             #获得区县详细信息链接
             county_href=city_href+county_info[0]
             #获取区县小区链接
             oldhome_href=county_href[:-1].replace('/market/dist','/ha/list/salesort.html?dist=')
             item['county_href']=county_href
             item['oldhome_href']=oldhome_href
         except:
             county_info = re.findall('<a title="(.*?)" href="(.*?)" class="c_blue">(.*?)</a>', t)[0]
             item['county']=county_info[-1]
             county_href=city_href+county_info[1]
             #获得区县小区二手房链接
             oldhome_href=county_href[:-1].replace('/market/dist','/ha/list/salesort.html?dist=')
             item['county_href']=county_href
             item['oldhome_href']=oldhome_href
         #转到小区二手房信息
         yield scrapy.Request(url=oldhome_href, callback=self.oldhome, meta=item, dont_filter=True)
         #获得区县小区新楼盘链接
         newhome_href=county_href.replace('/market/dist','/ha/ds')
         item['cpage']=1
         item['newhome_href'] = newhome_href
         #区县小区首页链接(中间变量)
         item['newhome_fweb']=newhome_href
         #转到小区新楼盘信息
         yield scrapy.Request(url=newhome_href, callback=self.newhome, meta=item, dont_filter=True)
Example #3
0
 def parse(self, response):
     for each in response.xpath('//li[@class="clear LOGCLICKDATA"]'):
         item = HouseItem()
         item['title'] = each.xpath('./div[1]/div[1]/a/text()').extract()[0]
         item['address'] = each.xpath(
             './div[1]/div[2]/div/a/text()').extract()[0]
         house_info = each.xpath(
             './div[1]/div[2]/div/text()').extract()[0].split('|')
         item['type'] = house_info[1].strip()
         item['size'] = house_info[2].strip()
         item['orientation'] = house_info[3].strip()
         item['fitment'] = house_info[4].strip()
         item['price'] = each.xpath(
             './div[1]/div[6]/div[1]/span/text()').extract()[0]
         yield item
Example #4
0
    def parse(self, response):
        item = HouseItem()
        sel = scrapy.Selector(response)
        webs_info = sel.xpath(".//div[@class='col_detail']/table[@class='table_city']")
        #获得省名称(27个省的列表)
        #山东class='s_province s_plast ordinary_province '多了个空格,其他省最后无空格,只能用包含判断
        province_info = webs_info.xpath(".//span[contains(@class,'s_province s_plast ordinary_province')]/text()").extract()
        # province_info.insert(19,"山东")
        web_info = webs_info.xpath(".//span[@class='wraplist']")
        #将每个省分开分析
        for i in range(len(web_info)):
            #获得各省名称
            item['province'] = province_info[i]
            w = web_info[i]
            #获得各省对应所有市的信息
            wraps = w.xpath(".//span[@class='wrap']")
            #将各市分开分析
            for wrap in wraps:
                #获得每个市的名称
                item['city'] = wrap.xpath(".//span[@class='m_d_zx']/a/text()").extract()[0]
                #获得每个市的链接
                city_href = wrap.xpath(".//span[@class='m_d_zx']/a/@href").extract()[0]
                item['city_href'] = city_href
                #市对应各区县房价涨幅链接
                city_county=city_href+'/market/rankforsale.html'
                #获得区县详细信息链接
                yield scrapy.Request(url=city_county, callback=self.county, meta=item, dont_filter=True)

        #直辖市列表(没有对应的省)
        zhixiashi_list = webs_info.xpath(".//td[@class='right_city']/span[@class='m_d_zx']/a/text()").extract()
        #直辖市链接列表
        zhixianshi_href_list = webs_info.xpath(".//td[@class='right_city']/span[@class='m_d_zx']/a/@href").extract()
        #直辖市所有区信息
        zxs_county = webs_info.xpath(".//td[@class='right_city']/span[@class='m_d_city mb5']")
        for i in range(len(zxs_county)):
            item['province'] = zhixiashi_list[i]
            #获得每个直辖市的名称
            item['city'] = zhixiashi_list[i]
            #获得每个直辖市的链接
            city_href = zhixianshi_href_list[i]
            item['city_href'] = city_href
            # 市对应各区房价涨幅链接
            city_county = city_href + '/market/rankforsale.html'
            # 获得区详细信息链接
            yield scrapy.Request(url=city_county, callback=self.county, meta=item, dont_filter=True)
Example #5
0
    def parse_item(self, response):
        items = response.xpath(
            '//ul[@class="houselist-mod houselist-mod-new"]/li')
        logger.info(response.url)

        for item in items:
            room_type, capacity, house_type, house_time, detail = item.xpath(
                './/div[@class="details-item"]/span/text()').extract()
            name, infos = [d.strip() for d in detail.split('\xa0\xa0')]
            info_items = infos.split('-')
            info_items.reverse()
            # inspect_response(response, self)
            area = self.getFistItem(info_items)
            region = self.getFistItem(info_items)
            street = self.getFistItem(info_items)
            yield HouseItem({
                'title':
                item.xpath('.//div[@class="house-title"]/a/@title').get(),
                'labels':
                item.xpath(
                    './/div[@class="tags-bottom"]/span/text()').extract(),
                'price':
                item.xpath('.//span[@class="price-det"]/strong/text()').get(),
                'mean_price':
                item.xpath('.//span[@class="unit-price"]/text()').get(),
                'time':
                house_time,
                'area':
                area,
                'region':
                region,
                'street':
                street,
                'name':
                name,
                'room_type':
                room_type,
                'house_type':
                house_type,
                'capacity':
                capacity,
            })
Example #6
0
    def parse_item(self, response):
        items = []
        sel = Selector(response)
        base_url = get_base_url(response)
        houses = sel.xpath('//div[@class="resblock-desc-wrapper"]')
        for house in houses:
            item = HouseItem()
            house_name = house.xpath(
                'div[@class="resblock-name"]/a/text()').extract()
            house_address = house.xpath(
                'div[@class="resblock-location"]/a/text()').extract()
            house_price = house.xpath(
                'div[@class="resblock-price"]/div[@class="main-price"]/span/text()'
            ).extract()
            house_url = house.xpath(
                'div[@class="resblock-name"]/a/@href').extract()
            url = base_url + '/' + ''.join(house_url).split('/')[2]

            item['house_name'] = house_name
            item['house_address'] = house_address
            item['house_price'] = house_price[0] + house_price[1].strip()
            item['house_url'] = url
            items.append(item)
        return items
Example #7
0
    def parse_newhouse(self, response):

        nlcd_name = response.xpath("//div[@class='tit']/h1//text()").get()
        new_house_address = response.xpath("//div[@class='br_left']//li[2]//text()").get()+\
                            response.xpath("//div[@class='br_left']//li[3]//text()").get()
        residence = response.xpath(
            "//div[@class='biaoqian1']/a[1]/text()").get()
        new_disk = response.xpath(
            "//div[@class='biaoqian1']/a[2]/text()").get()
        avg_money = response.xpath(
            "//div[@class='inf_left fl ']//text()").getall()
        avg_money = ''.join(
            list(map(lambda avg: re.sub('\s', '', avg), avg_money)))
        housewear = response.xpath("//div[@class='fl zlhx']//text()").getall()
        housewear = list(map(lambda ware: re.sub('\s', '', ware), housewear))
        housewear = ','.join([i for i in housewear if len(i) > 0])
        project_addr = response.xpath(
            "//div[@class='information_li']//span/text()").get()
        open_time = response.xpath(
            "//div[@class='inf_left fl']//a[@class='kaipan']/text()").get()
        item = HouseItem(nlcd_name=nlcd_name,
                         new_house_address=new_house_address,
                         residence=residence,
                         new_disk=new_disk,
                         avg_money=avg_money,
                         housewear=housewear,
                         project_addr=project_addr,
                         open_time=open_time)
        print("tong", nlcd_name)
        yield item

        def parse_tail(self, response):
            item1 = {}
            item1["area"] = response.xpath(
                "//div[@class='screen_al']//ul[contains(@class,'choose_screen ')]/li/a/text()"
            ).get()
Example #8
0
 def parse(self, response):
     il = ItemLoader(item=HouseItem(), response=response)
     il.add_value('link', [response.url])
     il.add_value('code', [response.status])
     return il.load_item()
Example #9
0
    def newhome(self,response):
        #从上一函数传下来
        sel=scrapy.Selector(response)
        item=response.meta
        province=item['province']
        city=item['city']
        city_href=item['city_href']
        county=item['county']
        # 中间变量(不返回到yield item)
        cpage=item['cpage']         #当前页
        county_href=item['county_href']
        newhome_href = item['newhome_href']
        # 中间变量(不返回到yield item)
        newhome_fweb=item['newhome_fweb']    #首页链接(为了后续拼翻页链接)
        url=response.url
        item = HouseItem()
        #存储到此函数
        item['province'] = province
        item['city'] = city
        item['city_href'] = city_href
        item['county'] = county
        item['county_href'] = county_href
        item['newhome_href']=newhome_href
        item['building']='新楼盘'
        item['date_before']=self.date_before
        item['ProgramStarttime']=self.ProgramStarttime
        boxs=sel.xpath(".//div[@id='content']/div[@class='halistbox']")
        # xpath重新获取时,HTML不能用extract(),所以后续不能合并处理
        if boxs==[]:   #会出现获取信息不全的情况
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
            }
            urls=requests.get(url,headers=headers).text
            html = HTML(urls)
            #有该页所有小区信息那一部分
            boxs = html.xpath(".//div[@id='content']/div[@class='halistbox']")[0]
            #小区信息列表
            box=boxs.xpath(".//div[@class='halist clearfix']")
            #各个小区
            for b in box:
                #小区名称
                item['house']=b.xpath(".//div[@class='title mb5 clearfix']/h4[@class='tit fl mr']/a/text()")[0]
                #text=['均价:', '元/㎡', '(2017-06-12)'] 或[]
                text=b.xpath(".//div[@class='text']/ul[@class='mb15']/li[1]/*/text()")
                if text:
                    try:  #房价类型
                        item['price_type']=text[0][:-1]
                    except:
                        item['price_type']=None
                    try:   #房价发布时间
                        item['time']=text[2][1:-1]
                    except:
                        item['time']=None
                #price_info=['25,000']或[]
                price_info=b.xpath(".//div[@class='text']/ul[@class='mb15']/li[1]/span/*/text()")
                if price_info:   #房价
                    item['price']=price_info[0]
                    yield item
            #共**页
            try:
                pages=boxs.xpath(".//div[@class='page1 mb5 clearfix']/span[@class='page_p']/text()")[0]
                page=int(re.findall("共(.*?)页",pages)[0])
            except:
                page=None
        else:
            #小区信息列表
            box=boxs.xpath(".//div[@class='halist clearfix']")
            #各个小区
            for b in box:
                #小区名称
                item['house']=b.xpath(".//div[@class='title mb5 clearfix']/h4[@class='tit fl mr']/a/text()").extract()[0]
                #text=['均价:', '元/㎡', '(2017-06-12)'] 或[]
                text=b.xpath(".//div[@class='text']/ul[@class='mb15']/li[1]/*/text()").extract()
                if text:
                    try:
                        #房价类型(均价或起价)
                        item['price_type']=text[0][:-1]
                    except:
                        item['price_type']=None
                    try:  #房价更新时间
                        item['time']=text[2][1:-1]
                    except:
                        item['time']=None
                # price_info=['25,000']或[]
                price_info=b.xpath(".//div[@class='text']/ul[@class='mb15']/li[1]/span/*/text()").extract()
                if price_info:
                    item['price']=price_info[0]
                    yield item
            # 共**页
            try:
                pages=boxs.xpath(".//div[@class='page1 mb5 clearfix']/span/text()").extract()[0]
                page=int(re.findall("共(.*?)页",pages)[0])
            except:
                page=None

        #翻页
        if page:
            if cpage<page:
                #根据各区县小区首页链接拼翻页链接
                newhome_href=newhome_fweb[:-1]+"-pg"+str(cpage+1)+"/"
                item['cpage']=cpage+1
                item['newhome_fweb']=newhome_fweb
                item['newhome_href']=newhome_href
                yield scrapy.Request(url=newhome_href, callback=self.newhome, meta=item, dont_filter=True)
Example #10
0
 def oldhome(self, response):
     # if response.status==
     sel=scrapy.Selector(response)
     #从上一函数传下来
     item=response.meta
     province=item['province']
     city=item['city']
     city_href=item['city_href']
     county=item['county']
     county_href=item['county_href']
     oldhome_href=item['oldhome_href']
     url=response.url
     item = HouseItem()
     #存储到此函数
     item['province'] = province
     item['city'] = city
     item['city_href'] = city_href
     item['county'] = county
     item['county_href'] = county_href
     item['oldhome_href']=oldhome_href
     item['date_before']=self.date_before
     item['building'] = '二手房'
     item['ProgramStarttime']=self.ProgramStarttime
     # 有小区信息那一部分
     detail_table=sel.xpath(".//div[@class='l-c']/div[@class='gary-detail pdd-5']/table[@class='ha_detail_table mt']")
     # 会出现传入的第一个链接,获取内容不全的情况
     #xpath重新获取时,HTML不能用extract(),所以后续不能合并处理
     if detail_table==[]:
         headers = {
             'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
         }
         urls=requests.get(url,headers=headers).text
         html=HTML(urls)
         #有该页小区信息那一部分
         detail_table = html.xpath(".//div[@class='l-c']/div[@class='gary-detail pdd-5']/table[@class='ha_detail_table mt']")[0]
         #该区县所有小区列表
         detail=detail_table.xpath(".//tr[@height='25px;']")
         #获取各个小区信息
         for d in detail:
             #小区名称
             item['house']=d.xpath(".//a[@class='c_blue']/text()")[0]
             #上月房价(平均单价)
             item['price']=d.xpath(".//td[4]/span/text()")[0]
             #环比上月信息
             rate=d.xpath(".//td[5]/span/text()")[0]
             if '--' not in rate:
                 if rate[0]=='-':
                     item['rate_m_unit']='下降'
                     item['rate_m'] = rate[1:]
                 elif rate[0]=='+':
                     item['rate_m_unit']='上升'
                     item['rate_m'] = rate[1:]
                 else:
                     item['rate_m_unit']=None
                     item['rate_m'] = rate
             else:
                 item['rate_m_unit'] = None
                 item['rate_m'] = rate
             yield item
     else:
         # 该区县所有小区列表
         detail=detail_table.xpath(".//tr[@height='25px;']")
         # 获取各个小区信息
         for d in detail:
             # 小区名称
             item['house'] = d.xpath(".//a[@class='c_blue']/text()").extract()[0]
             # 上月房价(平均单价)
             item['price'] = d.xpath(".//td[4]/span/text()").extract()[0]
             # 环比上月信息
             rate = d.xpath(".//td[5]/span/text()").extract()[0]
             if '--' not in rate:
                 if rate[0] == '-':
                     item['rate_m_unit'] = '下降'
                     item['rate_m'] = rate[1:]
                 elif rate[0] == '+':
                     item['rate_m_unit'] = '上升'
                     item['rate_m'] = rate[1:]
                 else:
                     item['rate_m_unit'] = None
                     item['rate_m'] = rate
             else:
                 item['rate_m_unit'] = None
                 item['rate_m'] = rate
             yield item