def parse_homes(self, response): rows = response.xpath('//tr[contains(@class,"inre-clm")]') for row in rows: date = row.xpath(".//td/text()")[0].extract() address_number = row.xpath( './/div[contains(@class,"address")]/text()')[0].extract() address_area_1 = row.xpath( './/div[contains(@class,"address")]/a/text()')[0].extract() try: address_area_2 = row.xpath( './/div[contains(@class,"address")]/text()')[1].extract() except: address_area_2 = "" price = row.xpath(".//td/text()")[-2].extract() property_type = row.xpath(".//td/text()")[-1].extract() l = ItemLoader(item=HouseItem(), selector=row) l.add_value('date', date) l.add_value('address_number', address_number) l.add_value('address_area_1', address_area_1) l.add_value('address_area_2', address_area_2) l.add_value('price', price) l.add_value('property_type', property_type) yield l.load_item()
def county(self,response): item=response.meta #继承上个函数得到省市及链接 province=item['province'] city=item['city'] city_href=item['city_href'] url=response.url item = HouseItem() #存储到该函数 item['province'] = province item['city'] = city item['city_href'] = city_href body=response.body soup=BeautifulSoup(body,"lxml") tbody=soup.find_all('tbody',class_=None) # 会出现传入的第一个链接,获取内容不全的情况 if tbody==[]: headers = { 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6' } urls=requests.get(url,headers=headers).text soup=BeautifulSoup(urls,'lxml') tbody = soup.find_all('tbody', class_=None) #包含一个市的所有区县信息 tr=tbody[-1].find_all('tr',class_=None) #获得每个区县信息 for t in tr[1:]: t=str(t).replace('\n','').replace('\r','').replace('\t','') # print(t) try: county_info=re.findall('<a class="c_blue" href="(.*?)" title="(.*?)">(.*?)</a>',t)[0] #获得区县名称 item['county']=county_info[-1] #获得区县详细信息链接 county_href=city_href+county_info[0] #获取区县小区链接 oldhome_href=county_href[:-1].replace('/market/dist','/ha/list/salesort.html?dist=') item['county_href']=county_href item['oldhome_href']=oldhome_href except: county_info = re.findall('<a title="(.*?)" href="(.*?)" class="c_blue">(.*?)</a>', t)[0] item['county']=county_info[-1] county_href=city_href+county_info[1] #获得区县小区二手房链接 oldhome_href=county_href[:-1].replace('/market/dist','/ha/list/salesort.html?dist=') item['county_href']=county_href item['oldhome_href']=oldhome_href #转到小区二手房信息 yield scrapy.Request(url=oldhome_href, callback=self.oldhome, meta=item, dont_filter=True) #获得区县小区新楼盘链接 newhome_href=county_href.replace('/market/dist','/ha/ds') item['cpage']=1 item['newhome_href'] = newhome_href #区县小区首页链接(中间变量) item['newhome_fweb']=newhome_href #转到小区新楼盘信息 yield scrapy.Request(url=newhome_href, callback=self.newhome, meta=item, dont_filter=True)
def parse(self, response): for each in response.xpath('//li[@class="clear LOGCLICKDATA"]'): item = HouseItem() item['title'] = each.xpath('./div[1]/div[1]/a/text()').extract()[0] item['address'] = each.xpath( './div[1]/div[2]/div/a/text()').extract()[0] house_info = each.xpath( './div[1]/div[2]/div/text()').extract()[0].split('|') item['type'] = house_info[1].strip() item['size'] = house_info[2].strip() item['orientation'] = house_info[3].strip() item['fitment'] = house_info[4].strip() item['price'] = each.xpath( './div[1]/div[6]/div[1]/span/text()').extract()[0] yield item
def parse(self, response): item = HouseItem() sel = scrapy.Selector(response) webs_info = sel.xpath(".//div[@class='col_detail']/table[@class='table_city']") #获得省名称(27个省的列表) #山东class='s_province s_plast ordinary_province '多了个空格,其他省最后无空格,只能用包含判断 province_info = webs_info.xpath(".//span[contains(@class,'s_province s_plast ordinary_province')]/text()").extract() # province_info.insert(19,"山东") web_info = webs_info.xpath(".//span[@class='wraplist']") #将每个省分开分析 for i in range(len(web_info)): #获得各省名称 item['province'] = province_info[i] w = web_info[i] #获得各省对应所有市的信息 wraps = w.xpath(".//span[@class='wrap']") #将各市分开分析 for wrap in wraps: #获得每个市的名称 item['city'] = wrap.xpath(".//span[@class='m_d_zx']/a/text()").extract()[0] #获得每个市的链接 city_href = wrap.xpath(".//span[@class='m_d_zx']/a/@href").extract()[0] item['city_href'] = city_href #市对应各区县房价涨幅链接 city_county=city_href+'/market/rankforsale.html' #获得区县详细信息链接 yield scrapy.Request(url=city_county, callback=self.county, meta=item, dont_filter=True) #直辖市列表(没有对应的省) zhixiashi_list = webs_info.xpath(".//td[@class='right_city']/span[@class='m_d_zx']/a/text()").extract() #直辖市链接列表 zhixianshi_href_list = webs_info.xpath(".//td[@class='right_city']/span[@class='m_d_zx']/a/@href").extract() #直辖市所有区信息 zxs_county = webs_info.xpath(".//td[@class='right_city']/span[@class='m_d_city mb5']") for i in range(len(zxs_county)): item['province'] = zhixiashi_list[i] #获得每个直辖市的名称 item['city'] = zhixiashi_list[i] #获得每个直辖市的链接 city_href = zhixianshi_href_list[i] item['city_href'] = city_href # 市对应各区房价涨幅链接 city_county = city_href + '/market/rankforsale.html' # 获得区详细信息链接 yield scrapy.Request(url=city_county, callback=self.county, meta=item, dont_filter=True)
def parse_item(self, response): items = response.xpath( '//ul[@class="houselist-mod houselist-mod-new"]/li') logger.info(response.url) for item in items: room_type, capacity, house_type, house_time, detail = item.xpath( './/div[@class="details-item"]/span/text()').extract() name, infos = [d.strip() for d in detail.split('\xa0\xa0')] info_items = infos.split('-') info_items.reverse() # inspect_response(response, self) area = self.getFistItem(info_items) region = self.getFistItem(info_items) street = self.getFistItem(info_items) yield HouseItem({ 'title': item.xpath('.//div[@class="house-title"]/a/@title').get(), 'labels': item.xpath( './/div[@class="tags-bottom"]/span/text()').extract(), 'price': item.xpath('.//span[@class="price-det"]/strong/text()').get(), 'mean_price': item.xpath('.//span[@class="unit-price"]/text()').get(), 'time': house_time, 'area': area, 'region': region, 'street': street, 'name': name, 'room_type': room_type, 'house_type': house_type, 'capacity': capacity, })
def parse_item(self, response): items = [] sel = Selector(response) base_url = get_base_url(response) houses = sel.xpath('//div[@class="resblock-desc-wrapper"]') for house in houses: item = HouseItem() house_name = house.xpath( 'div[@class="resblock-name"]/a/text()').extract() house_address = house.xpath( 'div[@class="resblock-location"]/a/text()').extract() house_price = house.xpath( 'div[@class="resblock-price"]/div[@class="main-price"]/span/text()' ).extract() house_url = house.xpath( 'div[@class="resblock-name"]/a/@href').extract() url = base_url + '/' + ''.join(house_url).split('/')[2] item['house_name'] = house_name item['house_address'] = house_address item['house_price'] = house_price[0] + house_price[1].strip() item['house_url'] = url items.append(item) return items
def parse_newhouse(self, response): nlcd_name = response.xpath("//div[@class='tit']/h1//text()").get() new_house_address = response.xpath("//div[@class='br_left']//li[2]//text()").get()+\ response.xpath("//div[@class='br_left']//li[3]//text()").get() residence = response.xpath( "//div[@class='biaoqian1']/a[1]/text()").get() new_disk = response.xpath( "//div[@class='biaoqian1']/a[2]/text()").get() avg_money = response.xpath( "//div[@class='inf_left fl ']//text()").getall() avg_money = ''.join( list(map(lambda avg: re.sub('\s', '', avg), avg_money))) housewear = response.xpath("//div[@class='fl zlhx']//text()").getall() housewear = list(map(lambda ware: re.sub('\s', '', ware), housewear)) housewear = ','.join([i for i in housewear if len(i) > 0]) project_addr = response.xpath( "//div[@class='information_li']//span/text()").get() open_time = response.xpath( "//div[@class='inf_left fl']//a[@class='kaipan']/text()").get() item = HouseItem(nlcd_name=nlcd_name, new_house_address=new_house_address, residence=residence, new_disk=new_disk, avg_money=avg_money, housewear=housewear, project_addr=project_addr, open_time=open_time) print("tong", nlcd_name) yield item def parse_tail(self, response): item1 = {} item1["area"] = response.xpath( "//div[@class='screen_al']//ul[contains(@class,'choose_screen ')]/li/a/text()" ).get()
def parse(self, response): il = ItemLoader(item=HouseItem(), response=response) il.add_value('link', [response.url]) il.add_value('code', [response.status]) return il.load_item()
def newhome(self,response): #从上一函数传下来 sel=scrapy.Selector(response) item=response.meta province=item['province'] city=item['city'] city_href=item['city_href'] county=item['county'] # 中间变量(不返回到yield item) cpage=item['cpage'] #当前页 county_href=item['county_href'] newhome_href = item['newhome_href'] # 中间变量(不返回到yield item) newhome_fweb=item['newhome_fweb'] #首页链接(为了后续拼翻页链接) url=response.url item = HouseItem() #存储到此函数 item['province'] = province item['city'] = city item['city_href'] = city_href item['county'] = county item['county_href'] = county_href item['newhome_href']=newhome_href item['building']='新楼盘' item['date_before']=self.date_before item['ProgramStarttime']=self.ProgramStarttime boxs=sel.xpath(".//div[@id='content']/div[@class='halistbox']") # xpath重新获取时,HTML不能用extract(),所以后续不能合并处理 if boxs==[]: #会出现获取信息不全的情况 headers = { 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6' } urls=requests.get(url,headers=headers).text html = HTML(urls) #有该页所有小区信息那一部分 boxs = html.xpath(".//div[@id='content']/div[@class='halistbox']")[0] #小区信息列表 box=boxs.xpath(".//div[@class='halist clearfix']") #各个小区 for b in box: #小区名称 item['house']=b.xpath(".//div[@class='title mb5 clearfix']/h4[@class='tit fl mr']/a/text()")[0] #text=['均价:', '元/㎡', '(2017-06-12)'] 或[] text=b.xpath(".//div[@class='text']/ul[@class='mb15']/li[1]/*/text()") if text: try: #房价类型 item['price_type']=text[0][:-1] except: item['price_type']=None try: #房价发布时间 item['time']=text[2][1:-1] except: item['time']=None #price_info=['25,000']或[] price_info=b.xpath(".//div[@class='text']/ul[@class='mb15']/li[1]/span/*/text()") if price_info: #房价 item['price']=price_info[0] yield item #共**页 try: pages=boxs.xpath(".//div[@class='page1 mb5 clearfix']/span[@class='page_p']/text()")[0] page=int(re.findall("共(.*?)页",pages)[0]) except: page=None else: #小区信息列表 box=boxs.xpath(".//div[@class='halist clearfix']") #各个小区 for b in box: #小区名称 item['house']=b.xpath(".//div[@class='title mb5 clearfix']/h4[@class='tit fl mr']/a/text()").extract()[0] #text=['均价:', '元/㎡', '(2017-06-12)'] 或[] text=b.xpath(".//div[@class='text']/ul[@class='mb15']/li[1]/*/text()").extract() if text: try: #房价类型(均价或起价) item['price_type']=text[0][:-1] except: item['price_type']=None try: #房价更新时间 item['time']=text[2][1:-1] except: item['time']=None # price_info=['25,000']或[] price_info=b.xpath(".//div[@class='text']/ul[@class='mb15']/li[1]/span/*/text()").extract() if price_info: item['price']=price_info[0] yield item # 共**页 try: pages=boxs.xpath(".//div[@class='page1 mb5 clearfix']/span/text()").extract()[0] page=int(re.findall("共(.*?)页",pages)[0]) except: page=None #翻页 if page: if cpage<page: #根据各区县小区首页链接拼翻页链接 newhome_href=newhome_fweb[:-1]+"-pg"+str(cpage+1)+"/" item['cpage']=cpage+1 item['newhome_fweb']=newhome_fweb item['newhome_href']=newhome_href yield scrapy.Request(url=newhome_href, callback=self.newhome, meta=item, dont_filter=True)
def oldhome(self, response): # if response.status== sel=scrapy.Selector(response) #从上一函数传下来 item=response.meta province=item['province'] city=item['city'] city_href=item['city_href'] county=item['county'] county_href=item['county_href'] oldhome_href=item['oldhome_href'] url=response.url item = HouseItem() #存储到此函数 item['province'] = province item['city'] = city item['city_href'] = city_href item['county'] = county item['county_href'] = county_href item['oldhome_href']=oldhome_href item['date_before']=self.date_before item['building'] = '二手房' item['ProgramStarttime']=self.ProgramStarttime # 有小区信息那一部分 detail_table=sel.xpath(".//div[@class='l-c']/div[@class='gary-detail pdd-5']/table[@class='ha_detail_table mt']") # 会出现传入的第一个链接,获取内容不全的情况 #xpath重新获取时,HTML不能用extract(),所以后续不能合并处理 if detail_table==[]: headers = { 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6' } urls=requests.get(url,headers=headers).text html=HTML(urls) #有该页小区信息那一部分 detail_table = html.xpath(".//div[@class='l-c']/div[@class='gary-detail pdd-5']/table[@class='ha_detail_table mt']")[0] #该区县所有小区列表 detail=detail_table.xpath(".//tr[@height='25px;']") #获取各个小区信息 for d in detail: #小区名称 item['house']=d.xpath(".//a[@class='c_blue']/text()")[0] #上月房价(平均单价) item['price']=d.xpath(".//td[4]/span/text()")[0] #环比上月信息 rate=d.xpath(".//td[5]/span/text()")[0] if '--' not in rate: if rate[0]=='-': item['rate_m_unit']='下降' item['rate_m'] = rate[1:] elif rate[0]=='+': item['rate_m_unit']='上升' item['rate_m'] = rate[1:] else: item['rate_m_unit']=None item['rate_m'] = rate else: item['rate_m_unit'] = None item['rate_m'] = rate yield item else: # 该区县所有小区列表 detail=detail_table.xpath(".//tr[@height='25px;']") # 获取各个小区信息 for d in detail: # 小区名称 item['house'] = d.xpath(".//a[@class='c_blue']/text()").extract()[0] # 上月房价(平均单价) item['price'] = d.xpath(".//td[4]/span/text()").extract()[0] # 环比上月信息 rate = d.xpath(".//td[5]/span/text()").extract()[0] if '--' not in rate: if rate[0] == '-': item['rate_m_unit'] = '下降' item['rate_m'] = rate[1:] elif rate[0] == '+': item['rate_m_unit'] = '上升' item['rate_m'] = rate[1:] else: item['rate_m_unit'] = None item['rate_m'] = rate else: item['rate_m_unit'] = None item['rate_m'] = rate yield item