def parseBlock(self, response): oneOut = items.BlockItem() try: oneOut['name'] = util.ExtractString(response, self.xpath['name']) oneOut['block'] = util.ExtractString(response, self.xpath['block']) oneOut['price'] = util.ExtractNumber(response, self.xpath['price']) oneOut['sellCounter'] = util.ExtractNumber(response, self.xpath['sellCounter']) oneOut['traded'] = util.ExtractNumber(response, self.xpath['traded']) oneOut['lookCounter'] = util.ExtractNumber(response, self.xpath['lookCounter']) except Exception as e: print(e) oneOut['crawlDate'] = util.today() return oneOut
def parse(self, response): self.received.add(response.url) # districts = self.parseDistricts(response) # realOut = set(districts) - self.received # for one in realOut: # yield Request(one, meta={'step': 0}) # # subDistricts = self.parseSubDistricts(response) # realOut = set(subDistricts) - self.received # for one in realOut: # yield Request(one, meta={'step': 1, 'url': one}) blockName = None if 'block' not in response.meta: #本小区摘要信息 block = self.parseBlock(response) blockName = block['name'] # yield block #所有同级页面 nextPage = self.nextPage(response, self.head, None) realOut = set(nextPage) - self.received for one in realOut: #这个是一共多少页 yield Request(one, meta={'step': 2, 'block': block['name']}) else: blockName = response.meta['block'] ones = response.xpath(self.xpath['lists']) for one in ones: # 这个是每页多少条 try: url = util.ExtractString(one, './/div[1]/div[1]/a/@href') housecode = util.ExtractString(one, './/div[1]/div[1]/a/@data-housecode') yield Request(url, meta={'step': 3, 'block': blockName, 'hc': housecode}) except Exception as e: print(e) if 'step' in response.meta and response.meta['step'] ==3: oneOut = self.parseOne(response, blockName, response.meta['hc']) if len(oneOut['_id']): yield oneOut
def parseOne(self, one, index): oneOut = items.TopListItem() try: oneOut['index'] = index oneOut['name'] = util.ExtractString(one, './/ul/li[3]/a/@title') oneOut['href'] = util.ExtractString(one, './/ul/li[3]/a/@href') if len(oneOut['href']): index = oneOut['href'].find('id=') if index != -1: oneOut['_id'] = oneOut['href'][index + 3:] + '_zb' oneOut['kind'] = util.ExtractString(one, './/ul/li[4]/text()') oneOut['glgzhs'] = util.ExtractNumber(one, './/ul/li[5]/text()') oneOut['gshydl'] = util.ExtractString(one, './/ul/li[6]/text()') oneOut['czzs'] = util.ExtractNumber(one, './/ul/li[7]/text()') oneOut['aldzs'] = util.ExtractNumber(one, './/ul/li[8]/span/text()') except Exception as e: print(e) logging.warning("parseOne Exception %s" % (str(e))) return oneOut
def nextPagePlusOne(self, response, url): np = [] nextPageText = ''.join(response.xpath(self.xpath['nextPageText']).extract()).strip() if nextPageText == '下一页': for one in response.xpath(self.xpath['nextPage']).extract(): np.append(url + one) else: p = response.xpath(self.xpath['allPage']) # 框架支持url排重,这里就不排重了 for one in p: #原始url 最后多一个/,导致无法匹配 np.append(url + urllib.parse.quote(util.ExtractString(one, './/@href')) + '/') return np
def parse(self, response): self.received.add(response.url) print("receive data... " + response.url) one = items.DetailItem() index = response.url.find('id=') if index != -1: index2 = response.url.find('&', index + 3) if index != -1: one['_id'] = response.url[index + 3:index2] one['logo'] = util.ExtractString(response, '//*[@id="logo"]/@src') one['detail'] = util.ExtractString( response, '//*[@id="detail_page"]/div/div[2]/div[2]/div[5]/div[4]/text()' ).strip() array = response.xpath( '//*[@id="detail_page"]/div/div[2]/div[2]/div[5]/div[2]/ul/li') tmpArray = [] for tmp in array: t = util.ExtractString(tmp, './/img/@src') tmpArray.append(t) one['imageList'] = tmpArray yield one
def parseOne(self, one, district, subDistrict): # {'_id': '', # 'area': '', # 'attention': '', # 'community': '', # 'crawlDate': datetime.datetime(2019, 8, 24, 0, 0), # 'district': '朝阳', # 'level': ')', # 'src': 'lj', # 'subDistrict': '通州北苑', # 'title': '', # 'totalPrice': nan, # 'unitPrice': nan} oneOut = items.HouseItem() oneOut['src'] = self.src oneOut['district'] = district oneOut['subDistrict'] = subDistrict oneOut['title'] = ''.join( one.xpath('.//div[1]/div[1]/a/text()').extract()).strip() oneOut['_id'] = ''.join( one.xpath('.//div[1]/div[1]/a/@data-housecode').extract()).strip() try: unitPrice = util.String2Number(''.join( one.xpath( './/div[1]/div[6]/div[2]/span/text()').extract()).strip()) if not np.isnan(unitPrice): oneOut['unitPrice'] = unitPrice oneOut['totalPrice'] = util.String2Number(''.join( one.xpath('.//div[1]/div[6]/div[1]/span/text()').extract() ).strip()) else: # https://sh.lianjia.com/ershoufang/changning/pg96/ oneOut['unitPrice'] = util.String2Number(''.join( one.xpath('.//div[1]/div[7]/div[2]/span/text()').extract() ).strip()) oneOut['totalPrice'] = util.String2Number(''.join( one.xpath('.//div[1]/div[7]/div[1]/span/text()').extract() ).strip()) oneOut['community'] = ''.join( one.xpath('.//div[1]/div[2]/div/a/text()').extract()) houseInfo = ''.join( one.xpath('.//div[1]/div[2]/div/text()').extract()) houseInfo = houseInfo.split('|') if len(houseInfo) > 1: oneOut['houseType'] = houseInfo[1].strip() if len(houseInfo) > 2: oneOut['square'] = util.String2Number(houseInfo[2].strip()) #'/html/body/div[4]/div[1]/ul/li[1]/div[1]/div[3]/div/a' oneOut['area'] = util.ExtractString( one, './/div[1]/div[3]/div/a/text()') positionInfo = ''.join( one.xpath('.//div[1]/div[3]/div/text()').extract()) positionInfo = positionInfo.split(')') if len(positionInfo) > 0: oneOut['level'] = positionInfo[0].strip() + ')' if len(positionInfo) > 1: oneOut['structure'] = positionInfo[1].strip() followInfo = ''.join( one.xpath('.//div[1]/div[4]/text()').extract()) followInfo = followInfo.split('/') if len(followInfo) > 0: oneOut['attention'] = followInfo[0].strip() if len(followInfo) > 1: oneOut['follow'] = followInfo[1].strip() if len(followInfo) > 2: oneOut['release'] = followInfo[2].strip() oneOut['crawlDate'] = util.today() except Exception as e: print(e) logging.warning("parseOne Exception %s" % (str(e))) return oneOut
def parseOne(self, one, block, housecode): oneOut = items.HouseItem2() oneOut['src'] = self.src try: #/html/body/div[3]/div/div/div[1]/h1 oneOut['title'] = util.ExtractString(one, '/html/body/div[3]/div/div/div[1]/h1/text()') # 这个是链家编号+crawldate oneOut['_id'] = housecode#util.ExtractString(one, '/html/body/div[5]/div[2]/div[6]/div[4]/span[2]') # 这个是真实的链家编号 oneOut['houseID'] = oneOut['_id'] oneOut['_id'] += '_' + util.todayString() oneOut['unitPrice'] = util.ExtractNumber(one, '/html/body/div[5]/div[2]/div[4]/div[1]/div[1]/span') oneOut['totalPrice'] = util.ExtractNumber(one, '/html/body/div[5]/div[2]/div[4]/span[1]') if len(block): oneOut['community'] = block else: oneOut['community'] = util.ExtractString(one, '/html/body/div[5]/div[2]/div[6]/div[1]/a[1]/text()') oneOut['houseType'] = util.ExtractString(one, '/html/body/div[5]/div[2]/div[5]/div[1]/div[1]/text()')# oneOut['square'] = util.ExtractNumber(one, '/html/body/div[5]/div[2]/div[5]/div[3]/div[1]') oneOut['level'] = util.ExtractString(one, '/html/body/div[5]/div[2]/div[5]/div[1]/div[2]/text()') oneOut['structure'] = util.ExtractString(one, '/html/body/div[7]/div[1]/div[1]/div/div/div[1]/div[2]/ul/li[1]/text()') oneOut['thb'] = util.ExtractString(one, '/html/body/div[7]/div[1]/div[1]/div/div/div[1]/div[2]/ul/li[10]/text()') oneOut['lx'] = util.ExtractString(one, '/html/body/div[7]/div[1]/div[1]/div/div/div[1]/div[2]/ul/li[6]/text()') oneOut['heating'] = util.ExtractString(one, '/html/body/div[7]/div[1]/div[1]/div/div/div[1]/div[2]/ul/li[11]/text()') oneOut['property'] = util.ExtractString(one, '/html/body/div[7]/div[1]/div[1]/div/div/div[1]/div[2]/ul/li[13]/text()') oneOut['attention'] = util.ExtractNumber(one, '//*[@id="favCount"]') oneOut['follow'] = util.ExtractNumber(one, '//*[@id="cartCount"]') oneOut['release'] = util.ExtractString(one, '/html/body/div[7]/div[1]/div[1]/div/div/div[2]/div[2]/ul/li[1]/span[2]/text()') oneOut['lastTrade'] = util.ExtractString(one, '/html/body/div[7]/div[1]/div[1]/div/div/div[2]/div[2]/ul/li[3]/span[2]/text()') oneOut['years'] = util.ExtractString(one, '/html/body/div[7]/div[1]/div[1]/div/div/div[2]/div[2]/ul/li[5]/span[2]/text()') oneOut['mortgage'] = util.ExtractString(one, '/html/body/div[7]/div[1]/div[1]/div/div/div[2]/div[2]/ul/li[7]/span[2]/text()').strip() #/html/body/div[7]/div[1]/div[1]/div/div/div[2]/div[2]/ul/li[6]/span[2] oneOut['ownership'] = util.ExtractString(one, '/html/body/div[7]/div[1]/div[1]/div/div/div[2]/div[2]/ul/li[2]/span[2]/text()') oneOut['use'] = util.ExtractString(one, '/html/body/div[7]/div[1]/div[1]/div/div/div[2]/div[2]/ul/li[4]/span[2]/text()') oneOut['propertyRight'] = util.ExtractString(one, '/html/body/div[7]/div[1]/div[1]/div/div/div[2]/div[2]/ul/li[6]/span[2]/text()') oneOut['book'] = util.ExtractString(one, '/html/body/div[7]/div[1]/div[1]/div/div/div[2]/div[2]/ul/li[8]/span[2]/text()') oneOut['crawlDate'] = util.today() except Exception as e: print(e) logging.warning("parseOne Exception %s"%(str(e))) return oneOut