def parse_item(self, response): selector = Selector(response) item = DoubanItem() item["url"] = response.url item["title"] = response.meta.get('title') item["author"] = selector.xpath( '//h3/span[1]/a/text()').extract_first() item["content"] = selector.xpath( '//div[@class="topic-richtext"]').extract_first() item["image"] = selector.xpath( '//div[@class="topic-richtext"]//img/@src').extract() item["create_time"] = selector.xpath( '//h3/span[2]/text()').extract_first() item["text"] = selector.xpath('//div[@class="topic-richtext"]').xpath( "normalize-space(.)").extract_first() lease = None if "整租" in item["title"]: lease = "整租" elif "合租" in item["title"]: lease = "合租" if not lease: if "整租" in item["text"]: lease = "整租" elif "合租" in item["text"]: lease = "合租" item["lease"] = lease if item["create_time"].split('-')[1] > response.meta.get( 'update_time').split('-')[0]: item['update_time'], item['update_timestamp'] = time_standard( str(int(item["create_time"].split('-')[0]) + 1) + '-' + response.meta.get('update_time')) else: item['update_time'], item['update_timestamp'] = time_standard( item["create_time"].split('-')[0] + '-' + response.meta.get('update_time')) item["replay_num"] = response.meta.get('replay_num') for city_name, url_list in city_url.items(): if response.meta.get('list_url') in url_list: item['city'] = city_name break yield item
def parse_item(self, response): try: self.lock.acquire() item = ScrapyJojozuItem() item["title"] = response.xpath( '//p[@class="content__title"]/text()').extract_first() item['lease'] = response.xpath( '//ul[@class="content__aside__list"]/li[1]/text()' ).extract_first(default="") item['type'] = response.xpath( '//ul[@class="content__aside__list"]/li[2]/text()' ).extract_first().split(' ')[0] image_list = response.xpath( '//ul[@class="content__article__slide--small content__article__slide_dot"]//img/@src' ).extract() item["image"] = [i.replace("https", "http") for i in image_list] item['payment_method'] = response.xpath( '//ul[@class="table_row"]//li[1]/text()').extract_first() item['cost'] = int( response.xpath( '//ul[@class="table_row"]//li[2]/text()').extract_first()) item['cash_pledge'] = response.xpath( '//ul[@class="table_row"]//li[3]/text()').extract_first() address = item["title"].split(' ')[0].split('·')[-1] # city 需根据列表页传meta进来 if 'SZ' in urlparse(response.url)[2]: item['city'] = '深圳' elif 'GZ' in urlparse(response.url)[2]: item['city'] = '广州' elif 'SH' in urlparse(response.url)[2]: item['city'] = '上海' elif 'BJ' in urlparse(response.url)[2]: item['city'] = '北京' location = self.a.get_geocoder(address, item['city']) item['area'] = self.a.get_area(location, item['city']) item['location'] = self.a.get_place(location, item['city']) item['had_agent'] = 1 item['service_charge'] = response.xpath( '//ul[@class="table_row"]//li[4]/text()').extract_first() item['agent_cost'] = response.xpath( '//ul[@class="table_row"]//li[5]/text()').extract_first() item['support'] = [ i.strip() for i in response.xpath( '//ul[@class="content__article__info2"]/li[@class="fl oneline "]/text()' ).extract() if i.strip() ] item['description'] = ''.join( response.xpath( '//div[@class="content__article__info"]/ul[1]//text()'). extract()).replace(" ", "").replace(" ", "").replace("\n\n", "\n") item['update_time'], item['update_timestamp'] = time_standard( response.xpath('//div[@class="content__subtitle"]').re( "\d+-\d+-\d+")[0]) item['url'] = response.url item['source'] = "贝壳" except Exception as e: traceback.print_exc() finally: self.lock.release() yield item
def item_parse(self, response): print("inter item_parse") item = ScrapyJojozuItem() item["title"] = response.xpath( '//div[@class="title"]/text()').extract_first().replace( " ", "").replace(" ", "").replace("\n", "").replace("\r", "") # 租赁方式 item["lease"] = response.xpath( '//div[@class="tt"]/text()').extract()[0] # 户型 item["type"] = response.xpath('//div[@class="tt"]/text()').extract()[1] # 图片为ArrayList item["image"] = [ i if 'http:' in i else 'http:' + i for i in response.xpath( '//div[@class="cont-sty1 clearfix"]//img/@src').extract() ] # 付款方式 item["payment_method"] = response.xpath( '//div[@class="trl-item sty1"]/text()').extract_first().replace( '元/月', "").replace('(', "").replace(')', "") # 月租金 item["cost"] = int( response.xpath( '//div[@class="trl-item sty1"]/i/text()').extract_first()) # 押金 item["cash_pledge"] = response.xpath( '//div[@class="trl-item sty1"]/text()').extract_first().replace( '元/月', "").replace('(', "").replace(')', "") # 区域 item["area"] = response.xpath( '//div[@class="rcont"]/a/text()').extract()[0] # 是否有中介 item["had_agent"] = 1 # 服务费 item["service_charge"] = "服务费未知" # 中介费 item["agent_cost"] = "中介费未知" # 最近地铁站 location = response.xpath('//div[@class="rcont"]/a/text()') item["location"] = location.re('线(.*?)站')[0] if location.re( '线(.*?)站') else location.extract_first() # 设施 item["support"] = re.search("var peitao = '(.*?)';", response.text).group(1) item["description"] = response.xpath( '//li[@class="font14 fyld"]/div[@class="fyms_con floatl gray3"]' ).xpath('string(.)').extract_first() # 更新时间以及更新时间戳 item['update_time'], item['update_timestamp'] = time_standard( response.xpath('//div[@class="gray9 fybh-zf"]/span[2]/text()'). extract_first().replace("更新时间", "").replace(" ", "")) item["url"] = response.url # 来源渠道 item["source"] = "房天下" if "sz.zu" in response.url: item['city'] = '深圳' elif "gz.zu" in response.url: item['city'] = '广州' elif "sh.zu" in response.url: item['city'] = '上海' elif "bj.zu" in response.url: item['city'] = '北京' yield item
def parse_item(self, response): if response.status == 302: return try: print('开始加锁') self.lock.acquire() font_src = re.search("src:url\('(.*?)'\)", response.text).group(1) font_face = font_src.split("base64,") # 字体文件生成,每个页面返回的字体文件都不同,需要持续更新 if 'ttf' in font_face[0] or 'woff' in font_face[0]: b = base64.b64decode(font_face[1]) with open('anjuke.ttf', 'wb') as f: f.write(b) font = TTFont('anjuke.ttf') font.saveXML('anjuke.xml') # 如果有 cmap 可以拿到替换数字的unicode码,再通过正则匹配到该unicode码进行替换 cmap = font['cmap'].getBestCmap() mapdict = {} for i in cmap: pat = re.compile(r'(\d+)') values = int(re.search(pat, cmap[i])[1]) - 1 keys = hex(i) new_keys = '&#x' + keys[2:] + ';' mapdict[new_keys] = values print(mapdict) right_html = response.text for k, v in mapdict.items(): right_html = right_html.replace(k, str(v)) soup = BeautifulSoup(right_html, 'lxml') item = ScrapyJojozuItem() item['title'] = soup.find('h3', attrs={ 'class': 'house-title' }).text.replace('\n', "") item['type'] = \ soup.find('div', attrs={'class': 'title-basic-info'}).find_all('span', attrs={'class': 'info-tag'})[ 1].get_text().replace('\n', "").replace(' ', "") item['lease'] = soup.find('div', attrs={ 'class': 'title-basic-info' }).find('li', attrs={ 'class': 'rent' }).text item['image'] = [ i.get('data-src') for i in soup.find( 'div', id="room_pic_wrap", attrs={ 'class': 'switch_list' }).find_all('img') ] item['payment_method'] = soup.find('li', attrs={ 'class': 'full-line' }).find('span', attrs={ 'class': 'type' }).text item['cost'] = int( soup.find('li', attrs={ 'class': 'full-line' }).find('span', attrs={ 'class': 'price' }).text[:-3]) # item['cash_pledge'] = soup.find('li', attrs={'class':'full-line'}).find('span',attrs={'class':'price'}).text item['cash_pledge'] = item['payment_method'] try: item['area'] = soup.find_all('a', class_='link', attrs={'target': True})[0].text except: print(response.url) item['had_agent'] = 1 item['service_charge'] = "服务费未知" item['agent_cost'] = "中介费未知" item['location'] = ",".join([ i.text for i in soup.find_all( 'a', class_='link', attrs={'target': True})[1:] ]) item['support'] = [ i.find('div').text for i in soup.find_all( 'li', attrs={'class': re.compile('peitao-item(.*)has')}) ] item['description'] = soup.find('div', attrs={ 'class': 'auto-general' }).text item['update_time'], item['update_timestamp'] = time_standard( soup.find('div', attrs={ 'class': "right-info" }).find('b').text) item['url'] = response.url item['source'] = "安居客" if 'sz' in urlparse(response.url)[1]: item['city'] = '深圳' elif 'gz' in urlparse(response.url)[1]: item['city'] = '广州' elif 'sh' in urlparse(response.url)[1]: item['city'] = '上海' elif 'bj' in urlparse(response.url)[1]: item['city'] = '北京' except Exception as e: traceback.print_exc() finally: print("释放锁") self.lock.release() yield item