コード例 #1
0
ファイル: test.py プロジェクト: anantag/craigslist-scrapper
 def parse(self, response):
    hxs = HtmlXPathSelector(response)
    titles = hxs.selector("//span[@class='pl']")
    for titles in titles :
       title = titles.select("a/text()").extract()
       link = titles.select("a/@href").extract()
       print title,link
コード例 #2
0
ファイル: Spider.py プロジェクト: jiangyuan168/mycode
	def parse_item(self,response):
		s=HtmlXPathSelector(response)
		movie_name=s.selector('//*[@id="content"]/h1/span[1]/text()').extract()
		movie_director = s.select('//*[@id="info"]/span[1]/span[2]/a/text()').extract()
		movie_writer = s.select('//*[@id="info"]/span[2]/span[2]/a/text()').extract()
		movie_description_paths = s.select('//*[@id="link-report"]')
		movie_description=[]
		for movie_description_path in movie_description_paths:
			ovie_description = movie_description_path.select('.//*[@property="v:summary"]/text()').extract()
		movie_roles_paths = s.select('//*[@id="info"]/span[3]/span[2]')
		movie_roles = []
		for movie_roles_path in movie_roles_paths:
			movie_roles = movie_roles_path.select('.//*[@rel="v:starring"]/text()').extract()
		movie_detail = s.select('//*[@id="info"]').extract()

		item=MovieItem()
		item['movie_name'] = ''.join(movie_name).strip().replace(',',';').replace('\'','\\\'').replace('\"','\\\"').replace(':',';')
		item['movie_director'] = movie_director[0].strip().replace(',',';').replace('\'','\\\'').replace('\"','\\\"').replace(':',';') if len(movie_director) > 0 else ''
		item['movie_description'] = movie_description[0].strip().replace(',',';').replace('\'','\\\'').replace('\"','\\\"').replace(':',';') if len(movie_description) > 0 else ''
		item['movie_writer'] = ';'.join(movie_writer).strip().replace(',',';').replace('\'','\\\'').replace('\"','\\\"').replace(':',';')
		item['movie_roles'] = ';'.join(movie_roles).strip().replace(',',';').replace('\'','\\\'').replace('\"','\\\"').replace(':',';')
		movie_detail_str = ''.join(movie_detail).strip()
		movie_language_str = ".*语言:</span> (.+?)<br><span.*".decode("utf8")
		movie_date_str = ".*上映日期:</span> <span property=\"v:initialReleaseDate\" content=\"(\S+?)\">(\S+?)</span>.*".decode("utf8")
		movie_long_str = ".*片长:</span> <span property=\"v:runtime\" content=\"(\d+).*".decode("utf8")
		pattern_language =re.compile(movie_language_str,re.S)
		pattern_date = re.compile(movie_date_str,re.S)
		pattern_date = re.compile(movie_date_str,re.S)
		movie_language = re.search(pattern_language,movie_detail_str)
		movie_date = re.search(pattern_date,movie_detail_str)
		movie_long = re.search(pattern_long,movie_detail_str)
		item['movie_language'] = ""
		if movie_language:
			item['movie_language'] = movie_language.group(1).strip().replace(',',';').replace('\'','\\\'').replace('\"','\\\"').replace(':',';')
		item['movie_date'] = ""
		if movie_date:
			item['movie_date'] = movie_date.group(1).strip().replace(',',';').replace('\'','\\\'').replace('\"','\\\"').replace(':',';')
		item['movie_long'] = ""
		if movie_long:
			item['movie_long'] = movie_long.group(1)
		yield item