Esempio n. 1
0
def parse(site, website_timstamp, response):
	
	page = etree.HTML(response)
	entries = page.xpath('//div[@class="entryContainer"]')
	articles = []

	for i in xrange(0,len(entries)):
		print i
		_item = ArticleListItem()
		
		title = entries[i].xpath('//div[@id="entry-title"]//h1[@class="entryTitle"]//a')[i].text
		url = entries[i].xpath('//div[@id="entry-title"]//h1[@class="entryTitle"]//a')[i].attrib['href']
		img = entries[i].xpath('//div[@class="entryMeta"]//img')[i]
		headImg = img.attrib['src']
		author_time = entries[i].xpath('//div[@class="entryMeta"]//span')[i].text.split('|')
		author = author_time[0].strip()
		created = author_time[1].strip()
		abstract = entries[i].xpath('//div[@class="blog_description"]')[i].text.strip()
		_item['title']=title
		_item['author']=author
		_item['headImg']=headImg
		_item['abstract']=abstract
		_item['url']=url
		_item['site']=site
		_item['isContentDownload']=False
		_item['created']=created

		articles.append(_item)
		cur = Date.str_to_timestamp(created)

		if website_timstamp >= cur:                   # 比较当前抓取的列表页更帖的时间与上一次该站点最新更帖时间
			print articles
			print '=================================cur',Date.timestamp_to_str(cur),'website_timestamp',Date.timestamp_to_str(website_timstamp)
			return articles
	print '========================================'
	print articles
	print '========================================'
	return articles
Esempio n. 2
0
File: db.py Progetto: luxudong/hello
	def get_site_timestamp(self, site):
		for site in self.collection.find({"url":site}):
			return Date.str_to_timestamp(site['updated'])