Beispiel #1
0
	def load_feed(self, store_data=True):
		working_url = self._get_working_url('feed')
		#print 'loading feed from: %s' % working_url
		# try:
		f = urllib2.urlopen(working_url)
		#print f.info()
		data = f.read()
		# print data
		# try to decode the json string
		json_obj = json.loads(data)
		# print '\n\n--------------------\n%s' % json_obj['data'][0]
		
		if store_data:
			# store the data in the raw_data model
			new_tag = DataTag.objects.get(name='new')
			new_datas = []

			unprocessed_datas = RawData.objects.filter(source=self.source_node, tags__in=[new_tag])
			if unprocessed_datas:
				new_datas.extend(unprocessed_datas)

			for data in json_obj['data']:
				fblinktype = data.get('type',None)
				new_data = RawData()
				title = data.get('name',data['id'])
				created_at = data.get('created_time',str(datetime.datetime.now()))

				if fblinktype:
					if fblinktype == "link" and data.get('description',None):
						title = data.get('description')
					elif data.get('message',None):
						title = data.get('message')

				if len(title) > 100:
					title = title[:100] + "..."

				new_data.title = title
				new_data.data_id = data['id']
				new_data.source = self.data_src
				new_data.data = json.dumps(data)
				new_data.link = data.get('link',None)

				# try and parse the date
				try:
					dt = parser.parse(created_at)
				except ValueError:
					dt = datetime.datetime.now()

				new_data.occurred_at = dt
				
				# make sure that the raw data does not exist
				if not new_data.exists():
					new_data.save()
					new_data.tags.add(new_tag)
					new_data.save()
					new_datas.append(new_data)

			if new_datas:
				fba = FacebookAgent()
				fba.search(raw_data_set = new_datas)
Beispiel #2
0
	def load(self, store_data=True):
		if not self.article_css_selector or not self.article_link_selector:
			# print "No CSS selector information supplied, cannot load"
			return None

		article_links = SiteLinkLoader.get_elements_by_css(self.url, self.article_link_selector)
		new_tags = DataTag.objects.filter(name='new')
		new_tag = DataTag.objects.get(name='new')
		# get all the new data objects 
		new_datas = []
		unprocessed_datas = RawData.objects.filter(source=self.source_node, tags__in=new_tags)
		if unprocessed_datas:
			new_datas.extend(unprocessed_datas)

		if article_links:
			for article_link in article_links:
				article_title = strip_tags(article_link.html())
				article_url = article_link.href
				if article_url.startswith('/'):
					article_url = article_url[1:] # get rid of the starting slash
				if not article_url.lower().startswith('http://') and not article_url.lower().startswith(self.hostname):
					article_url = 'http://%s/%s' % (self.hostname, article_url)
					article_content = ""
					article_pieces = SiteLinkLoader.get_elements_by_css(article_url, self.article_css_selector)
					for article_piece in article_pieces:
						article_content += strip_tags(article_piece.html())

					# create a new raw data if none exists
					similar_raw_datas = RawData.objects.filter(title=article_title)
					if not similar_raw_datas:
						# print " + Saving article: %s" % article_title
						new_data = RawData()
						new_data.title = article_title
						new_data.data = article_content
						new_data.link = article_url
						new_data.source = self.source_node
						new_data.data_id = article_url
						if store_data:
							# save it
							new_data.save()
							new_data.tags.add(new_tag)
							new_data.save()
						new_datas.append(new_data)
			if new_datas:
				ba = BasicAgent()
				ba.search(raw_data_set=new_datas)
Beispiel #3
0
	def load(self, store_data = True, date_limit=None, run_agent=False):
		for data_src in self.data_sources:
			print "Loading data from: %s" % data_src

			# init variables from the data source
			url = data_src.src_id
			source_node = data_src
			parameters = data_src.get_parameters()
			username = parameters.get('username','*****@*****.**')
			psw = parameters.get('password','choirpassword')
			article_css_selector = parameters.get('article-css-selector','')
			fetch_limit = parameters.get('fetch-limit',None)

			auth = ClientAuthMethod(username,psw)
			
			reader = GoogleReader(auth)
			if reader.buildSubscriptionList():
				feeds = reader.getSubscriptionList()
				new_tag = DataTag.objects.get(name='new')
				new_datas = []

				fetch_count = 0

				# loop through and store feeds we already have RawData for


				for feed in feeds:
					if not fetch_limit:
						fetch_limit = feed.unread
					read_items = []
					print "Reading " + feed.title + " (%s unread)" % feed.unread
					print "===================================================="
					print
					print "Loading items"
					print
					feed.loadItems()
					print "Loaded %s items" % (len(feed.items),)
					print
					index = 0
					for item in feed.items:
						# make sure it doesn't already exist
						title = item.title
						url = item.url
						index+=1

						if index + 1 >= len(feed.items) and fetch_count < fetch_limit:
							print "Loading more items...."
							print
							feed.loadMoreItems()

						f = urllib.urlopen(url)
						html = f.read()
						doc = leaf.parse(html)
						elements = doc(article_css_selector)
						for element in elements:
							# print
							article_html = element.html()
							new_data = RawData()
							new_data.title = title
							new_data.source = source_node
							new_data.data = strip_tags(article_html)
							new_data.data_id = item.id
							new_data.link = item.url

							try:
								new_data.occurred_at = datetime.datetime.fromtimestamp(feed.lastUpdated)
							except ValueError:
								# print "Error, could not parse timestamp: %s" % feed.lastUpdated
								new_data.occurred_at = datetime.datetime.now()

							# patching in date limit thing Parris wanted --------------------------
							# if date_limit is None:
							#	date_limit = datetime.date.today() - datetime.timedelta(week=1)
							#
							# if new_data.occured_at < date_limit:
							# 	# we should skip this item .... it is too old
							# 	continue
							#
							# end patch -----------------------------------------------------------
							# Abandonning this idea for now ... I think it's best to patch the map view and not mess with this for now

								
							# if it is not new... save it
							if not new_data.exists():
								print " + Saving article: %s" % new_data.title
								new_data.save()
								new_data.tags.add(new_tag)
								new_datas.append(new_data)
								fetch_count +=1

							read_items.append(item)


					# print "All done.\n %s items fetched, our limit is %s. There are %s feeds. We stopped at index %s" % (fetch_count, self.fetch_limit, len(feed.items),index)

			if new_datas and run_agent:
				gra = GoogleReaderAgent()
				gra.search(raw_data_set = new_datas)
			return new_datas
		return None