def fetch_pages(request): user_email = request.POST.get('user_email', None) user = users.User(email=user_email) if user is None: logging.error('User not found: %s', user_email) raise TypeError('User not found') page_urls_p = request.POST.get('page_urls', None) if not page_urls_p: logging.error('Page urls not given') raise TypeError('Page urls not given') page_urls = pickle.loads(str(page_urls_p)) logging.debug('fetching pages: %s;;%s', user_email, page_urls) pages = [] for url in page_urls: html = urllib.urlopen(url).read(10240) doc = redability.Document(html) pages.append(doc.summary().encode('ascii','ignore')) rd = ReadyData(owner=user, data_type='page') rd.content = '<hr>'.join(p for p in pages) rd.merged = len(pages) rd.put() logging.debug('ReadyData for fetched pages created: %s', page_urls) # schedule task for fetched data send params = {'ready_data_key': rd.key()} taskqueue.add(url=reverse('fetcher-send'), params=params) logging.debug('task created') return True
def fetch_feeds(request): feed_key = request.POST.get('feed_key', None) if feed_key is None: logging.error('missing parameter') raise TypeError('missing parameter') feed = Feed.get(feed_key) if feed is None: logging.error('Feed object not found: %s', feed_key) raise TypeError('Feed object not found') parser = feedparser.parse(feed.url) # check if feed exists if hasattr(feed, 'bozo_exception'): feed.is_valid = False logging.warn('Invalid feed: %s;;%s', feed.id, feed.url) feed.put() return # setup feed title if does not exist if not feed.title: feed.title = parser.feed.title rd = ReadyData.gql("WHERE data_type = :1 AND owner = :2 LIMIT 1", 'feed', feed.owner).get() if rd is None: rd = ReadyData(owner=feed.owner, data_type='feed') rd.content = '' for e in parser['entries']: # TODO - check the date article = '<h1>%(title)s</h1>' % e for content in e['content']: article += content['value'] rd.content += article rd.merged += 1 rd.put() feed.put() params = {'ready_data_key': rd.key()} taskqueue.add(url=reverse('fetcher-send'), params=params) return True