def test_crawl_error(): # broken feed my_opener = urllib2.build_opener(TestHTTPHandler) urllib2.install_opener(my_opener) feeds = ['http://brokenrss.com/rss'] generator = crawl(feeds, 2) with raises(CrawlError): next(iter(generator)) # unreachable url feeds = ['http://not-exists.com/rss'] generator = crawl(feeds, 2) with raises(CrawlError): next(iter(generator))
def add_feed(category_id): cursor = Cursor(category_id) url = request.form['url'] try: f = urllib.request.urlopen(url) document = f.read() f.close() except Exception: r = jsonify( error='unreachable-url', message='Cannot connect to given url' ) r.status_code = 400 return r try: feed_links = autodiscovery(document, url) except FeedUrlNotFoundError: r = jsonify( error='unreachable-feed-url', message='Cannot find feed url' ) r.status_code = 400 return r feed_url = feed_links[0].url feed_url, feed, hints = next(iter(crawl([feed_url], 1))) with stage: sub = cursor.subscribe(feed) stage.subscriptions = cursor.subscriptionlist stage.feeds[sub.feed_id] = feed return feeds(category_id)
def crawl_category(self): running = True while running: priority, arguments = self.crawling_queue.get() if priority == 0: if arguments == 'terminate': running = False self.crawling_queue.task_done() elif priority == 1: cursor, feed_id = arguments if not feed_id: urls = dict((sub.feed_uri, sub.feed_id) for sub in cursor.recursive_subscriptions) else: urls = dict((sub.feed_uri, sub.feed_id) for sub in cursor.recursive_subscriptions if sub.feed_id == feed_id) iterator = iter(crawl(urls, self.worker_num)) while True: try: feed_url, feed_data, crawler_hints = next(iterator) with stage: stage.feeds[urls[feed_url]] = feed_data except CrawlError: continue except StopIteration: break self.crawling_queue.task_done()
def crawl_category(): running = True while running: priority, arguments = crawling_queue.get() if priority == 0: if arguments == 'terminate': running = False crawling_queue.task_done() elif priority == 1: cursor, feed_id = arguments urls = {} if not feed_id: urls = dict((sub.feed_uri, sub.feed_id) for sub in cursor.recursive_subscriptions) else: urls = dict((sub.feed_uri, sub.feed_id) for sub in cursor.recursive_subscriptions if sub.feed_id == feed_id) iterator = iter(crawl(urls, app.config['CRAWLER_THREAD'])) while True: try: feed_url, feed_data, crawler_hints = next(iterator) with get_stage() as stage: stage.feeds[urls[feed_url]] = feed_data except CrawlError: continue except StopIteration: break crawling_queue.task_done()
def test_crawler(): my_opener = urllib2.build_opener(TestHTTPHandler) urllib2.install_opener(my_opener) feeds = ['http://vio.atomtest.com/feed/atom', 'http://rsstest.com/rss.xml', 'http://favicontest.com/atom.xml'] generator = crawl(feeds, 4) for result in generator: feed_data = result.feed if feed_data.title.value == 'Atom Test': entries = feed_data.entries assert entries[0].title.value == 'xml base test' assert entries[1].title.value == 'Title One' assert result.hints is None assert result.icon_url == 'http://vio.atomtest.com/favicon.ico' elif feed_data.title.value == 'Vio Blog': entries = feed_data.entries assert entries[0].title.value == 'test one' source = feed_data.entries[0].source assert source.title.value == 'Source Test' assert result.icon_url == 'http://rsstest.com/images/favicon.ico' assert result.hints == { 'ttl': '10', 'lastBuildDate': datetime.datetime(2002, 9, 7, 0, 0, 1, tzinfo=utc) } elif feed_data.title.value == 'Favicon Test': assert result.icon_url == 'http://favicontest.com/favicon.ico'
def test_sort_entries(): my_opener = urllib2.build_opener(TestHTTPHandler) urllib2.install_opener(my_opener) feeds = ['http://reversedentries.com/feed/atom'] crawler = iter(crawl(feeds, 4)) url, feed, hints = next(crawler) assert feed.entries[0].updated_at > feed.entries[1].updated_at
def add_feed(category_id): cursor = Cursor(category_id) url = request.form['url'] try: rq = urllib.request.Request(url) rq.add_header('User-Agent', '{0}/{1}'.format(version.__package__, version.VERSION)) f = urllib.request.urlopen(rq): document = f.read() f.close() except Exception: r = jsonify( error='unreachable-url', message='Cannot connect to given url' ) r.status_code = 400 return r try: feed_links = autodiscovery(document, url) except FeedUrlNotFoundError: r = jsonify( error='unreachable-feed-url', message='Cannot find feed url' ) r.status_code = 400 return r feed_url = feed_links[0].url feed_url, feed, hints = next(iter(crawl([feed_url], 1))) with stage: sub = cursor.subscribe(feed) stage.subscriptions = cursor.subscriptionlist stage.feeds[sub.feed_id] = feed return feeds(category_id)
def add_feed(category_id): stage = get_stage() cursor = Cursor(category_id) url = request.form['url'] try: f = urllib2.urlopen(url) document = f.read() f.close() except Exception: r = jsonify(error='unreachable-url', message='Cannot connect to given url') r.status_code = 400 return r try: feed_links = autodiscovery(document, url) except FeedUrlNotFoundError: r = jsonify(error='unreachable-feed-url', message='Cannot find feed url') r.status_code = 400 return r feed_url = feed_links[0].url feed_url, feed, hints = next(iter(crawl([feed_url], 1))) with stage: sub = cursor.subscribe(feed) stage.subscriptions = cursor.subscriptionlist stage.feeds[sub.feed_id] = feed return feeds(category_id)
def test_crawler(): my_opener = urllib2.build_opener(TestHTTPHandler) urllib2.install_opener(my_opener) feeds = [ 'http://vio.atomtest.com/feed/atom', 'http://rsstest.com/rss.xml', 'http://favicontest.com/atom.xml' ] generator = crawl(feeds, 4) for result in generator: feed_data = result.feed if feed_data.title.value == 'Atom Test': entries = feed_data.entries assert entries[0].title.value == 'xml base test' assert entries[1].title.value == 'Title One' assert result.hints is None assert result.icon_url == 'http://vio.atomtest.com/favicon.ico' elif feed_data.title.value == 'Vio Blog': entries = feed_data.entries assert entries[0].title.value == 'test one' source = feed_data.entries[0].source assert source.title.value == 'Source Test' assert result.icon_url == 'http://rsstest.com/images/favicon.ico' assert result.hints == { 'ttl': '10', 'lastBuildDate': datetime.datetime(2002, 9, 7, 0, 0, 1, tzinfo=utc) } elif feed_data.title.value == 'Favicon Test': assert result.icon_url == 'http://favicontest.com/favicon.ico'
def crawl_command(args): repo = from_url(args.repository) session = Session(args.session_id) stage = Stage(session, repo) with stage: opml = stage.subscriptions if not opml: print('OPML does not exist in the repository', file=sys.stderr) return urllist = [subscription.feed_uri for subscription in opml.recursive_subscriptions] threads_count = args.threads if args.threads is not None else cpu_count() generator = crawl(urllist, threads_count) try: for feed_url, feed_data, crawler_hints in generator: if args.verbose: print('{0.title} - {1} entries'.format( feed_data, len(feed_data.entries) )) with stage: feed_id = hashlib.sha1(feed_url).hexdigest() stage.feeds[feed_id] = feed_data except CrawlError as e: print(e, file=sys.stderr)
def test_sort_entries(fx_opener): feeds = ['http://reversedentries.com/feed/atom'] crawler = iter(crawl(feeds, 4)) result = next(crawler) url, feed, hints = result assert url == result.url assert feed is result.feed assert hints == result.hints assert feed.entries[0].updated_at > feed.entries[1].updated_at
def test_crawl_error(fx_opener): # broken feed feeds = ['http://brokenrss.com/rss'] generator = crawl(feeds, 2) with raises(CrawlError): try: next(iter(generator)) except CrawlError as e: assert e.feed_uri == feeds[0] raise # unreachable url feeds = ['http://not-exists.com/rss'] generator = crawl(feeds, 2) with raises(CrawlError): try: next(iter(generator)) except CrawlError as e: assert e.feed_uri == feeds[0] raise
def fx_non_exist_opml(fx_test_stage): feed_urls = ['http://feedone.com/feed/atom/'] generator = crawl(feed_urls, 1) for result in generator: feed_data = result[1] feed_url = result[0] feed_id = get_hash(feed_url) with fx_test_stage as stage: stage.feeds[feed_id] = feed_data with fx_test_stage as stage: stage.subscriptions = read(SubscriptionList, opml_with_non_exist_feed)
def test_crawler(): my_opener = urllib2.build_opener(TestHTTPHandler) urllib2.install_opener(my_opener) feeds = ['http://vio.atomtest.com/feed/atom', 'http://rsstest.com/rss.xml'] generator = crawl(feeds, 4) for result in generator: feed_data = result[1] if feed_data.title.value == 'Atom Test': entries = feed_data.entries assert entries[0].title.value == 'xml base test' assert entries[1].title.value == 'Title One' elif feed_data.title.value == 'Vio Blog': entries = feed_data.entries assert entries[0].title.value == 'test one' source = feed_data.entries[0].source assert source.title.value == 'Source Test'
def crawl_command(args): repo = from_url(args.repository) session = Session(args.session_id) stage = Stage(session, repo) with stage: opml = stage.subscriptions if not opml: print('OPML does not exist in the repository', file=sys.stderr) return feed_id = args.feed_id if feed_id: feed_map = dict((sub.feed_uri, sub.feed_id) for sub in opml.recursive_subscriptions if sub.feed_id == feed_id) if not feed_map: print('There is no such feed:', feed_id, file=sys.stderr) return else: feed_map = dict((sub.feed_uri, sub.feed_id) for sub in opml.recursive_subscriptions) if not feed_map: print('No feeds to crawl', file=sys.stderr) return threads_count = args.threads if args.threads is not None else cpu_count() iterator = iter(crawl(feed_map.keys(), threads_count)) while 1: try: feed_url, feed_data, crawler_hints = next(iterator) if args.verbose: print('{0.title} - {1} entries'.format( feed_data, len(feed_data.entries) )) with stage: feed_id = feed_map[feed_url] stage.feeds[feed_id] = feed_data except (CrawlError, SchemaError) as e: if isinstance(e, CrawlError): print('Something went wrong with', e.feed_uri, file=sys.stderr) if args.verbose: traceback.print_exc() else: print(e, file=sys.stderr) except StopIteration: break
def crawl_command(args): repo = from_url(args.repository) session = Session(args.session_id) stage = Stage(session, repo) with stage: opml = stage.subscriptions if not opml: print('OPML does not exist in the repository', file=sys.stderr) return feed_id = args.feed_id if feed_id: feed_map = dict((sub.feed_uri, sub.feed_id) for sub in opml.recursive_subscriptions if sub.feed_id == feed_id) if not feed_map: print('There is no such feed:', feed_id, file=sys.stderr) return else: feed_map = dict((sub.feed_uri, sub.feed_id) for sub in opml.recursive_subscriptions) if not feed_map: print('No feeds to crawl', file=sys.stderr) return threads_count = args.threads if args.threads is not None else cpu_count() iterator = iter(crawl(feed_map.keys(), threads_count)) while 1: try: feed_url, feed_data, crawler_hints = next(iterator) if args.verbose: print('{0.title} - {1} entries'.format(feed_data, len(feed_data.entries))) with stage: feed_id = feed_map[feed_url] stage.feeds[feed_id] = feed_data except (CrawlError, SchemaError) as e: if isinstance(e, CrawlError): print('Something went wrong with', e.feed_uri, file=sys.stderr) if args.verbose: traceback.print_exc() else: print(e, file=sys.stderr) except StopIteration: break
def xmls(request, fx_test_stage): stage = fx_test_stage subscriptions = SubscriptionList() categoryone = Category(label='categoryone', _title='categoryone') categorytwo = Category(label='categorytwo', _title='categorytwo') categorythree = Category(label='categorythree', _title='categorythree') subscriptions.add(categoryone) subscriptions.add(categorythree) categoryone.add(categorytwo) pair = { 'http://feedone.com/feed/atom/': categoryone, 'http://feedtwo.com/feed/atom/': categorytwo, 'http://feedthree.com/feed/atom/': subscriptions, 'http://feedfour.com/feed/atom/': categorythree } generator = crawl(pair.keys(), 4) with stage: for feed_url, feed, hints in generator: sub = pair[feed_url].subscribe(feed) stage.feeds[sub.feed_id] = feed stage.subscriptions = subscriptions
def test_crawler(fx_opener): feeds = [ 'http://vio.atomtest.com/feed/atom', 'http://rsstest.com/rss.xml', 'http://favicontest.com/atom.xml', 'http://nofavicontest.com/atom.xml' ] generator = crawl(feeds, 4) for result in generator: feed_data = result.feed if feed_data.title.value == 'Atom Test': entries = feed_data.entries assert entries[0].title.value == 'xml base test' assert entries[1].title.value == 'Title One' assert result.hints is None assert result.icon_url == 'http://vio.atomtest.com/favicon.ico' elif feed_data.title.value == 'Vio Blog': entries = feed_data.entries assert entries[0].title.value == 'test one' source = feed_data.entries[0].source assert source.title.value == 'Source Test' assert result.icon_url == 'http://rsstest.com/images/favicon.ico' elif feed_data.title.value == 'Favicon Test': assert result.icon_url == 'http://favicontest.com/favicon.ico' elif feed_data.title.value == 'No Favicon Test': assert result.icon_url is None
def crawlWithSender_(self, sender): try: subs = self.subscriptions.recursive_subscriptions logger.debug('len(subs) = %d', len(subs)) feeds_dict = {s.feed_uri: s for s in subs} cput_count = NSProcessInfo.processInfo().activeProcessorCount() logger.debug('len(feeds_dict) = %d', len(feeds_dict)) for result in crawl(feeds_dict, cput_count): assert isinstance(result, CrawlResult) logger.info('Crawled %d entries from %s', len(result.feed.entries), result.url) sub = feeds_dict[result.url] with self.stage: if result.icon_url: sub.icon_uri = result.icon_url self.stage.subscriptions = self.subscriptions self.stage.feeds[sub.feed_id] = result.feed logger.info('Finished crawling %d feeds', len(feeds_dict)) except Exception as e: logger.exception(e) finally: self.pyobjc_performSelectorOnMainThread_withObject_( 'stopRefreshWithSender:', sender )
def test_crawler(fx_opener): feeds = ['http://vio.atomtest.com/feed/atom', 'http://rsstest.com/rss.xml', 'http://favicontest.com/atom.xml', 'http://nofavicontest.com/atom.xml'] generator = crawl(feeds, 4) for result in generator: feed_data = result.feed if feed_data.title.value == 'Atom Test': entries = feed_data.entries assert entries[0].title.value == 'xml base test' assert entries[1].title.value == 'Title One' assert result.hints is None assert result.icon_url == 'http://vio.atomtest.com/favicon.ico' elif feed_data.title.value == 'Vio Blog': entries = feed_data.entries assert entries[0].title.value == 'test one' source = feed_data.entries[0].source assert source.title.value == 'Source Test' assert result.icon_url == 'http://rsstest.com/images/favicon.ico' elif feed_data.title.value == 'Favicon Test': assert result.icon_url == 'http://favicontest.com/favicon.ico' elif feed_data.title.value == 'No Favicon Test': assert result.icon_url is None