def get_app(ws_pages_uri, ws_jobs_uri): db_uri = get_db_uri() items_uri = "{}/items".format(db_uri) jobs_uri = "{}/jobs".format(db_uri) job_storage = MongoTailStorage(jobs_uri, cache=True) item_storage = MongoTailStorage(items_uri) context = { 'crawler_process': None, 'job_storage': job_storage, 'item_storage': item_storage, } app = tornado.web.Application([ (ws_pages_uri, PagesDataRpcWebsocketHandler, context), (ws_jobs_uri, JobsDataRpcWebsocketHandler, context), ]) return app
class Pages(object): """ Pages (scraped items) object exposed via JSON RPC """ handler_id = None callback = None def __init__(self, handler, item_storage, **kwargs): self.handler = handler self.storage = MongoTailStorage(item_storage.mongo_uri, item_storage.cache_flag) def subscribe(self, last_id=0, query=None, fields=None, fetch_delay=None): if fetch_delay: self.storage.fetch_delay = fetch_delay self.storage.subscribe('tailed', self._publish, last_id=last_id, query=query, fields=fields) def _on_close(self): self.storage.unsubscribe('tailed') def unsubscribe(self): self.storage.unsubscribe('tailed') def _publish(self, data): if self.callback: _callback = self.callback else: _callback = self.handler.write_event if self.storage.tailing: _callback(data)
def main(port, host, start_manhole, manhole_port, manhole_host, loglevel, opts): from arachnado.handlers import get_application from arachnado.crawler_process import ArachnadoCrawlerProcess from arachnado.site_checker import get_site_checker_crawler from arachnado.storages.mongo import MongoStorage from arachnado.storages.mongotail import MongoTailStorage from arachnado.domain_crawlers import DomainCrawlers from arachnado.cron import Cron settings = { 'LOG_LEVEL': loglevel, } # mongo export options storage_opts = opts['arachnado.storage'] assert storage_opts['enabled'], "Storage can't be turned off" items_uri = _getval(storage_opts, 'items_uri_env', 'items_uri') jobs_uri = _getval(storage_opts, 'jobs_uri_env', 'jobs_uri') sites_uri = _getval(storage_opts, 'sites_uri_env', 'sites_uri') scrapy_opts = opts['arachnado.scrapy'] settings.update({k: v for k, v in scrapy_opts.items() if k.isupper()}) settings.update({ 'MONGO_EXPORT_ENABLED': storage_opts['enabled'], 'MONGO_EXPORT_JOBS_URI': jobs_uri, 'MONGO_EXPORT_ITEMS_URI': items_uri, }) job_storage = MongoTailStorage(jobs_uri, cache=True) job_storage.ensure_index("urls") site_storage = MongoStorage(sites_uri, cache=True) item_storage = MongoTailStorage(items_uri) item_storage.ensure_index("url") item_storage.ensure_index("_job_id") crawler_process = ArachnadoCrawlerProcess(settings) site_checker_crawler = get_site_checker_crawler(site_storage) crawler_process.crawl(site_checker_crawler) spider_packages = scrapy_opts['spider_packages'] default_spider_name = scrapy_opts['default_spider_name'] domain_crawlers = DomainCrawlers( crawler_process=crawler_process, spider_packages=_parse_spider_packages(spider_packages), default_spider_name=default_spider_name, settings=settings) domain_crawlers.resume(job_storage) cron = Cron(domain_crawlers, site_storage) cron.start() app = get_application(crawler_process, domain_crawlers, site_storage, item_storage, job_storage, opts) app.listen(int(port), host) logger.info("Arachnado v%s is started on %s:%s" % (__version__, host, port)) if start_manhole: from arachnado import manhole manhole.start(manhole_port, manhole_host, {'cp': crawler_process}) logger.info("Manhole server is started on %s:%s" % (manhole_host, manhole_port)) crawler_process.start(stop_after_crawl=False)
def __init__(self, handler, item_storage, **kwargs): self.handler = handler self.storage = MongoTailStorage(item_storage.mongo_uri, item_storage.cache_flag)