Exemple #1
0
def get_app(ws_pages_uri, ws_jobs_uri):
    db_uri = get_db_uri()
    items_uri = "{}/items".format(db_uri)
    jobs_uri = "{}/jobs".format(db_uri)
    job_storage = MongoTailStorage(jobs_uri, cache=True)
    item_storage = MongoTailStorage(items_uri)
    context = {
        'crawler_process': None,
        'job_storage': job_storage,
        'item_storage': item_storage,
    }
    app = tornado.web.Application([
        (ws_pages_uri, PagesDataRpcWebsocketHandler, context),
        (ws_jobs_uri, JobsDataRpcWebsocketHandler, context),
    ])
    return app
Exemple #2
0
class Pages(object):
    """ Pages (scraped items) object exposed via JSON RPC """
    handler_id = None
    callback = None

    def __init__(self, handler, item_storage, **kwargs):
        self.handler = handler
        self.storage = MongoTailStorage(item_storage.mongo_uri,
                                        item_storage.cache_flag)

    def subscribe(self, last_id=0, query=None, fields=None, fetch_delay=None):
        if fetch_delay:
            self.storage.fetch_delay = fetch_delay
        self.storage.subscribe('tailed',
                               self._publish,
                               last_id=last_id,
                               query=query,
                               fields=fields)

    def _on_close(self):
        self.storage.unsubscribe('tailed')

    def unsubscribe(self):
        self.storage.unsubscribe('tailed')

    def _publish(self, data):
        if self.callback:
            _callback = self.callback
        else:
            _callback = self.handler.write_event
        if self.storage.tailing:
            _callback(data)
Exemple #3
0
def main(port, host, start_manhole, manhole_port, manhole_host, loglevel,
         opts):
    from arachnado.handlers import get_application
    from arachnado.crawler_process import ArachnadoCrawlerProcess
    from arachnado.site_checker import get_site_checker_crawler
    from arachnado.storages.mongo import MongoStorage
    from arachnado.storages.mongotail import MongoTailStorage
    from arachnado.domain_crawlers import DomainCrawlers
    from arachnado.cron import Cron

    settings = {
        'LOG_LEVEL': loglevel,
    }

    # mongo export options
    storage_opts = opts['arachnado.storage']
    assert storage_opts['enabled'], "Storage can't be turned off"

    items_uri = _getval(storage_opts, 'items_uri_env', 'items_uri')
    jobs_uri = _getval(storage_opts, 'jobs_uri_env', 'jobs_uri')
    sites_uri = _getval(storage_opts, 'sites_uri_env', 'sites_uri')

    scrapy_opts = opts['arachnado.scrapy']
    settings.update({k: v for k, v in scrapy_opts.items() if k.isupper()})

    settings.update({
        'MONGO_EXPORT_ENABLED': storage_opts['enabled'],
        'MONGO_EXPORT_JOBS_URI': jobs_uri,
        'MONGO_EXPORT_ITEMS_URI': items_uri,
    })

    job_storage = MongoTailStorage(jobs_uri, cache=True)
    job_storage.ensure_index("urls")
    site_storage = MongoStorage(sites_uri, cache=True)
    item_storage = MongoTailStorage(items_uri)
    item_storage.ensure_index("url")
    item_storage.ensure_index("_job_id")

    crawler_process = ArachnadoCrawlerProcess(settings)

    site_checker_crawler = get_site_checker_crawler(site_storage)
    crawler_process.crawl(site_checker_crawler)

    spider_packages = scrapy_opts['spider_packages']
    default_spider_name = scrapy_opts['default_spider_name']
    domain_crawlers = DomainCrawlers(
        crawler_process=crawler_process,
        spider_packages=_parse_spider_packages(spider_packages),
        default_spider_name=default_spider_name,
        settings=settings)
    domain_crawlers.resume(job_storage)

    cron = Cron(domain_crawlers, site_storage)
    cron.start()

    app = get_application(crawler_process, domain_crawlers, site_storage,
                          item_storage, job_storage, opts)
    app.listen(int(port), host)
    logger.info("Arachnado v%s is started on %s:%s" %
                (__version__, host, port))

    if start_manhole:
        from arachnado import manhole
        manhole.start(manhole_port, manhole_host, {'cp': crawler_process})
        logger.info("Manhole server is started on %s:%s" %
                    (manhole_host, manhole_port))

    crawler_process.start(stop_after_crawl=False)
Exemple #4
0
 def __init__(self, handler, item_storage, **kwargs):
     self.handler = handler
     self.storage = MongoTailStorage(item_storage.mongo_uri,
                                     item_storage.cache_flag)