Esempio n. 1
0
def main(port, host, start_manhole, manhole_port, manhole_host, loglevel, opts):
    from arachnado.handlers import get_application
    from arachnado.crawler_process import ArachnadoCrawlerProcess
    from arachnado import manhole

    settings = {'LOG_LEVEL': loglevel}
    crawler_process = ArachnadoCrawlerProcess(settings)

    app = get_application(crawler_process, opts)
    app.listen(int(port), host)

    if start_manhole:
        manhole.start(manhole_port, manhole_host, {'cp': crawler_process})

    crawler_process.start(stop_after_crawl=False)
Esempio n. 2
0
def main(port, host, start_manhole, manhole_port, manhole_host, loglevel, opts):
    from arachnado.handlers import get_application
    from arachnado.crawler_process import ArachnadoCrawlerProcess
    from arachnado import manhole

    settings = {'LOG_LEVEL': loglevel}
    crawler_process = ArachnadoCrawlerProcess(settings)

    app = get_application(crawler_process, opts)
    app.listen(int(port), host)
    logger.info("Arachnado v%s is started on %s:%s" % (__version__, host, port))

    if start_manhole:
        manhole.start(manhole_port, manhole_host, {'cp': crawler_process})
        logger.info("Manhole server is started on %s:%s" % (
            manhole_host, manhole_port))

    crawler_process.start(stop_after_crawl=False)
Esempio n. 3
0
def main(port, host, start_manhole, manhole_port, manhole_host, loglevel,
         opts):
    from arachnado.handlers import get_application
    from arachnado.crawler_process import ArachnadoCrawlerProcess
    from arachnado import manhole

    settings = {'LOG_LEVEL': loglevel}
    crawler_process = ArachnadoCrawlerProcess(settings)

    app = get_application(crawler_process, opts)
    app.listen(int(port), host)
    logger.info("Arachnado v%s is started on %s:%s" %
                (__version__, host, port))

    if start_manhole:
        manhole.start(manhole_port, manhole_host, {'cp': crawler_process})
        logger.info("Manhole server is started on %s:%s" %
                    (manhole_host, manhole_port))

    crawler_process.start(stop_after_crawl=False)
Esempio n. 4
0
def main(port, host, start_manhole, manhole_port, manhole_host, loglevel,
         opts):
    from arachnado.handlers import get_application
    from arachnado.crawler_process import ArachnadoCrawlerProcess
    from arachnado.site_checker import get_site_checker_crawler
    from arachnado.storages.mongo import MongoStorage
    from arachnado.storages.mongotail import MongoTailStorage
    from arachnado.domain_crawlers import DomainCrawlers
    from arachnado.cron import Cron

    settings = {
        'LOG_LEVEL': loglevel,
    }

    # mongo export options
    storage_opts = opts['arachnado.storage']
    assert storage_opts['enabled'], "Storage can't be turned off"

    items_uri = _getval(storage_opts, 'items_uri_env', 'items_uri')
    jobs_uri = _getval(storage_opts, 'jobs_uri_env', 'jobs_uri')
    sites_uri = _getval(storage_opts, 'sites_uri_env', 'sites_uri')

    scrapy_opts = opts['arachnado.scrapy']
    settings.update({k: v for k, v in scrapy_opts.items() if k.isupper()})

    settings.update({
        'MONGO_EXPORT_ENABLED': storage_opts['enabled'],
        'MONGO_EXPORT_JOBS_URI': jobs_uri,
        'MONGO_EXPORT_ITEMS_URI': items_uri,
    })

    job_storage = MongoTailStorage(jobs_uri, cache=True)
    job_storage.ensure_index("urls")
    site_storage = MongoStorage(sites_uri, cache=True)
    item_storage = MongoTailStorage(items_uri)
    item_storage.ensure_index("url")
    item_storage.ensure_index("_job_id")

    crawler_process = ArachnadoCrawlerProcess(settings)

    site_checker_crawler = get_site_checker_crawler(site_storage)
    crawler_process.crawl(site_checker_crawler)

    spider_packages = scrapy_opts['spider_packages']
    default_spider_name = scrapy_opts['default_spider_name']
    domain_crawlers = DomainCrawlers(
        crawler_process=crawler_process,
        spider_packages=_parse_spider_packages(spider_packages),
        default_spider_name=default_spider_name,
        settings=settings)
    domain_crawlers.resume(job_storage)

    cron = Cron(domain_crawlers, site_storage)
    cron.start()

    app = get_application(crawler_process, domain_crawlers, site_storage,
                          item_storage, job_storage, opts)
    app.listen(int(port), host)
    logger.info("Arachnado v%s is started on %s:%s" %
                (__version__, host, port))

    if start_manhole:
        from arachnado import manhole
        manhole.start(manhole_port, manhole_host, {'cp': crawler_process})
        logger.info("Manhole server is started on %s:%s" %
                    (manhole_host, manhole_port))

    crawler_process.start(stop_after_crawl=False)