Python CrawlerRunner.CrawlerRunner Examples

Programming Language: Python

Namespace/Package Name: scrapy.crawler

Class/Type: CrawlerRunner

Method/Function: CrawlerRunner

Examples at hotexamples.com: 30

Python Scrapy is a web crawling and scraping framework that enables the users to extract data from websites using code. One of the most important classes in this framework is the CrawlerRunner, which runs a set of spider crawlers on multiple websites in parallel.

Examples:

1. To use the CrawlerRunner class, first, we need to import it from the scrapy.crawler library. Then, we need to create an instance of the CrawlerRunner class and pass a list of spider crawlers to it.

from scrapy.crawler import CrawlerRunner
from myproject.spiders import Spider1, Spider2, Spider3

runner = CrawlerRunner()
runner.crawl(Spider1)
runner.crawl(Spider2)
runner.crawl(Spider3)
d = runner.join()
d.addBoth(lambda _: reactor.stop())

reactor.run() # the script will block here until all crawling jobs are finished

2. We can also use the CrawlerRunner class to run spider crawlers based on a list of websites. We simply need to create a list of Requests objects with the URLs and pass it to the CrawlerRunner class.

from scrapy.crawler import CrawlerRunner
from scrapy import Request
from myproject.spiders import Spider1

urls = ['http://www.example.com/page1', 'http://www.example.com/page2', 'http://www.example.com/page3']

runner = CrawlerRunner()
for url in urls:
    req = Request(url=url)
    runner.crawl(Spider1, req)

d = runner.join()
d.addBoth(lambda _: reactor.stop())

reactor.run() # the script will block here until all crawling jobs are finished

Package library: Scrapy.

Python CrawlerRunner.CrawlerRunner - 30 examples found. These are the top rated real world Python examples of scrapy.crawler.CrawlerRunner.CrawlerRunner extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

CrawlerRunner(30)

crawl(30)

join(30)

create_crawler(25)

stop(7)

start(3)

_create_crawler(2)

_create_spider(2)

addBoth(1)

Example #1

Show file

File: mabelle_spider.py Project: linearxian/jd

def run():
    configure_logging()
    runner = CrawlerRunner(get_project_settings())
    d = runner.crawl(mabelleSpider)
    d.addBoth(lambda _: reactor.stop())
    reactor.run()  # the script will block here until the crawling is finished

Example #2

Show file

import os

from twisted.internet import reactor, defer
from scrapy.crawler import CrawlerRunner
from scrapy.utils.project import get_project_settings
from scrapy.utils.log import configure_logging

from utils.mover import move_nas

os.chdir('/root/spiderItem/pyusa/pyusa')

configure_logging()
runner = CrawlerRunner(get_project_settings())


@defer.inlineCallbacks
def crawl():
    yield runner.crawl('update_company_list')
    yield runner.crawl('update_company_docs')
    reactor.stop()


crawl()
reactor.run()

move_nas('/data/lq/usa/reports', '/data/usa')
move_nas('/data/lq/usa/announcements', '/data/usa_announcements')

Example #3

Show file

def loop_crawl():
    runner = CrawlerRunner(get_project_settings())
    crawl(runner)
    reactor.run()

Example #4

Show file

 def __init__(self, session):
     self.session = session
     self.review_urls = []
     self.profile_urls = []
     settings = self.__configure_project()
     self.runner = CrawlerRunner(settings=settings)

Example #5

Show file

 def test_crawler_runner_accepts_None(self):
     runner = CrawlerRunner()
     self.assertOptionIsDefault(runner.settings, 'RETRY_ENABLED')

Example #6

Show file

                item['store'] = final_product.store
                item['category'] = final_product.category
                item['animal'] = final_product.animal
                item['date'] = final_product.date
                item['date_str'] = final_product.date_str

                yield item

        next_page = response.css(
            'a.fa-chevron-right::attr(href)').extract_first()
        if next_page is not None:
            yield response.follow(next_page, callback=self.parse)


configure_logging()
RUNNER = CrawlerRunner()


@defer.inlineCallbacks
def crawl():
    """ Execute the spiders sequentially """
    yield RUNNER.crawl(TiendapetDogFoodSpider)
    yield runner.crawl(TiendapetDogMedSpider)
    reactor.stop()


crawl()
reactor.run(
)  #the script will block here until the last crawl call is finished

Example #7

Show file

        except:
            return parties_voted

        cdu_csu = parties_voted[cdu_cdu_pos - 1:cdu_cdu_pos + 2]
        cdu_csu_string = ''.join(cdu_csu)
        parties_voted_fixed = parties_voted[0:cdu_cdu_pos - 1] \
                              + [cdu_csu_string] \
                              + parties_voted[cdu_cdu_pos + 2:len(parties_voted)]

        return parties_voted_fixed

    def clean_umlaute(self, input):
        replacers = {'ä': 'ae', 'ö': 'oe',
                     'ü': 'ue', 'ß': 'ss',
                     'Ä': 'AE', 'Ö': 'OE',
                     'Ü': 'UE', '–': '-'
                     }
        for key, value in replacers.items():
            input = input.replace(key, value)

        return input


process = None
process = CrawlerRunner()
crawler = process.crawl(UmfrageerbegnisseSpider)

crawler.addBoth(lambda _: reactor.stop())
reactor.run() # the script will block here until the crawling is finished

Example #8

Show file

File: test_crawler.py Project: wwjiang007/scrapy

 def _runner(self):
     return CrawlerRunner({'REQUEST_FINGERPRINTER_IMPLEMENTATION': 'VERSION'})

Example #9

Show file

from scraping.webscrapy.spiders.amazonscraper import AmazonspiderSpider
from scraping.webscrapy.spiders.playstationscraper import PlaystationspiderSpider
from scraping.webscrapy.spiders.nintendoscraper import NintendospiderSpider
from scraping.webscrapy.spiders.metacriticscraper import MetacriticspiderSpider
from scraping.webscrapy.spiders.howlongtobeatscraper import HowLongToBeatspiderSpider
from flask import Flask, render_template, jsonify, request, redirect, url_for
from scrapy import signals
from scrapy.crawler import CrawlerRunner
from scrapy.signalmanager import dispatcher
from scrapy.utils.project import get_project_settings

final_data = {}
crochet.setup()
output_data = []
crawl_runner = CrawlerRunner({
    'USER_AGENT':
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.128 Safari/537.36 OPR/75.0.3969.259'
})

actual_path = os.path.dirname(os.path.abspath(__file__))
games_path = actual_path + "/game_list.json"
scrape_result_path = actual_path + "/scrape_result.json"
outputfile_amazon = actual_path + "/outputfile_amazon.json"
outputfile_howlongtobeat = actual_path + "/outputfile_howlongtobeat.json"
outputfile_playstation = actual_path + "/outputfile_playstation.json"
outputfile_metacritic = actual_path + "/outputfile_metacritic.json"

f = open(games_path)
games_data = json.load(f)


def doScraping(game):

Example #10

Show file

 def test_async_def_asyncio_parse(self):
     runner = CrawlerRunner({"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor"})
     runner.crawl(AsyncDefAsyncioSpider, self.mockserver.url("/status?n=200"), mockserver=self.mockserver)
     with LogCapture() as log:
         yield runner.join()
     self.assertIn("Got response 200", str(log))

Example #11

Show file

File: main.py Project: jk625x/exhentai-downloader

def main():
    username = raw_input('用户名:')
    password = getpass.getpass('密码:')
    rule = {
        'keyword':
        raw_input('关键字:'),
        'ori':
        int(raw_input('是否尝试下载原图 （1 原图 / 0 缩略图）:') or "0"),
        'star':
        float(raw_input('最少星标(默认为0，最大为5 输入值可包含两位小数):') or "0"),
        'fav':
        int(raw_input('最少收藏数（默认为0）:') or "0"),
        'start_page':
        int(raw_input('从第几页开始下载（默认为1）:') or "1"),
        'end_page':
        int(raw_input('下载至第几页（默认为5）:') or "5"),
        'doujinshi':
        (raw_input('是否包含doujinshi（1 包含 / 0 不包含 默认包含）:') or "1") == "0"
        and "off" or "on",
        'manga':
        (raw_input('是否包含manga（1 包含 / 0 不包含 默认包含）:') or "1") == "0" and "off"
        or "on",
        'artist_cg':
        (raw_input('是否包含artist_cg（1 包含 / 0 不包含 默认包含）:') or "1") == "0"
        and "off" or "on",
        'game_cg':
        (raw_input('是否包含game_cg（1 包含 / 0 不包含 默认包含）:') or "1") == "0" and "off"
        or "on",
        'western':
        (raw_input('是否包含western（1 包含 / 0 不包含 默认不包含）:') or "0") == "0" and "off"
        or "on",
        'non_h':
        (raw_input('是否包含non_h（1 包含 / 0 不包含 默认不包含）:') or "0") == "0" and "off"
        or "on",
        'image_set':
        (raw_input('是否包含image_set（1 包含 / 0 不包含 默认包含）:') or "1") == "0"
        and "off" or "on",
        'cosplay':
        (raw_input('是否包含cosplay（1 包含 / 0 不包含 默认包含）:') or "1") == "0" and "off"
        or "on",
        'asian_porn':
        (raw_input('是否包含asian_porn（1 包含 / 0 不包含 默认不包含）:') or "0") == "0"
        and "off" or "on",
        'misc':
        (raw_input('是否包含misc（1 包含 / 0 不包含 默认不包含）:') or "0") == "0" and "off"
        or "on",
    }

    settings = get_project_settings()
    #    disable the scrapy log
    #     configure_logging(settings)

    runner = CrawlerRunner(settings)
    runner.crawl(ExHentaiSpider,
                 user={
                     'username': username,
                     'password': password,
                 },
                 rule=rule)

    d = runner.join()
    d.addBoth(lambda _: reactor.stop())
    reactor.run()

Example #12

Show file

def main():
    configure_logging()
    runner = CrawlerRunner()
    task = LoopingCall(lambda: runner.crawl(CovidGeoInfoSpider))
    task.start(60 * 15, now=True)
    reactor.run()

Example #13

Show file

def  crawl(url):
    runner = CrawlerRunner()
    d = runner.crawl(PhilomathSpider, start_urls=[url])
    d.addBoth(lambda _: reactor.stop())
    reactor.run(installSignalHandlers=0)

Example #14

Show file

File: scrapy_spider.py Project: alex-pi/CourseProject

def start(base_url, max_urls_to_scrap=50):
    '''More settings can be added here to change the spider behaviour
    https://docs.scrapy.org/en/latest/topics/settings.html'''
    process = CrawlerRunner(settings)

    return process.crawl(LinkSpider, start_urls=[base_url], max_to_scrap=max_urls_to_scrap)

Example #15

Show file

File: main.py Project: IDEES-Rouen/Geocache-Scrapping

def crawl(reactor):
    runner = CrawlerRunner()
    d = runner.crawl(GeocachingSpider.GeocachingSpider)
    d.addCallback(getResult)
    d.addCallback(crawl2, runner)
    return d

Example #16

Show file

File: test_crawler.py Project: zrbruce/scrapy

 def test_crawler_runner_accepts_dict(self):
     runner = CrawlerRunner({'foo': 'bar'})
     self.assertEqual(runner.settings['foo'], 'bar')
     self.assertEqual(runner.settings['RETRY_ENABLED'],
                      default_settings.RETRY_ENABLED)
     self.assertIsInstance(runner.settings, Settings)

Example #17

Show file

File: masterspider.py Project: yolalayouu/OpenScraper

def run_generic_spider(user_id=None,
                       spider_id=None,
                       datamodel=None,
                       run_spider_config=None,
                       test_limit=None):
    """
	just launch run_generic_spider() from any handler in controller
	"""

    print
    log_scrap.info("--- run_generic_spider / spider_id : %s ", spider_id)

    # !!! spider is launched from main.py level !!!
    # all relative routes referring to this...
    log_scrap.info("--- run_generic_spider / os.getcwd() : %s ", os.getcwd())

    ### flattening run_spider_config : from nested to flat dict
    log_scrap.info(
        "--- run_generic_spider / 'flattenSpiderConfig()' on 'run_spider_config' --> 'spider_config_flat' ..."
    )
    spider_config_flat = flattenSpiderConfig(run_spider_config)

    ### settings for crawler
    # cf : https://hackernoon.com/how-to-crawl-the-web-politely-with-scrapy-15fbe489573d
    # gllobal settings for scrapy processes (see upper)
    log_scrap.info("--- run_generic_spider / BOT_NAME :       %s ",
                   settings.get('BOT_NAME'))
    log_scrap.info("--- run_generic_spider / USER_AGENT :     %s ",
                   settings.get('USER_AGENT'))
    log_scrap.info("--- run_generic_spider / ITEM_PIPELINES : %s ",
                   settings.get('ITEM_PIPELINES').__dict__)
    # specific settings for this scrapy process
    settings.set("CURRENT_SPIDER_ID", spider_id)
    settings.set("DOWNLOAD_DELAY", DOWNLOAD_DELAY)
    settings.set("RANDOMIZE_DOWNLOAD_DELAY", RANDOMIZE_DOWNLOAD_DELAY)

    ### initiating crawler process
    log_scrap.info("--- run_generic_spider / instanciate process ...")
    # process = CrawlerRunner()
    # process = CrawlerProcess()
    process = CrawlerRunner(settings=settings)

    ### adding CrawlerRunner as deferred
    def f(q):
        try:
            ### send/create custom spider from run_spider_config
            ### cf : https://stackoverflow.com/questions/35662146/dynamic-spider-generation-with-scrapy-subclass-init-error

            deferred = process.crawl(GenericSpider,
                                     user_id=user_id,
                                     datamodel=datamodel,
                                     spider_id=spider_id,
                                     spider_config_flat=spider_config_flat,
                                     test_limit=test_limit)
            deferred.addBoth(lambda _: reactor.stop())
            reactor.run()
            q.put(None)
        except Exception as e:
            q.put(e)

    ### putting task in queue and start
    q = Queue()
    p = Process(target=f, args=(q, ))
    p.start()
    result = q.get()
    p.join()

    if result is not None:
        raise result

    print "\n\n{}\n".format("> > > " * 20)


#############################################
### cool snippets

### convert to class object
# spider = globals()[spider]

Example #18

Show file

File: crawl_and_scrape.py Project: jphacks/D_2014

def crawl_and_scrape(url):
    """
    入力されたurlを起点に,再帰的にページをクロールし，取得した文章コンテンツを返す．

    Args:
        url (str): 再帰的クロールを開始するurl．
    
    Returns:
        (list): 取得したコンテンツのリスト．コンテンツは辞書形式:{"url":str, "title":str, "text":str}
    """

    # output_pathはurlのドメインに一意
    output_path = get_contents_path(url)

    # 既に当該ドメインをクロール済みの場合
    if os.path.exists(output_path):
        try:
            with open(output_path, encoding="utf-8") as f:
                contents = json.load(f)
                return contents
        except:
            os.remove(output_path)

    settings = {
        # "USER_AGENT":"",
        "EXTENSIONS": {
            #    'scrapy.extensions.telnet.TelnetConsole': None,
            'scrapy.extensions.closespider.CloseSpider': 1,
        },
        "CLOSESPIDER_TIMEOUT": 0,
        "CLOSESPIDER_ITEMCOUNT": 30,
        "CLOSESPIDER_PAGECOUNT": 0,
        "CLOSESPIDER_ERRORCOUNT": 0,
        "CONCURRENT_REQUESTS": 16,
        "DOWNLOAD_DELAY": 1,  # リクエストの間隔
        "DEPTH_LIMIT": 2,  # 再帰の深さ上限
        "FEED_FORMAT": "json",
        "FEED_URI": output_path,  # 出力ファイルパス
        "FEED_EXPORT_ENCODING": 'utf-8',
    }

    print("crawl start")

    # クローリング実行
    # process: CrawlerProcess = CrawlerProcess(settings=settings)
    # process.crawl(MySpider, [url])
    # process.start()  # the script will block here until the crawling is finished

    runner: CrawlerRunner = CrawlerRunner(settings=settings)
    d = runner.crawl(MySpider, [url])
    d.addBoth(lambda _: reactor.stop())
    reactor.run()  # クロールが終了するまでスクリプトはここでブロックされます

    # スクレイピング結果はoutput_pathに保存してある．
    try:
        with open(output_path, encoding="utf-8") as f:
            contents = json.load(f)
    except:
        contents = None

    print("crawl end")

    return contents

Example #19

Show file

def spider_process(spider, settings=None):
    """Runs a scrapy CrawlerRunner"""
    runner = CrawlerRunner(settings)
    deferred = runner.crawl(spider)
    deferred.addBoth(lambda _: reactor.stop())
    reactor.run()

Example #20

Show file

File: __main__.py Project: JeppeKlitgaard/NewsScraper

def consume(reactor, hosts='kafka-server1:9092'):
    topic = 'crawl-queue'
    client = yield ready_client(reactor, hosts, topic)
    partitions = client.topic_partitions[topic]
    print(f'PARTITIONS: {partitions}')

    settings = project.get_project_settings()

    runner = CrawlerRunner(settings=settings)
    spiders = make_spider_dict(settings=settings)

    def process(consumer, message_list):
        """
        This function is called for every batch of messages received from
        Kafka. It may return a Deferred, but this implementation just logs the
        messages received.
        """
        deferreds = []
        for m in message_list:
            log.debug("Got message %r", m)
            mo = json.loads(m.message.value)
            log.info(mo)
            log.info(consumers)

            try:
                spider_obj = spiders[mo['spider']]
            except KeyError as e:
                log.error(
                    f"Unable to find spider '{mo['spider']}'. Ignoring error {e}"
                )
                continue

            d = runner.crawl(spider_obj, rss_item=mo)
            deferreds.append(d)

        def consumer_commit(r):
            success = all(list(zip(*r))[0])

            if success:
                log.info("Committing to consumer!")
                d = consumer.commit()
                d.addCallback(lambda _: log.info("Succesfully commited."))

            else:
                log.error("A consumer failed. Not committing...")

        dl = defer.DeferredList(deferreds)
        dl.addBoth(consumer_commit)

        consumer.shutdown()

    consumers = [
        Consumer(client,
                 topic,
                 partition,
                 process,
                 consumer_group='scraper-group',
                 auto_offset_reset=OFFSET_EARLIEST,
                 buffer_size=1024) for partition in partitions
    ]

    def cb_closed(result):
        """
        Called when a consumer cleanly stops.
        """
        log.info("Consumer stopped")

    def eb_failed(failure):
        """
        Called when a consumer fails due to an uncaught exception in the
        processing callback or a network error on shutdown. In this case we
        simply log the error.
        """
        log.error("Consumer failed: %s", failure)

    def start_consumer(consumer):
        log.info("Consumer started.")
        d = consumer.start(OFFSET_COMMITTED)
        d.addCallbacks(cb_closed, eb_failed)

        return d

    def stop_consumers():
        log.info("\n")
        log.info("Time is up, stopping consumers...")
        d = defer.gatherResults([c.shutdown() for c in consumers])
        d.addCallback(lambda result: client.close())
        return d

    yield defer.gatherResults([start_consumer(c) for c in consumers]
                              #[task.deferLater(reactor, 10.0, stop_consumers)]
                              )

Example #21

Show file

 def setUp(self):
     self.mockserver = MockServer()
     self.mockserver.__enter__()
     self.runner = CrawlerRunner()

Example #22

Show file

 runner = CrawlerRunner(settings=Settings({
     'DOWNLOAD_DELAY':
     3,
     'CONCURRENT_REQUESTS':
     20,
     'ROBOTSTXT_OBEY':
     False,
     'USER_AGENT':
     'Mozilla/5.0 (X11; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0',
     'AUTOTHROTTLE_ENABLED':
     True,
     'HTTPCACHE_ENABLED':
     False,  # Cache enabled for testing
     'HTTPCACHE_EXPIRATION_SECS':
     0,
     'TELNETCONSOLE_PORT':
     None,
     'RETRY_ENABLED':
     False,
     'REDIRECT_ENABLED':
     True,
     'COOKIES_ENABLED':
     False,
     'REACTOR_THREADPOOL_MAXSIZE':
     20,
     'DOWNLOAD_TIMEOUT':
     30,  # To avoid loss of entries?
     # Retry many times since proxies often fail
     'RETRY_TIMES':
     10,
     # Retry on most error codes since proxies fail for different reasons
     'RETRY_HTTP_CODES': [500, 503, 504, 400, 403, 404, 408],
     'DOWNLOADER_MIDDLEWARES': {
         'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware':
         None,
         'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90,
         'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 610,
         'random_useragent.RandomUserAgentMiddleware': 400,
         'rotating_proxies.middlewares.RotatingProxyMiddleware': 110,
         'rotating_proxies.middlewares.BanDetectionMiddleware': 620,
     },
     'PROXY_LIST':
     PROXY_PATH,
     'PROXY_MODE':
     0,
     'USER_AGENT_LIST':
     USER_PATH
 }))

Example #23

Show file

 def test_crawler_runner_accepts_dict(self):
     runner = CrawlerRunner({'foo': 'bar'})
     self.assertEqual(runner.settings['foo'], 'bar')
     self.assertOptionIsDefault(runner.settings, 'RETRY_ENABLED')

Example #24

Show file

File: main.py Project: uk-gov-mirror/alphagov.paas-compose-scraper

def crawl(settings=None):
    runner = CrawlerRunner(settings=settings)
    runner.crawl(ClusterSpider)
    runner.join()

Example #25

Show file

 def test_crawler_runner_bootstrap_successful_for_several(self):
     runner = CrawlerRunner()
     yield runner.crawl(NoRequestsSpider)
     yield runner.crawl(NoRequestsSpider)
     self.assertEqual(runner.bootstrap_failed, False)

Example #26

Show file

File: flask_twisted.py Project: Suchana34/Crawler_Schedular

To run this app, run it directly:
    python flask_twisted.py
Alternatively, use Twisted's `twist` executable. This assumes you're in the directory where the
source files are located:
    PYTHONPATH=$(pwd) twist web --wsgi flask_twisted.app --port tcp:9000:interface=0.0.0.0
"""

import json

from flask import Flask
from scrapy.crawler import CrawlerRunner

from quote_scraper import QuoteSpider

app = Flask('Scrape With Flask')
crawl_runner = CrawlerRunner()  # requires the Twisted reactor to run
quotes_list = []  # store quotes
scrape_in_progress = False
scrape_complete = False


@app.route('/greeting')
@app.route('/greeting/<name>')
def greeting(name='World'):
    return 'Hello %s!' % (name)


@app.route('/crawl')
def crawl_for_quotes():
    """
    Scrape for quotes

Example #27

Show file

File: 3_automatizacion_scrapy.py Project: parawebpy/web-scraping

        current = response.xpath('//a[contains(@class, "card current")]//div[@class="temp"]/span[1]/text()').get()
        real_feel = response.xpath('//a[contains(@class, "card current")]//div[@class="real-feel"]/text()').get()

        # Limpieza de datos
        ciudad = ciudad.replace('\n', '').replace('\r', '').strip()
        current = current.replace('°', '').replace('\n', '').replace('\r', '').strip()
        real_feel = real_feel.replace('RealFeel®', '').replace('°', '').replace('\n', '').replace('\r', '').strip()
        
        # Guardado de datos en un archivo
        f = open("./datos_clima_scrapy.csv", "a")
        f.write(ciudad + "," + current + "," + real_feel + "\n")
        f.close()
        print(ciudad)
        print(current)
        print(real_feel)
        print()

        # No necesito hacer yield. El yield me sirve cuando voy a guardar los datos
        # en un archivo, corriendo Scrapy desde Terminal

# Logica para correr una extraccion de Scrapy periodicamente. Es decir, automatizarla.
runner = CrawlerRunner()
task = LoopingCall(lambda: runner.crawl(ExtractorClima)) # Para Investigar: Funciones Anonimas en Python
task.start(20) # Tiempo en segundos desde la primera corrida del programa para repetir la extraccion
reactor.run()

# Segundos en 1 dia: 86400
# Segundos en 1 hora: 3600
# Segundos en 1 semana: 604800
# Segundos en 1 mes: 2.628e+6
# Segundos en 1 minuto: 60

Example #28

Show file

File: main.py Project: tonny62/Text_Classification

    def f(q):
        try:
            print "running...", spider_name

            ### setting up output directory for the spider.
            detail_path = '{ymd}/{spider}'.format(
                ymd=st_time.strftime('%Y%m%d'), spider=spider_name)
            html_path = os.path.join(html_base_dir, detail_path)

            html_path = os.path.join(html_path, '{dttm}.html')
            if not os.path.exists(os.path.dirname(html_path)):
                os.makedirs(os.path.dirname(html_path))

            ### setting up log directory for the spider
            sp_log_path = log_path.format(spider=spider_name)
            sp_err_log_path = err_log_path.format(spider=spider_name)

            if not os.path.exists(os.path.dirname(sp_log_path)):
                os.makedirs(os.path.dirname(sp_log_path))
            if not os.path.exists(os.path.dirname(sp_err_log_path)):
                os.makedirs(os.path.dirname(sp_err_log_path))

            ### setting up logger for the spider
            logger = logging.getLogger(spider_name + '_logger')
            logger.setLevel(logging.DEBUG)

            debug_handler = logging.FileHandler(sp_log_path)
            error_handler = logging.FileHandler(sp_err_log_path)
            debug_handler.setLevel(logging.DEBUG)
            error_handler.setLevel(logging.WARNING)
            formatter = logging.Formatter(
                '%(asctime)s:%(module)s - %(message)s')
            debug_handler.setFormatter(formatter)
            error_handler.setFormatter(formatter)

            logger.addHandler(debug_handler)
            logger.addHandler(error_handler)

            logger.info('logger created')

            ### preparing spider object.
            settings = project.get_project_settings()
            settings.set('ITEM_PIPELINES', {pipeline: 1}, priority='cmdline')
            spider_loader = spiderloader.SpiderLoader.from_settings(settings)

            spider = spider_loader.load(spider_name)
            spider.html_path = html_path
            spider.proxies = proxies
            spider.use_proxy = use_proxy
            spider.logger = logger
            spider.sqllogger = sqllogger

            spider.repeat_count = 0
            spider.repeat_threshold = 10
            spider.error_count = 0
            spider.error_threshold = 5

            ### starting spider queue and spider
            crawler_runner = CrawlerRunner(settings)  #from Scrapy docs

            deferred = crawler_runner.crawl(spider)
            deferred.addBoth(lambda _: reactor.stop())
            reactor.run()
            q.put(None)

        except Exception as e:
            q.put(e)

Example #29

Show file

from twisted.internet import reactor
import scrapy
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from scrapy.utils.project import get_project_settings
from spiders.gladiaspider import GladiaspiderSpider

import sys

configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
settings = get_project_settings()
settings.set('FEED_FORMAT', 'csv')
settings.set('FEED_URI', 'stats.csv')

runner = CrawlerRunner(settings)
if len(sys.argv) > 2:
    d = runner.crawl(GladiaspiderSpider, max_r=sys.argv[1], player=sys.argv[2])
elif len(sys.argv) > 1:
    d = runner.crawl(GladiaspiderSpider, max_r=sys.argv[1])
else:
    d = runner.crawl(GladiaspiderSpider, max_r='100')
d.addBoth(lambda _: reactor.stop())
reactor.run()  # the script will block here until the crawling is finished

Example #30

Show file

File: scrapper.py Project: susmitpy/ScrapyExample

def get_crawl_runner():
    return CrawlerRunner(
    {
    'USER_AGENT': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:87.0) Gecko/20100101 Firefox/87.0',
        "LOG_LEVEL" : "INFO"
    })