Python CrawlerRunner.CrawlerRunnerの例、scrapy.crawler.CrawlerRunner.CrawlerRunner Pythonの例

コード例 #1

0

ファイルを表示

ファイル: mabelle_spider.py プロジェクト: linearxian/jd

def run():
    configure_logging()
    runner = CrawlerRunner(get_project_settings())
    d = runner.crawl(mabelleSpider)
    d.addBoth(lambda _: reactor.stop())
    reactor.run()  # the script will block here until the crawling is finished

コード例 #2

0

ファイルを表示

import os

from twisted.internet import reactor, defer
from scrapy.crawler import CrawlerRunner
from scrapy.utils.project import get_project_settings
from scrapy.utils.log import configure_logging

from utils.mover import move_nas

os.chdir('/root/spiderItem/pyusa/pyusa')

configure_logging()
runner = CrawlerRunner(get_project_settings())


@defer.inlineCallbacks
def crawl():
    yield runner.crawl('update_company_list')
    yield runner.crawl('update_company_docs')
    reactor.stop()


crawl()
reactor.run()

move_nas('/data/lq/usa/reports', '/data/usa')
move_nas('/data/lq/usa/announcements', '/data/usa_announcements')

コード例 #3

0

ファイルを表示

def loop_crawl():
    runner = CrawlerRunner(get_project_settings())
    crawl(runner)
    reactor.run()

コード例 #4

0

ファイルを表示

 def __init__(self, session):
     self.session = session
     self.review_urls = []
     self.profile_urls = []
     settings = self.__configure_project()
     self.runner = CrawlerRunner(settings=settings)

コード例 #5

0

ファイルを表示

 def test_crawler_runner_accepts_None(self):
     runner = CrawlerRunner()
     self.assertOptionIsDefault(runner.settings, 'RETRY_ENABLED')

コード例 #6

0

ファイルを表示

                item['store'] = final_product.store
                item['category'] = final_product.category
                item['animal'] = final_product.animal
                item['date'] = final_product.date
                item['date_str'] = final_product.date_str

                yield item

        next_page = response.css(
            'a.fa-chevron-right::attr(href)').extract_first()
        if next_page is not None:
            yield response.follow(next_page, callback=self.parse)


configure_logging()
RUNNER = CrawlerRunner()


@defer.inlineCallbacks
def crawl():
    """ Execute the spiders sequentially """
    yield RUNNER.crawl(TiendapetDogFoodSpider)
    yield runner.crawl(TiendapetDogMedSpider)
    reactor.stop()


crawl()
reactor.run(
)  #the script will block here until the last crawl call is finished

コード例 #7

0

ファイルを表示

        except:
            return parties_voted

        cdu_csu = parties_voted[cdu_cdu_pos - 1:cdu_cdu_pos + 2]
        cdu_csu_string = ''.join(cdu_csu)
        parties_voted_fixed = parties_voted[0:cdu_cdu_pos - 1] \
                              + [cdu_csu_string] \
                              + parties_voted[cdu_cdu_pos + 2:len(parties_voted)]

        return parties_voted_fixed

    def clean_umlaute(self, input):
        replacers = {'ä': 'ae', 'ö': 'oe',
                     'ü': 'ue', 'ß': 'ss',
                     'Ä': 'AE', 'Ö': 'OE',
                     'Ü': 'UE', '–': '-'
                     }
        for key, value in replacers.items():
            input = input.replace(key, value)

        return input


process = None
process = CrawlerRunner()
crawler = process.crawl(UmfrageerbegnisseSpider)

crawler.addBoth(lambda _: reactor.stop())
reactor.run() # the script will block here until the crawling is finished

コード例 #8

0

ファイルを表示

ファイル: test_crawler.py プロジェクト: wwjiang007/scrapy

 def _runner(self):
     return CrawlerRunner({'REQUEST_FINGERPRINTER_IMPLEMENTATION': 'VERSION'})

コード例 #9

0

ファイルを表示

from scraping.webscrapy.spiders.amazonscraper import AmazonspiderSpider
from scraping.webscrapy.spiders.playstationscraper import PlaystationspiderSpider
from scraping.webscrapy.spiders.nintendoscraper import NintendospiderSpider
from scraping.webscrapy.spiders.metacriticscraper import MetacriticspiderSpider
from scraping.webscrapy.spiders.howlongtobeatscraper import HowLongToBeatspiderSpider
from flask import Flask, render_template, jsonify, request, redirect, url_for
from scrapy import signals
from scrapy.crawler import CrawlerRunner
from scrapy.signalmanager import dispatcher
from scrapy.utils.project import get_project_settings

final_data = {}
crochet.setup()
output_data = []
crawl_runner = CrawlerRunner({
    'USER_AGENT':
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.128 Safari/537.36 OPR/75.0.3969.259'
})

actual_path = os.path.dirname(os.path.abspath(__file__))
games_path = actual_path + "/game_list.json"
scrape_result_path = actual_path + "/scrape_result.json"
outputfile_amazon = actual_path + "/outputfile_amazon.json"
outputfile_howlongtobeat = actual_path + "/outputfile_howlongtobeat.json"
outputfile_playstation = actual_path + "/outputfile_playstation.json"
outputfile_metacritic = actual_path + "/outputfile_metacritic.json"

f = open(games_path)
games_data = json.load(f)


def doScraping(game):

コード例 #10

0

ファイルを表示

 def test_async_def_asyncio_parse(self):
     runner = CrawlerRunner({"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor"})
     runner.crawl(AsyncDefAsyncioSpider, self.mockserver.url("/status?n=200"), mockserver=self.mockserver)
     with LogCapture() as log:
         yield runner.join()
     self.assertIn("Got response 200", str(log))

コード例 #11

0

ファイルを表示

ファイル: main.py プロジェクト: jk625x/exhentai-downloader

def main():
    username = raw_input('用户名:')
    password = getpass.getpass('密码:')
    rule = {
        'keyword':
        raw_input('关键字:'),
        'ori':
        int(raw_input('是否尝试下载原图 （1 原图 / 0 缩略图）:') or "0"),
        'star':
        float(raw_input('最少星标(默认为0，最大为5 输入值可包含两位小数):') or "0"),
        'fav':
        int(raw_input('最少收藏数（默认为0）:') or "0"),
        'start_page':
        int(raw_input('从第几页开始下载（默认为1）:') or "1"),
        'end_page':
        int(raw_input('下载至第几页（默认为5）:') or "5"),
        'doujinshi':
        (raw_input('是否包含doujinshi（1 包含 / 0 不包含 默认包含）:') or "1") == "0"
        and "off" or "on",
        'manga':
        (raw_input('是否包含manga（1 包含 / 0 不包含 默认包含）:') or "1") == "0" and "off"
        or "on",
        'artist_cg':
        (raw_input('是否包含artist_cg（1 包含 / 0 不包含 默认包含）:') or "1") == "0"
        and "off" or "on",
        'game_cg':
        (raw_input('是否包含game_cg（1 包含 / 0 不包含 默认包含）:') or "1") == "0" and "off"
        or "on",
        'western':
        (raw_input('是否包含western（1 包含 / 0 不包含 默认不包含）:') or "0") == "0" and "off"
        or "on",
        'non_h':
        (raw_input('是否包含non_h（1 包含 / 0 不包含 默认不包含）:') or "0") == "0" and "off"
        or "on",
        'image_set':
        (raw_input('是否包含image_set（1 包含 / 0 不包含 默认包含）:') or "1") == "0"
        and "off" or "on",
        'cosplay':
        (raw_input('是否包含cosplay（1 包含 / 0 不包含 默认包含）:') or "1") == "0" and "off"
        or "on",
        'asian_porn':
        (raw_input('是否包含asian_porn（1 包含 / 0 不包含 默认不包含）:') or "0") == "0"
        and "off" or "on",
        'misc':
        (raw_input('是否包含misc（1 包含 / 0 不包含 默认不包含）:') or "0") == "0" and "off"
        or "on",
    }

    settings = get_project_settings()
    #    disable the scrapy log
    #     configure_logging(settings)

    runner = CrawlerRunner(settings)
    runner.crawl(ExHentaiSpider,
                 user={
                     'username': username,
                     'password': password,
                 },
                 rule=rule)

    d = runner.join()
    d.addBoth(lambda _: reactor.stop())
    reactor.run()

コード例 #12

0

ファイルを表示

def main():
    configure_logging()
    runner = CrawlerRunner()
    task = LoopingCall(lambda: runner.crawl(CovidGeoInfoSpider))
    task.start(60 * 15, now=True)
    reactor.run()

コード例 #13

0

ファイルを表示

def  crawl(url):
    runner = CrawlerRunner()
    d = runner.crawl(PhilomathSpider, start_urls=[url])
    d.addBoth(lambda _: reactor.stop())
    reactor.run(installSignalHandlers=0)

コード例 #14

0

ファイルを表示

ファイル: scrapy_spider.py プロジェクト: alex-pi/CourseProject

def start(base_url, max_urls_to_scrap=50):
    '''More settings can be added here to change the spider behaviour
    https://docs.scrapy.org/en/latest/topics/settings.html'''
    process = CrawlerRunner(settings)

    return process.crawl(LinkSpider, start_urls=[base_url], max_to_scrap=max_urls_to_scrap)

コード例 #15

0

ファイルを表示

ファイル: main.py プロジェクト: IDEES-Rouen/Geocache-Scrapping

def crawl(reactor):
    runner = CrawlerRunner()
    d = runner.crawl(GeocachingSpider.GeocachingSpider)
    d.addCallback(getResult)
    d.addCallback(crawl2, runner)
    return d

コード例 #16

0

ファイルを表示

ファイル: test_crawler.py プロジェクト: zrbruce/scrapy

 def test_crawler_runner_accepts_dict(self):
     runner = CrawlerRunner({'foo': 'bar'})
     self.assertEqual(runner.settings['foo'], 'bar')
     self.assertEqual(runner.settings['RETRY_ENABLED'],
                      default_settings.RETRY_ENABLED)
     self.assertIsInstance(runner.settings, Settings)

コード例 #17

0

ファイルを表示

ファイル: masterspider.py プロジェクト: yolalayouu/OpenScraper

def run_generic_spider(user_id=None,
                       spider_id=None,
                       datamodel=None,
                       run_spider_config=None,
                       test_limit=None):
    """
	just launch run_generic_spider() from any handler in controller
	"""

    print
    log_scrap.info("--- run_generic_spider / spider_id : %s ", spider_id)

    # !!! spider is launched from main.py level !!!
    # all relative routes referring to this...
    log_scrap.info("--- run_generic_spider / os.getcwd() : %s ", os.getcwd())

    ### flattening run_spider_config : from nested to flat dict
    log_scrap.info(
        "--- run_generic_spider / 'flattenSpiderConfig()' on 'run_spider_config' --> 'spider_config_flat' ..."
    )
    spider_config_flat = flattenSpiderConfig(run_spider_config)

    ### settings for crawler
    # cf : https://hackernoon.com/how-to-crawl-the-web-politely-with-scrapy-15fbe489573d
    # gllobal settings for scrapy processes (see upper)
    log_scrap.info("--- run_generic_spider / BOT_NAME :       %s ",
                   settings.get('BOT_NAME'))
    log_scrap.info("--- run_generic_spider / USER_AGENT :     %s ",
                   settings.get('USER_AGENT'))
    log_scrap.info("--- run_generic_spider / ITEM_PIPELINES : %s ",
                   settings.get('ITEM_PIPELINES').__dict__)
    # specific settings for this scrapy process
    settings.set("CURRENT_SPIDER_ID", spider_id)
    settings.set("DOWNLOAD_DELAY", DOWNLOAD_DELAY)
    settings.set("RANDOMIZE_DOWNLOAD_DELAY", RANDOMIZE_DOWNLOAD_DELAY)

    ### initiating crawler process
    log_scrap.info("--- run_generic_spider / instanciate process ...")
    # process = CrawlerRunner()
    # process = CrawlerProcess()
    process = CrawlerRunner(settings=settings)

    ### adding CrawlerRunner as deferred
    def f(q):
        try:
            ### send/create custom spider from run_spider_config
            ### cf : https://stackoverflow.com/questions/35662146/dynamic-spider-generation-with-scrapy-subclass-init-error

            deferred = process.crawl(GenericSpider,
                                     user_id=user_id,
                                     datamodel=datamodel,
                                     spider_id=spider_id,
                                     spider_config_flat=spider_config_flat,
                                     test_limit=test_limit)
            deferred.addBoth(lambda _: reactor.stop())
            reactor.run()
            q.put(None)
        except Exception as e:
            q.put(e)

    ### putting task in queue and start
    q = Queue()
    p = Process(target=f, args=(q, ))
    p.start()
    result = q.get()
    p.join()

    if result is not None:
        raise result

    print "\n\n{}\n".format("> > > " * 20)


#############################################
### cool snippets

### convert to class object
# spider = globals()[spider]

コード例 #18

0

ファイルを表示

ファイル: crawl_and_scrape.py プロジェクト: jphacks/D_2014

def crawl_and_scrape(url):
    """
    入力されたurlを起点に,再帰的にページをクロールし，取得した文章コンテンツを返す．

    Args:
        url (str): 再帰的クロールを開始するurl．
    
    Returns:
        (list): 取得したコンテンツのリスト．コンテンツは辞書形式:{"url":str, "title":str, "text":str}
    """

    # output_pathはurlのドメインに一意
    output_path = get_contents_path(url)

    # 既に当該ドメインをクロール済みの場合
    if os.path.exists(output_path):
        try:
            with open(output_path, encoding="utf-8") as f:
                contents = json.load(f)
                return contents
        except:
            os.remove(output_path)

    settings = {
        # "USER_AGENT":"",
        "EXTENSIONS": {
            #    'scrapy.extensions.telnet.TelnetConsole': None,
            'scrapy.extensions.closespider.CloseSpider': 1,
        },
        "CLOSESPIDER_TIMEOUT": 0,
        "CLOSESPIDER_ITEMCOUNT": 30,
        "CLOSESPIDER_PAGECOUNT": 0,
        "CLOSESPIDER_ERRORCOUNT": 0,
        "CONCURRENT_REQUESTS": 16,
        "DOWNLOAD_DELAY": 1,  # リクエストの間隔
        "DEPTH_LIMIT": 2,  # 再帰の深さ上限
        "FEED_FORMAT": "json",
        "FEED_URI": output_path,  # 出力ファイルパス
        "FEED_EXPORT_ENCODING": 'utf-8',
    }

    print("crawl start")

    # クローリング実行
    # process: CrawlerProcess = CrawlerProcess(settings=settings)
    # process.crawl(MySpider, [url])
    # process.start()  # the script will block here until the crawling is finished

    runner: CrawlerRunner = CrawlerRunner(settings=settings)
    d = runner.crawl(MySpider, [url])
    d.addBoth(lambda _: reactor.stop())
    reactor.run()  # クロールが終了するまでスクリプトはここでブロックされます

    # スクレイピング結果はoutput_pathに保存してある．
    try:
        with open(output_path, encoding="utf-8") as f:
            contents = json.load(f)
    except:
        contents = None

    print("crawl end")

    return contents

コード例 #19

0

ファイルを表示

def spider_process(spider, settings=None):
    """Runs a scrapy CrawlerRunner"""
    runner = CrawlerRunner(settings)
    deferred = runner.crawl(spider)
    deferred.addBoth(lambda _: reactor.stop())
    reactor.run()

コード例 #20

0

ファイルを表示

ファイル: __main__.py プロジェクト: JeppeKlitgaard/NewsScraper

def consume(reactor, hosts='kafka-server1:9092'):
    topic = 'crawl-queue'
    client = yield ready_client(reactor, hosts, topic)
    partitions = client.topic_partitions[topic]
    print(f'PARTITIONS: {partitions}')

    settings = project.get_project_settings()

    runner = CrawlerRunner(settings=settings)
    spiders = make_spider_dict(settings=settings)

    def process(consumer, message_list):
        """
        This function is called for every batch of messages received from
        Kafka. It may return a Deferred, but this implementation just logs the
        messages received.
        """
        deferreds = []
        for m in message_list:
            log.debug("Got message %r", m)
            mo = json.loads(m.message.value)
            log.info(mo)
            log.info(consumers)

            try:
                spider_obj = spiders[mo['spider']]
            except KeyError as e:
                log.error(
                    f"Unable to find spider '{mo['spider']}'. Ignoring error {e}"
                )
                continue

            d = runner.crawl(spider_obj, rss_item=mo)
            deferreds.append(d)

        def consumer_commit(r):
            success = all(list(zip(*r))[0])

            if success:
                log.info("Committing to consumer!")
                d = consumer.commit()
                d.addCallback(lambda _: log.info("Succesfully commited."))

            else:
                log.error("A consumer failed. Not committing...")

        dl = defer.DeferredList(deferreds)
        dl.addBoth(consumer_commit)

        consumer.shutdown()

    consumers = [
        Consumer(client,
                 topic,
                 partition,
                 process,
                 consumer_group='scraper-group',
                 auto_offset_reset=OFFSET_EARLIEST,
                 buffer_size=1024) for partition in partitions
    ]

    def cb_closed(result):
        """
        Called when a consumer cleanly stops.
        """
        log.info("Consumer stopped")

    def eb_failed(failure):
        """
        Called when a consumer fails due to an uncaught exception in the
        processing callback or a network error on shutdown. In this case we
        simply log the error.
        """
        log.error("Consumer failed: %s", failure)

    def start_consumer(consumer):
        log.info("Consumer started.")
        d = consumer.start(OFFSET_COMMITTED)
        d.addCallbacks(cb_closed, eb_failed)

        return d

    def stop_consumers():
        log.info("\n")
        log.info("Time is up, stopping consumers...")
        d = defer.gatherResults([c.shutdown() for c in consumers])
        d.addCallback(lambda result: client.close())
        return d

    yield defer.gatherResults([start_consumer(c) for c in consumers]
                              #[task.deferLater(reactor, 10.0, stop_consumers)]
                              )

コード例 #21

0

ファイルを表示

 def setUp(self):
     self.mockserver = MockServer()
     self.mockserver.__enter__()
     self.runner = CrawlerRunner()

コード例 #22

0

ファイルを表示

 runner = CrawlerRunner(settings=Settings({
     'DOWNLOAD_DELAY':
     3,
     'CONCURRENT_REQUESTS':
     20,
     'ROBOTSTXT_OBEY':
     False,
     'USER_AGENT':
     'Mozilla/5.0 (X11; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0',
     'AUTOTHROTTLE_ENABLED':
     True,
     'HTTPCACHE_ENABLED':
     False,  # Cache enabled for testing
     'HTTPCACHE_EXPIRATION_SECS':
     0,
     'TELNETCONSOLE_PORT':
     None,
     'RETRY_ENABLED':
     False,
     'REDIRECT_ENABLED':
     True,
     'COOKIES_ENABLED':
     False,
     'REACTOR_THREADPOOL_MAXSIZE':
     20,
     'DOWNLOAD_TIMEOUT':
     30,  # To avoid loss of entries?
     # Retry many times since proxies often fail
     'RETRY_TIMES':
     10,
     # Retry on most error codes since proxies fail for different reasons
     'RETRY_HTTP_CODES': [500, 503, 504, 400, 403, 404, 408],
     'DOWNLOADER_MIDDLEWARES': {
         'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware':
         None,
         'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90,
         'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 610,
         'random_useragent.RandomUserAgentMiddleware': 400,
         'rotating_proxies.middlewares.RotatingProxyMiddleware': 110,
         'rotating_proxies.middlewares.BanDetectionMiddleware': 620,
     },
     'PROXY_LIST':
     PROXY_PATH,
     'PROXY_MODE':
     0,
     'USER_AGENT_LIST':
     USER_PATH
 }))

コード例 #23

0

ファイルを表示

 def test_crawler_runner_accepts_dict(self):
     runner = CrawlerRunner({'foo': 'bar'})
     self.assertEqual(runner.settings['foo'], 'bar')
     self.assertOptionIsDefault(runner.settings, 'RETRY_ENABLED')

コード例 #24

0

ファイルを表示

ファイル: main.py プロジェクト: uk-gov-mirror/alphagov.paas-compose-scraper

def crawl(settings=None):
    runner = CrawlerRunner(settings=settings)
    runner.crawl(ClusterSpider)
    runner.join()

コード例 #25

0

ファイルを表示

 def test_crawler_runner_bootstrap_successful_for_several(self):
     runner = CrawlerRunner()
     yield runner.crawl(NoRequestsSpider)
     yield runner.crawl(NoRequestsSpider)
     self.assertEqual(runner.bootstrap_failed, False)

コード例 #26

0

ファイルを表示

ファイル: flask_twisted.py プロジェクト: Suchana34/Crawler_Schedular

To run this app, run it directly:
    python flask_twisted.py
Alternatively, use Twisted's `twist` executable. This assumes you're in the directory where the
source files are located:
    PYTHONPATH=$(pwd) twist web --wsgi flask_twisted.app --port tcp:9000:interface=0.0.0.0
"""

import json

from flask import Flask
from scrapy.crawler import CrawlerRunner

from quote_scraper import QuoteSpider

app = Flask('Scrape With Flask')
crawl_runner = CrawlerRunner()  # requires the Twisted reactor to run
quotes_list = []  # store quotes
scrape_in_progress = False
scrape_complete = False


@app.route('/greeting')
@app.route('/greeting/<name>')
def greeting(name='World'):
    return 'Hello %s!' % (name)


@app.route('/crawl')
def crawl_for_quotes():
    """
    Scrape for quotes

コード例 #27

0

ファイルを表示

ファイル: 3_automatizacion_scrapy.py プロジェクト: parawebpy/web-scraping

        current = response.xpath('//a[contains(@class, "card current")]//div[@class="temp"]/span[1]/text()').get()
        real_feel = response.xpath('//a[contains(@class, "card current")]//div[@class="real-feel"]/text()').get()

        # Limpieza de datos
        ciudad = ciudad.replace('\n', '').replace('\r', '').strip()
        current = current.replace('°', '').replace('\n', '').replace('\r', '').strip()
        real_feel = real_feel.replace('RealFeel®', '').replace('°', '').replace('\n', '').replace('\r', '').strip()
        
        # Guardado de datos en un archivo
        f = open("./datos_clima_scrapy.csv", "a")
        f.write(ciudad + "," + current + "," + real_feel + "\n")
        f.close()
        print(ciudad)
        print(current)
        print(real_feel)
        print()

        # No necesito hacer yield. El yield me sirve cuando voy a guardar los datos
        # en un archivo, corriendo Scrapy desde Terminal

# Logica para correr una extraccion de Scrapy periodicamente. Es decir, automatizarla.
runner = CrawlerRunner()
task = LoopingCall(lambda: runner.crawl(ExtractorClima)) # Para Investigar: Funciones Anonimas en Python
task.start(20) # Tiempo en segundos desde la primera corrida del programa para repetir la extraccion
reactor.run()

# Segundos en 1 dia: 86400
# Segundos en 1 hora: 3600
# Segundos en 1 semana: 604800
# Segundos en 1 mes: 2.628e+6
# Segundos en 1 minuto: 60

コード例 #28

0

ファイルを表示

ファイル: main.py プロジェクト: tonny62/Text_Classification

    def f(q):
        try:
            print "running...", spider_name

            ### setting up output directory for the spider.
            detail_path = '{ymd}/{spider}'.format(
                ymd=st_time.strftime('%Y%m%d'), spider=spider_name)
            html_path = os.path.join(html_base_dir, detail_path)

            html_path = os.path.join(html_path, '{dttm}.html')
            if not os.path.exists(os.path.dirname(html_path)):
                os.makedirs(os.path.dirname(html_path))

            ### setting up log directory for the spider
            sp_log_path = log_path.format(spider=spider_name)
            sp_err_log_path = err_log_path.format(spider=spider_name)

            if not os.path.exists(os.path.dirname(sp_log_path)):
                os.makedirs(os.path.dirname(sp_log_path))
            if not os.path.exists(os.path.dirname(sp_err_log_path)):
                os.makedirs(os.path.dirname(sp_err_log_path))

            ### setting up logger for the spider
            logger = logging.getLogger(spider_name + '_logger')
            logger.setLevel(logging.DEBUG)

            debug_handler = logging.FileHandler(sp_log_path)
            error_handler = logging.FileHandler(sp_err_log_path)
            debug_handler.setLevel(logging.DEBUG)
            error_handler.setLevel(logging.WARNING)
            formatter = logging.Formatter(
                '%(asctime)s:%(module)s - %(message)s')
            debug_handler.setFormatter(formatter)
            error_handler.setFormatter(formatter)

            logger.addHandler(debug_handler)
            logger.addHandler(error_handler)

            logger.info('logger created')

            ### preparing spider object.
            settings = project.get_project_settings()
            settings.set('ITEM_PIPELINES', {pipeline: 1}, priority='cmdline')
            spider_loader = spiderloader.SpiderLoader.from_settings(settings)

            spider = spider_loader.load(spider_name)
            spider.html_path = html_path
            spider.proxies = proxies
            spider.use_proxy = use_proxy
            spider.logger = logger
            spider.sqllogger = sqllogger

            spider.repeat_count = 0
            spider.repeat_threshold = 10
            spider.error_count = 0
            spider.error_threshold = 5

            ### starting spider queue and spider
            crawler_runner = CrawlerRunner(settings)  #from Scrapy docs

            deferred = crawler_runner.crawl(spider)
            deferred.addBoth(lambda _: reactor.stop())
            reactor.run()
            q.put(None)

        except Exception as e:
            q.put(e)

コード例 #29

0

ファイルを表示

from twisted.internet import reactor
import scrapy
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from scrapy.utils.project import get_project_settings
from spiders.gladiaspider import GladiaspiderSpider

import sys

configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
settings = get_project_settings()
settings.set('FEED_FORMAT', 'csv')
settings.set('FEED_URI', 'stats.csv')

runner = CrawlerRunner(settings)
if len(sys.argv) > 2:
    d = runner.crawl(GladiaspiderSpider, max_r=sys.argv[1], player=sys.argv[2])
elif len(sys.argv) > 1:
    d = runner.crawl(GladiaspiderSpider, max_r=sys.argv[1])
else:
    d = runner.crawl(GladiaspiderSpider, max_r='100')
d.addBoth(lambda _: reactor.stop())
reactor.run()  # the script will block here until the crawling is finished

コード例 #30

0

ファイルを表示

ファイル: scrapper.py プロジェクト: susmitpy/ScrapyExample

def get_crawl_runner():
    return CrawlerRunner(
    {
    'USER_AGENT': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:87.0) Gecko/20100101 Firefox/87.0',
        "LOG_LEVEL" : "INFO"
    })