Example #1
0
from scrapy.settings.default_settings import SPIDER_MIDDLEWARES, DOWNLOADER_MIDDLEWARES

BOT_NAME = 'topical-spiders'

SPIDER_MODULES = ['topical-spiders.spiders']
NEWSPIDER_MODULE = 'topical-spiders.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'topic (+http://www.yourdomain.com)'

SPIDER_MIDDLEWARES.update({
    'crawlfrontier.contrib.scrapy.middlewares.schedulers.SchedulerSpiderMiddleware': 1000,
})

DOWNLOADER_MIDDLEWARES.update({
    'crawlfrontier.contrib.scrapy.middlewares.schedulers.SchedulerDownloaderMiddleware': 1000,
})

FRONTIER_SETTINGS = 'frontier.settings'

SCHEDULER = 'crawlfrontier.contrib.scrapy.schedulers.frontier.CrawlFrontierScheduler'
SPIDER_MIDDLEWARES.update({
    'crawlfrontier.contrib.scrapy.middlewares.seeds.file.FileSeedLoader': 1,
    'scrapy.contrib.spidermiddleware.depth.DepthMiddleware': None,
    'scrapy.contrib.spidermiddleware.offsite.OffsiteMiddleware': None,
    'scrapy.contrib.spidermiddleware.referer.RefererMiddleware': None,
    'scrapy.contrib.spidermiddleware.urllength.UrlLengthMiddleware': None
})

#SEEDS_SOURCE = 'seeds.txt'
Example #2
0
    'scrapy.spidermiddleware.depth.DepthMiddleware':
    None,
    'scrapy.spidermiddleware.offsite.OffsiteMiddleware':
    None,
    'scrapy.spidermiddleware.referer.RefererMiddleware':
    None,
    'scrapy.spidermiddleware.urllength.UrlLengthMiddleware':
    None
})

DOWNLOADER_MIDDLEWARES.update({
    'crawler.middlewares.IPCheckerMiddleware':
    1,
    #'crawler.middlewares.DenyBlacklistDomainMiddleware': 2,
    #'crawler.middlewares.ReturnQueryURLMiddleware': 3,
    'crawler.middlewares.DownloadTooMuchAtOnceCheckerMiddleware':
    4,
    'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware':
    810,
    'frontera.contrib.scrapy.middlewares.schedulers.SchedulerDownloaderMiddleware':
    1000,
})

ITEM_PIPELINES = {
    'crawler.pipelines.S3Pipeline': 800,
}

SCHEDULER = 'frontera.contrib.scrapy.schedulers.frontier.FronteraScheduler'

HTTPCACHE_ENABLED = False
REDIRECT_ENABLED = True
DOWNLOAD_TIMEOUT = 30
Example #3
0
# -*- coding: utf-8 -*-
from scrapy.settings.default_settings import SPIDER_MIDDLEWARES, DOWNLOADER_MIDDLEWARES

FRONTERA_SETTINGS = 'bc.config.spider'

SCHEDULER = 'frontera.contrib.scrapy.schedulers.frontier.FronteraScheduler'
SPIDER_MIDDLEWARES.update({
    'frontera.contrib.scrapy.middlewares.schedulers.SchedulerSpiderMiddleware': 999,
    'frontera.contrib.scrapy.middlewares.seeds.file.FileSeedLoader': 1,
})
DOWNLOADER_MIDDLEWARES.update({
    'frontera.contrib.scrapy.middlewares.schedulers.SchedulerDownloaderMiddleware': 999,
})

BOT_NAME = 'bc'

SPIDER_MODULES = ['bc.spiders']
NEWSPIDER_MODULE = 'bc.spiders'

CONCURRENT_REQUESTS=256
CONCURRENT_REQUESTS_PER_DOMAIN=1

DOWNLOAD_DELAY=0.0
DOWNLOAD_TIMEOUT=180
RANDOMIZE_DOWNLOAD_DELAY = False

REACTOR_THREADPOOL_MAXSIZE = 30
DNS_TIMEOUT = 120

COOKIES_ENABLED=False
RETRY_ENABLED = False
Example #4
0
    'scrapy.spidermiddleware.urllength.UrlLengthMiddleware':
    None
})

DOWNLOADER_MIDDLEWARES = {
    # Engine side
    'scrapy_splash.SplashCookiesMiddleware': 723,
    'scrapy_splash.SplashMiddleware': 725,
    'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware':
    810,
    # Downloader side
}

DOWNLOADER_MIDDLEWARES.update({
    # Frontera
    'frontera.contrib.scrapy.middlewares.schedulers.SchedulerDownloaderMiddleware':
    1000,
})

SCHEDULER = 'frontera.contrib.scrapy.schedulers.frontier.FronteraScheduler'
SPIDER_MIDDLEWARES.update({
    'scrapy_spider.spider_middlewares.file_seed_loader.SplashFileSeedLoader':
    1,
})

# DOWNLOAD_HANDLERS = {
#    'http': 'scrapy_splash.HTTPDownloadHandler',
#    'https': 'scrapy_splash.HTTPDownloadHandler',
#}

# Splash Settings
Example #5
0
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy.settings.default_settings import DOWNLOADER_MIDDLEWARES

BOT_NAME = 'shopbot'

SPIDER_MODULES = ['shopbot.spiders']
NEWSPIDER_MODULE = 'shopbot.spiders'

# -----------------------------------------------------------------------------
# USER AGENT
# -----------------------------------------------------------------------------

DOWNLOADER_MIDDLEWARES.update({
    'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware':
    None,
    'scrapy_useragents.downloadermiddlewares.useragents.UserAgentsMiddleware':
    500,
})

USER_AGENTS = [
    ('Mozilla/5.0 (X11; Linux x86_64) '
     'AppleWebKit/537.36 (KHTML, like Gecko) '
     'Chrome/57.0.2987.110 '
     'Safari/537.36'),  # chrome
    ('Mozilla/5.0 (X11; Linux x86_64) '
     'AppleWebKit/537.36 (KHTML, like Gecko) '
     'Chrome/61.0.3163.79 '
     'Safari/537.36'),  # chrome
    ('Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:55.0) '
     'Gecko/20100101 '
     'Firefox/55.0')  # firefox