Example #1
0
# default. All the other settings are documented here:
#
#     http://doc.scrapy.org/en/latest/topics/settings.html
#
from scrapy.settings.default_settings import SPIDER_MIDDLEWARES, DOWNLOADER_MIDDLEWARES

BOT_NAME = 'topical-spiders'

SPIDER_MODULES = ['topical-spiders.spiders']
NEWSPIDER_MODULE = 'topical-spiders.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'topic (+http://www.yourdomain.com)'

SPIDER_MIDDLEWARES.update({
    'crawlfrontier.contrib.scrapy.middlewares.schedulers.SchedulerSpiderMiddleware': 1000,
})

DOWNLOADER_MIDDLEWARES.update({
    'crawlfrontier.contrib.scrapy.middlewares.schedulers.SchedulerDownloaderMiddleware': 1000,
})

FRONTIER_SETTINGS = 'frontier.settings'

SCHEDULER = 'crawlfrontier.contrib.scrapy.schedulers.frontier.CrawlFrontierScheduler'
SPIDER_MIDDLEWARES.update({
    'crawlfrontier.contrib.scrapy.middlewares.seeds.file.FileSeedLoader': 1,
    'scrapy.contrib.spidermiddleware.depth.DepthMiddleware': None,
    'scrapy.contrib.spidermiddleware.offsite.OffsiteMiddleware': None,
    'scrapy.contrib.spidermiddleware.referer.RefererMiddleware': None,
    'scrapy.contrib.spidermiddleware.urllength.UrlLengthMiddleware': None
Example #2
0
DEFAULT_REQUEST_HEADERS = {
    'Accept':
    'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'en',
}

ITEM_PIPELINES = {
    'scrapy_spider.pipelines.MySQLStorePipeline': 300,
}

SPIDER_MIDDLEWARES.update({
    'frontera.contrib.scrapy.middlewares.schedulers.SchedulerSpiderMiddleware':
    1000,
    'scrapy.spidermiddleware.depth.DepthMiddleware':
    None,
    'scrapy.spidermiddleware.offsite.OffsiteMiddleware':
    None,
    'scrapy.spidermiddleware.referer.RefererMiddleware':
    None,
    'scrapy.spidermiddleware.urllength.UrlLengthMiddleware':
    None
})

DOWNLOADER_MIDDLEWARES = {
    # Engine side
    'scrapy_splash.SplashCookiesMiddleware': 723,
    'scrapy_splash.SplashMiddleware': 725,
    'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware':
    810,
    # Downloader side
}
Example #3
0
# Obey robots.txt rules
ROBOTSTXT_OBEY = False

DEFAULT_REQUEST_HEADERS = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'en',
}

ITEM_PIPELINES = {
    'scrapy_spider.pipelines.MySQLStorePipeline': 300,
}

SPIDER_MIDDLEWARES.update({
    'frontera.contrib.scrapy.middlewares.schedulers.SchedulerSpiderMiddleware': 1000,
    'scrapy.spidermiddleware.depth.DepthMiddleware': None,
    'scrapy.spidermiddleware.offsite.OffsiteMiddleware': None,
    'scrapy.spidermiddleware.referer.RefererMiddleware': None,
    'scrapy.spidermiddleware.urllength.UrlLengthMiddleware': None
})

DOWNLOADER_MIDDLEWARES = {
    # Engine side
    'scrapy_splash.SplashCookiesMiddleware': 723,
    'scrapy_splash.SplashMiddleware': 725,
    'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
    # Downloader side
}

DOWNLOADER_MIDDLEWARES.update({
    # Frontera
    'frontera.contrib.scrapy.middlewares.schedulers.SchedulerDownloaderMiddleware': 1000,
Example #4
0
# -*- coding: utf-8 -*-
from scrapy.settings.default_settings import SPIDER_MIDDLEWARES, DOWNLOADER_MIDDLEWARES

FRONTERA_SETTINGS = 'bc.config.spider'

SCHEDULER = 'frontera.contrib.scrapy.schedulers.frontier.FronteraScheduler'
SPIDER_MIDDLEWARES.update({
    'frontera.contrib.scrapy.middlewares.schedulers.SchedulerSpiderMiddleware': 999,
    'frontera.contrib.scrapy.middlewares.seeds.file.FileSeedLoader': 1,
})
DOWNLOADER_MIDDLEWARES.update({
    'frontera.contrib.scrapy.middlewares.schedulers.SchedulerDownloaderMiddleware': 999,
})

BOT_NAME = 'bc'

SPIDER_MODULES = ['bc.spiders']
NEWSPIDER_MODULE = 'bc.spiders'

CONCURRENT_REQUESTS=256
CONCURRENT_REQUESTS_PER_DOMAIN=1

DOWNLOAD_DELAY=0.0
DOWNLOAD_TIMEOUT=180
RANDOMIZE_DOWNLOAD_DELAY = False

REACTOR_THREADPOOL_MAXSIZE = 30
DNS_TIMEOUT = 120

COOKIES_ENABLED=False
RETRY_ENABLED = False
Example #5
0
# -*- coding: utf-8 -*-
from scrapy.settings.default_settings import SPIDER_MIDDLEWARES, DOWNLOADER_MIDDLEWARES

FRONTERA_SETTINGS = 'bc.config.spider'

SCHEDULER = 'frontera.contrib.scrapy.schedulers.frontier.FronteraScheduler'
SPIDER_MIDDLEWARES.update({
    'frontera.contrib.scrapy.middlewares.schedulers.SchedulerSpiderMiddleware':
    999,
    'frontera.contrib.scrapy.middlewares.seeds.file.FileSeedLoader':
    1,
})
DOWNLOADER_MIDDLEWARES.update({
    'frontera.contrib.scrapy.middlewares.schedulers.SchedulerDownloaderMiddleware':
    999,
})

BOT_NAME = 'bc'

SPIDER_MODULES = ['bc.spiders']
NEWSPIDER_MODULE = 'bc.spiders'

CONCURRENT_REQUESTS = 256
CONCURRENT_REQUESTS_PER_DOMAIN = 1

DOWNLOAD_DELAY = 0.0
DOWNLOAD_TIMEOUT = 180
RANDOMIZE_DOWNLOAD_DELAY = False

REACTOR_THREADPOOL_MAXSIZE = 30
DNS_TIMEOUT = 120