Exemple #1
0
		def login(self, response):
			print "loging..."
			return FormRequest.from_response(response,
                    formdata={'username': Settings().get('MUUSERNAME'), 'userpass': Settings().get('MUPASSWORD')},
                    callback=self.after_login,
					#dont forget dont_filter, without it the after_login will not be loaded.
					dont_filter=True, meta=response.meta)
Exemple #2
0
		def instancialize_database(self):
			if(self.dbase == None):
				dbname = Settings().get('DBNAME')
				dbuser = Settings().get('DBUSERNAME')
				dbpass = Settings().get('DBPASSWORD')
				dbhost = Settings().get('DBHOST')
				dbport = Settings().get('DBPORT')
				
				self.dbase = database.Database(dbname, dbuser,dbpass, dbhost,dbport)
				if(self.dbase.connect() == False):
					self.dbase = None
					raise SystemExit
Exemple #3
0
def shub_conn():
    # don't use default `.get()` property because then it will evaluate
    # `settings.SH_API_KEY` anyway and you might not have setup it locally
    api_key = os.environ.get('SH_API_KEY') or Settings().get('SH_API_KEY')

    # NOTE not really safe when `name` doesn't exist
    return ScrapinghubClient(api_key)
Exemple #4
0
def global_settings(namespace='project_settings'):
    """Unify Scrapy and Scrapinghub settings into one entrypoint.

    See the following for namespaces of Scrapinghub settings:
    http://shub.readthedocs.io/en/stable/custom-images-contract.html#shub-settings

    As a side effect, also try to make it work locally in order to have a smooth experience
    going back and forth between development and playground/production.

    NOTE 1: It turns out that shub doubly escapes settings before putting them
    into env['SHUB_SETTINGS'] It means that '\n' in a settings will be stored
    as '\\\\n' in 'SHUB_SETTINGS'.  json.loads does a single unescape, so we
    need to unescape a second time.

    NOTE 2: A good way of not having to update every file each time we change
    how we import settings is to use it this way:

            from kp_scrapers.lib.services.shub import global_settings as Settings

    """
    raw_shub_settings = (
        os.environ.get('SHUB_SETTINGS', '{}').encode('utf-8').decode('unicode_escape')
    )
    shub_settings = json.loads(raw_shub_settings).get(namespace)
    scrapy_settings = Settings().copy()
    # merge them
    scrapy_settings.update(shub_settings)

    return scrapy_settings
Exemple #5
0
def run_single_spider(spider_name,cin):
    settings = Settings()

    process = CrawlerProcess(settings)

    spider_loader = spiderloader.SpiderLoader.from_settings(settings)

    try:
        spider_object = spider_loader.load(spider_name,cin=cin)

        process.crawl(spider_object)

        process.start()

        # post_execution = SpiderPostExecution()
        # post_execution.engine_stopped([spider_object])
    except Exception as e:
        raise (e)
Exemple #6
0
def run_spiders():
    settings = Settings()

    process = CrawlerProcess(settings)

    spider_loader = spiderloader.SpiderLoader.from_settings(settings)
    spiders = spider_loader.list()
    spider_objects = [spider_loader.load(name) for name in spiders]

    exec_report.number_of_scrapers = len(spider_objects)

    for spider in spider_objects:
        process.crawl(spider)

    process.start()

    post_execution = SpiderPostExecution()
    post_execution.engine_stopped(spider_objects)
    def __init__(self, useCache=True):

        # Initialize the required resources
        # Scrapy needs to run inside twisted reactor -- Start the process
        configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
        self.settings = {
            'DOWNLOAD_DELAY': 3,
            'CONCURRENT_REQUESTS': 20,
            'ROBOTSTXT_OBEY': False,
            'USER_AGENT':
            'Mozilla/5.0 (X11; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0',
            'AUTOTHROTTLE_ENABLED': True,
            'HTTPCACHE_ENABLED': False,  # Cache enabled for testing
            'HTTPCACHE_EXPIRATION_SECS': 0,
            'TELNETCONSOLE_PORT': None,
            'RETRY_ENABLED': False,
            'REDIRECT_ENABLED': False,
            'COOKIES_ENABLED': False,
            'REACTOR_THREADPOOL_MAXSIZE': 20,
            'DOWNLOAD_TIMEOUT': 30,  # To avoid loss of entries?
            # Retry many times since proxies often fail
            'RETRY_TIMES': 10,
            # Retry on most error codes since proxies fail for different reasons
            'RETRY_HTTP_CODES': [500, 503, 504, 400, 403, 404, 408],
            'DOWNLOADER_MIDDLEWARES': {
                'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware':
                None,
                'scrapy.downloadermiddlewares.retry.RetryMiddleware':
                90,
                'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware':
                610,
                'random_useragent.RandomUserAgentMiddleware':
                400,
                'rotating_proxies.middlewares.RotatingProxyMiddleware':
                110,
                'rotating_proxies.middlewares.BanDetectionMiddleware':
                620,
            },
            'PROXY_LIST': PROXY_PATH,
            'PROXY_MODE': 0,
            'USER_AGENT_LIST': USER_PATH
        }
        self.crawlRunner = CrawlerRunner(Settings(self.settings))
Exemple #8
0
    def runSpider(symbol):

        # Scrapy needs to run inside twisted reactor -- Start the process
        configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
        settings = Settings({
            'ITEM_PIPELINES': {
                'src.server.Crawler.JSONPipeline.JSONPipeline': 100,
                'src.server.Crawler.RedisPipeline.RedisPipeline': 200
            },
            'DOWNLOAD_DELAY': 2,
            'ROBOTSTXT_OBEY': True,
            'USER_AGENT': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
            'AUTOTHROTTLE_ENABLED': True,
            'HTTPCACHE_ENABLED': False
        })

        runner = CrawlerRunner(settings=settings)

        d = runner.crawl(NASDAQSpider, symbol=symbol)
        d.addBoth(lambda _: reactor.stop())  # Callback to stop the reactor
        reactor.run()  # the script will block here until the crawling is finished
Exemple #9
0
# -*- coding: utf-8 -*-
import scrapy
import os
import json
from ..items import get_item_and_loader

from scrapy.utils.project import get_project_settings as Settings
settings = Settings()


class BibleSpider(scrapy.Spider):
    name = "bible"
    allowed_domains = ["www.audiobible.com"]
    start_urls = []

    def __init__(self,
                 data_store=settings.get('DATA_STORE'),
                 content_file=settings.get('CONTENT_FILE')):
        if data_store and content_file:
            p = os.path.join(data_store, content_file)
            if os.path.exists(p) and not os.stat(p).st_size == 0:
                with open(os.path.join(data_store, content_file)) as f:
                    for line in f.readlines():
                        data = json.loads(line)
                        if data['urls']:
                            for url in data['urls']:
                                self.start_urls.append(url)
            else:
                self.start_urls.append(
                    'http://www.audiobible.com/bible/bible.html')
Exemple #10
0
# -*- coding: utf-8 -*-

from scrapy.contrib.spiders.init import InitSpider
from scrapy.http import Request
from scrapy.selector import Selector
from decouple import config
import re
import json
import requests
from urlparse import urljoin
from scrapy.utils.project import get_project_settings as Settings

STGS = Settings()

CITY_FROM = 'REC'
CITY_TO = 'RIO'
NUMBER_ADULTS = 2
DATE_GO = '2014-09-20'
DATE_BACK = '2014-09-25'


class JSCallError(Exception):
    def __init__(self, message):
        super(JSCallError, self).__init__(message)
        self.message = u'Erro na chamada da função.'


def response_to_file(name, response):
    with open(name, 'wb') as f:
        f.write(response.body)
Exemple #11
0
 runner = CrawlerRunner(settings=Settings({
     'DOWNLOAD_DELAY':
     3,
     'CONCURRENT_REQUESTS':
     20,
     'ROBOTSTXT_OBEY':
     False,
     'USER_AGENT':
     'Mozilla/5.0 (X11; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0',
     'AUTOTHROTTLE_ENABLED':
     True,
     'HTTPCACHE_ENABLED':
     False,  # Cache enabled for testing
     'HTTPCACHE_EXPIRATION_SECS':
     0,
     'TELNETCONSOLE_PORT':
     None,
     'RETRY_ENABLED':
     False,
     'REDIRECT_ENABLED':
     True,
     'COOKIES_ENABLED':
     False,
     'REACTOR_THREADPOOL_MAXSIZE':
     20,
     'DOWNLOAD_TIMEOUT':
     30,  # To avoid loss of entries?
     # Retry many times since proxies often fail
     'RETRY_TIMES':
     10,
     # Retry on most error codes since proxies fail for different reasons
     'RETRY_HTTP_CODES': [500, 503, 504, 400, 403, 404, 408],
     'DOWNLOADER_MIDDLEWARES': {
         'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware':
         None,
         'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90,
         'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 610,
         'random_useragent.RandomUserAgentMiddleware': 400,
         'rotating_proxies.middlewares.RotatingProxyMiddleware': 110,
         'rotating_proxies.middlewares.BanDetectionMiddleware': 620,
     },
     'PROXY_LIST':
     PROXY_PATH,
     'PROXY_MODE':
     0,
     'USER_AGENT_LIST':
     USER_PATH
 }))