Example #1
0
 def _print_setting(self, opts):
     if opts.get:
         print settings_.get(opts.get)
     elif opts.getbool:
         print settings_.getbool(opts.getbool)
     elif opts.getint:
         print settings_.getint(opts.getint)
     elif opts.getfloat:
         print settings_.getfloat(opts.getfloat)
     elif opts.getlist:
         print settings_.getlist(opts.getlist)
Example #2
0
    def __init__(self):
        if not settings.getbool('MEMUSAGE_ENABLED'):
            raise NotConfigured
        if not procfs_supported():
            raise NotConfigured

        self.warned = False
        self.notify_mails = settings.getlist('MEMUSAGE_NOTIFY')
        self.limit = settings.getint('MEMUSAGE_LIMIT_MB') * 1024 * 1024
        self.warning = settings.getint('MEMUSAGE_WARNING_MB') * 1024 * 1024
        self.report = settings.getbool('MEMUSAGE_REPORT')
        self.mail = MailSender()
        dispatcher.connect(self.engine_started, signal=signals.engine_started)
        dispatcher.connect(self.engine_stopped, signal=signals.engine_stopped)
Example #3
0
    def __init__(self):
        if not settings.getbool('MEMUSAGE_ENABLED'):
            raise NotConfigured
        if not os.path.exists('/proc'):
            raise NotConfigured

        self.warned = False
        self.notify_mails = settings.getlist('MEMUSAGE_NOTIFY')
        self.limit = settings.getint('MEMUSAGE_LIMIT_MB')*1024*1024
        self.warning = settings.getint('MEMUSAGE_WARNING_MB')*1024*1024
        self.report = settings.getbool('MEMUSAGE_REPORT')
        self.mail = MailSender()
        dispatcher.connect(self.engine_started, signal=signals.engine_started)
        dispatcher.connect(self.engine_stopped, signal=signals.engine_stopped)
Example #4
0
    def __init__(self):
        if not settings.getbool("MEMUSAGE_ENABLED"):
            raise NotConfigured
        if not procfs_supported():
            raise NotConfigured

        self.warned = False
        self.notify_mails = settings.getlist("MEMUSAGE_NOTIFY")
        self.limit = settings.getint("MEMUSAGE_LIMIT_MB") * 1024 * 1024
        self.warning = settings.getint("MEMUSAGE_WARNING_MB") * 1024 * 1024
        self.report = settings.getbool("MEMUSAGE_REPORT")
        self.mail = MailSender()
        dispatcher.connect(self.engine_started, signal=signals.engine_started)
        dispatcher.connect(self.engine_stopped, signal=signals.engine_stopped)
Example #5
0
 def __init__(self):
     if not settings.getbool('REDIRECT_ENABLED'):
         raise NotConfigured
     self.max_metarefresh_delay = settings.getint(
         'REDIRECT_MAX_METAREFRESH_DELAY')
     self.max_redirect_times = settings.getint('REDIRECT_MAX_TIMES')
     self.priority_adjust = settings.getint('REDIRECT_PRIORITY_ADJUST')
Example #6
0
 def __init__(self):
     if not settings.getbool('TELNETCONSOLE_ENABLED'):
         raise NotConfigured
     self.protocol = makeProtocol
     self.noisy = False
     port = settings.getint('TELNETCONSOLE_PORT')
     reactor.callWhenRunning(reactor.listenTCP, port, self)
Example #7
0
 def get_exporter_and_file(self):
     format = settings['EXPORT_FORMAT']
     filename = settings['EXPORT_FILE']
     if not format or not filename:
         raise NotConfigured
     exp_kwargs = {
         'fields_to_export': settings.getlist('EXPORT_FIELDS') or None,
         'export_empty_fields': settings.getbool('EXPORT_EMPTY', False),
         'encoding': settings.get('EXPORT_ENCODING', 'utf-8'),
     }
     file = open(filename, 'wb')
     if format == 'xml':
         exp = exporter.XmlItemExporter(file, **exp_kwargs)
     elif format == 'csv':
         exp = exporter.CsvItemExporter(file, **exp_kwargs)
     elif format == 'csv_headers':
         exp = exporter.CsvItemExporter(file, include_headers_line=True, \
             **exp_kwargs)
     elif format == 'pprint':
         exp = exporter.PprintItemExporter(file, **exp_kwargs)
     elif format == 'pickle':
         exp = exporter.PickleItemExporter(file, **exp_kwargs)
     elif format == 'json':
         exp = exporter.JsonLinesItemExporter(file, **exp_kwargs)
     elif format == 'jsonlines':
         exp = exporter.JsonItemExporter(file, **exp_kwargs)
     else:
         raise NotConfigured("Unsupported export format: %s" % format)
     return exp, file
Example #8
0
 def __init__(self, settings=settings):
     self.ignore_missing = settings.getbool('HTTPCACHE_IGNORE_MISSING',
                                            False)
     self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES',
                                            ['file'])
     self.ignore_http_codes = map(
         int, settings.getlist('HTTPCACHE_IGNORE_HTTP_CODES', []))
Example #9
0
 def __init__(self):
     if not settings.getbool('TELNETCONSOLE_ENABLED'):
         raise NotConfigured
     self.noisy = False
     self.portrange = map(int, settings.getlist('TELNETCONSOLE_PORT'))
     self.host = settings['TELNETCONSOLE_HOST']
     dispatcher.connect(self.start_listening, signals.engine_started)
     dispatcher.connect(self.stop_listening, signals.engine_stopped)
Example #10
0
def start(logfile=None, loglevel=None, logstdout=None):
    """Initialize and start logging facility"""
    global log_level, started

    if started or not settings.getbool('LOG_ENABLED'):
        return
    log_level = _get_log_level(loglevel)
    started = True

    # set log observer
    if log.defaultObserver: # check twisted log not already started
        logfile = logfile or settings['LOG_FILE'] or settings['LOGFILE']
        if logstdout is None:
            logstdout = settings.getbool('LOG_STDOUT')

        file = open(logfile, 'a') if logfile else sys.stderr
        log.startLogging(file, setStdout=logstdout)
Example #11
0
 def __init__(self):
     if not settings.getbool('AUTOTHROTTLE_ENABLED'):
         raise NotConfigured
     dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
     dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
     dispatcher.connect(self.response_received, signal=signals.response_received)
     self.last_latencies = {}
     self.last_lat = {}
Example #12
0
 def __init__(self):
     if not settings.getbool('WEBCONSOLE_ENABLED'):
         raise NotConfigured
     logfile = settings['WEBCONSOLE_LOGFILE']
     server.Site.__init__(self, WebConsoleResource(), logPath=logfile)
     self.noisy = False
     port = settings.getint('WEBCONSOLE_PORT')
     reactor.callWhenRunning(reactor.listenTCP, port, self)
Example #13
0
    def __init__(self):
        if not settings.getbool('ROBOTSTXT_OBEY'):
            raise NotConfigured

        self._parsers = {}
        self._spider_netlocs = {}
        self._useragents = {}
        dispatcher.connect(self.spider_opened, signals.spider_opened)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
Example #14
0
 def create_report(self, figures):
     s = ""
     s += "SCRAPY MEMORY DEBUGGER RESULTS\n\n"
     for f in figures:
         s += "%-30s : %d %s\n" % f
     if settings.getbool('TRACK_REFS'):
         s += os.linesep
         s += format_live_refs()
     return s
Example #15
0
    def __init__(self):
        if not settings.getbool('ROBOTSTXT_OBEY'):
            raise NotConfigured

        self._parsers = {}
        self._spider_netlocs = {}
        self._useragents = {}
        dispatcher.connect(self.spider_opened, signals.spider_opened)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
Example #16
0
    def __init__(self):
        if not settings.getbool('MEMDEBUG_ENABLED'):
            raise NotConfigured

        self.mail = MailSender()
        self.rcpts = settings.getlist('MEMDEBUG_NOTIFY')

        dispatcher.connect(self.engine_started, signals.engine_started)
        dispatcher.connect(self.engine_stopped, signals.engine_stopped)
Example #17
0
class CookiesMiddleware(object):
    """This middleware enables working with sites that need cookies"""
    debug = settings.getbool('COOKIES_DEBUG')

    def __init__(self):
        if not settings.getbool('COOKIES_ENABLED'):
            raise NotConfigured
        self.jars = defaultdict(CookieJar)

    def process_request(self, request, spider):
        if 'dont_merge_cookies' in request.meta:
            return

        cookiejarkey = request.meta.get("cookiejar")
        jar = self.jars[cookiejarkey]
        cookies = self._get_request_cookies(jar, request)
        for cookie in cookies:
            jar.set_cookie_if_ok(cookie, request)

        # set Cookie header
        request.headers.pop('Cookie', None)
        jar.add_cookie_header(request)
        self._debug_cookie(request, spider)

    def process_response(self, request, response, spider):
        if 'dont_merge_cookies' in request.meta:
            return response

        # extract cookies from Set-Cookie and drop invalid/expired cookies
        cookiejarkey = request.meta.get("cookiejar")
        jar = self.jars[cookiejarkey]
        jar.extract_cookies(response, request)
        self._debug_set_cookie(response, spider)

        return response

    def _debug_cookie(self, request, spider):
        if self.debug:
            cl = request.headers.getlist('Cookie')
            if cl:
                msg = "Sending cookies to: %s" % request + os.linesep
                msg += os.linesep.join("Cookie: %s" % c for c in cl)
                log.msg(msg, spider=spider, level=log.DEBUG)

    def _debug_set_cookie(self, response, spider):
        if self.debug:
            cl = response.headers.getlist('Set-Cookie')
            if cl:
                msg = "Received cookies from: %s" % response + os.linesep
                msg += os.linesep.join("Set-Cookie: %s" % c for c in cl)
                log.msg(msg, spider=spider, level=log.DEBUG)

    def _get_request_cookies(self, jar, request):
        headers = {'Set-Cookie': ['%s=%s;' % (k, v) for k, v in request.cookies.iteritems()]}
        response = Response(request.url, headers=headers)
        cookies = jar.make_cookies(response, request)
        return cookies
Example #18
0
 def __init__(self):
     if not settings.getbool('SPIDERPROFILER_ENABLED'):
         raise NotConfigured
     try:
         get_vmvalue_from_procfs('VmSize')
     except RuntimeError:
         self._mem_tracking = False
     else:
         self._mem_tracking = True
     dispatcher.connect(self._request_received, signals.request_received)
Example #19
0
    def __init__(self, stats, settings=settings):
        # Required settings
        self.S3_ACCESS_KEY = settings.get('AWS_ACCESS_KEY_ID')
        self.S3_SECRET_KEY = settings.get('AWS_SECRET_ACCESS_KEY')
        self.S3_CACHE_BUCKET = settings.get('HISTORY_S3_BUCKET')

        # Optional settings
        self.use_proxy = settings.getbool('HISTORY_USE_PROXY', True)
        self.SAVE_SOURCE = settings.get('HISTORY_SAVE_SOURCE')
        self.stats = stats
Example #20
0
    def _response_downloaded(self, response, callback, cb_kwargs, follow):
        if callback:
            cb_res = callback(response, **cb_kwargs) or ()
            cb_res = self.process_results(response, cb_res)
            for requests_or_item in iterate_spider_output(cb_res):
                yield requests_or_item

        if follow and settings.getbool('CRAWLSPIDER_FOLLOW_LINKS', True):
            for request_or_item in self._requests_to_follow(response):
                yield request_or_item
Example #21
0
def start(logfile=None, loglevel=None, logstdout=None):
    global started
    if started or not settings.getbool('LOG_ENABLED'):
        return
    started = True

    if log.defaultObserver: # check twisted log not already started
        loglevel = _get_log_level(loglevel)
        logfile = logfile or settings['LOG_FILE']
        file = open(logfile, 'a') if logfile else sys.stderr
        if logstdout is None:
            logstdout = settings.getbool('LOG_STDOUT')
        sflo = ScrapyFileLogObserver(file, loglevel, settings['LOG_ENCODING'])
        _oldshowwarning = warnings.showwarning
        log.startLoggingWithObserver(sflo.emit, setStdout=logstdout)
        # restore warnings, wrongly silenced by Twisted
        warnings.showwarning = _oldshowwarning
        msg("Scrapy %s started (bot: %s)" % (scrapy.__version__, \
            settings['BOT_NAME']))
Example #22
0
File: log.py Project: errord/scrapy
def start(logfile=None, loglevel=None, logstdout=None):
    global started
    if started or not settings.getbool('LOG_ENABLED'):
        return
    started = True

    if log.defaultObserver:  # check twisted log not already started
        loglevel = _get_log_level(loglevel)
        logfile = logfile or settings['LOG_FILE']
        file = open(logfile, 'a') if logfile else sys.stderr
        if logstdout is None:
            logstdout = settings.getbool('LOG_STDOUT')
        sflo = ScrapyFileLogObserver(file, loglevel, settings['LOG_ENCODING'])
        _oldshowwarning = warnings.showwarning
        log.startLoggingWithObserver(sflo.emit, setStdout=logstdout)
        # restore warnings, wrongly silenced by Twisted
        warnings.showwarning = _oldshowwarning
        msg("Scrapy %s started (bot: %s)" % (scrapy.__version__, \
            settings['BOT_NAME']))
Example #23
0
    def _response_downloaded(self, response, callback, cb_kwargs, follow):
        if callback:
            cb_res = callback(response, **cb_kwargs) or ()
            cb_res = self.process_results(response, cb_res)
            for requests_or_item in iterate_spider_output(cb_res):
                yield requests_or_item

        if follow and settings.getbool('CRAWLSPIDER_FOLLOW_LINKS', True):
            for request_or_item in self._requests_to_follow(response):
                yield request_or_item
Example #24
0
    def __init__(self):
        super(SimpledbStatsCollector, self).__init__()
        self._sdbdomain = settings['STATS_SDB_DOMAIN']
        self._access_key = settings['AWS_ACCESS_KEY_ID']
        self._secret_key = settings['AWS_SECRET_ACCESS_KEY']

        self._async = settings.getbool('STATS_SDB_ASYNC')
        import boto
        self.connect_sdb = boto.connect_sdb
        self.connect_sdb(aws_access_key_id=self._access_key, aws_secret_access_key=self._secret_key).create_domain(self._sdbdomain)
Example #25
0
    def __init__(self):
        try:
            import libxml2
            self.libxml2 = libxml2
        except ImportError:
            self.libxml2 = None
        if not settings.getbool('MEMDEBUG_ENABLED'):
            raise NotConfigured

        dispatcher.connect(self.engine_started, signals.engine_started)
        dispatcher.connect(self.engine_stopped, signals.engine_stopped)
Example #26
0
 def engine_stopped(self):
     if self.libxml2:
         self.libxml2.cleanupParser()
         stats.set_value('memdebug/libxml2_leaked_bytes', self.libxml2.debugMemory(1))
     gc.collect()
     stats.set_value('memdebug/gc_garbage_count', len(gc.garbage))
     if settings.getbool('TRACK_REFS'):
         for cls, wdict in live_refs.iteritems():
             if not wdict:
                 continue
             stats.set_value('memdebug/live_refs/%s' % cls.__name__, len(wdict))
Example #27
0
    def __init__(self):
        try:
            import libxml2
            self.libxml2 = libxml2
        except ImportError:
            self.libxml2 = None
        if not settings.getbool('MEMDEBUG_ENABLED'):
            raise NotConfigured

        dispatcher.connect(self.engine_started, signals.engine_started)
        dispatcher.connect(self.engine_stopped, signals.engine_stopped)
Example #28
0
    def __init__(self):
        super(SimpledbStatsCollector, self).__init__()
        self._sdbdomain = settings['STATS_SDB_DOMAIN']
        self._access_key = settings['AWS_ACCESS_KEY_ID']
        self._secret_key = settings['AWS_SECRET_ACCESS_KEY']

        self._async = settings.getbool('STATS_SDB_ASYNC')
        import boto
        self.connect_sdb = boto.connect_sdb
        self.connect_sdb(aws_access_key_id=self._access_key,
                         aws_secret_access_key=self._secret_key).create_domain(
                             self._sdbdomain)
Example #29
0
    def __init__(self):
        if not settings.getbool("GROUPSETTINGS_ENABLED"):
            raise NotConfigured

        if command_executed and command_executed["name"] == "crawl":
            mod = __import__(settings["GROUPSETTINGS_MODULE"], {}, {}, [""])
            args = command_executed["args"]
            if len(args) == 1 and not args[0].startswith("http://"):
                domain = args[0]
                settings.overrides.update(mod.default_settings)
                for group, domains in mod.group_spiders.iteritems():
                    if domain in domains:
                        settings.overrides.update(mod.group_settings.get(group, {}))
Example #30
0
 def engine_stopped(self):
     if self.libxml2:
         self.libxml2.cleanupParser()
         stats.set_value('memdebug/libxml2_leaked_bytes',
                         self.libxml2.debugMemory(1))
     gc.collect()
     stats.set_value('memdebug/gc_garbage_count', len(gc.garbage))
     if settings.getbool('TRACK_REFS'):
         for cls, wdict in live_refs.iteritems():
             if not wdict:
                 continue
             stats.set_value('memdebug/live_refs/%s' % cls.__name__,
                             len(wdict))
Example #31
0
    def __init__(self):
        try:
            import libxml2
            self.libxml2 = libxml2
        except ImportError:
            self.libxml2 = None
        if not settings.getbool('MEMDEBUG_ENABLED'):
            raise NotConfigured

        self.mail = MailSender()
        self.rcpts = settings.getlist('MEMDEBUG_NOTIFY')

        dispatcher.connect(self.engine_started, signals.engine_started)
        dispatcher.connect(self.engine_stopped, signals.engine_stopped)
Example #32
0
 def __init__(self):
     if not settings.getbool('WEBSERVICE_ENABLED'):
         raise NotConfigured
     logfile = settings['WEBSERVICE_LOGFILE']
     port = settings.getint('WEBSERVICE_PORT')
     root = RootResource()
     reslist = build_component_list(settings['WEBSERVICE_RESOURCES_BASE'], \
         settings['WEBSERVICE_RESOURCES'])
     for res_cls in map(load_object, reslist):
         res = res_cls()
         root.putChild(res.ws_name, res)
     server.Site.__init__(self, root, logPath=logfile)
     self.noisy = False
     reactor.callWhenRunning(reactor.listenTCP, port, self)
Example #33
0
    def _response_downloaded(self, response, callback, cb_kwargs, follow):
        """
        This is were any response arrives, and were it's decided whether
        to extract links or not from it, and if it will be parsed or not.
        It returns a list of requests/items.
        """
        if callback:
            cb_res = callback(response, **cb_kwargs) or ()
            cb_res = self.process_results(response, cb_res)
            for requests_or_item in iterate_spider_output(cb_res):
                yield requests_or_item

        if follow and settings.getbool("CRAWLSPIDER_FOLLOW_LINKS", True):
            for request_or_item in self._requests_to_follow(response):
                yield request_or_item
Example #34
0
 def __init__(self, settings):
     self.settings = settings
     self.urifmt = settings['FEED_URI']
     if not self.urifmt:
         raise NotConfigured
     self.format = settings['FEED_FORMAT'].lower()
     self.storages = self._load_components('FEED_STORAGES')
     self.exporters = self._load_components('FEED_EXPORTERS')
     if not self._storage_supported(self.urifmt):
         raise NotConfigured
     if not self._exporter_supported(self.format):
         raise NotConfigured
     self.store_empty = settings.getbool('FEED_STORE_EMPTY')
     uripar = settings['FEED_URI_PARAMS']
     self._uripar = load_object(uripar) if uripar else lambda x, y: None
Example #35
0
 def __init__(self, settings):
     self.settings = settings
     self.urifmt = settings['FEED_URI']
     if not self.urifmt:
         raise NotConfigured
     self.format = settings['FEED_FORMAT'].lower()
     self.storages = self._load_components('FEED_STORAGES')
     self.exporters = self._load_components('FEED_EXPORTERS')
     if not self._storage_supported(self.urifmt):
         raise NotConfigured
     if not self._exporter_supported(self.format):
         raise NotConfigured
     self.store_empty = settings.getbool('FEED_STORE_EMPTY')
     uripar = settings['FEED_URI_PARAMS']
     self._uripar = load_object(uripar) if uripar else lambda x, y: None
Example #36
0
 def __init__(self):
     if not settings.getbool('WEBSERVICE_ENABLED'):
         raise NotConfigured
     logfile = settings['WEBSERVICE_LOGFILE']
     self.portrange = map(int, settings.getlist('WEBSERVICE_PORT'))
     self.host = settings['WEBSERVICE_HOST']
     root = RootResource()
     reslist = build_component_list(settings['WEBSERVICE_RESOURCES_BASE'], \
         settings['WEBSERVICE_RESOURCES'])
     for res_cls in map(load_object, reslist):
         res = res_cls()
         root.putChild(res.ws_name, res)
     server.Site.__init__(self, root, logPath=logfile)
     self.noisy = False
     dispatcher.connect(self.start_listening, signals.engine_started)
     dispatcher.connect(self.stop_listening, signals.engine_stopped)
Example #37
0
    def __init__(self):
        if settings.getbool('GIT_CACHE_ENABLED'):
            cachedir = settings['HTTPCACHE_DIR']
            if os.path.exists(cachedir):
                self.work_tree = cachedir
            else:
                self.work_tree = os.path.join(os.path.dirname(settings['PROJECT_ROOT']), '.scrapy', cachedir)

            self.basecmd = [
                'git',
                '--git-dir=%s' % os.path.join(self.work_tree, '.git'),
                '--work-tree=%s' % self.work_tree,
            ]

            dispatcher.connect(self.engine_started, signal=signals.engine_started)
            dispatcher.connect(self.engine_stopped, signal=signals.engine_stopped)
Example #38
0
 def __init__(self):
     if not settings.getbool('WEBSERVICE_ENABLED'):
         raise NotConfigured
     logfile = settings['WEBSERVICE_LOGFILE']
     self.portrange = map(int, settings.getlist('WEBSERVICE_PORT'))
     self.host = settings['WEBSERVICE_HOST']
     root = RootResource()
     reslist = build_component_list(settings['WEBSERVICE_RESOURCES_BASE'], \
         settings['WEBSERVICE_RESOURCES'])
     for res_cls in map(load_object, reslist):
         res = res_cls()
         root.putChild(res.ws_name, res)
     server.Site.__init__(self, root, logPath=logfile)
     self.noisy = False
     dispatcher.connect(self.start_listening, signals.engine_started)
     dispatcher.connect(self.stop_listening, signals.engine_stopped)
Example #39
0
 def __init__(self):
     self.urifmt = settings['FEED_URI']
     if not self.urifmt:
         raise NotConfigured
     self.format = settings['FEED_FORMAT'].lower()
     self.storages = self._load_components('FEED_STORAGES')
     self.exporters = self._load_components('FEED_EXPORTERS')
     if not self._storage_supported(self.urifmt):
         raise NotConfigured
     if not self._exporter_supported(self.format):
         raise NotConfigured
     self.store_empty = settings.getbool('FEED_STORE_EMPTY')
     uripar = settings['FEED_URI_PARAMS']
     self._uripar = load_object(uripar) if uripar else lambda x, y: None
     self.slots = {}
     dispatcher.connect(self.open_spider, signals.spider_opened)
     dispatcher.connect(self.close_spider, signals.spider_closed)
     dispatcher.connect(self.item_passed, signals.item_passed)
Example #40
0
 def __init__(self):
     self.urifmt = settings['FEED_URI']
     if not self.urifmt:
         raise NotConfigured
     self.format = settings['FEED_FORMAT'].lower()
     self.storages = self._load_components('FEED_STORAGES')
     self.exporters = self._load_components('FEED_EXPORTERS')
     if not self._storage_supported(self.urifmt):
         raise NotConfigured
     if not self._exporter_supported(self.format):
         raise NotConfigured
     self.store_empty = settings.getbool('FEED_STORE_EMPTY')
     uripar = settings['FEED_URI_PARAMS']
     self._uripar = load_object(uripar) if uripar else lambda x, y: None
     self.slots = {}
     dispatcher.connect(self.open_spider, signals.spider_opened)
     dispatcher.connect(self.close_spider, signals.spider_closed)
     dispatcher.connect(self.item_scraped, signals.item_scraped)
    def __init__(self, settings=settings):
        history = settings.get("HISTORY", None)
        if not history:
            raise NotConfigured()

        # EPOCH:
        #   == False: don't retrieve historical data
        #   == True : retrieve most recent version
        #   == datetime(): retrieve next version after datetime()
        self.epoch = self.parse_epoch(settings.get("EPOCH", False))

        self.retrieve_if = load_object(history.get("RETRIEVE_IF", "history.logic.RetrieveNever"))(settings)
        self.store_if = load_object(history.get("STORE_IF", "history.logic.StoreAlways"))(settings)
        self.storage = load_object(history.get("BACKEND", "history.storage.S3CacheStorage"))(settings)
        self.ignore_missing = settings.getbool("HTTPCACHE_IGNORE_MISSING")

        dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
        dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
Example #42
0
    def __init__(self):
        if settings.getbool('GIT_CACHE_ENABLED'):
            cachedir = settings['HTTPCACHE_DIR']
            if os.path.exists(cachedir):
                self.work_tree = cachedir
            else:
                self.work_tree = os.path.join(
                    os.path.dirname(settings['PROJECT_ROOT']), '.scrapy',
                    cachedir)

            self.basecmd = [
                'git',
                '--git-dir=%s' % os.path.join(self.work_tree, '.git'),
                '--work-tree=%s' % self.work_tree,
            ]

            dispatcher.connect(self.engine_started,
                               signal=signals.engine_started)
            dispatcher.connect(self.engine_stopped,
                               signal=signals.engine_stopped)
Example #43
0
 def __init__(self, settings):
     self.settings = settings
     self.urifmt = settings['FEED_URI']
     if not self.urifmt:
         raise NotConfigured
     self.format = settings['FEED_FORMAT'].lower()
     self.export_encoding = settings['FEED_EXPORT_ENCODING']
     self.storages = self._load_components('FEED_STORAGES')
     self.exporters = self._load_components('FEED_EXPORTERS')
     if not self._storage_supported(self.urifmt):
         raise NotConfigured
     if not self._exporter_supported(self.format):
         raise NotConfigured
     self.store_empty = settings.getbool('FEED_STORE_EMPTY')
     self._exporting = False
     self.export_fields = settings.getlist('FEED_EXPORT_FIELDS') or None
     self.indent = None
     if settings.get('FEED_EXPORT_INDENT') is not None:
         self.indent = settings.getint('FEED_EXPORT_INDENT')
     uripar = settings['FEED_URI_PARAMS']
     self._uripar = load_object(uripar) if uripar else lambda x, y: None
Example #44
0
 def __init__(self, settings):
     self.settings = settings
     self.urifmt = settings['FEED_URI']
     if not self.urifmt:
         raise NotConfigured
     self.format = settings['FEED_FORMAT'].lower()
     self.export_encoding = settings['FEED_EXPORT_ENCODING']
     self.storages = self._load_components('FEED_STORAGES')
     self.exporters = self._load_components('FEED_EXPORTERS')
     if not self._storage_supported(self.urifmt):
         raise NotConfigured
     if not self._exporter_supported(self.format):
         raise NotConfigured
     self.store_empty = settings.getbool('FEED_STORE_EMPTY')
     self._exporting = False
     self.export_fields = settings.getlist('FEED_EXPORT_FIELDS') or None
     self.indent = None
     if settings.get('FEED_EXPORT_INDENT') is not None:
         self.indent = settings.getint('FEED_EXPORT_INDENT')
     uripar = settings['FEED_URI_PARAMS']
     self._uripar = load_object(uripar) if uripar else lambda x, y: None
Example #45
0
    def send(self, to, subject, body, cc=None, attachs=()):
        if attachs:
            msg = MIMEMultipart()
        else:
            msg = MIMENonMultipart('text', 'plain')
        msg['From'] = self.mailfrom
        msg['To'] = COMMASPACE.join(to)
        msg['Date'] = formatdate(localtime=True)
        msg['Subject'] = subject
        rcpts = to[:]
        if cc:
            rcpts.extend(cc)
            msg['Cc'] = COMMASPACE.join(cc)

        if attachs:
            msg.attach(MIMEText(body))
            for attach_name, mimetype, f in attachs:
                part = MIMEBase(*mimetype.split('/'))
                part.set_payload(f.read())
                Encoders.encode_base64(part)
                part.add_header('Content-Disposition', 'attachment; filename="%s"' \
                    % attach_name)
                msg.attach(part)
        else:
            msg.set_payload(body)

        send_catch_log(signal=mail_sent, to=to, subject=subject, body=body,
                       cc=cc, attach=attachs, msg=msg)

        if settings.getbool('MAIL_DEBUG'):
            log.msg('Debug mail sent OK: To=%s Cc=%s Subject="%s" Attachs=%d' % \
                (to, cc, subject, len(attachs)), level=log.DEBUG)
            return

        dfd = self._sendmail(rcpts, msg.as_string())
        dfd.addCallbacks(self._sent_ok, self._sent_failed,
            callbackArgs=[to, cc, subject, len(attachs)],
            errbackArgs=[to, cc, subject, len(attachs)])
        reactor.addSystemEventTrigger('before', 'shutdown', lambda: dfd)
        return dfd
Example #46
0
    def __init__(self, download_delay=None, max_concurrent_requests=None):
        if download_delay is None:
            self._download_delay = settings.getfloat("DOWNLOAD_DELAY")
        else:
            self._download_delay = float(download_delay)
        if self._download_delay:
            self.max_concurrent_requests = 1
        elif max_concurrent_requests is None:
            self.max_concurrent_requests = settings.getint("CONCURRENT_REQUESTS_PER_SPIDER")
        else:
            self.max_concurrent_requests = max_concurrent_requests
        if self._download_delay and settings.getbool("RANDOMIZE_DOWNLOAD_DELAY"):
            # same policy as wget --random-wait
            self.random_delay_interval = (0.5 * self._download_delay, 1.5 * self._download_delay)
        else:
            self.random_delay_interval = None

        self.active = set()
        self.queue = []
        self.transferring = set()
        self.closing = False
        self.lastseen = 0
        self.next_request_calls = set()
Example #47
0
 def __init__(self):
     if not settings.getbool('COOKIES_ENABLED'):
         raise NotConfigured
     self.jars = defaultdict(CookieJar)
     dispatcher.connect(self.spider_closed, signals.spider_closed)
Example #48
0
 def __init__(self):
     self._dump = settings.getbool('STATS_DUMP')
     self._stats = {None: {}}  # None is for global stats
Example #49
0
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor

from scrapy.selector import Selector
from scrapy.http import Request
#from scrapy.http import FormRequest
from scrapy.conf import settings
#from scrapy.shell import inspect_response

from scrapy.utils.response import get_base_url

from tase.HistorySpider import HistorySpider
from tase.items import TaseItem

import tase.common

PROCESS_HISTORY = settings.getbool('PROCESS_HISTORY', False)
HISTORY_PERIOD = settings.getint('HISTORY_PERIOD', 2)
category_fund = settings.get('CATEGORY_FUND')


class FundSpider(HistorySpider):
    name = 'funds'
    allowed_domains = ['tase.co.il']
    start_urls = ['http://www.tase.co.il/TASEEng/MarketData/MutualFunds/']

    rules = (
        Rule(SgmlLinkExtractor(allow=[r'BuildCmb_6_1.js']),
             callback='parse_fund_list'),
        Rule(SgmlLinkExtractor(allow=('FundMainData\.htm', )),
             callback='parse_fund'),
        Rule(SgmlLinkExtractor(allow=(r'MutualFunds', )),
Example #50
0
 def __init__(self):
     if not settings.getbool('COOKIES_ENABLED'):
         raise NotConfigured
     self.jars = defaultdict(CookieJar)
Example #51
0
from tase.items import TaseItem
from tase.items import FinancialStatement

import urllib
from urlparse import urlparse
from urlparse import parse_qs
from urlparse import urljoin

import tase.common

from html2text import html2text

#import string
#import random

PROCESS_HISTORY = settings.getbool('PROCESS_HISTORY', False)
HISTORY_PERIOD = settings.getint('HISTORY_PERIOD', 2)  # 1 month
category_comp = settings.get('CATEGORY_COMP')
PROCESS_FINANCIAL_STATEMENTS = settings.getbool('PROCESS_FINANCIAL_STATEMENTS',
                                                False)


class StockSpider(HistorySpider):
    name = 'stocks'
    allowed_domains = ['tase.co.il']
    start_urls = [
        'http://www.tase.co.il/eng/marketdata/stocks/marketdata/Pages/MarketData.aspx'
    ]

    rules = (
        Rule(SgmlLinkExtractor(allow=('MarketData\.aspx', )),
Example #52
0
class CookiesMiddleware(object):
    """This middleware enables working with sites that need cookies"""
    debug = settings.getbool('COOKIES_DEBUG')

    def __init__(self):
        self.jars = defaultdict(CookieJar)
        dispatcher.connect(self.spider_closed, signals.spider_closed)

    def process_request(self, request, spider):
        if 'dont_merge_cookies' in request.meta:
            return

        jar = self.jars[spider]
        cookies = self._get_request_cookies(jar, request)
        for cookie in cookies:
            jar.set_cookie_if_ok(cookie, request)

        # set Cookie header
        request.headers.pop('Cookie', None)
        jar.add_cookie_header(request)
        self._debug_cookie(request)

    def process_response(self, request, response, spider):
        if 'dont_merge_cookies' in request.meta:
            return response

        # extract cookies from Set-Cookie and drop invalid/expired cookies
        jar = self.jars[spider]
        jar.extract_cookies(response, request)
        self._debug_set_cookie(response)

        return response

    def spider_closed(self, spider):
        self.jars.pop(spider, None)

    def _debug_cookie(self, request):
        """log Cookie header for request"""
        if self.debug:
            c = request.headers.get('Cookie')
            c = c and [p.split('=')[0] for p in c.split(';')]
            log.msg('Cookie: %s for %s' % (c, request.url), level=log.DEBUG)

    def _debug_set_cookie(self, response):
        """log Set-Cookies headers but exclude cookie values"""
        if self.debug:
            cl = response.headers.getlist('Set-Cookie')
            res = []
            for c in cl:
                kv, tail = c.split(';', 1)
                k = kv.split('=', 1)[0]
                res.append('%s %s' % (k, tail))
            log.msg('Set-Cookie: %s from %s' % (res, response.url))

    def _get_request_cookies(self, jar, request):
        headers = {
            'Set-Cookie':
            ['%s=%s;' % (k, v) for k, v in request.cookies.iteritems()]
        }
        response = Response(request.url, headers=headers)
        cookies = jar.make_cookies(response, request)
        return cookies
Example #53
0
from scrapy.selector import Selector
from scrapy.http import Request
from scrapy.http import FormRequest
from scrapy.conf import settings
#from scrapy.shell import inspect_response
from scrapy import log
from tase.items import NewsArticle

import urllib
from urlparse import urlparse
from urlparse import parse_qs
#from urlparse import urljoin

import tase.common

PROCESS_NEWS = settings.getbool('PROCESS_NEWS', False)
PROCESS_NEWS_HISTORY = settings.getbool('PROCESS_NEWS_HISTORY', False)
PROCESS_NEWS_CONTENT = settings.getbool('PROCESS_NEWS_CONTENT', True)


class NewsSpider(CrawlSpider):
    name = 'news'
    allowed_domains = ['globes.co.il']
    start_urls = []

    rules = (
        #Rule(SgmlLinkExtractor(allow=(r'searchajax\.aspx\?',)), callback='parse_article_list'),
        Rule(SgmlLinkExtractor(allow=('searchajax\.aspx', )),
             callback='parse_article_list'),
        #Rule(SgmlLinkExtractor(allow=('\/en\/article-',)), callback='parse_article'),
    )
Example #54
0
 def __init__(self):
     self._dump = settings.getbool('STATS_DUMP')
     self._stats = {None: {}}  # None is for global stats
     dispatcher.connect(self._engine_stopped, signal=signals.engine_stopped)
Example #55
0
from scrapy.statscol import DummyStatsCollector
from scrapy.conf import settings
from scrapy.utils.misc import load_object

# if stats are disabled use a DummyStatsCollector to improve performance
if settings.getbool('STATS_ENABLED'):
    stats = load_object(settings['STATS_CLASS'])()
else:
    stats = DummyStatsCollector()
Example #56
0
 def __init__(self, *args, **kwargs):
     kwargs['delimiter'] = settings.get('EXPORT_CSV_DELIMITER', '\001')
     kwargs['fields_to_export'] = settings.getlist('EXPORT_FIELDS') or None
     kwargs['encoding'] = settings.getlist('EXPORT_ENCODING', 'utf-8')
     super(AppinfoCsvExporter, self).__init__(*args, **kwargs)
     self.include_headers_line = settings.getbool('export_csv_headers', true);