Beispiel #1
0
 def stats_spider_opened(self, spider):
     stats.set_value('start_time',
                     datetime.datetime.utcnow(),
                     spider=spider)
     stats.set_value('envinfo/host',
                     stats.get_value('envinfo/host'),
                     spider=spider)
     stats.inc_value('spider_count/opened')
Beispiel #2
0
 def stats_spider_closing(self, spider, reason):
     stats.set_value('finish_time',
                     datetime.datetime.utcnow(),
                     spider=spider)
     stats.set_value('finish_status',
                     'OK' if reason == 'finished' else reason,
                     spider=spider)
     stats.inc_value('spider_count/%s' % reason, spider=spider)
Beispiel #3
0
 def process_item(self, spider, item):
     sampled = stats.get_value("items_sampled", 0, spider=spider)
     if sampled < items_per_spider:
         self.items[item.guid] = item
         sampled += 1
         stats.set_value("items_sampled", sampled, spider=spider)
         log.msg("Sampled %s" % item, spider=spider, level=log.INFO)
         if close_spider and sampled == items_per_spider:
             scrapyengine.close_spider(spider)
     return item
Beispiel #4
0
 def _check_limit(self):
     if self.get_virtual_size() > self.limit:
         stats.set_value('memusage/limit_reached', 1)
         mem = self.limit/1024/1024
         log.msg("Memory usage exceeded %dM. Shutting down Scrapy..." % mem, level=log.ERROR)
         if self.notify_mails:
             subj = "%s terminated: memory usage exceeded %dM at %s" % \
                     (self.crawler.settings['BOT_NAME'], mem, socket.gethostname())
             self._send_report(self.notify_mails, subj)
             stats.set_value('memusage/limit_notified', 1)
         self.crawler.stop()
    def process_response(self, request, response, spider):
        """
        A response is leaving the Downloader. It was either retreived
        from the web or from another middleware.

        Decide if we would like to store it in the history.
        """
        if self.store_if(spider, request, response):
            self.storage.store_response(spider, request, response)
            stats.set_value("history/cached", True, spider=spider)

        return response
Beispiel #6
0
 def _check_warning(self):
     if self.warned:  # warn only once
         return
     if self.get_virtual_size() > self.warning:
         stats.set_value("memusage/warning_reached", 1)
         mem = self.warning / 1024 / 1024
         log.msg("Memory usage reached %dM" % mem, level=log.WARNING)
         if self.notify_mails:
             subj = "%s warning: memory usage reached %dM at %s" % (settings["BOT_NAME"], mem, socket.gethostname())
             self._send_report(self.notify_mails, subj)
             stats.set_value("memusage/warning_notified", 1)
         self.warned = True
Beispiel #7
0
 def _check_limit(self):
     if self.get_virtual_size() > self.limit:
         stats.set_value('memusage/limit_reached', 1)
         mem = self.limit / 1024 / 1024
         log.msg("Memory usage exceeded %dM. Shutting down Scrapy..." % mem,
                 level=log.ERROR)
         if self.notify_mails:
             subj = "%s terminated: memory usage exceeded %dM at %s" % \
                     (settings['BOT_NAME'], mem, socket.gethostname())
             self._send_report(self.notify_mails, subj)
             stats.set_value('memusage/limit_notified', 1)
         crawler.stop()
Beispiel #8
0
 def _filter(request):
     if isinstance(request, Request):
         depth = response.request.meta['depth'] + 1
         request.meta['depth'] = depth
         if self.maxdepth and depth > self.maxdepth:
             log.msg("Ignoring link (depth > %d): %s " % (self.maxdepth, request.url), \
                 level=log.DEBUG, spider=spider)
             return False
         elif self.stats:
             stats.inc_value('request_depth_count/%s' % depth, spider=spider)
             if depth > stats.get_value('request_depth_max', 0, spider=spider):
                 stats.set_value('request_depth_max', depth, spider=spider)
     return True
Beispiel #9
0
 def _check_warning(self):
     if self.warned: # warn only once
         return
     if self.get_virtual_size() > self.warning:
         stats.set_value('memusage/warning_reached', 1)
         mem = self.warning/1024/1024
         log.msg("Memory usage reached %dM" % mem, level=log.WARNING)
         if self.notify_mails:
             subj = "%s warning: memory usage reached %dM at %s" % \
                     (self.crawler.settings['BOT_NAME'], mem, socket.gethostname())
             self._send_report(self.notify_mails, subj)
             stats.set_value('memusage/warning_notified', 1)
         self.warned = True
Beispiel #10
0
 def engine_started(self):
     stats.set_value('memusage/startup', self.get_virtual_size())
     self.tasks = []
     tsk = task.LoopingCall(self.update)
     self.tasks.append(tsk)
     tsk.start(60.0, now=True)
     if self.limit:
         tsk = task.LoopingCall(self._check_limit)
         self.tasks.append(tsk)
         tsk.start(60.0, now=True)
     if self.warning:
         tsk = task.LoopingCall(self._check_warning)
         self.tasks.append(tsk)
         tsk.start(60.0, now=True)
Beispiel #11
0
 def engine_started(self):
     stats.set_value('memusage/startup', self.get_virtual_size())
     self.tasks = []
     tsk = task.LoopingCall(self.update)
     self.tasks.append(tsk)
     tsk.start(60.0, now=True)
     if self.limit:
         tsk = task.LoopingCall(self._check_limit)
         self.tasks.append(tsk)
         tsk.start(60.0, now=True)
     if self.warning:
         tsk = task.LoopingCall(self._check_warning)
         self.tasks.append(tsk)
         tsk.start(60.0, now=True)
Beispiel #12
0
    def __init__(self):
        stats.set_value("envinfo/user", getpass.getuser())
        stats.set_value("envinfo/host", socket.gethostname())
        stats.set_value("envinfo/logfile", settings["LOG_FILE"])
        stats.set_value("envinfo/pid", os.getpid())

        dispatcher.connect(self.stats_spider_opened, signal=stats_spider_opened)
        dispatcher.connect(self.stats_spider_closing, signal=stats_spider_closing)
        dispatcher.connect(self.item_scraped, signal=signals.item_scraped)
        dispatcher.connect(self.item_passed, signal=signals.item_passed)
        dispatcher.connect(self.item_dropped, signal=signals.item_dropped)
Beispiel #13
0
    def __init__(self):
        stats.set_value('envinfo/user', getpass.getuser())
        stats.set_value('envinfo/host', socket.gethostname())
        stats.set_value('envinfo/logfile', settings['LOG_FILE'])
        stats.set_value('envinfo/pid', os.getpid())

        dispatcher.connect(self.stats_spider_opened,
                           signal=signals.stats_spider_opened)
        dispatcher.connect(self.stats_spider_closing,
                           signal=signals.stats_spider_closing)
        dispatcher.connect(self.item_scraped, signal=signals.item_scraped)
        dispatcher.connect(self.item_passed, signal=signals.item_passed)
        dispatcher.connect(self.item_dropped, signal=signals.item_dropped)
Beispiel #14
0
 def engine_stopped(self):
     if self.libxml2:
         self.libxml2.cleanupParser()
         stats.set_value('memdebug/libxml2_leaked_bytes', self.libxml2.debugMemory(1))
     gc.collect()
     stats.set_value('memdebug/gc_garbage_count', len(gc.garbage))
     if self.trackrefs:
         for cls, wdict in live_refs.iteritems():
             if not wdict:
                 continue
             stats.set_value('memdebug/live_refs/%s' % cls.__name__, len(wdict))
Beispiel #15
0
 def new_callback(*args, **kwargs):
     tbefore = time()
     mbefore = self._memusage()
     r = function(*args, **kwargs)
     mafter = self._memusage()
     ct = time() - tbefore
     tcc = stats.get_value('profiling/total_callback_time', 0, spider=spider)
     sct = stats.get_value('profiling/slowest_callback_time', 0, spider=spider)
     stats.set_value('profiling/total_callback_time', tcc+ct, spider=spider)
     if ct > sct:
         stats.set_value('profiling/slowest_callback_time', ct, spider=spider)
         stats.set_value('profiling/slowest_callback_name', function.__name__, \
             spider=spider)
         stats.set_value('profiling/slowest_callback_url', args[0].url, \
             spider=spider)
     if self._memusage:
         stats.inc_value('profiling/total_mem_allocated_in_callbacks', \
             count=mafter-mbefore, spider=spider)
     return r
Beispiel #16
0
 def engine_stopped(self):
     if self.libxml2:
         self.libxml2.cleanupParser()
         stats.set_value('memdebug/libxml2_leaked_bytes',
                         self.libxml2.debugMemory(1))
     gc.collect()
     stats.set_value('memdebug/gc_garbage_count', len(gc.garbage))
     if settings.getbool('TRACK_REFS'):
         for cls, wdict in live_refs.iteritems():
             if not wdict:
                 continue
             stats.set_value('memdebug/live_refs/%s' % cls.__name__,
                             len(wdict))
Beispiel #17
0
 def stats_spider_opened(self, spider):
     stats.set_value('start_time',
                     datetime.datetime.utcnow(),
                     spider=spider)
Beispiel #18
0
 def stats_spider_closing(self, spider, reason):
     stats.set_value("finish_time", datetime.datetime.utcnow(), spider=spider)
     stats.set_value("finish_status", "OK" if reason == "finished" else reason, spider=spider)
     stats.inc_value("spider_count/%s" % reason, spider=spider)
Beispiel #19
0
 def stats_spider_opened(self, spider):
     stats.set_value("start_time", datetime.datetime.utcnow(), spider=spider)
     stats.set_value("envinfo/host", stats.get_value("envinfo/host"), spider=spider)
     stats.inc_value("spider_count/opened")
Beispiel #20
0
 def stats_spider_opened(self, spider):
     stats.set_value('start_time', datetime.datetime.utcnow(), spider=spider)
     stats.set_value('envinfo/host', stats.get_value('envinfo/host'), spider=spider)
     stats.inc_value('spider_count/opened')
Beispiel #21
0
from scrapy.stats import stats

from scrapy.shell import inspect_response as inspect

# python stdlib modules
from random import random
from urllib import urlencode

# local modules
from crange import Crange

# michbar modules
import michbar.settings
from michbar.items import MichbarItem

stats.set_value('prefixes', {})

class MichBarSpider(BaseSpider):
	name = 'MichBar'

	base_url = 'http://www.michbar.org/memberdirectory'
	index_url = base_url + '/results.cfm'
	detail_url = base_url + '/detail.cfm'

	allowed_domains = (
		'michbar.org',
	)
	
	def start_requests(self):
		prefix_stats = stats.get_value('prefixes')
		alpha = Crange()
Beispiel #22
0
 def stats_spider_closing(self, spider, reason):
     stats.set_value('finish_time', datetime.datetime.utcnow(), spider=spider)
     stats.set_value('finish_reason', reason, spider=spider)
Beispiel #23
0
 def stats_spider_closing(self, spider, reason):
     stats.set_value('finish_time', datetime.datetime.utcnow(), spider=spider)
     stats.set_value('finish_status', 'OK' if reason == 'finished' else reason, spider=spider)
     stats.inc_value('spider_count/%s' % reason, spider=spider)
Beispiel #24
0
 def __init__(self):
     self.maxdepth = settings.getint('DEPTH_LIMIT')
     self.stats = settings.getbool('DEPTH_STATS')
     if self.stats and self.maxdepth:
         stats.set_value('envinfo/request_depth_limit', self.maxdepth)
Beispiel #25
0
 def stats_spider_closing(self, spider, reason):
     stats.set_value('finish_time',
                     datetime.datetime.utcnow(),
                     spider=spider)
     stats.set_value('finish_reason', reason, spider=spider)
Beispiel #26
0
 def stats_spider_opened(self, spider):
     stats.set_value('start_time', datetime.datetime.utcnow(), spider=spider)