Beispiel #1
0
    def _send_report(self, rcpts, subject):
        """send notification mail with some additional useful info"""
        s = "Memory usage at engine startup : %dM\r\n" % (stats.get_value('memusage/startup')/1024/1024)
        s += "Maximum memory usage           : %dM\r\n" % (stats.get_value('memusage/max')/1024/1024)
        s += "Current memory usage           : %dM\r\n" % (self.get_virtual_size()/1024/1024)

        s += "ENGINE STATUS ------------------------------------------------------- \r\n"
        s += "\r\n"
        s += pformat(get_engine_status(self.crawler.engine))
        s += "\r\n"
        self.mail.send(rcpts, subject, s)
Beispiel #2
0
    def _send_report(self, rcpts, subject):
        """send notification mail with some additional useful info"""
        s = "Memory usage at engine startup : %dM\r\n" % (stats.get_value('memusage/startup')/1024/1024)
        s += "Maximum memory usage           : %dM\r\n" % (stats.get_value('memusage/max')/1024/1024)
        s += "Current memory usage           : %dM\r\n" % (self.get_virtual_size()/1024/1024)

        s += "ENGINE STATUS ------------------------------------------------------- \r\n"
        s += "\r\n"
        s += pformat(get_engine_status(self.crawler.engine))
        s += "\r\n"
        self.mail.send(rcpts, subject, s)
Beispiel #3
0
	def close_spider(self, spider):
		self.cur.close()
		self.con.close()
		stats_dump = open('stats.json', 'w')
		current_stats = stats.get_value('prefixes')
		stats_dump.write(json.dumps(current_stats))
		stats_dump.close()
Beispiel #4
0
 def stats_spider_opened(self, spider):
     stats.set_value('start_time',
                     datetime.datetime.utcnow(),
                     spider=spider)
     stats.set_value('envinfo/host',
                     stats.get_value('envinfo/host'),
                     spider=spider)
     stats.inc_value('spider_count/opened')
Beispiel #5
0
 def process_item(self, spider, item):
     sampled = stats.get_value("items_sampled", 0, spider=spider)
     if sampled < items_per_spider:
         self.items[item.guid] = item
         sampled += 1
         stats.set_value("items_sampled", sampled, spider=spider)
         log.msg("Sampled %s" % item, spider=spider, level=log.INFO)
         if close_spider and sampled == items_per_spider:
             scrapyengine.close_spider(spider)
     return item
    def test_process_spider_output(self):
        req = Request('http://scrapytest.org')
        resp = Response('http://scrapytest.org')
        resp.request = req
        result = [Request('http://scrapytest.org')]

        out = list(self.mw.process_spider_output(resp, result, self.spider))
        self.assertEquals(out, result)

        rdc = stats.get_value('request_depth_count/1', spider=self.spider)
        self.assertEquals(rdc, 1)

        req.meta['depth'] = 1

        out2 = list(self.mw.process_spider_output(resp, result, self.spider))
        self.assertEquals(out2, [])

        rdm = stats.get_value('request_depth_max', spider=self.spider)
        self.assertEquals(rdm, 1)
Beispiel #7
0
 def new_callback(*args, **kwargs):
     tbefore = time()
     mbefore = self._memusage()
     r = function(*args, **kwargs)
     mafter = self._memusage()
     ct = time() - tbefore
     tcc = stats.get_value('profiling/total_callback_time', 0, spider=spider)
     sct = stats.get_value('profiling/slowest_callback_time', 0, spider=spider)
     stats.set_value('profiling/total_callback_time', tcc+ct, spider=spider)
     if ct > sct:
         stats.set_value('profiling/slowest_callback_time', ct, spider=spider)
         stats.set_value('profiling/slowest_callback_name', function.__name__, \
             spider=spider)
         stats.set_value('profiling/slowest_callback_url', args[0].url, \
             spider=spider)
     if self._memusage:
         stats.inc_value('profiling/total_mem_allocated_in_callbacks', \
             count=mafter-mbefore, spider=spider)
     return r
Beispiel #8
0
	def process_item(self, item, spider):
		prefix, pid, info = item['prefix'], item['pid'], item['info']
		current_stats = stats.get_value('prefixes')[prefix]
		try:
			self.cur.execute('insert into data(pid, info, updated) values (?, ?, ?)', (pid, info, date.today()))
		except sqlite3.IntegrityError:
			self.cur.execute('update data set info=(?), updated=(?) where pid=(?)', (info, date.today(), pid))

		self.con.commit()
		current_stats['pids_processed'] += 1
		log.msg('Committed info for pid:%s in prefix:%s' % (pid, prefix))
    def setUp(self):
        settings.disabled = False
        settings.overrides['DEPTH_LIMIT'] = 1
        settings.overrides['DEPTH_STATS'] = True

        self.spider = BaseSpider('scrapytest.org')

        stats.open_spider(self.spider)

        self.mw = DepthMiddleware()
        self.assertEquals(stats.get_value('envinfo/request_depth_limit'), 1)
Beispiel #10
0
 def _filter(request):
     if isinstance(request, Request):
         depth = response.request.meta['depth'] + 1
         request.meta['depth'] = depth
         if self.maxdepth and depth > self.maxdepth:
             log.msg("Ignoring link (depth > %d): %s " % (self.maxdepth, request.url), \
                 level=log.DEBUG, spider=spider)
             return False
         elif self.stats:
             stats.inc_value('request_depth_count/%s' % depth, spider=spider)
             if depth > stats.get_value('request_depth_max', 0, spider=spider):
                 stats.set_value('request_depth_max', depth, spider=spider)
     return True
Beispiel #11
0
	def parse_index(self, response):
		michbar.settings.USER_AGENT = '%s' % (str(random())[2:])
		prefix = response.meta['data']['LAST_NAME']
		current_stats = stats.get_value('prefixes')[prefix]

		hxs = HtmlXPathSelector(response)
		if response.meta['results'] is None:
			results = hxs.select('//td[@class="text"]/p/strong/text()')
			current_stats['pids_collected'] = 0
			current_stats['pids_processed'] = 0
			try:
				results = results.re('Search results:\s(\d+)').pop()
				current_stats['pids_total'] = results
			except IndexError:
				log.msg('There are no records for prefix:%s' % prefix, level=log.WARNING)
				if hxs.select('//b').re('No members found.'):
					current_stats['pids_total'] = None
					return
				inspect(response)
			response.meta['results'] = int(results)
			log.msg('There is a total of %s pids for %s.' % (results, prefix))

		results = response.meta['results']
		row_start = response.meta['row_start']
		per_page = response.meta['per_page']

		log.msg('Retrieved pids from %s to %s for %s.' % (row_start, row_start + per_page, prefix))
		response.meta['row_start'] += per_page
		if response.meta['row_start'] < results:
			log.msg('Requesting pids form %s to %s for %s.' % (row_start+per_page, row_start+per_page*2, prefix))
			response.meta['data'].update({
				'rowBegins':unicode(response.meta['row_start']),
				'reference':'/memberdirectory/results.cfm',
				'submit':'Next 25',
			})
			yield response.request.replace(formdata=response.meta['data'])

		pids = hxs.select('//td[@class="text"]/a').re('detail\.cfm\?PID=(\d+)')
		current_stats['pids_collected'] += len(pids)
		log.msg('Found %s pids for %s.' % (len(pids), prefix))
		if not pids:
				log.msg('No PIDS for prefix:%s row:%s' % (prefix, response.meta['row_start']), level=log.ERROR)
				inspect(response)

		for pid in pids:
			request = Request(url=self.detail_url+'?'+urlencode({'PID':pid}), method='GET', callback=self.parse_detail)
			request.meta.update({
					'pid':pid,
					'prefix':prefix,
			})
			yield request
Beispiel #12
0
    def process_spider_output(self, response, result, spider):
        requests, items = [], []
        for r in result:
            if isinstance(r, Request):
                requests.append(r)
            else:
                items.append(r)

        if stats.get_value("items_sampled", spider=spider) >= items_per_spider:
            return []
        else:
            # TODO: this needs some revision, as keeping only the first item
            # may lead to differences when performing replays on sampled items
            return requests + items[0:]
Beispiel #13
0
	def start_requests(self):
		prefix_stats = stats.get_value('prefixes')
		alpha = Crange()
		for prefix in (''.join(x) for x in alpha.range('aa', 'zz', 2)):
			data = {
				'membertype':'1',
				'rowBegins':'1',
				'Seach_Type':'name',
				'reference':'/memberdirectory/content.html',
				'LAST_NAME':prefix,
			}
			request = FormRequest(url=self.index_url, formdata=data, dont_filter=True, callback=self.parse_index)
			request.meta.update({
				'data':data,
				'row_start':int(data['rowBegins']),
				'per_page':25,
				'results':None,
			})
			prefix_stats[prefix] = {}
			yield request
 def test_process_response(self):
     self.mw.process_response(self.req, self.res, self.spider)
     self.assertEqual(stats.get_value('downloader/response_count', \
         spider=self.spider), 1)
Beispiel #15
0
 def stats_spider_opened(self, spider):
     stats.set_value("start_time", datetime.datetime.utcnow(), spider=spider)
     stats.set_value("envinfo/host", stats.get_value("envinfo/host"), spider=spider)
     stats.inc_value("spider_count/opened")
 def test_process_response(self):
     self.mw.process_response(self.req, self.res, self.spider)
     self.assertEqual(stats.get_value('downloader/response_count', \
         spider=self.spider), 1)
Beispiel #17
0
 def process_spider_input(self, response, spider):
     if stats.get_value("items_sampled", spider=spider) >= items_per_spider:
         return []
     elif max_response_size and max_response_size > len(response_httprepr(response)):
         return []
Beispiel #18
0
 def spider_closed(self, spider, reason):
     if reason == "finished" and not stats.get_value("items_sampled", spider=spider):
         self.empty_domains.add(spider.domain_name)
     self.spiders_count += 1
     log.msg("Sampled %d domains so far (%d empty)" % (self.spiders_count, len(self.empty_domains)), level=log.INFO)
 def test_process_exception(self):
     self.mw.process_exception(self.req, Exception(), self.spider)
     self.assertEqual(stats.get_value('downloader/exception_count', \
         spider=self.spider), 1)
 def test_process_exception(self):
     self.mw.process_exception(self.req, Exception(), self.spider)
     self.assertEqual(stats.get_value('downloader/exception_count', \
         spider=self.spider), 1)
Beispiel #21
0
 def stats_spider_opened(self, spider):
     stats.set_value('start_time', datetime.datetime.utcnow(), spider=spider)
     stats.set_value('envinfo/host', stats.get_value('envinfo/host'), spider=spider)
     stats.inc_value('spider_count/opened')
Beispiel #22
0
 def get_value(self, key, spider):
     return "%s: %s" %(key, stats.get_value(key, spider=spider))