def _send_report(self, rcpts, subject): """send notification mail with some additional useful info""" s = "Memory usage at engine startup : %dM\r\n" % (stats.get_value('memusage/startup')/1024/1024) s += "Maximum memory usage : %dM\r\n" % (stats.get_value('memusage/max')/1024/1024) s += "Current memory usage : %dM\r\n" % (self.get_virtual_size()/1024/1024) s += "ENGINE STATUS ------------------------------------------------------- \r\n" s += "\r\n" s += pformat(get_engine_status(self.crawler.engine)) s += "\r\n" self.mail.send(rcpts, subject, s)
def close_spider(self, spider): self.cur.close() self.con.close() stats_dump = open('stats.json', 'w') current_stats = stats.get_value('prefixes') stats_dump.write(json.dumps(current_stats)) stats_dump.close()
def stats_spider_opened(self, spider): stats.set_value('start_time', datetime.datetime.utcnow(), spider=spider) stats.set_value('envinfo/host', stats.get_value('envinfo/host'), spider=spider) stats.inc_value('spider_count/opened')
def process_item(self, spider, item): sampled = stats.get_value("items_sampled", 0, spider=spider) if sampled < items_per_spider: self.items[item.guid] = item sampled += 1 stats.set_value("items_sampled", sampled, spider=spider) log.msg("Sampled %s" % item, spider=spider, level=log.INFO) if close_spider and sampled == items_per_spider: scrapyengine.close_spider(spider) return item
def test_process_spider_output(self): req = Request('http://scrapytest.org') resp = Response('http://scrapytest.org') resp.request = req result = [Request('http://scrapytest.org')] out = list(self.mw.process_spider_output(resp, result, self.spider)) self.assertEquals(out, result) rdc = stats.get_value('request_depth_count/1', spider=self.spider) self.assertEquals(rdc, 1) req.meta['depth'] = 1 out2 = list(self.mw.process_spider_output(resp, result, self.spider)) self.assertEquals(out2, []) rdm = stats.get_value('request_depth_max', spider=self.spider) self.assertEquals(rdm, 1)
def new_callback(*args, **kwargs): tbefore = time() mbefore = self._memusage() r = function(*args, **kwargs) mafter = self._memusage() ct = time() - tbefore tcc = stats.get_value('profiling/total_callback_time', 0, spider=spider) sct = stats.get_value('profiling/slowest_callback_time', 0, spider=spider) stats.set_value('profiling/total_callback_time', tcc+ct, spider=spider) if ct > sct: stats.set_value('profiling/slowest_callback_time', ct, spider=spider) stats.set_value('profiling/slowest_callback_name', function.__name__, \ spider=spider) stats.set_value('profiling/slowest_callback_url', args[0].url, \ spider=spider) if self._memusage: stats.inc_value('profiling/total_mem_allocated_in_callbacks', \ count=mafter-mbefore, spider=spider) return r
def process_item(self, item, spider): prefix, pid, info = item['prefix'], item['pid'], item['info'] current_stats = stats.get_value('prefixes')[prefix] try: self.cur.execute('insert into data(pid, info, updated) values (?, ?, ?)', (pid, info, date.today())) except sqlite3.IntegrityError: self.cur.execute('update data set info=(?), updated=(?) where pid=(?)', (info, date.today(), pid)) self.con.commit() current_stats['pids_processed'] += 1 log.msg('Committed info for pid:%s in prefix:%s' % (pid, prefix))
def setUp(self): settings.disabled = False settings.overrides['DEPTH_LIMIT'] = 1 settings.overrides['DEPTH_STATS'] = True self.spider = BaseSpider('scrapytest.org') stats.open_spider(self.spider) self.mw = DepthMiddleware() self.assertEquals(stats.get_value('envinfo/request_depth_limit'), 1)
def _filter(request): if isinstance(request, Request): depth = response.request.meta['depth'] + 1 request.meta['depth'] = depth if self.maxdepth and depth > self.maxdepth: log.msg("Ignoring link (depth > %d): %s " % (self.maxdepth, request.url), \ level=log.DEBUG, spider=spider) return False elif self.stats: stats.inc_value('request_depth_count/%s' % depth, spider=spider) if depth > stats.get_value('request_depth_max', 0, spider=spider): stats.set_value('request_depth_max', depth, spider=spider) return True
def parse_index(self, response): michbar.settings.USER_AGENT = '%s' % (str(random())[2:]) prefix = response.meta['data']['LAST_NAME'] current_stats = stats.get_value('prefixes')[prefix] hxs = HtmlXPathSelector(response) if response.meta['results'] is None: results = hxs.select('//td[@class="text"]/p/strong/text()') current_stats['pids_collected'] = 0 current_stats['pids_processed'] = 0 try: results = results.re('Search results:\s(\d+)').pop() current_stats['pids_total'] = results except IndexError: log.msg('There are no records for prefix:%s' % prefix, level=log.WARNING) if hxs.select('//b').re('No members found.'): current_stats['pids_total'] = None return inspect(response) response.meta['results'] = int(results) log.msg('There is a total of %s pids for %s.' % (results, prefix)) results = response.meta['results'] row_start = response.meta['row_start'] per_page = response.meta['per_page'] log.msg('Retrieved pids from %s to %s for %s.' % (row_start, row_start + per_page, prefix)) response.meta['row_start'] += per_page if response.meta['row_start'] < results: log.msg('Requesting pids form %s to %s for %s.' % (row_start+per_page, row_start+per_page*2, prefix)) response.meta['data'].update({ 'rowBegins':unicode(response.meta['row_start']), 'reference':'/memberdirectory/results.cfm', 'submit':'Next 25', }) yield response.request.replace(formdata=response.meta['data']) pids = hxs.select('//td[@class="text"]/a').re('detail\.cfm\?PID=(\d+)') current_stats['pids_collected'] += len(pids) log.msg('Found %s pids for %s.' % (len(pids), prefix)) if not pids: log.msg('No PIDS for prefix:%s row:%s' % (prefix, response.meta['row_start']), level=log.ERROR) inspect(response) for pid in pids: request = Request(url=self.detail_url+'?'+urlencode({'PID':pid}), method='GET', callback=self.parse_detail) request.meta.update({ 'pid':pid, 'prefix':prefix, }) yield request
def process_spider_output(self, response, result, spider): requests, items = [], [] for r in result: if isinstance(r, Request): requests.append(r) else: items.append(r) if stats.get_value("items_sampled", spider=spider) >= items_per_spider: return [] else: # TODO: this needs some revision, as keeping only the first item # may lead to differences when performing replays on sampled items return requests + items[0:]
def start_requests(self): prefix_stats = stats.get_value('prefixes') alpha = Crange() for prefix in (''.join(x) for x in alpha.range('aa', 'zz', 2)): data = { 'membertype':'1', 'rowBegins':'1', 'Seach_Type':'name', 'reference':'/memberdirectory/content.html', 'LAST_NAME':prefix, } request = FormRequest(url=self.index_url, formdata=data, dont_filter=True, callback=self.parse_index) request.meta.update({ 'data':data, 'row_start':int(data['rowBegins']), 'per_page':25, 'results':None, }) prefix_stats[prefix] = {} yield request
def test_process_response(self): self.mw.process_response(self.req, self.res, self.spider) self.assertEqual(stats.get_value('downloader/response_count', \ spider=self.spider), 1)
def stats_spider_opened(self, spider): stats.set_value("start_time", datetime.datetime.utcnow(), spider=spider) stats.set_value("envinfo/host", stats.get_value("envinfo/host"), spider=spider) stats.inc_value("spider_count/opened")
def process_spider_input(self, response, spider): if stats.get_value("items_sampled", spider=spider) >= items_per_spider: return [] elif max_response_size and max_response_size > len(response_httprepr(response)): return []
def spider_closed(self, spider, reason): if reason == "finished" and not stats.get_value("items_sampled", spider=spider): self.empty_domains.add(spider.domain_name) self.spiders_count += 1 log.msg("Sampled %d domains so far (%d empty)" % (self.spiders_count, len(self.empty_domains)), level=log.INFO)
def test_process_exception(self): self.mw.process_exception(self.req, Exception(), self.spider) self.assertEqual(stats.get_value('downloader/exception_count', \ spider=self.spider), 1)
def get_value(self, key, spider): return "%s: %s" %(key, stats.get_value(key, spider=spider))