class NrcBot(BaseSpider): name = 'NrcBot' allowed_domains = ['skytruth.org'] db = None geodb = None task_id = None status_processing = 'PROCESSING' status_done = 'DONE' status_dropped = 'SKIPPED' status_new = 'NEW' status_no_data = 'NODATA' #TODO: get this from config botmaster_url_template = 'http://ewn2.skytruth.org/nrc/botmaster.php?bot={0}' task_conditions = {} # Default values - override in subclass # batch_size = 10 # number of items to process in one batch job_item_limit = 1000 # maximum total items to process in one job execution alert_context = None def __init__(self, **kwargs): self.db = NrcDatabase() self.db.connect() self.geodb = GeoDatabase() self.geodb.connect() dispatcher.connect(self.spider_closed, signal=signals.spider_closed) if 'task_id' in kwargs: self.task_id = kwargs['task_id'] self.task_params = kwargs self.exception_count = 0 def start_requests(self): url = self.botmaster_url_template.format(self.name) return [Request(url, callback=self.parse)] # Parse execution params from botmaster def parse(self, response): hxs = XmlXPathSelector(response) name = hxs.select('//name').extract() if self.task_id is not None: self.log('Processing item %s' % self.task_id, log.INFO) self.alert_context = 'task_id=%s' % self.task_id for item in self.process_item(self.bot_task_params(self.task_id)): yield item else: for item in self.process_items(): yield item # iterate over records that have not yet been processed def process_items(self): items_processed = 0 c = self.db.getBotTaskCount(self.name, self.status_processing) c = c[0]['count'] if (c > 0): self.send_alert( 'ERROR: %s tasks found for bot = %s with status = %s' % (c, self.name, self.status_processing)) tasks = [] tasks.extend(self.db.getBotTasks(self.name)) if self.task_conditions: tasks.extend( self.db.getBotTaskBatch(self.name, self.job_item_limit, self.status_processing, self.task_conditions)) if len(tasks) == 0: self.log('Found no more tasks available. Terminating job.', log.INFO) return else: self.log('processsing a batch of %s items' % len(tasks), log.INFO) for task in tasks[:self.job_item_limit]: task_id = task['task_id'] self.log('processing item %s' % task_id, log.INFO) self.alert_context = "task_id=%s" % task_id # actually process the report items = self.process_item(self.bot_task_params(task_id)) for item in items: yield item items_processed += 1 self.log('Terminating after processing %s items' % (items_processed), log.INFO) def bot_task_params(self, task_id): try: task_id = int(task_id) params = self.db.getBotTaskParams(self.name, task_id) except ValueError: params = None if (params): params = dict( zip([p['key'] for p in params], [p['value'] for p in params])) params['task_id'] = task_id params.update(self.task_params) return params else: return task_id def update_task_param(self, task_id, key, value): self.db.updateBotTaskParam(self.name, task_id, key, value) # override this in the subclass. Remember to call self.item_completed() when you are done to # mark the item as completed in the db def process_item(self, task_id): self.item_completed(task_id) pass # does nothing - override in subclass def item_stored(self, item, id): pass def set_item_status(self, task_id, status): self.db.setBotTaskStatus(task_id, self.name, status) def item_new(self, task_id): self.db.setBotTaskStatus(task_id, self.name, self.status_new) def item_completed(self, task_id): self.db.setBotTaskStatus(task_id, self.name, self.status_done) def item_dropped(self, task_id): self.db.setBotTaskStatus(task_id, self.name, self.status_dropped) def item_processing(self, task_id): self.db.setBotTaskStatus(task_id, self.name, self.status_processing) def send_email(self, from_address, to_address, mime_msg): try: server = smtplib.SMTP('%s:%s' % (settings.MAIL_HOST, settings.MAIL_PORT)) server.starttls() server.login(settings.MAIL_USER, settings.MAIL_PASS) server.sendmail(from_address, to_address, mime_msg.as_string()) server.quit() except SMTPException as e: self.log('Eror sending email to %s: %s' % (to_address, e), log.ERROR) def send_alert(self, message, context=None): subject = 'Bot Alert: %s' % self.name if not context: context = self.alert_context if context: message = "%s\nCONTEXT: %s" % (message, context) self.log('Sending alert: %s -- %s' % (subject, message), log.WARNING) senddate = datetime.strftime(datetime.now(), '%Y-%m-%d') header = "Date: %s\r\nFrom: %s\r\nTo: %s\r\nSubject: %s\r\n\r\n" % ( senddate, settings.MAIL_FROM, settings.MAIL_TO, subject) server = smtplib.SMTP('%s:%s' % (settings.MAIL_HOST, settings.MAIL_PORT)) server.starttls() server.login(settings.MAIL_USER, settings.MAIL_PASS) server.sendmail(settings.MAIL_FROM, settings.MAIL_TO, header + message) server.quit() # used for NRC reports only def make_tag(self, task_id, tag, comment=None): t = ItemLoader(NrcTag()) t.add_value('reportnum', task_id) t.add_value('tag', tag) t.add_value('comment', comment) return t.load_item() # used for FeedEntry Tags def create_tag(self, feed_entry_id, tag, comment=''): l = ItemLoader(FeedEntryTag()) l.add_value('feed_entry_id', feed_entry_id) l.add_value('tag', tag) l.add_value('comment', comment) return l.load_item() def make_bot_task_error(self, task_id, code, message=''): t = ItemLoader(BotTaskError()) t.message_in = lambda slist: [s[:1023] for s in slist] t.add_value('task_id', task_id) t.add_value('bot', self.name) t.add_value('code', code) t.add_value('message', message) return t.load_item() # Use this as the errback in a request if you want to log request failures def error_callback(self, err): self.log('HTTP request failed %s' % (err.getErrorMessage()), log.WARNING) def spider_closed(self, spider): # self.log("\n" + "\n".join ( ["%s = %s" % (k,v) for k,v in sorted(self.crawler.stats.get_stats(spider=self).items())] ), log.INFO) pass
class NrcBot(BaseSpider): name = 'NrcBot' allowed_domains = ['skytruth.org'] db = None geodb = None task_id = None status_processing = 'PROCESSING' status_done = 'DONE' status_dropped = 'SKIPPED' status_new = 'NEW' status_no_data = 'NODATA' status_updated = 'UPDATED' #TODO: get this from config botmaster_url_template = 'http://ewn2.skytruth.org/nrc/botmaster.php?bot={0}' task_conditions = {} # Default values - override in subclass # batch_size = 10 # number of items to process in one batch job_item_limit = 1000 # maximum total items to process in one job execution alert_context = None def __init__(self, **kwargs): self.db = NrcDatabase() self.db.connect() self.geodb = GeoDatabase() self.geodb.connect() dispatcher.connect(self.spider_closed, signal=signals.spider_closed) if 'task_id' in kwargs: self.task_id = kwargs['task_id'] self.task_params = kwargs self.exception_count = 0 def start_requests(self): url = self.botmaster_url_template.format(self.name) return [Request (url, callback=self.parse)] # Parse execution params from botmaster def parse(self, response): hxs = XmlXPathSelector(response) name = hxs.select('//name').extract() if self.task_id is not None: self.log('Processing item %s' % self.task_id, log.INFO) self.alert_context = 'task_id=%s' % self.task_id for item in self.process_item(self.bot_task_params(self.task_id)): yield item else: for item in self.process_items(): yield item # iterate over records that have not yet been processed def process_items (self): items_processed = 0 c = self.db.getBotTaskCount (self.name, self.status_processing) c = c[0]['count'] if (c > 0): self.send_alert ('ERROR: %s tasks found for bot = %s with status = %s' % (c, self.name, self.status_processing)) tasks = [] tasks.extend(self.db.getBotTasks (self.name)) if self.task_conditions: tasks.extend(self.db.getBotTaskBatch(self.name, self.job_item_limit, self.status_processing, self.task_conditions)) if len(tasks) == 0: self.log('Found no more tasks available. Terminating job.', log.INFO) return else: self.log('processsing a batch of %s items' % len(tasks), log.INFO) for task in tasks[:self.job_item_limit]: task_id = task['task_id'] self.log('processing item %s' % task_id, log.INFO) self.alert_context = "task_id=%s" % task_id # actually process the report items = self.process_item(self.bot_task_params(task_id)) for item in items: yield item items_processed += 1 self.log('Terminating after processing %s items' % (items_processed), log.INFO) def bot_task_params (self, task_id): try: task_id = int(task_id) params = self.db.getBotTaskParams (self.name, task_id) except ValueError: params = None if (params): params = dict (zip ([p['key'] for p in params],[p['value'] for p in params])) params ['task_id'] = task_id params.update (self.task_params) return params else: return task_id def update_task_param (self, task_id, key, value): self.db.updateBotTaskParam (self.name, task_id, key, value) # override this in the subclass. Remember to call self.item_completed() when you are done to # mark the item as completed in the db def process_item(self, task_id): self.item_completed (task_id) pass # does nothing - override in subclass def item_stored(self, item, id): pass def set_item_status (self, task_id, status): self.db.setBotTaskStatus(task_id, self.name, status) def item_new (self, task_id): self.db.setBotTaskStatus(task_id, self.name, self.status_new) def item_updated(self, task_id): self.db.setBotTaskStatus(task_id, self.name, self.status_updated) def item_completed (self, task_id): self.db.setBotTaskStatus(task_id, self.name, self.status_done) def item_dropped (self, task_id): self.db.setBotTaskStatus(task_id, self.name, self.status_dropped) def item_processing (self, task_id): self.db.setBotTaskStatus(task_id, self.name, self.status_processing) def send_email (self, from_address, to_address, mime_msg): try: server = smtplib.SMTP('%s:%s' % (settings.MAIL_HOST, settings.MAIL_PORT)) server.starttls() server.login(settings.MAIL_USER,settings.MAIL_PASS) server.sendmail(from_address, to_address, mime_msg.as_string()) server.quit() except SMTPException as e: self.log ('Eror sending email to %s: %s' % (to_address, e), log.ERROR) def send_alert (self, message, context=None): subject = 'Bot Alert: %s' % self.name if not context: context = self.alert_context if context: message = "%s\nCONTEXT: %s" % (message, context) self.log ('Sending alert: %s -- %s' % (subject, message), log.WARNING) senddate=datetime.strftime(datetime.now(), '%Y-%m-%d') header="Date: %s\r\nFrom: %s\r\nTo: %s\r\nSubject: %s\r\n\r\n" % (senddate, settings.MAIL_FROM, settings.MAIL_TO, subject) server = smtplib.SMTP('%s:%s' % (settings.MAIL_HOST, settings.MAIL_PORT)) server.starttls() server.login(settings.MAIL_USER,settings.MAIL_PASS) server.sendmail(settings.MAIL_FROM, settings.MAIL_TO, header+message) server.quit() # used for NRC reports only def make_tag (self, task_id, tag, comment=None): t = ItemLoader (NrcTag()) t.add_value ('reportnum', task_id) t.add_value ('tag', tag) t.add_value ('comment', comment) return t.load_item() # used for FeedEntry Tags def create_tag (self, feed_entry_id, tag, comment = ''): l = ItemLoader (FeedEntryTag()) l.add_value ('feed_entry_id', feed_entry_id) l.add_value ('tag', tag) l.add_value ('comment', comment) return l.load_item() def make_bot_task_error (self, task_id, code, message=''): t = ItemLoader (BotTaskError()) t.message_in = lambda slist: [s[:1023] for s in slist] t.add_value ('task_id', task_id) t.add_value ('bot', self.name) t.add_value ('code', code) t.add_value ('message', message) return t.load_item() # Use this as the errback in a request if you want to log request failures def error_callback (self, err): self.log ('HTTP request failed %s' % (err.getErrorMessage()), log.WARNING) def spider_closed(self, spider): # self.log("\n" + "\n".join ( ["%s = %s" % (k,v) for k,v in sorted(self.crawler.stats.get_stats(spider=self).items())] ), log.INFO) pass
class NrcScraper(BaseSpider): name = "NrcScraper" allowed_domains = ["nrc.uscg.mil", "maps.googleapis.com"] # old url: "http://www.nrc.uscg.mil/apex/f?p=109" start_urls = [ "http://www.nrc.uscg.mil/pls/apex/f?p=109:1:0:::::" ] db = None def __init__(self, **kwargs): self.db = NrcDatabase() self.db.connect() self.enddate = date.today() self.exception_count = 0 if 'enddate' in kwargs: self.enddate = datetime.strptime(kwargs['enddate'],'%Y-%m-%d') interval = timedelta (10) # ten days last_report_dt = self.db.latestReportDate() if last_report_dt: self.startdate = (last_report_dt - timedelta(days=10)) else: self.startdate = self.enddate - interval if 'startdate' in kwargs: self.startdate = datetime.strptime(kwargs['startdate'],'%Y-%m-%d') if 'target' in kwargs: # ignore default date range and get it from the schedule table in the database instead t = self.db.getNextNrcScraperTarget(kwargs['target']) if (t): self.startdate = t["startdate"] self.enddate = t["enddate"] def parse(self, response): # get today's date. # TODO: make this a runtime parameter enddate = self.enddate startdate = self.startdate self.log('Requesting NRC Incident reports from {0} to {1}'.format(startdate, enddate), log.INFO) request = FormRequest.from_response(response, formdata={ 'p_t04': startdate.strftime("%d-%b-%y"), 'p_t05': enddate.strftime("%d-%b-%y"), 'p_t14': '100', 'p_request': 'Go' }, callback=self.search_results) return [request] def search_results(self, response): text = unicode (response.body, response.encoding) hxs = HtmlXPathSelector(text=text) reports = hxs.select ('//table[@class="t16Standard"]/tr') if (len(reports) == 0): self.log('Incident report data not present in response', log.ERROR) else: # Skip the first report record because this is the header row reports.pop (0) if (len(reports) == 0): self.log('No incident reports found in response', log.WARNING) else: self.log('Retrieved {0} incident reports'.format(len(reports)), log.INFO) for report in reports: l = XPathItemLoader(NrcScrapedReport(), report) l.context['base_url'] = response.url for name, params in NrcScrapedReport.fields.items(): l.add_xpath(name, params['xpath']) item = l.load_item() if self.db.reportExists(item['reportnum']): self.log('Report {0} already exists. Skipping to next report.'.format(item['reportnum']), log.INFO) else: f_request = Request( item['full_report_url'], callback=self.parse_full_report) m_request = Request( item['materials_url'], callback=self.parse_materials) yield item self.db.setBotTaskStatus(item['reportnum'], self.name, 'DONE') # if self.db.fullReportExists (item['reportnum']): # self.log('Full report Report {0} already exists. Skipping download.'.format(item['reportnum']), log.INFO) # else: # yield f_request # # if self.db.materialExists (item['reportnum']): # self.log('Materials record(s) already exist for report {0}. Skipping download.'.format(item['reportnum']), log.INFO) # else: # yield m_request # get next page of results next = hxs.select('//td[@class="pagination"][4]/a/@href') if len(next) > 0: yield Request (urljoin(response.url, next[0].extract()), callback=self.search_results) def parse_full_report(self, response): # need to work around weird bug where lxml can't handle encode=WINDOWS-1252 # so pull out the body, convert to utf-8 and create a new TextResponse object to contain it # since XPathItemLoader requires a Response object text = unicode (response.body, response.encoding) t = TextResponse (url=response.url, body=text.encode('utf-8'), encoding='utf-8') l= XPathItemLoader(NrcScrapedFullReport(), response=t) url_parts = urlsplit(response.url) l.add_value('reportnum', parse_qs(url_parts.query)['standard_web inc_seq']) l.add_xpath('full_report_body', '//body') l.add_value('full_report_url', response.url) item = l.load_item() reportnum = item['reportnum'] yield item self.db.setBotTaskStatus(reportnum, self.name, 'DONE') def parse_materials(self, response): text = unicode (response.body, response.encoding) hxs = HtmlXPathSelector(text=text) materials = hxs.select ('//table[@class="t16Standard"]/tr') if (len(materials) == 0): self.log('Materials data not present in response from {0}'.format(response.url), log.INFO) else: # Skip the first report record because this is the header row materials.pop (0) if (len(materials) == 0): self.log('No incident reports found in response', log.INFO) else: self.log('Retrieved {0} materials records'.format(len(materials)), log.INFO) for material in materials: l = XPathItemLoader(NrcScrapedMaterial(), material) l.add_value('reportnum', response.url, TakeFirst(), re='P3_SEQNOS:(\d+)') for name, params in NrcScrapedMaterial.fields.items(): if 'xpath' in params: l.add_xpath(name, params['xpath']) item = l.load_item() yield item def item_stored(self, item, id): return