Beispiel #1
0
 def __init__(self, **kwargs):
     self.db = NrcDatabase()
     self.db.connect()
     self.geodb = GeoDatabase()
     self.geodb.connect()
     dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
     if 'task_id' in kwargs:
         self.task_id = kwargs['task_id']
     self.task_params = kwargs
     self.exception_count = 0
Beispiel #2
0
 def __init__(self, **kwargs):
     self.db = NrcDatabase()
     self.db.connect()
     self.geodb = GeoDatabase()
     self.geodb.connect()
     dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
     if 'task_id' in kwargs:
         self.task_id = kwargs['task_id']
     self.task_params = kwargs
     self.exception_count = 0
Beispiel #3
0
class NrcBot(BaseSpider):
    name = 'NrcBot'
    allowed_domains = ['skytruth.org']
    db = None
    geodb = None
    task_id = None
    status_processing = 'PROCESSING'
    status_done = 'DONE'
    status_dropped = 'SKIPPED'
    status_new = 'NEW'
    status_no_data = 'NODATA'
    status_updated = 'UPDATED'

    #TODO: get this from config
    botmaster_url_template = 'http://ewn2.skytruth.org/nrc/botmaster.php?bot={0}'
    task_conditions = {}

    # Default values - override in subclass
#    batch_size = 10  # number of items to process in one batch
    job_item_limit = 1000  # maximum total items to process in one job execution

    alert_context = None

    def __init__(self, **kwargs):
        self.db = NrcDatabase()
        self.db.connect()
        self.geodb = GeoDatabase()
        self.geodb.connect()
        dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
        if 'task_id' in kwargs:
            self.task_id = kwargs['task_id']
        self.task_params = kwargs
        self.exception_count = 0


    def start_requests(self):
        url = self.botmaster_url_template.format(self.name)
        return [Request (url, callback=self.parse)]

    # Parse execution params from botmaster
    def parse(self, response):

        hxs = XmlXPathSelector(response)
        name = hxs.select('//name').extract()

        if self.task_id is not None:
            self.log('Processing item %s' % self.task_id, log.INFO)
            self.alert_context = 'task_id=%s' % self.task_id
            for item in self.process_item(self.bot_task_params(self.task_id)):
                yield item
        else:
            for item in self.process_items():
                yield  item

    # iterate over records that have not yet been processed
    def process_items (self):
        items_processed = 0
        c = self.db.getBotTaskCount (self.name, self.status_processing)
        c = c[0]['count']
        if (c  > 0):
            self.send_alert ('ERROR: %s tasks found for bot = %s with status = %s' % (c, self.name, self.status_processing))

        tasks = []
        tasks.extend(self.db.getBotTasks (self.name))

        if self.task_conditions:
            tasks.extend(self.db.getBotTaskBatch(self.name, self.job_item_limit, self.status_processing, self.task_conditions))
        if len(tasks) == 0:
            self.log('Found no more tasks available.  Terminating job.', log.INFO)
            return
        else:
            self.log('processsing a batch of %s items' % len(tasks), log.INFO)

        for task in tasks[:self.job_item_limit]:
            task_id = task['task_id']

            self.log('processing item %s' % task_id, log.INFO)

            self.alert_context = "task_id=%s" % task_id


            # actually process the report
            items = self.process_item(self.bot_task_params(task_id))
            for item in items:
                yield item

            items_processed += 1

        self.log('Terminating after processing %s items' % (items_processed), log.INFO)

    def bot_task_params (self, task_id):
        try:
                task_id = int(task_id)
	        params = self.db.getBotTaskParams (self.name, task_id)
        except ValueError:
                params = None

        if (params):
            params = dict (zip ([p['key'] for p in params],[p['value'] for p in params]))
            params ['task_id'] = task_id
            params.update (self.task_params)
            return params
        else:
            return task_id

    def update_task_param (self, task_id, key, value):
        self.db.updateBotTaskParam (self.name, task_id, key, value)


    # override this in the subclass.  Remember to call self.item_completed() when you are done to
    # mark the item as completed in the db
    def process_item(self, task_id):
        self.item_completed (task_id)
        pass

    # does nothing - override in subclass
    def item_stored(self, item, id):
        pass


    def set_item_status (self, task_id, status):
        self.db.setBotTaskStatus(task_id, self.name, status)

    def item_new (self, task_id):
        self.db.setBotTaskStatus(task_id, self.name, self.status_new)

    def item_updated(self, task_id):
        self.db.setBotTaskStatus(task_id, self.name, self.status_updated)

    def item_completed (self, task_id):
        self.db.setBotTaskStatus(task_id, self.name, self.status_done)

    def item_dropped (self, task_id):
        self.db.setBotTaskStatus(task_id, self.name, self.status_dropped)

    def item_processing (self, task_id):
        self.db.setBotTaskStatus(task_id, self.name, self.status_processing)

    def send_email (self, from_address, to_address, mime_msg):
        try:
            server = smtplib.SMTP('%s:%s' % (settings.MAIL_HOST, settings.MAIL_PORT))
            server.starttls()
            server.login(settings.MAIL_USER,settings.MAIL_PASS)
            server.sendmail(from_address, to_address, mime_msg.as_string())
            server.quit()
        except SMTPException as e:
            self.log ('Eror sending email to %s: %s'  % (to_address, e), log.ERROR)

    def send_alert (self, message, context=None):
        subject = 'Bot Alert: %s' % self.name
        if not context:
            context = self.alert_context
        if context:
            message = "%s\nCONTEXT: %s" % (message, context)
        self.log ('Sending alert: %s -- %s'  % (subject, message), log.WARNING)

        senddate=datetime.strftime(datetime.now(), '%Y-%m-%d')

        header="Date: %s\r\nFrom: %s\r\nTo: %s\r\nSubject: %s\r\n\r\n" % (senddate, settings.MAIL_FROM, settings.MAIL_TO, subject)

        server = smtplib.SMTP('%s:%s' % (settings.MAIL_HOST, settings.MAIL_PORT))
        server.starttls()
        server.login(settings.MAIL_USER,settings.MAIL_PASS)
        server.sendmail(settings.MAIL_FROM, settings.MAIL_TO, header+message)
        server.quit()

    # used for NRC reports only
    def make_tag (self, task_id, tag, comment=None):
        t = ItemLoader (NrcTag())
        t.add_value ('reportnum', task_id)
        t.add_value ('tag', tag)
        t.add_value ('comment', comment)
        return t.load_item()

    # used for FeedEntry Tags
    def create_tag (self, feed_entry_id, tag, comment = ''):
        l = ItemLoader (FeedEntryTag())
        l.add_value ('feed_entry_id', feed_entry_id)
        l.add_value ('tag', tag)
        l.add_value ('comment', comment)
        return l.load_item()

    def make_bot_task_error (self, task_id, code, message=''):
        t = ItemLoader (BotTaskError())
        t.message_in = lambda slist: [s[:1023] for s in slist]
        t.add_value ('task_id', task_id)
        t.add_value ('bot', self.name)
        t.add_value ('code', code)
        t.add_value ('message', message)
        return t.load_item()


    # Use this as the errback in a request if you want to log request failures
    def error_callback (self, err):
        self.log ('HTTP request failed %s'  % (err.getErrorMessage()), log.WARNING)

    def spider_closed(self, spider):
#        self.log("\n" + "\n".join ( ["%s = %s" % (k,v) for k,v in sorted(self.crawler.stats.get_stats(spider=self).items())] ), log.INFO)
        pass
Beispiel #4
0
class NrcBot(BaseSpider):
    name = 'NrcBot'
    allowed_domains = ['skytruth.org']
    db = None
    geodb = None
    task_id = None
    status_processing = 'PROCESSING'
    status_done = 'DONE'
    status_dropped = 'SKIPPED'
    status_new = 'NEW'
    status_no_data = 'NODATA'

    #TODO: get this from config
    botmaster_url_template = 'http://ewn2.skytruth.org/nrc/botmaster.php?bot={0}'
    task_conditions = {}

    # Default values - override in subclass
    #    batch_size = 10  # number of items to process in one batch
    job_item_limit = 1000  # maximum total items to process in one job execution

    alert_context = None

    def __init__(self, **kwargs):
        self.db = NrcDatabase()
        self.db.connect()
        self.geodb = GeoDatabase()
        self.geodb.connect()
        dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
        if 'task_id' in kwargs:
            self.task_id = kwargs['task_id']
        self.task_params = kwargs
        self.exception_count = 0

    def start_requests(self):
        url = self.botmaster_url_template.format(self.name)
        return [Request(url, callback=self.parse)]

    # Parse execution params from botmaster
    def parse(self, response):

        hxs = XmlXPathSelector(response)
        name = hxs.select('//name').extract()

        if self.task_id is not None:
            self.log('Processing item %s' % self.task_id, log.INFO)
            self.alert_context = 'task_id=%s' % self.task_id
            for item in self.process_item(self.bot_task_params(self.task_id)):
                yield item
        else:
            for item in self.process_items():
                yield item

    # iterate over records that have not yet been processed
    def process_items(self):
        items_processed = 0
        c = self.db.getBotTaskCount(self.name, self.status_processing)
        c = c[0]['count']
        if (c > 0):
            self.send_alert(
                'ERROR: %s tasks found for bot = %s with status = %s' %
                (c, self.name, self.status_processing))

        tasks = []
        tasks.extend(self.db.getBotTasks(self.name))

        if self.task_conditions:
            tasks.extend(
                self.db.getBotTaskBatch(self.name, self.job_item_limit,
                                        self.status_processing,
                                        self.task_conditions))
        if len(tasks) == 0:
            self.log('Found no more tasks available.  Terminating job.',
                     log.INFO)
            return
        else:
            self.log('processsing a batch of %s items' % len(tasks), log.INFO)

        for task in tasks[:self.job_item_limit]:
            task_id = task['task_id']

            self.log('processing item %s' % task_id, log.INFO)

            self.alert_context = "task_id=%s" % task_id

            # actually process the report
            items = self.process_item(self.bot_task_params(task_id))
            for item in items:
                yield item

            items_processed += 1

        self.log('Terminating after processing %s items' % (items_processed),
                 log.INFO)

    def bot_task_params(self, task_id):
        try:
            task_id = int(task_id)
            params = self.db.getBotTaskParams(self.name, task_id)
        except ValueError:
            params = None

        if (params):
            params = dict(
                zip([p['key'] for p in params], [p['value'] for p in params]))
            params['task_id'] = task_id
            params.update(self.task_params)
            return params
        else:
            return task_id

    def update_task_param(self, task_id, key, value):
        self.db.updateBotTaskParam(self.name, task_id, key, value)

    # override this in the subclass.  Remember to call self.item_completed() when you are done to
    # mark the item as completed in the db
    def process_item(self, task_id):
        self.item_completed(task_id)
        pass

    # does nothing - override in subclass
    def item_stored(self, item, id):
        pass

    def set_item_status(self, task_id, status):
        self.db.setBotTaskStatus(task_id, self.name, status)

    def item_new(self, task_id):
        self.db.setBotTaskStatus(task_id, self.name, self.status_new)

    def item_completed(self, task_id):
        self.db.setBotTaskStatus(task_id, self.name, self.status_done)

    def item_dropped(self, task_id):
        self.db.setBotTaskStatus(task_id, self.name, self.status_dropped)

    def item_processing(self, task_id):
        self.db.setBotTaskStatus(task_id, self.name, self.status_processing)

    def send_email(self, from_address, to_address, mime_msg):
        try:
            server = smtplib.SMTP('%s:%s' %
                                  (settings.MAIL_HOST, settings.MAIL_PORT))
            server.starttls()
            server.login(settings.MAIL_USER, settings.MAIL_PASS)
            server.sendmail(from_address, to_address, mime_msg.as_string())
            server.quit()
        except SMTPException as e:
            self.log('Eror sending email to %s: %s' % (to_address, e),
                     log.ERROR)

    def send_alert(self, message, context=None):
        subject = 'Bot Alert: %s' % self.name
        if not context:
            context = self.alert_context
        if context:
            message = "%s\nCONTEXT: %s" % (message, context)
        self.log('Sending alert: %s -- %s' % (subject, message), log.WARNING)

        senddate = datetime.strftime(datetime.now(), '%Y-%m-%d')

        header = "Date: %s\r\nFrom: %s\r\nTo: %s\r\nSubject: %s\r\n\r\n" % (
            senddate, settings.MAIL_FROM, settings.MAIL_TO, subject)

        server = smtplib.SMTP('%s:%s' %
                              (settings.MAIL_HOST, settings.MAIL_PORT))
        server.starttls()
        server.login(settings.MAIL_USER, settings.MAIL_PASS)
        server.sendmail(settings.MAIL_FROM, settings.MAIL_TO, header + message)
        server.quit()

    # used for NRC reports only
    def make_tag(self, task_id, tag, comment=None):
        t = ItemLoader(NrcTag())
        t.add_value('reportnum', task_id)
        t.add_value('tag', tag)
        t.add_value('comment', comment)
        return t.load_item()

    # used for FeedEntry Tags
    def create_tag(self, feed_entry_id, tag, comment=''):
        l = ItemLoader(FeedEntryTag())
        l.add_value('feed_entry_id', feed_entry_id)
        l.add_value('tag', tag)
        l.add_value('comment', comment)
        return l.load_item()

    def make_bot_task_error(self, task_id, code, message=''):
        t = ItemLoader(BotTaskError())
        t.message_in = lambda slist: [s[:1023] for s in slist]
        t.add_value('task_id', task_id)
        t.add_value('bot', self.name)
        t.add_value('code', code)
        t.add_value('message', message)
        return t.load_item()

    # Use this as the errback in a request if you want to log request failures
    def error_callback(self, err):
        self.log('HTTP request failed %s' % (err.getErrorMessage()),
                 log.WARNING)

    def spider_closed(self, spider):
        #        self.log("\n" + "\n".join ( ["%s = %s" % (k,v) for k,v in sorted(self.crawler.stats.get_stats(spider=self).items())] ), log.INFO)
        pass