Esempio n. 1
0
 def __init__(self, **kwargs):
     self.db = NrcDatabase()
     self.db.connect()
     self.geodb = GeoDatabase()
     self.geodb.connect()
     dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
     if 'task_id' in kwargs:
         self.task_id = kwargs['task_id']
     self.task_params = kwargs
     self.exception_count = 0
Esempio n. 2
0
 def __init__(self, **kwargs):
     self.db = NrcDatabase()
     self.db.connect()
     self.geodb = GeoDatabase()
     self.geodb.connect()
     dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
     if 'task_id' in kwargs:
         self.task_id = kwargs['task_id']
     self.task_params = kwargs
     self.exception_count = 0
Esempio n. 3
0
    def __init__(self, **kwargs):
        self.db = NrcDatabase()
        self.db.connect()
        self.enddate = date.today()
        self.exception_count = 0
        if 'enddate' in kwargs:
            self.enddate = datetime.strptime(kwargs['enddate'],'%Y-%m-%d')
        interval = timedelta (10) # ten days
        last_report_dt = self.db.latestReportDate()

        if last_report_dt:
            self.startdate = (last_report_dt - timedelta(days=10))
        else:
            self.startdate = self.enddate - interval
        if 'startdate' in kwargs:
            self.startdate = datetime.strptime(kwargs['startdate'],'%Y-%m-%d')
        if 'target' in kwargs:
            # ignore default date range and get it from the schedule table in the database instead
            t = self.db.getNextNrcScraperTarget(kwargs['target'])
            if (t):
                self.startdate = t["startdate"]
                self.enddate = t["enddate"]
Esempio n. 4
0
class NrcBot(BaseSpider):
    name = 'NrcBot'
    allowed_domains = ['skytruth.org']
    db = None
    geodb = None
    task_id = None
    status_processing = 'PROCESSING'
    status_done = 'DONE'
    status_dropped = 'SKIPPED'
    status_new = 'NEW'
    status_no_data = 'NODATA'
    status_updated = 'UPDATED'

    #TODO: get this from config
    botmaster_url_template = 'http://ewn2.skytruth.org/nrc/botmaster.php?bot={0}'
    task_conditions = {}

    # Default values - override in subclass
#    batch_size = 10  # number of items to process in one batch
    job_item_limit = 1000  # maximum total items to process in one job execution

    alert_context = None

    def __init__(self, **kwargs):
        self.db = NrcDatabase()
        self.db.connect()
        self.geodb = GeoDatabase()
        self.geodb.connect()
        dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
        if 'task_id' in kwargs:
            self.task_id = kwargs['task_id']
        self.task_params = kwargs
        self.exception_count = 0


    def start_requests(self):
        url = self.botmaster_url_template.format(self.name)
        return [Request (url, callback=self.parse)]

    # Parse execution params from botmaster
    def parse(self, response):

        hxs = XmlXPathSelector(response)
        name = hxs.select('//name').extract()

        if self.task_id is not None:
            self.log('Processing item %s' % self.task_id, log.INFO)
            self.alert_context = 'task_id=%s' % self.task_id
            for item in self.process_item(self.bot_task_params(self.task_id)):
                yield item
        else:
            for item in self.process_items():
                yield  item

    # iterate over records that have not yet been processed
    def process_items (self):
        items_processed = 0
        c = self.db.getBotTaskCount (self.name, self.status_processing)
        c = c[0]['count']
        if (c  > 0):
            self.send_alert ('ERROR: %s tasks found for bot = %s with status = %s' % (c, self.name, self.status_processing))

        tasks = []
        tasks.extend(self.db.getBotTasks (self.name))

        if self.task_conditions:
            tasks.extend(self.db.getBotTaskBatch(self.name, self.job_item_limit, self.status_processing, self.task_conditions))
        if len(tasks) == 0:
            self.log('Found no more tasks available.  Terminating job.', log.INFO)
            return
        else:
            self.log('processsing a batch of %s items' % len(tasks), log.INFO)

        for task in tasks[:self.job_item_limit]:
            task_id = task['task_id']

            self.log('processing item %s' % task_id, log.INFO)

            self.alert_context = "task_id=%s" % task_id


            # actually process the report
            items = self.process_item(self.bot_task_params(task_id))
            for item in items:
                yield item

            items_processed += 1

        self.log('Terminating after processing %s items' % (items_processed), log.INFO)

    def bot_task_params (self, task_id):
        try:
                task_id = int(task_id)
	        params = self.db.getBotTaskParams (self.name, task_id)
        except ValueError:
                params = None

        if (params):
            params = dict (zip ([p['key'] for p in params],[p['value'] for p in params]))
            params ['task_id'] = task_id
            params.update (self.task_params)
            return params
        else:
            return task_id

    def update_task_param (self, task_id, key, value):
        self.db.updateBotTaskParam (self.name, task_id, key, value)


    # override this in the subclass.  Remember to call self.item_completed() when you are done to
    # mark the item as completed in the db
    def process_item(self, task_id):
        self.item_completed (task_id)
        pass

    # does nothing - override in subclass
    def item_stored(self, item, id):
        pass


    def set_item_status (self, task_id, status):
        self.db.setBotTaskStatus(task_id, self.name, status)

    def item_new (self, task_id):
        self.db.setBotTaskStatus(task_id, self.name, self.status_new)

    def item_updated(self, task_id):
        self.db.setBotTaskStatus(task_id, self.name, self.status_updated)

    def item_completed (self, task_id):
        self.db.setBotTaskStatus(task_id, self.name, self.status_done)

    def item_dropped (self, task_id):
        self.db.setBotTaskStatus(task_id, self.name, self.status_dropped)

    def item_processing (self, task_id):
        self.db.setBotTaskStatus(task_id, self.name, self.status_processing)

    def send_email (self, from_address, to_address, mime_msg):
        try:
            server = smtplib.SMTP('%s:%s' % (settings.MAIL_HOST, settings.MAIL_PORT))
            server.starttls()
            server.login(settings.MAIL_USER,settings.MAIL_PASS)
            server.sendmail(from_address, to_address, mime_msg.as_string())
            server.quit()
        except SMTPException as e:
            self.log ('Eror sending email to %s: %s'  % (to_address, e), log.ERROR)

    def send_alert (self, message, context=None):
        subject = 'Bot Alert: %s' % self.name
        if not context:
            context = self.alert_context
        if context:
            message = "%s\nCONTEXT: %s" % (message, context)
        self.log ('Sending alert: %s -- %s'  % (subject, message), log.WARNING)

        senddate=datetime.strftime(datetime.now(), '%Y-%m-%d')

        header="Date: %s\r\nFrom: %s\r\nTo: %s\r\nSubject: %s\r\n\r\n" % (senddate, settings.MAIL_FROM, settings.MAIL_TO, subject)

        server = smtplib.SMTP('%s:%s' % (settings.MAIL_HOST, settings.MAIL_PORT))
        server.starttls()
        server.login(settings.MAIL_USER,settings.MAIL_PASS)
        server.sendmail(settings.MAIL_FROM, settings.MAIL_TO, header+message)
        server.quit()

    # used for NRC reports only
    def make_tag (self, task_id, tag, comment=None):
        t = ItemLoader (NrcTag())
        t.add_value ('reportnum', task_id)
        t.add_value ('tag', tag)
        t.add_value ('comment', comment)
        return t.load_item()

    # used for FeedEntry Tags
    def create_tag (self, feed_entry_id, tag, comment = ''):
        l = ItemLoader (FeedEntryTag())
        l.add_value ('feed_entry_id', feed_entry_id)
        l.add_value ('tag', tag)
        l.add_value ('comment', comment)
        return l.load_item()

    def make_bot_task_error (self, task_id, code, message=''):
        t = ItemLoader (BotTaskError())
        t.message_in = lambda slist: [s[:1023] for s in slist]
        t.add_value ('task_id', task_id)
        t.add_value ('bot', self.name)
        t.add_value ('code', code)
        t.add_value ('message', message)
        return t.load_item()


    # Use this as the errback in a request if you want to log request failures
    def error_callback (self, err):
        self.log ('HTTP request failed %s'  % (err.getErrorMessage()), log.WARNING)

    def spider_closed(self, spider):
#        self.log("\n" + "\n".join ( ["%s = %s" % (k,v) for k,v in sorted(self.crawler.stats.get_stats(spider=self).items())] ), log.INFO)
        pass
Esempio n. 5
0
class NrcBot(BaseSpider):
    name = 'NrcBot'
    allowed_domains = ['skytruth.org']
    db = None
    geodb = None
    task_id = None
    status_processing = 'PROCESSING'
    status_done = 'DONE'
    status_dropped = 'SKIPPED'
    status_new = 'NEW'
    status_no_data = 'NODATA'

    #TODO: get this from config
    botmaster_url_template = 'http://ewn2.skytruth.org/nrc/botmaster.php?bot={0}'
    task_conditions = {}

    # Default values - override in subclass
    #    batch_size = 10  # number of items to process in one batch
    job_item_limit = 1000  # maximum total items to process in one job execution

    alert_context = None

    def __init__(self, **kwargs):
        self.db = NrcDatabase()
        self.db.connect()
        self.geodb = GeoDatabase()
        self.geodb.connect()
        dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
        if 'task_id' in kwargs:
            self.task_id = kwargs['task_id']
        self.task_params = kwargs
        self.exception_count = 0

    def start_requests(self):
        url = self.botmaster_url_template.format(self.name)
        return [Request(url, callback=self.parse)]

    # Parse execution params from botmaster
    def parse(self, response):

        hxs = XmlXPathSelector(response)
        name = hxs.select('//name').extract()

        if self.task_id is not None:
            self.log('Processing item %s' % self.task_id, log.INFO)
            self.alert_context = 'task_id=%s' % self.task_id
            for item in self.process_item(self.bot_task_params(self.task_id)):
                yield item
        else:
            for item in self.process_items():
                yield item

    # iterate over records that have not yet been processed
    def process_items(self):
        items_processed = 0
        c = self.db.getBotTaskCount(self.name, self.status_processing)
        c = c[0]['count']
        if (c > 0):
            self.send_alert(
                'ERROR: %s tasks found for bot = %s with status = %s' %
                (c, self.name, self.status_processing))

        tasks = []
        tasks.extend(self.db.getBotTasks(self.name))

        if self.task_conditions:
            tasks.extend(
                self.db.getBotTaskBatch(self.name, self.job_item_limit,
                                        self.status_processing,
                                        self.task_conditions))
        if len(tasks) == 0:
            self.log('Found no more tasks available.  Terminating job.',
                     log.INFO)
            return
        else:
            self.log('processsing a batch of %s items' % len(tasks), log.INFO)

        for task in tasks[:self.job_item_limit]:
            task_id = task['task_id']

            self.log('processing item %s' % task_id, log.INFO)

            self.alert_context = "task_id=%s" % task_id

            # actually process the report
            items = self.process_item(self.bot_task_params(task_id))
            for item in items:
                yield item

            items_processed += 1

        self.log('Terminating after processing %s items' % (items_processed),
                 log.INFO)

    def bot_task_params(self, task_id):
        try:
            task_id = int(task_id)
            params = self.db.getBotTaskParams(self.name, task_id)
        except ValueError:
            params = None

        if (params):
            params = dict(
                zip([p['key'] for p in params], [p['value'] for p in params]))
            params['task_id'] = task_id
            params.update(self.task_params)
            return params
        else:
            return task_id

    def update_task_param(self, task_id, key, value):
        self.db.updateBotTaskParam(self.name, task_id, key, value)

    # override this in the subclass.  Remember to call self.item_completed() when you are done to
    # mark the item as completed in the db
    def process_item(self, task_id):
        self.item_completed(task_id)
        pass

    # does nothing - override in subclass
    def item_stored(self, item, id):
        pass

    def set_item_status(self, task_id, status):
        self.db.setBotTaskStatus(task_id, self.name, status)

    def item_new(self, task_id):
        self.db.setBotTaskStatus(task_id, self.name, self.status_new)

    def item_completed(self, task_id):
        self.db.setBotTaskStatus(task_id, self.name, self.status_done)

    def item_dropped(self, task_id):
        self.db.setBotTaskStatus(task_id, self.name, self.status_dropped)

    def item_processing(self, task_id):
        self.db.setBotTaskStatus(task_id, self.name, self.status_processing)

    def send_email(self, from_address, to_address, mime_msg):
        try:
            server = smtplib.SMTP('%s:%s' %
                                  (settings.MAIL_HOST, settings.MAIL_PORT))
            server.starttls()
            server.login(settings.MAIL_USER, settings.MAIL_PASS)
            server.sendmail(from_address, to_address, mime_msg.as_string())
            server.quit()
        except SMTPException as e:
            self.log('Eror sending email to %s: %s' % (to_address, e),
                     log.ERROR)

    def send_alert(self, message, context=None):
        subject = 'Bot Alert: %s' % self.name
        if not context:
            context = self.alert_context
        if context:
            message = "%s\nCONTEXT: %s" % (message, context)
        self.log('Sending alert: %s -- %s' % (subject, message), log.WARNING)

        senddate = datetime.strftime(datetime.now(), '%Y-%m-%d')

        header = "Date: %s\r\nFrom: %s\r\nTo: %s\r\nSubject: %s\r\n\r\n" % (
            senddate, settings.MAIL_FROM, settings.MAIL_TO, subject)

        server = smtplib.SMTP('%s:%s' %
                              (settings.MAIL_HOST, settings.MAIL_PORT))
        server.starttls()
        server.login(settings.MAIL_USER, settings.MAIL_PASS)
        server.sendmail(settings.MAIL_FROM, settings.MAIL_TO, header + message)
        server.quit()

    # used for NRC reports only
    def make_tag(self, task_id, tag, comment=None):
        t = ItemLoader(NrcTag())
        t.add_value('reportnum', task_id)
        t.add_value('tag', tag)
        t.add_value('comment', comment)
        return t.load_item()

    # used for FeedEntry Tags
    def create_tag(self, feed_entry_id, tag, comment=''):
        l = ItemLoader(FeedEntryTag())
        l.add_value('feed_entry_id', feed_entry_id)
        l.add_value('tag', tag)
        l.add_value('comment', comment)
        return l.load_item()

    def make_bot_task_error(self, task_id, code, message=''):
        t = ItemLoader(BotTaskError())
        t.message_in = lambda slist: [s[:1023] for s in slist]
        t.add_value('task_id', task_id)
        t.add_value('bot', self.name)
        t.add_value('code', code)
        t.add_value('message', message)
        return t.load_item()

    # Use this as the errback in a request if you want to log request failures
    def error_callback(self, err):
        self.log('HTTP request failed %s' % (err.getErrorMessage()),
                 log.WARNING)

    def spider_closed(self, spider):
        #        self.log("\n" + "\n".join ( ["%s = %s" % (k,v) for k,v in sorted(self.crawler.stats.get_stats(spider=self).items())] ), log.INFO)
        pass
Esempio n. 6
0
class NrcScraper(BaseSpider):
    name = "NrcScraper"
    allowed_domains = ["nrc.uscg.mil", "maps.googleapis.com"]

#        old url: "http://www.nrc.uscg.mil/apex/f?p=109"
    start_urls = [
        "http://www.nrc.uscg.mil/pls/apex/f?p=109:1:0:::::"
    ]
    db = None

    def __init__(self, **kwargs):
        self.db = NrcDatabase()
        self.db.connect()
        self.enddate = date.today()
        self.exception_count = 0
        if 'enddate' in kwargs:
            self.enddate = datetime.strptime(kwargs['enddate'],'%Y-%m-%d')
        interval = timedelta (10) # ten days
        last_report_dt = self.db.latestReportDate()

        if last_report_dt:
            self.startdate = (last_report_dt - timedelta(days=10))
        else:
            self.startdate = self.enddate - interval
        if 'startdate' in kwargs:
            self.startdate = datetime.strptime(kwargs['startdate'],'%Y-%m-%d')
        if 'target' in kwargs:
            # ignore default date range and get it from the schedule table in the database instead
            t = self.db.getNextNrcScraperTarget(kwargs['target'])
            if (t):
                self.startdate = t["startdate"]
                self.enddate = t["enddate"]


    def parse(self, response):
        # get today's date.
        # TODO: make this a runtime parameter
        enddate = self.enddate
        startdate = self.startdate
        self.log('Requesting NRC Incident reports from {0} to {1}'.format(startdate, enddate), log.INFO)
        request = FormRequest.from_response(response,
            formdata={
                'p_t04': startdate.strftime("%d-%b-%y"),
                'p_t05': enddate.strftime("%d-%b-%y"),
                'p_t14': '100',
                'p_request': 'Go'
            },
            callback=self.search_results)
        return [request]

    def search_results(self, response):
        text = unicode (response.body, response.encoding)
        hxs = HtmlXPathSelector(text=text)
        reports = hxs.select ('//table[@class="t16Standard"]/tr')
        if (len(reports) == 0):
            self.log('Incident report data not present in response', log.ERROR)
        else:
            # Skip the first report record because this is the header row
            reports.pop (0)
            if (len(reports) == 0):
                self.log('No incident reports found in response', log.WARNING)
            else:
                self.log('Retrieved {0} incident reports'.format(len(reports)), log.INFO)

        for report in reports:
            l = XPathItemLoader(NrcScrapedReport(), report)
            l.context['base_url'] = response.url
            for name, params in NrcScrapedReport.fields.items():
                l.add_xpath(name, params['xpath'])
            item = l.load_item()
            if self.db.reportExists(item['reportnum']):
                self.log('Report {0} already exists.  Skipping to next report.'.format(item['reportnum']), log.INFO)
            else:
                f_request = Request(
                    item['full_report_url'],
                    callback=self.parse_full_report)
                m_request = Request(
                    item['materials_url'],
                    callback=self.parse_materials)
                yield item
                self.db.setBotTaskStatus(item['reportnum'], self.name, 'DONE')

#                if self.db.fullReportExists (item['reportnum']):
#                    self.log('Full report Report {0} already exists.  Skipping download.'.format(item['reportnum']), log.INFO)
#                else:
#                    yield f_request
#
#                if self.db.materialExists (item['reportnum']):
#                    self.log('Materials record(s) already exist for report {0}.  Skipping download.'.format(item['reportnum']), log.INFO)
#               else:
#                    yield m_request

        # get next page of results
        next = hxs.select('//td[@class="pagination"][4]/a/@href')
        if len(next) > 0:
            yield Request (urljoin(response.url, next[0].extract()), callback=self.search_results)


    def parse_full_report(self, response):
        # need to work around weird bug where lxml can't handle encode=WINDOWS-1252
        # so pull out the body, convert to utf-8 and create a new TextResponse object to contain it
        # since XPathItemLoader requires a Response object
        text = unicode (response.body, response.encoding)
        t = TextResponse (url=response.url, body=text.encode('utf-8'), encoding='utf-8')

        l= XPathItemLoader(NrcScrapedFullReport(), response=t)
        url_parts = urlsplit(response.url)
        l.add_value('reportnum', parse_qs(url_parts.query)['standard_web inc_seq'])
        l.add_xpath('full_report_body', '//body')
        l.add_value('full_report_url', response.url)
        item = l.load_item()
        reportnum = item['reportnum']
        yield item
        self.db.setBotTaskStatus(reportnum, self.name, 'DONE')


    def parse_materials(self, response):
        text = unicode (response.body, response.encoding)
        hxs = HtmlXPathSelector(text=text)
        materials = hxs.select ('//table[@class="t16Standard"]/tr')
        if (len(materials) == 0):
            self.log('Materials data not present in response from {0}'.format(response.url), log.INFO)
        else:
            # Skip the first report record because this is the header row
            materials.pop (0)
            if (len(materials) == 0):
                self.log('No incident reports found in response', log.INFO)
            else:
                self.log('Retrieved {0} materials records'.format(len(materials)), log.INFO)

        for material in materials:
            l = XPathItemLoader(NrcScrapedMaterial(), material)
            l.add_value('reportnum', response.url, TakeFirst(), re='P3_SEQNOS:(\d+)')
            for name, params in NrcScrapedMaterial.fields.items():
                if 'xpath' in params:
                    l.add_xpath(name, params['xpath'])
            item = l.load_item()
            yield item

    def item_stored(self, item, id):
        return