class SpecializedCrawler(object):
    """
    Base class for Page crawlers.
    """

    schema = None
    seed_url = None
    date_headline_re = None
    date_format = None
    retriever = None

    def __init__(self):
        try:
            self.seed = Seed.objects.get(url=self.seed_url)
        except Seed.DoesNotExist:
            raise NoSeedYet('You need to add a Seed with the URL %r' % self.seed_url)
        self.logger = logging.getLogger('eb.retrieval.%s.%s' % (settings.SHORT_NAME, self.schema))
        if self.retriever is None:
            self.retriever = UnicodeRetriever(cache=None, sleep=self.seed.delay)

    def save_page(self, unique_id):
        """
        Downloads the page with the given unique ID (possibly a numeric ID, or
        a URL) and saves it as a Page object. Returns the Page object, or None
        if the page couldn't be found.

        The page won't be retrieved/saved if it's already in the database. In
        this case, the existing Page object will be returned.
        """
        self.logger.debug('save_page(%s)', unique_id)
        retrieval_url = self.retrieval_url(unique_id)
        public_url = self.public_url(unique_id)

        try:
            p = Page.objects.get(seed__id=self.seed.id, url=public_url)
        except Page.DoesNotExist:
            pass
        else:
            self.logger.debug('Skipping already-saved URL %s', public_url)
            return p

        try:
            html = self.retriever.get_html(retrieval_url).strip()
        except (RetrievalError, UnicodeDecodeError):
            return None
        if not html:
            self.logger.debug('Got empty page for %s', retrieval_url)
            return None
        self.logger.debug('Got VALID page for %s', retrieval_url)

        m = self.date_headline_re.search(html)
        if not m:
            self.logger.debug('Could not find date/headline on %s', retrieval_url)
            return None
        article_date, article_headline = m.groupdict()['article_date'], m.groupdict()['article_headline']
        try:
            article_date = parse_date(article_date, self.date_format)
        except ValueError:
            self.logger.debug('Got unparseable date %r on %s', article_date, retrieval_url)
            return None
        article_headline = strip_tags(article_headline)
        if len(article_headline) > 255:
            article_headline = article_headline[:252] + '...'

        p = Page.objects.create(
            seed=self.seed,
            url=public_url,
            scraped_url=retrieval_url,
            html=html,
            when_crawled=datetime.datetime.now(),
            is_article=True,
            is_pdf=False,
            is_printer_friendly=False,
            article_headline=article_headline,
            article_date=article_date,
            has_addresses=None,
            when_geocoded=None,
            geocoded_by='',
            times_skipped=0,
            robot_report='',
        )
        self.logger.debug('Created Page ID %s' % p.id)
        save_locations_for_page(p)
        return p

    ######################################
    # METHODS SUBCLASSES SHOULD OVERRIDE #
    ######################################

    def public_url(self, unique_id):
        "Given the ID value, returns the URL that we should publish."
        raise NotImplementedError()

    def retrieval_url(self, unique_id):
        "Given the ID value, returns the URL that we should scrape."
        return self.public_url(unique_id)
Beispiel #2
0
class ZoningUpdater(object):
    def __init__(self):
        self.url = 'http://sfgov.org/site/planning_meeting.asp?id=15840'
        self.retriever = UnicodeRetriever()
        self.delay = 2

    def update(self):
        for year in self.get_years(self.url):
            self.update_year(year['url'])

    def get_years(self, url):
        html = self.retriever.get_html(url)
        t = document_fromstring(html)
        for a in t.xpath("//table[@id='Table4']//a"):
            year_url = 'http://sfgov.org/site/planning_meeting.asp%s' % a.get('href')[:-8]
            yield {'url': year_url, 'year': a.text}

    def update_year(self, url):
        minutes_schema = Schema.objects.get(slug='zoning-minutes')
        agendas_schema = Schema.objects.get(slug='zoning-agenda')
        for page in self.get_minutes(url):
            self.save_page(page, minutes_schema)
        for page in self.get_agendas(url):
            self.save_page(page, agendas_schema)

    def get_minutes(self, url):
        return self._helper(url, 'Minutes')

    def get_agendas(self, url):
        return self._helper(url, 'Agendas')

    def _helper(self, url, item_type):
        html = self.retriever.get_html(url)
        t = document_fromstring(html)
        for a in t.xpath("//a[@name='%s']/parent::td/parent::tr/following-sibling::*[4]//a" % item_type):
            if '(cancellation notice)' in a.text.lower():
                continue
            url = 'http://sfgov.org/site/%s' % a.get('href')
            yield {'title': a.text, 'url': url}

    def save_page(self, page, schema):
        url = page['url']
        # If we've already retrieved the page, there's no need to retrieve
        # it again.
        try:
            Blob.objects.filter(url=url)[0]
        except IndexError:
            pass
        else:
            #self.logger.debug('URL %s has already been retrieved', url)
            return

        # Fetch the html for the page and save it
        html = self.retriever.get_html(url + '&mode=text')
        b = Blob(
            schema=schema,
            title=page['title'],
            url=url,
            html=html,
            is_pdf=False,
            when_crawled=datetime.now(),
            has_addresses=None,
            when_geocoded=None,
            geocoded_by=''
        ).save()

        time.sleep(self.delay)
class ZoningUpdater(object):
    def __init__(self):
        self.url = 'http://sfgov.org/site/planning_meeting.asp?id=15840'
        self.retriever = UnicodeRetriever()
        self.delay = 2

    def update(self):
        for year in self.get_years(self.url):
            self.update_year(year['url'])

    def get_years(self, url):
        html = self.retriever.get_html(url)
        t = document_fromstring(html)
        for a in t.xpath("//table[@id='Table4']//a"):
            year_url = 'http://sfgov.org/site/planning_meeting.asp%s' % a.get(
                'href')[:-8]
            yield {'url': year_url, 'year': a.text}

    def update_year(self, url):
        minutes_schema = Schema.objects.get(slug='zoning-minutes')
        agendas_schema = Schema.objects.get(slug='zoning-agenda')
        for page in self.get_minutes(url):
            self.save_page(page, minutes_schema)
        for page in self.get_agendas(url):
            self.save_page(page, agendas_schema)

    def get_minutes(self, url):
        return self._helper(url, 'Minutes')

    def get_agendas(self, url):
        return self._helper(url, 'Agendas')

    def _helper(self, url, item_type):
        html = self.retriever.get_html(url)
        t = document_fromstring(html)
        for a in t.xpath(
                "//a[@name='%s']/parent::td/parent::tr/following-sibling::*[4]//a"
                % item_type):
            if '(cancellation notice)' in a.text.lower():
                continue
            url = 'http://sfgov.org/site/%s' % a.get('href')
            yield {'title': a.text, 'url': url}

    def save_page(self, page, schema):
        url = page['url']
        # If we've already retrieved the page, there's no need to retrieve
        # it again.
        try:
            Blob.objects.filter(url=url)[0]
        except IndexError:
            pass
        else:
            #self.logger.debug('URL %s has already been retrieved', url)
            return

        # Fetch the html for the page and save it
        html = self.retriever.get_html(url + '&mode=text')
        b = Blob(schema=schema,
                 title=page['title'],
                 url=url,
                 html=html,
                 is_pdf=False,
                 when_crawled=datetime.now(),
                 has_addresses=None,
                 when_geocoded=None,
                 geocoded_by='').save()

        time.sleep(self.delay)
Beispiel #4
0
class SpecializedCrawler(object):
    """
    Base class for Page crawlers.
    """

    schema = None
    seed_url = None
    date_headline_re = None
    date_format = None
    retriever = None

    def __init__(self):
        try:
            self.seed = Seed.objects.get(url=self.seed_url)
        except Seed.DoesNotExist:
            raise NoSeedYet('You need to add a Seed with the URL %r' %
                            self.seed_url)
        self.logger = logging.getLogger('eb.retrieval.%s.%s' %
                                        (settings.SHORT_NAME, self.schema))
        if self.retriever is None:
            self.retriever = UnicodeRetriever(cache=None,
                                              sleep=self.seed.delay)

    def save_page(self, unique_id):
        """
        Downloads the page with the given unique ID (possibly a numeric ID, or
        a URL) and saves it as a Page object. Returns the Page object, or None
        if the page couldn't be found.

        The page won't be retrieved/saved if it's already in the database. In
        this case, the existing Page object will be returned.
        """
        self.logger.debug('save_page(%s)', unique_id)
        retrieval_url = self.retrieval_url(unique_id)
        public_url = self.public_url(unique_id)

        try:
            p = Page.objects.get(seed__id=self.seed.id, url=public_url)
        except Page.DoesNotExist:
            pass
        else:
            self.logger.debug('Skipping already-saved URL %s', public_url)
            return p

        try:
            html = self.retriever.get_html(retrieval_url).strip()
        except (RetrievalError, UnicodeDecodeError):
            return None
        if not html:
            self.logger.debug('Got empty page for %s', retrieval_url)
            return None
        self.logger.debug('Got VALID page for %s', retrieval_url)

        m = self.date_headline_re.search(html)
        if not m:
            self.logger.debug('Could not find date/headline on %s',
                              retrieval_url)
            return None
        article_date, article_headline = m.groupdict(
        )['article_date'], m.groupdict()['article_headline']
        try:
            article_date = parse_date(article_date, self.date_format)
        except ValueError:
            self.logger.debug('Got unparseable date %r on %s', article_date,
                              retrieval_url)
            return None
        article_headline = strip_tags(article_headline)
        if len(article_headline) > 255:
            article_headline = article_headline[:252] + '...'

        p = Page.objects.create(
            seed=self.seed,
            url=public_url,
            scraped_url=retrieval_url,
            html=html,
            when_crawled=datetime.datetime.now(),
            is_article=True,
            is_pdf=False,
            is_printer_friendly=False,
            article_headline=article_headline,
            article_date=article_date,
            has_addresses=None,
            when_geocoded=None,
            geocoded_by='',
            times_skipped=0,
            robot_report='',
        )
        self.logger.debug('Created Page ID %s' % p.id)
        save_locations_for_page(p)
        return p

    ######################################
    # METHODS SUBCLASSES SHOULD OVERRIDE #
    ######################################

    def public_url(self, unique_id):
        "Given the ID value, returns the URL that we should publish."
        raise NotImplementedError()

    def retrieval_url(self, unique_id):
        "Given the ID value, returns the URL that we should scrape."
        return self.public_url(unique_id)