def test_RecoversGracefullyAfterIntegrityError(self):
        from phillyleg.models import LegFile
        from django.db.utils import DatabaseError

        LegFile.objects.all().delete()
        LegFile.objects.create(title='testing', key=123)

        ds = CouncilmaticDataStoreWrapper()
        try:
            ds._save_or_ignore(LegFile, {'title': 'testing', 'key': 123})
            ds._save_or_ignore(LegFile, {'title': 'testing', 'key': 123})
        except DatabaseError:
            self.fail('Shouldn\'t have raised a DatabaseError')
        else:
            pass
    def test_RecoversGracefullyAfterIntegrityError (self):
        from phillyleg.models import LegFile
        from django.db.utils import DatabaseError

        LegFile.objects.all().delete()
        LegFile.objects.create(title='testing', key=123)

        ds = CouncilmaticDataStoreWrapper()
        try:
            ds._save_or_ignore(LegFile, {'title':'testing', 'key':123})
            ds._save_or_ignore(LegFile, {'title':'testing', 'key':123})
        except DatabaseError:
            self.fail('Shouldn\'t have raised a DatabaseError')
        else:
            pass
    def _get_new_files(self, force_download):
        # Create a datastore wrapper object
        ds = CouncilmaticDataStoreWrapper()
        source = ScraperWikiSourceWrapper()

        # Get the latest filings
        curr_key = ds.get_latest_key()

        while True:
            curr_key, source_obj = source.check_for_new_content(curr_key, force_download)

            if source_obj is None:
                break

            record, attachments, actions, minutes = source.scrape_legis_file(curr_key, source_obj)
            ds.save_legis_file(record, attachments, actions, minutes)
Beispiel #4
0
    def handle(self, *args, **options):
        log = logging.getLogger()
        log.setLevel(logging.INFO)

        # Create a datastore wrapper object
        ds = self.ds = CouncilmaticDataStoreWrapper()
        source = self.source = PhillyLegistarSiteWrapper(
            settings.LEGISLATION['ROOT'])

        # Seed the PDF cache with already-downloaded content.
        #
        # Downloading and parsing PDF content really slows down the scraping
        # process.  If we had to redownload all of them every time we scraped,
        # it would take a really long time to refresh all of the old stuff.  So
        # that PDFs that have already been downloaded won't be again, seed the
        # source cache with that data.
        #
        # Hopefully this won't be too much of a burden on memory :).
        source.init_pdf_cache(ds.pdf_mapping)

        update_files = options['update_files']

        try:
            self._get_new_files()
            if update_files:
                self._get_updated_files()
        except TooManyGeocodeRequests:
            sys.exit(0)
    def _get_new_files(self, force_download):
        # Create a datastore wrapper object
        ds = CouncilmaticDataStoreWrapper()
        source = ScraperWikiSourceWrapper()

        # Get the latest filings
        curr_key = ds.get_latest_key()

        while True:
            curr_key, source_obj = source.check_for_new_content(
                curr_key, force_download)

            if source_obj is None:
                break

            record, attachments, actions, minutes = \
                source.scrape_legis_file(curr_key, source_obj)
            ds.save_legis_file(record, attachments, actions, minutes)