Beispiel #1
0
    def import_data(self, data_items):
        """ import a bunch of dicts together """
        # keep counts of all actions
        record = {
            'insert': 0, 'update': 0, 'noop': 0,
            'start': utcnow(),
            'records': {
                'insert': [],
                'update': [],
                'noop': [],
            }
        }

        for json_id, data in self._prepare_imports(data_items):
            obj_id, what = self.import_item(data)
            self.json_to_db_id[json_id] = obj_id
            record['records'][what].append(obj_id)
            record[what] += 1

        # all objects are loaded, a perfect time to do inter-object resolution and other tasks
        self.postimport()

        record['end'] = utcnow()

        return {self._type: record}
Beispiel #2
0
    def import_data(self, data_items):
        """ import a bunch of dicts together """
        # keep counts of all actions
        record = {
            'insert': 0, 'update': 0, 'noop': 0,
            'start': utcnow(),
            'records': {
                'insert': [],
                'update': [],
                'noop': [],
            }
        }

        for json_id, data in self._prepare_imports(data_items):
            obj_id, what = self.import_item(data)
            self.json_to_db_id[json_id] = obj_id
            record['records'][what].append(obj_id)
            record[what] += 1

        # all objects are loaded, a perfect time to do inter-object resolution and other tasks
        self.postimport()

        record['end'] = utcnow()

        return {self._type: record}
Beispiel #3
0
    def do_handle(self, args, other, juris):

        available_scrapers = getattr(juris, 'scrapers', {})
        scrapers = OrderedDict()

        if not available_scrapers:
            raise CommandError('no scrapers defined on jurisdiction')

        if other:
            # parse arg list in format: (scraper (k:v)+)+
            cur_scraper = None
            for arg in other:
                if '=' in arg:
                    if not cur_scraper:
                        raise CommandError('argument {} before scraper name'.format(arg))
                    k, v = arg.split('=', 1)
                    scrapers[cur_scraper][k] = v
                elif arg in juris.scrapers:
                    cur_scraper = arg
                    scrapers[cur_scraper] = {}
                else:
                    raise CommandError('no such scraper: module={} scraper={}'.format(args.module,
                                                                                      arg))
        else:
            scrapers = {key: {} for key in available_scrapers.keys()}

        # modify args in-place so we can pass them around
        if not args.actions:
            args.actions = ALL_ACTIONS

        if 'import' in args.actions:
            django.setup()

        # print the plan
        report = {'plan': {'module': args.module, 'actions': args.actions, 'scrapers': scrapers},
                  'start': utils.utcnow(),
                  }
        print_report(report)

        self.check_session_list(juris)

        try:
            if 'scrape' in args.actions:
                report['scrape'] = self.do_scrape(juris, args, scrapers)
            if 'import' in args.actions:
                report['import'] = self.do_import(juris, args)
            report['success'] = True
        except Exception as exc:
            report['success'] = False
            report['exception'] = exc
            report['traceback'] = traceback.format_exc()
            if 'import' in args.actions:
                save_report(report, juris.jurisdiction_id)
            raise

        if 'import' in args.actions:
            save_report(report, juris.jurisdiction_id)

        print_report(report)
        return report
Beispiel #4
0
def save_report(report, jurisdiction):
    from pupa.models import RunPlan

    # set end time
    report['end'] = utils.utcnow()

    plan = RunPlan.objects.create(jurisdiction_id=jurisdiction,
                                  success=report['success'],
                                  start_time=report['start'],
                                  end_time=report['end'],
                                  exception=report.get('exception', ''),
                                  traceback=report.get('traceback', ''),
                                  )

    for scraper, details in report.get('scrape', {}).items():
        args = ' '.join('{k}={v}'.format(k=k, v=v)
                        for k, v in report['plan']['scrapers'].get(scraper, {}).items())
        sr = plan.scrapers.create(scraper=scraper, args=args,
                                  start_time=details['start'], end_time=details['end'])
        for object_type, num in details['objects'].items():
            sr.scraped_objects.create(object_type=object_type, count=num)

    for object_type, changes in report.get('import', {}).items():
        if changes['insert'] or changes['update'] or changes['noop']:
            plan.imported_objects.create(
                object_type=object_type,
                insert_count=changes['insert'],
                update_count=changes['update'],
                noop_count=changes['noop'],
                start_time=changes['start'],
                end_time=changes['end'],
            )
Beispiel #5
0
    def handle(self, args, other):
        juris = self.get_jurisdiction(args.module)

        available_scrapers = getattr(juris, 'scrapers', {})
        scrapers = OrderedDict()

        if not available_scrapers:
            raise CommandError('no scrapers defined on jurisdiction')

        if other:
            # parse arg list in format: (scraper (k:v)+)+
            cur_scraper = None
            for arg in other:
                if '=' in arg:
                    if not cur_scraper:
                        raise CommandError('argument {} before scraper name'.format(arg))
                    k, v = arg.split('=', 1)
                    scrapers[cur_scraper][k] = v
                elif arg in juris.scrapers:
                    cur_scraper = arg
                    scrapers[cur_scraper] = {}
                else:
                    raise CommandError('no such scraper: module={} scraper={}'.format(args.module,
                                                                                      arg))
        else:
            scrapers = {key: {} for key in available_scrapers.keys()}

        # modify args in-place so we can pass them around
        if not args.actions:
            args.actions = ALL_ACTIONS

        # print the plan
        report = {'plan': {'module': args.module, 'actions': args.actions, 'scrapers': scrapers},
                  'start': utils.utcnow(),
                  }
        print_report(report)

        self.check_session_list(juris)

        try:
            if 'scrape' in args.actions:
                report['scrape'] = self.do_scrape(juris, args, scrapers)
            if 'import' in args.actions:
                report['import'] = self.do_import(juris, args)
            report['success'] = True
        except Exception as exc:
            report['success'] = False
            report['exception'] = exc
            report['traceback'] = traceback.format_exc()
            if 'import' in args.actions:
                save_report(report, juris.jurisdiction_id)
            raise

        if 'import' in args.actions:
            save_report(report, juris.jurisdiction_id)
            forward_report(report, juris.jurisdiction_id)

        print_report(report)
        return report
Beispiel #6
0
    def do_scrape(self, **kwargs):
        record = {'objects': defaultdict(int)}
        self.output_names = defaultdict(set)
        record['start'] = utils.utcnow()
        for obj in self.scrape(**kwargs) or []:
            if hasattr(obj, '__iter__'):
                for iterobj in obj:
                    self.save_object(iterobj)
            else:
                self.save_object(obj)
        record['end'] = utils.utcnow()
        record['skipped'] = getattr(self, 'skipped', 0)
        if not self.output_names:
            raise ScrapeError('no objects returned from scrape')
        for _type, nameset in self.output_names.items():
            record['objects'][_type] += len(nameset)

        return record
Beispiel #7
0
    def do_scrape(self, **kwargs):
        record = {'objects': defaultdict(int)}
        self.output_names = defaultdict(set)
        record['start'] = utils.utcnow()
        for obj in self.scrape(**kwargs) or []:
            if hasattr(obj, '__iter__'):
                for iterobj in obj:
                    self.save_object(iterobj)
            else:
                self.save_object(obj)
        record['end'] = utils.utcnow()
        record['skipped'] = getattr(self, 'skipped', 0)
        if not self.output_names:
            raise ScrapeError('no objects returned from {} scrape'.format(self.__class__.__name__))
        for _type, nameset in self.output_names.items():
            record['objects'][_type] += len(nameset)

        return record
Beispiel #8
0
def save_report(report, jurisdiction):
    from pupa.models import RunPlan
    from opencivicdata.core.models import Jurisdiction as JurisdictionModel

    # set end time
    report['end'] = utils.utcnow()

    # if there's an error on the first run, the jurisdiction doesn't exist
    # yet, we opt for skipping creation of RunPlan until there's been at least
    # one good run
    try:
        JurisdictionModel.objects.get(pk=jurisdiction)
    except JurisdictionModel.DoesNotExist:
        logger = logging.getLogger("pupa")
        logger.warning(
            'could not save RunPlan, no successful runs of {} yet'.format(
                jurisdiction))
        return

    plan = RunPlan.objects.create(
        jurisdiction_id=jurisdiction,
        success=report['success'],
        start_time=report['start'],
        end_time=report['end'],
        exception=report.get('exception', ''),
        traceback=report.get('traceback', ''),
    )

    for scraper, details in report.get('scrape', {}).items():
        args = ' '.join(
            '{k}={v}'.format(k=k, v=v)
            for k, v in report['plan']['scrapers'].get(scraper, {}).items())
        sr = plan.scrapers.create(scraper=scraper,
                                  args=args,
                                  start_time=details['start'],
                                  end_time=details['end'])
        for object_type, num in details['objects'].items():
            sr.scraped_objects.create(object_type=object_type, count=num)

    for object_type, changes in report.get('import', {}).items():
        if changes['insert'] or changes['update'] or changes['noop']:
            plan.imported_objects.create(
                object_type=object_type,
                insert_count=changes['insert'],
                update_count=changes['update'],
                noop_count=changes['noop'],
                start_time=changes['start'],
                end_time=changes['end'],
            )
Beispiel #9
0
def save_report(report, jurisdiction):
    from pupa.models import RunPlan
    from opencivicdata.core.models import Jurisdiction as JurisdictionModel

    # set end time
    report['end'] = utils.utcnow()

    # if there's an error on the first run, the jurisdiction doesn't exist
    # yet, we opt for skipping creation of RunPlan until there's been at least
    # one good run
    try:
        JurisdictionModel.objects.get(pk=jurisdiction)
    except JurisdictionModel.DoesNotExist:
        logger = logging.getLogger("pupa")
        logger.warning('could not save RunPlan, no successful runs of {} yet'.format(
            jurisdiction)
        )
        return

    plan = RunPlan.objects.create(jurisdiction_id=jurisdiction,
                                  success=report['success'],
                                  start_time=report['start'],
                                  end_time=report['end'],
                                  exception=report.get('exception', ''),
                                  traceback=report.get('traceback', ''),
                                  )

    for scraper, details in report.get('scrape', {}).items():
        args = ' '.join('{k}={v}'.format(k=k, v=v)
                        for k, v in report['plan']['scrapers'].get(scraper, {}).items())
        sr = plan.scrapers.create(scraper=scraper, args=args,
                                  start_time=details['start'], end_time=details['end'])
        for object_type, num in details['objects'].items():
            sr.scraped_objects.create(object_type=object_type, count=num)

    for object_type, changes in report.get('import', {}).items():
        if changes['insert'] or changes['update'] or changes['noop']:
            plan.imported_objects.create(
                object_type=object_type,
                insert_count=changes['insert'],
                update_count=changes['update'],
                noop_count=changes['noop'],
                start_time=changes['start'],
                end_time=changes['end'],
            )
Beispiel #10
0
def save_report(report, jurisdiction):
    from pupa.models import RunPlan

    # set end time
    report['end'] = utils.utcnow()

    plan = RunPlan.objects.create(
        jurisdiction_id=jurisdiction,
        success=report['success'],
        start_time=report['start'],
        end_time=report['end'],
        exception=report.get('exception', ''),
        traceback=report.get('traceback', ''),
    )

    for scraper, details in report.get('scrape', {}).items():
        args = ' '.join(
            '{k}={v}'.format(k=k, v=v)
            for k, v in report['plan']['scrapers'].get(scraper, {}).items())
        sr = plan.scrapers.create(scraper=scraper,
                                  args=args,
                                  start_time=details['start'],
                                  end_time=details['end'])
        for object_type, num in details['objects'].items():
            sr.scraped_objects.create(object_type=object_type, count=num)

    for object_type, changes in report.get('import', {}).items():
        if changes['insert'] or changes['update'] or changes['noop']:
            plan.imported_objects.create(
                object_type=object_type,
                insert_count=changes['insert'],
                update_count=changes['update'],
                noop_count=changes['noop'],
                start_time=changes['start'],
                end_time=changes['end'],
            )