Beispiel #1
0
    def validate_model(self):
        if self.model_valid:
            return

        log.info("Validating model")
        try:
            self.model = Model().deserialize(self.model)
            self.model_valid = True
        except Invalid as e:
            raise ModelValidationError(e)
Beispiel #2
0
class BaseImporter(object):
    def __init__(self, data, model, source_file="<stream>"):
        self.data = data
        self.model = model
        self.model_valid = None
        self.source_file = source_file
        self.errors = []
        self.on_error = lambda e: log.warn(e)
        self._generate_fields()

    def run(self,
            dry_run=False,
            max_errors=None,
            max_lines=None,
            raise_errors=False,
            build_indices=True):

        self.dry_run = dry_run
        self.max_errors = max_errors
        self.do_build_indices = build_indices
        self.raise_errors = raise_errors

        self.validate_model()
        self.describe_dimensions()

        self.validator = make_validator(self.fields)

        self.line_number = 0

        for line_number, line in enumerate(self.lines, start=1):
            if max_lines and line_number > max_lines:
                break

            self.line_number = line_number
            self.process_line(line)

        if self.line_number == 0:
            self.add_error("Didn't read any lines of data")

        self.generate_views()
        self.build_indices()

        if self.errors:
            log.error("Finished import with %d errors:")
            for err in self.errors:
                log.error(" - %s", err)
        else:
            log.info("Finished import with no errors!")

    @property
    def lines(self):
        raise NotImplementedError("lines not implemented in BaseImporter")

    @property
    def mapping(self):
        return self.model['mapping']

    @property
    def views(self):
        return self.model.get('views', [])

    def validate_model(self):
        if self.model_valid:
            return

        log.info("Validating model")
        try:
            self.model = Model().deserialize(self.model)
            self.model_valid = True
        except Invalid as e:
            raise ModelValidationError(e)

    def describe_dimensions(self):
        if self.dry_run:
            return False

        log.info("Describing dimensions")
        for dimension, mapping in self.mapping.iteritems():
            self.loader.create_dimension(
                dimension,
                mapping.get("label"),
                type=mapping.get('type'),
                datatype=mapping.get('datatype'),
                fields=mapping.get('fields', []),
                facet=mapping.get('facet'),
                description=mapping.get("description")
            )

    def generate_views(self):
        if self.dry_run:
            return False

        log.info("Generating aggregates and views")
        self.loader.flush_aggregates()
        for view in self.views:
            entity = ENTITY_TYPES.get(view.get('entity'))
            self.loader.create_view(
                entity,
                view.get('filters', {}),
                name=view.get('name'),
                label=view.get('label'),
                dimension=view.get('dimension'),
                breakdown=view.get('breakdown'),
                view_filters=view.get('view_filters', {})
            )
        self.loader.compute_aggregates()

    def build_indices(self):
        if self.dry_run or not self.do_build_indices:
            return False

        log.info("Building search indices")
        solr.drop_index(self.model['dataset']['name'])
        solr.build_index(self.model['dataset']['name'])

    @property
    def loader(self):
        if not hasattr(self, '_loader'):
            dataset = self.model.get('dataset').copy()

            self._loader = Loader(
                dataset_name=dataset.get('name'),
                unique_keys=dataset.get('unique_keys', ['_csv_import_fp']),
                label=dataset.get('label'),
                description=dataset.pop('description'),
                currency=dataset.pop('currency'),
                time_axis=times.GRANULARITY.get(dataset.get(
                    'temporal_granularity',
                    'year'
                )),
                metadata=dataset
            )
        return self._loader

    def process_line(self, line):
        if self.line_number % 1000 == 0:
            log.info('Imported %s lines' % self.line_number)

        try:
            _line = self.validator.deserialize(line)
            if not self.dry_run:
                self.import_line(_line)
        except (Invalid, ImporterError) as e:
            self.add_error(e)

    def import_line(self, line):
        raise NotImplementedError("load_line not implemented in BaseImporter")

    def add_error(self, exception):
        err = DataError(exception=exception,
                        line_number=self.line_number,
                        source_file=self.source_file)

        if self.raise_errors:
            raise err

        self.on_error(err)
        self.errors.append(err)

        if self.max_errors and len(self.errors) >= self.max_errors:
            all_errors = "".join(map(lambda x: "\n  " + str(x), self.errors))
            raise TooManyErrorsError("The following errors occurred:" + all_errors)

    def _generate_fields(self):
        def _field(dimension, mapping, column_name, is_end=False):
            return {
                'dimension': dimension,
                'field': mapping.get(column_name),
                'datatype': mapping.get('datatype'),
                'is_end': is_end
            }

        fields = []

        for dimension, mapping in self.mapping.items():
            if mapping.get('type') == 'value':
                fields.append(_field(dimension, mapping, 'column'))

                if mapping.get('end_column'):
                    fields.append(_field(dimension,
                                         mapping,
                                         'end_column',
                                         True))
            else:
                for field in mapping.get('fields', []):
                    fields.append(_field(dimension, field, 'column'))

        self.fields = fields