Python DataPreprocessor Examples, spoonbill.stats.DataPreprocessor Python Examples

Example #1

0

Show file

File: test_stats.py Project: lttga/test2

def test_dump_restore(log, spec, releases, tmpdir):
    for _ in spec.process_items(releases):
        pass
    spec.dump(tmpdir / "result.json")
    spec2 = DataPreprocessor.restore(tmpdir / "result.json")

    for name, table in spec.tables.items():
        assert table == spec2.tables[name]
    for key in (
            "schema",
            "root_tables",
            "combined_tables",
            "header_separator",
            "tables",
            "table_threshold",
            "total_items",
    ):
        assert key in spec2.__dict__
    with patch("builtins.open", mock_open(read_data="invalid")):
        spec2 = DataPreprocessor.restore(tmpdir / "result.json")
        log.assert_has_calls([call("Invalid pickle file. Can't restore.")])

    with patch("builtins.open", mock_open(read_data=b"invalid")):
        spec2 = DataPreprocessor.restore(tmpdir / "result.json")
        log.assert_has_calls([call("Invalid pickle file. Can't restore.")])

Example #2

0

Show file

File: views.py Project: open-contracting/spoonbill-web

    def create(self, request, *args, upload_id=None, url_id=None):
        data = request.data or request.POST
        kind = data.get("kind", DataSelection.CUSTOM)
        headings_type = DataSelection.OCDS

        if kind != DataSelection.OCDS_LITE:
            serializer = self.get_serializer_class()(data=data)
            if serializer.is_valid():
                datasource = Url.objects.get(
                    id=url_id) if url_id else Upload.objects.get(id=upload_id)
                selection = DataSelection.objects.create(
                    kind=kind, headings_type=headings_type)
                spec = DataPreprocessor.restore(datasource.analyzed_file.path)
                for table in serializer.data["tables"]:
                    _table = Table.objects.create(**table)
                    _table.should_split = spec[_table.name].splitted
                    _table.save()
                    selection.tables.add(_table)
                datasource.selections.add(selection)
                return Response(self.get_serializer_class()(selection).data,
                                status=status.HTTP_201_CREATED)
            else:
                return Response({"detail": serializer.errors},
                                status=status.HTTP_400_BAD_REQUEST)
        else:
            datasource = Url.objects.get(
                id=url_id) if url_id else Upload.objects.get(id=upload_id)
            if not datasource.available_tables:
                return Response(
                    {"detail": _("Datasource without available tables")},
                    status=status.HTTP_400_BAD_REQUEST)
            lang_code = get_language()
            lang_prefix = lang_code.split("-")[0]
            headings_type = f"{lang_prefix}_user_friendly"
            selection = DataSelection.objects.create(
                kind=kind, headings_type=headings_type)
            spec = DataPreprocessor.restore(datasource.analyzed_file.path)
            for available_table in datasource.available_tables:
                if available_table["name"] in OCDS_LITE_CONFIG["tables"]:
                    _name = available_table["name"]
                    _split = OCDS_LITE_CONFIG["tables"][_name].get(
                        "split", False)
                    _table = Table.objects.create(name=_name, split=_split)
                    child_tables_data = spec.tables[_name].child_tables
                    if _split and child_tables_data:
                        for child_table in child_tables_data:
                            _include = (
                                False if child_table
                                not in OCDS_LITE_CONFIG["tables"][_name].get(
                                    "child_tables", {}) else True)
                            _child_table = Table.objects.create(
                                name=child_table, include=_include)
                            _table.array_tables.add(_child_table)
                    selection.tables.add(_table)
            datasource.selections.add(selection)
            return Response(self.get_serializer_class()(selection).data,
                            status=status.HTTP_201_CREATED)

Example #3

0

Show file

File: __init__.py Project: lttga/test2

class FileAnalyzer:
    """Main utility for analyzing files

    :param workdir: Working directory
    :param schema: Json schema file to use with data
    :param root_tables: Path configuration which should become root tables
    :param combined_tables: Path configuration for tables with multiple sources
    :param root_key: Field name to access records
    """

    def __init__(
        self,
        workdir,
        schema=None,
        state_file=None,
        root_tables=ROOT_TABLES,
        combined_tables=COMBINED_TABLES,
        root_key="releases",
        language=LOCALE,
        table_threshold=TABLE_THRESHOLD,
    ):
        self.workdir = Path(workdir)
        if state_file:
            self.spec = DataPreprocessor.restore(state_file)
        else:
            self.spec = DataPreprocessor(
                schema,
                root_tables,
                combined_tables=combined_tables,
                language=language,
                table_threshold=table_threshold,
            )
        self.root_key = root_key

    def analyze_file(self, filename, with_preview=True):
        """Analyze provided file
        :param filename: Input filename
        :param with_preview: Generate preview during analysis
        """
        path = self.workdir / filename
        with open(path, "rb") as fd:
            items = iter_file(fd, self.root_key)
            for count in self.spec.process_items(items, with_preview=with_preview):
                yield fd.tell(), count

    def dump_to_file(self, filename):
        """Save analyzed information to file

        :param filename: Output filename in working directory
        """
        path = self.workdir / filename
        self.spec.dump(path)

Example #4

0

Show file

File: __init__.py Project: open-contracting/spoonbill

 def __init__(
     self,
     workdir,
     schema=None,
     state_file=None,
     root_tables=ROOT_TABLES,
     combined_tables=COMBINED_TABLES,
     pkg_type="releases",
     language=LOCALE,
     table_threshold=TABLE_THRESHOLD,
 ):
     self.workdir = Path(workdir)
     self.multiple_values = False
     self.schema = schema
     self.root_tables = root_tables
     self.combined_tables = combined_tables
     self.language = language
     self.table_threshold = table_threshold
     if state_file:
         self.spec = DataPreprocessor.restore(state_file)
         self.sort_tables()
     else:
         self.spec = None
     self.pkg_type = pkg_type
     self.order = None

Example #5

0

Show file

 def update(self, request, *args, **kwargs):
     try:
         if "url_id" in kwargs:
             datasource = Url.objects.get(id=kwargs["url_id"])
         elif "upload_id" in kwargs:
             datasource = Upload.objects.get(id=kwargs["upload_id"])
         table = Table.objects.get(id=kwargs["id"])
         spec = DataPreprocessor.restore(datasource.analyzed_file.path)
         update_fields = []
         for key in ("split", "include", "heading"):
             if key in request.data:
                 setattr(table, key, request.data[key])
                 update_fields.append(key)
         if update_fields:
             table.save(update_fields=update_fields)
         is_array_tables = len(table.array_tables.all())
         if "split" in request.data and request.data[
                 "split"] and not is_array_tables:
             child_tables = spec.tables[table.name].child_tables
             self._split_table(table, spec.tables, datasource, child_tables)
         serializer = self.get_serializer_class()(table)
         sources = table.dataselection_set.all() or table.array_tables.all(
         )[0].dataselection_set.all()
         if sources:
             sources[0].flattens.all().delete()
         return Response(serializer.data)
     except FileNotFoundError as e:
         extra = {
             "MESSAGE_ID": "update_table_failed",
             "DATASOURCE_ID": str(datasource.id),
             "TABLE_ID": kwargs["id"],
             "ERROR_MSG": str(e),
             "EXPIRED_AT": datasource.expired_at.isoformat(),
         }
         logger.info("Error while update table %s" % str(e), extra=extra)
         return Response({"detail": _("Datasource expired.")},
                         status=status.HTTP_404_NOT_FOUND)
     except OSError as e:
         extra = {
             "MESSAGE_ID": "update_table_failed",
             "DATASOURCE_ID": str(datasource.id),
             "TABLE_ID": kwargs["id"],
             "ERROR_MSG": str(e),
         }
         logger.info("Error while update table %s" % str(e), extra=extra)
         return Response(
             {
                 "detail":
                 _("Currently, the space limit was reached. Please try again later."
                   )
             },
             status=status.HTTP_413_REQUEST_ENTITY_TOO_LARGE,
         )

Example #6

0

Show file

File: utils.py Project: open-contracting/spoonbill-web

def set_column_headings(selection, analyzed_file_path):
    current_language_code = get_language()
    spec = DataPreprocessor.restore(analyzed_file_path)
    if selection.headings_type.startswith("es"):
        activate("es")
    for table in selection.tables.all():
        table.column_headings = get_column_headings(selection, spec, table)
        table.save(update_fields=["column_headings"])
        if table.split:
            for a_table in table.array_tables.all():
                a_table.column_headings = get_column_headings(
                    selection, spec, a_table)
                a_table.save(update_fields=["column_headings"])
    activate(current_language_code)

Example #7

0

Show file

File: __init__.py Project: lttga/test2

 def __init__(
     self,
     workdir,
     schema=None,
     state_file=None,
     root_tables=ROOT_TABLES,
     combined_tables=COMBINED_TABLES,
     root_key="releases",
     language=LOCALE,
     table_threshold=TABLE_THRESHOLD,
 ):
     self.workdir = Path(workdir)
     if state_file:
         self.spec = DataPreprocessor.restore(state_file)
     else:
         self.spec = DataPreprocessor(
             schema,
             root_tables,
             combined_tables=combined_tables,
             language=language,
             table_threshold=table_threshold,
         )
     self.root_key = root_key

Example #8

0

Show file

File: __init__.py Project: open-contracting/spoonbill

 def analyze_file(self, filenames, with_preview=True):
     """Analyze provided file
     :param filename: Input filename
     :param with_preview: Generate preview during analysis
     """
     if not isinstance(filenames, list):
         filenames = [filenames]
     path = self.workdir / filenames[0]
     (
         input_format,
         _is_concatenated,
         _is_array,
     ) = detect_format(path=path, reader=get_reader(path))
     LOGGER.info(_("Input file is {}").format(input_format))
     self.multiple_values = _is_concatenated
     self.parse_schema(input_format, self.schema)
     if self.spec is None:
         self.spec = DataPreprocessor(
             self.schema,
             self.root_tables,
             combined_tables=self.combined_tables,
             language=self.language,
             table_threshold=self.table_threshold,
             multiple_values=self.multiple_values,
             pkg_type=self.pkg_type,
         )
     for filename in filenames:
         path = self.workdir / filename
         reader = get_reader(path)
         with reader(path, "rb") as fd:
             items = iter_file(fd,
                               self.pkg_type,
                               multiple_values=self.multiple_values)
             for count in self.spec.process_items(items):
                 yield fd.tell(), count
     self.sort_tables()

Example #9

0

Show file

File: utils.py Project: lttga/test

def get_flatten_options(selection):
    selections = {}
    exclude_tables_list = []
    spec = None

    if selection.kind == selection.OCDS_LITE:
        datasource = selection.url_set.all() or selection.upload_set.all()
        spec = DataPreprocessor.restore(datasource[0].analyzed_file.path)
    get_options_for_table(selections,
                          exclude_tables_list,
                          selection,
                          selection.tables,
                          analyzed_data=spec)
    options = {"selection": selections}
    if exclude_tables_list:
        options["exclude"] = exclude_tables_list
    return options

Example #10

0

Show file

File: test_stats.py Project: lttga/test2

def test_resolve_schema_uri():
    dp = DataPreprocessor(schema_path,
                          TEST_ROOT_TABLES,
                          combined_tables=TEST_COMBINED_TABLES)
    assert isinstance(dp.schema, dict)

Example #11

0

Show file

def available_tables():
    spec = DataPreprocessor.restore(ANALYZED_DATA_PATH)
    # with open(ANALYZED_DATA_PATH) as fd:
    #     data = json.loads(fd.read())
    _available_tables, unavailable_tables = retrieve_tables(spec)
    return _available_tables, unavailable_tables

Example #12

0

Show file

File: conftest.py Project: open-contracting/spoonbill

def spec_analyzed(schema, releases):
    dp = DataPreprocessor(schema, TEST_ROOT_TABLES, combined_tables=TEST_COMBINED_TABLES, with_preview=True)
    [_ for _ in dp.process_items(releases)]
    return dp

Example #13

0

Show file

 def list(self,
          request,
          url_id=None,
          upload_id=None,
          selection_id=None,
          table_id=None):
     table = Table.objects.get(id=table_id)
     if url_id:
         datasource = Url.objects.get(id=url_id)
     elif upload_id:
         datasource = Upload.objects.get(id=upload_id)
     datasource_dir = os.path.dirname(datasource.file.path)
     selection = DataSelection.objects.get(id=selection_id)
     try:
         spec = DataPreprocessor.restore(datasource.analyzed_file.path)
         data = []
         if table.split:
             preview_path = f"{datasource_dir}/{table.name}.csv"
             if not os.path.exists(preview_path):
                 store_preview_csv(COLUMNS, PREVIEW_ROWS,
                                   spec.tables[table.name], preview_path)
             with open(preview_path) as csvfile:
                 preview = {
                     "name": spec.tables[table.name].name,
                     "id": str(table.id),
                     "preview": csvfile.read(),
                     "heading": table.heading,
                 }
                 if selection.headings_type != selection.OCDS:
                     preview["column_headings"] = table.column_headings
             data.append(preview)
             for child_table in table.array_tables.all():
                 if not child_table.include:
                     continue
                 preview_path = f"{datasource_dir}/{child_table.name}_combined.csv"
                 with open(preview_path) as csvfile:
                     preview = {
                         "name": spec.tables[child_table.name].name,
                         "id": str(child_table.id),
                         "preview": csvfile.read(),
                         "heading": child_table.heading,
                     }
                     if selection.headings_type != selection.OCDS:
                         preview[
                             "column_headings"] = child_table.column_headings
                 data.append(preview)
         else:
             preview_path = f"{datasource_dir}/{table.name}_combined.csv"
             if not os.path.exists(preview_path):
                 store_preview_csv(COMBINED_COLUMNS, COMBINED_PREVIEW_ROWS,
                                   spec.tables[table.name], preview_path)
             with open(preview_path) as csvfile:
                 preview = {
                     "name": spec.tables[table.name].name,
                     "id": str(table.id),
                     "preview": csvfile.read(),
                     "heading": table.heading,
                 }
                 if selection.headings_type != selection.OCDS:
                     preview["column_headings"] = table.column_headings
                 data.append(preview)
         return Response(data)
     except FileNotFoundError as e:
         extra = {
             "MESSAGE_ID": "get_preview_failed",
             "DATASOURCE_ID": str(datasource.id),
             "TABLE_ID": table_id,
             "ERROR_MSG": str(e),
             "EXPIRED_AT": datasource.expired_at.isoformat(),
         }
         logger.info("Error while get table preview %s" % str(e),
                     extra=extra)
         return Response({"detail": _("Datasource expired.")},
                         status=status.HTTP_404_NOT_FOUND)
     except OSError as e:
         extra = {
             "MESSAGE_ID": "create_preview_failed",
             "DATASOURCE_ID": str(datasource.id),
             "TABLE_ID": table_id,
             "ERROR_MSG": str(e),
         }
         logger.info("Error while create preview %s" % str(e), extra=extra)
         return Response(
             {
                 "detail":
                 _("Currently, the space limit was reached. Please try again later."
                   )
             },
             status=status.HTTP_413_REQUEST_ENTITY_TOO_LARGE,
         )

Example #14

0

Show file

File: tasks.py Project: lttga/test

def validate_data(object_id, model=None, lang_code="en"):
    with internationalization(lang_code=lang_code):
        logger_context = {"DATASOURCE_ID": object_id, "TASK": "validate_data"}
        ds_model, serializer = get_serializer_by_model(model, logger_context)
        channel_layer = get_channel_layer()
        if not ds_model:
            async_to_sync(channel_layer.group_send)(
                f"datasource_{object_id}",
                {
                    "type": "task.validate",
                    "error": _("Model %s for datasource not found") % model
                },
            )
            return
        try:
            is_valid = False
            datasource = ds_model.objects.get(id=object_id)

            datasource.status = "validation"
            datasource.save(update_fields=["status"])
            async_to_sync(channel_layer.group_send)(
                f"datasource_{datasource.id}",
                {
                    "type": "task.validate",
                    "datasource":
                    serializer.to_representation(instance=datasource)
                },
            )

            logger.debug("Start validation for %s file" % object_id)
            with open(SCHEMA_PATH) as fd:
                schema = json.loads(fd.read())
            resource = ""
            if is_release_package(datasource.file.path):
                resource = "releases"
            elif is_record_package(datasource.file.path):
                resource = "records"
            if resource:
                path = pathlib.Path(datasource.file.path)
                workdir = path.parent
                filename = path.name
                total = path.stat().st_size
                analyzer = FileAnalyzer(workdir,
                                        schema=schema,
                                        root_key=resource,
                                        root_tables=ROOT_TABLES,
                                        combined_tables=COMBINED_TABLES)
                timestamp = time.time()
                for read, count in analyzer.analyze_file(filename,
                                                         with_preview=True):
                    if (time.time() - timestamp) <= 1:
                        continue
                    async_to_sync(channel_layer.group_send)(
                        f"datasource_{datasource.id}",
                        {
                            "type": "task.validate",
                            "datasource": {
                                "id": str(datasource.id)
                            },
                            "progress": {
                                "rows": count,
                                "percentage":
                                (read / total) * 100 if total else 0,
                                "size": total,
                                "read": read,
                            },
                        },
                    )
                    timestamp = time.time()
                is_valid = True

            datasource.validation.is_valid = is_valid
            datasource.root_key = resource
            datasource.validation.save(update_fields=["is_valid"])
            datasource.save(update_fields=["root_key"])

            if is_valid and not datasource.available_tables and not datasource.analyzed_file:
                _file = ContentFile(b"")
                datasource.analyzed_file.save("new", _file)
                analyzer.spec.dump(datasource.analyzed_file.path)
                available_tables, unavailable_tables = retrieve_tables(
                    analyzer.spec)
                datasource.available_tables = available_tables
                datasource.unavailable_tables = unavailable_tables
                datasource.save(
                    update_fields=["available_tables", "unavailable_tables"])
            elif is_valid and datasource.analyzed_file:
                spec = DataPreprocessor.restore(datasource.analyzed_file.path)
                available_tables, unavailable_tables = retrieve_tables(spec)
                datasource.available_tables = available_tables
                datasource.unavailable_tables = unavailable_tables
                datasource.save(
                    update_fields=["available_tables", "unavailable_tables"])

            async_to_sync(channel_layer.group_send)(
                f"datasource_{datasource.id}",
                {
                    "type": "task.validate",
                    "datasource":
                    serializer.to_representation(instance=datasource)
                },
            )
        except ObjectDoesNotExist:
            logger_context["MODEL"] = model
            logger_context["MESSAGE_ID"] = "datasource_not_found"
            logger.info("Datasource %s %s not found" % (model, object_id),
                        extra=logger_context)
            async_to_sync(channel_layer.group_send)(
                f"datasource_{object_id}",
                {
                    "type": "task.validate",
                    "error": _("Datasource %s not found") % object_id
                },
            )
        except (ijson.JSONError, ijson.IncompleteJSONError) as e:
            logger.info(
                "Error while validating data %s" % object_id,
                extra={
                    "MESSAGE_ID": "validation_exception",
                    "MODEL": model,
                    "ID": object_id,
                    "STR_ERROR": str(e)
                },
            )
            message = _("Error while validating data `%s`") % str(e)
            datasource.validation.errors = message
            datasource.validation.is_valid = False
            datasource.validation.save(update_fields=["errors", "is_valid"])
            async_to_sync(channel_layer.group_send)(
                f"datasource_{datasource.id}",
                {
                    "type": "task.validate",
                    "error": message
                },
            )
        except OSError as e:
            logger.exception(
                "Error while validating data %s" % object_id,
                extra={
                    "MESSAGE_ID": "validation_exception",
                    "MODEL": model,
                    "ID": object_id,
                    "STR_ERROR": str(e)
                },
            )
            message = _(
                "Currently, the space limit was reached. Please try again later."
            )
            datasource.validation.errors = message
            datasource.validation.is_valid = False
            datasource.validation.save(update_fields=["errors", "is_valid"])
            async_to_sync(channel_layer.group_send)(
                f"datasource_{datasource.id}",
                {
                    "type": "task.validate",
                    "error": message
                },
            )
        except Exception as e:
            logger.exception(
                "Error while validating data %s" % object_id,
                extra={
                    "MESSAGE_ID": "validation_exception",
                    "MODEL": model,
                    "ID": object_id,
                    "STR_ERROR": str(e)
                },
            )
            message = _("Error while validating data `%s`") % str(e)
            datasource.validation.errors = message
            datasource.validation.is_valid = False
            datasource.validation.save(update_fields=["errors", "is_valid"])
            async_to_sync(channel_layer.group_send)(
                f"datasource_{datasource.id}",
                {
                    "type": "task.validate",
                    "error": message
                },
            )

Example #15

0

Show file

File: tasks.py Project: lttga/test

def flatten_data(flatten_id, model=None, lang_code="en_US"):
    with internationalization(lang_code=lang_code):
        logger_context = {
            "FLATTEN_ID": flatten_id,
            "TASK": "flatten_data",
            "MODEL": model
        }
        channel_layer = get_channel_layer()
        if model not in getters:
            extra = {
                "MESSAGE_ID": "model_not_registered",
                "MODEL": model,
                "TASK": "flatten_data",
                "FLATTEN_ID": flatten_id,
            }
            logger.info("Model %s not registered in getters" % model,
                        extra=extra)
            return
        try:
            serializer = FlattenSerializer()
            flatten = Flatten.objects.get(id=flatten_id)
            selection = flatten.dataselection_set.all()[0]
            datasource = getattr(selection, f"{model.lower()}_set").all()[0]
            flatten.status = "processing"
            flatten.save(update_fields=["status"])
            async_to_sync(channel_layer.group_send)(
                f"datasource_{datasource.id}",
                {
                    "type": "task.flatten",
                    "flatten": serializer.to_representation(instance=flatten)
                },
            )
            spec = DataPreprocessor.restore(datasource.analyzed_file.path)
            total_rows = spec.total_items
            opt = get_flatten_options(selection)
            logger.debug(
                "Generate options for export",
                extra={
                    "MESSAGE_ID": "generate_flatten_options",
                    "DATASOURCE_ID": str(datasource.id),
                    "MODEL": model,
                    "SELECTION_ID": str(selection.id),
                    "FLATTEN_ID": str(flatten.id),
                    "OPTIONS": opt,
                },
            )
            options = FlattenOptions(**opt)
            workdir = pathlib.Path(datasource.file.path).parent
            formats = {"csv": None, "xlsx": None}
            if flatten.export_format == flatten.CSV:
                workdir = workdir / "export"
                if not workdir.exists():
                    os.makedirs(workdir)
                formats[flatten.export_format] = workdir
            else:
                formats[flatten.export_format] = "result.xlsx"
            flattener = FileFlattener(workdir,
                                      options,
                                      spec.tables,
                                      root_key=datasource.root_key,
                                      **formats)
            timestamp = time.time()
            for count in flattener.flatten_file(datasource.file.path):
                if (time.time() - timestamp) <= 1:
                    continue
                async_to_sync(channel_layer.group_send)(
                    f"datasource_{datasource.id}",
                    {
                        "type": "task.flatten",
                        "flatten": {
                            "id": str(flatten.id)
                        },
                        "progress": {
                            "total_rows":
                            total_rows,
                            "processed":
                            count,
                            "percentage": (count / total_rows) *
                            100 if total_rows else total_rows,
                        },
                    },
                )
                timestamp = time.time()
            if flatten.export_format == flatten.CSV:
                target_file = f"{workdir}/{datasource.id}.zip"
                zip_files(workdir, target_file, extension="csv")
                with open(target_file, "rb") as fd:
                    file_ = File(fd)
                    file_.name = f"{datasource.id}.zip"
                    flatten.file = file_
                    flatten.status = "completed"
                    flatten.save(update_fields=["file", "status"])
                os.remove(fd.name)
            else:
                target_file = f"{workdir}/result.xlsx"
                with open(target_file, "rb") as fd:
                    file_ = File(fd)
                    file_.name = "result.xlsx"
                    flatten.file = file_
                    flatten.status = "completed"
                    flatten.save(update_fields=["file", "status"])
                os.remove(fd.name)
            async_to_sync(channel_layer.group_send)(
                f"datasource_{datasource.id}",
                {
                    "type": "task.flatten",
                    "flatten": serializer.to_representation(instance=flatten)
                },
            )
        except ObjectDoesNotExist:
            extra = deepcopy(logger_context)
            extra["MESSAGE_ID"] = "flatten_not_found"
            logger.info("Flatten %s for %s model not found" %
                        (flatten_id, model),
                        extra=extra)
        except OSError as e:
            extra = deepcopy(logger_context)
            extra.update({
                "MESSAGE_ID": "flatten_no_left_space",
                "DATASOURCE_ID": str(datasource.id),
                "ERROR_MSG": str(e)
            })
            logger.info("Flatten %s for %s model failed: %s" %
                        (flatten_id, model, e),
                        extra=extra)
            flatten.status = "failed"
            flatten.error = _(
                "Currently, the space limit was reached. Please try again later."
            )
            flatten.save(update_fields=["error", "status"])
            async_to_sync(channel_layer.group_send)(
                f"datasource_{datasource.id}",
                {
                    "type": "task.flatten",
                    "flatten": serializer.to_representation(instance=flatten)
                },
            )
        except (TypeError, Exception) as e:
            error_message = str(e)
            extra = deepcopy(logger_context)
            extra["MESSAGE_ID"] = "flatten_failed"
            extra["ERROR_MESSAGE"] = error_message
            logger.error(
                "Flatten %s for %s datasource %s failed" %
                (flatten_id, model, datasource.id),
                extra=extra,
                exc_info=True,
            )
            flatten.status = "failed"
            flatten.error = error_message
            flatten.save(update_fields=["error", "status"])
            async_to_sync(channel_layer.group_send)(
                f"datasource_{datasource.id}",
                {
                    "type": "task.flatten",
                    "flatten": serializer.to_representation(instance=flatten)
                },
            )

Example #16

0

Show file

def spec(schema):
    return DataPreprocessor(schema, TEST_ROOT_TABLES, combined_tables=TEST_COMBINED_TABLES)

Example #17

0

Show file

def validate_data(object_id, model=None, lang_code="en"):

    with internationalization(lang_code=lang_code):
        logger_context = {"DATASOURCE_ID": object_id, "TASK": "validate_data"}
        ds_model, serializer = get_serializer_by_model(model, logger_context)
        channel_layer = get_channel_layer()
        if not ds_model:
            async_to_sync(channel_layer.group_send)(
                f"datasource_{object_id}",
                {
                    "type": "task.validate",
                    "error": _("Model %s for datasource not found") % model
                },
            )
            return
        try:
            is_valid = False
            datasource = ds_model.objects.get(id=object_id)

            datasource.status = "validation"
            datasource.save(update_fields=["status"])
            async_to_sync(channel_layer.group_send)(
                f"datasource_{datasource.id}",
                {
                    "type": "task.validate",
                    "datasource":
                    serializer.to_representation(instance=datasource)
                },
            )

            logger.debug("Start validation for %s file", object_id)
            paths = [
                pathlib.Path(file.file.path)
                for file in datasource.files.all()
            ]
            workdir = paths[0].parent
            filenames = [pathlib.Path(path).name for path in paths]

            total = sum([
                pathlib.Path(path).stat().st_size
                if get_reader(path) == open else gz_size(path)
                for path in paths
            ])
            analyzer = FileAnalyzer(workdir,
                                    root_tables=ROOT_TABLES,
                                    combined_tables=COMBINED_TABLES)

            timestamp = time.time()
            filepaths = [workdir / filename for filename in filenames]
            for read, count in analyzer.analyze_file(filepaths,
                                                     with_preview=True):
                if (time.time() - timestamp) <= 1:
                    continue
                async_to_sync(channel_layer.group_send)(
                    f"datasource_{datasource.id}",
                    {
                        "type": "task.validate",
                        "datasource": {
                            "id": str(datasource.id)
                        },
                        "progress": {
                            "rows": count,
                            "percentage": (read / total) * 100 if total else 0,
                            "size": total,
                            "read": read,
                        },
                    },
                )
                timestamp = time.time()
            is_valid = True

            datasource.validation.is_valid = is_valid
            datasource.root_key = analyzer.pkg_type
            datasource.validation.save(update_fields=["is_valid"])
            datasource.order = ", ".join(analyzer.order)
            datasource.save()

            if is_valid and not datasource.available_tables and not datasource.analyzed_file:
                _file = ContentFile(b"")
                datasource.analyzed_file.save("new", _file)
                analyzer.spec.dump(datasource.analyzed_file.path)
                available_tables, unavailable_tables = retrieve_tables(
                    analyzer.spec)
                datasource.available_tables = available_tables
                datasource.unavailable_tables = unavailable_tables
                datasource.save(
                    update_fields=["available_tables", "unavailable_tables"])
            elif is_valid and datasource.analyzed_file:
                spec = DataPreprocessor.restore(datasource.analyzed_file.path)
                available_tables, unavailable_tables = retrieve_tables(spec)
                datasource.available_tables = available_tables
                datasource.unavailable_tables = unavailable_tables
                datasource.save(
                    update_fields=["available_tables", "unavailable_tables"])

            async_to_sync(channel_layer.group_send)(
                f"datasource_{datasource.id}",
                {
                    "type": "task.validate",
                    "datasource":
                    serializer.to_representation(instance=datasource)
                },
            )
        except ObjectDoesNotExist:
            logger_context["MODEL"] = model
            logger_context["MESSAGE_ID"] = "datasource_not_found"
            logger.info("Datasource %s %s not found",
                        model,
                        object_id,
                        extra=logger_context)
            async_to_sync(channel_layer.group_send)(
                f"datasource_{object_id}",
                {
                    "type": "task.validate",
                    "error": _("Datasource %s not found") % object_id
                },
            )
        except (ijson.JSONError, ijson.IncompleteJSONError) as e:
            logger.info(
                "Error while validating data %s",
                object_id,
                extra={
                    "MESSAGE_ID": "validation_exception",
                    "MODEL": model,
                    "ID": object_id,
                    "STR_EXCEPTION": e.__class__.__name__,
                    "STR_ERROR": str(e),
                },
            )
            message = _("Error while validating data `%s`") % str(e)
            datasource.validation.errors = message
            datasource.validation.is_valid = False
            datasource.validation.save(update_fields=["errors", "is_valid"])
            async_to_sync(channel_layer.group_send)(
                f"datasource_{datasource.id}",
                {
                    "type": "task.validate",
                    "error": message
                },
            )
        except OSError as e:
            logger.exception(
                "Error while validating data %s",
                object_id,
                extra={
                    "MESSAGE_ID": "validation_exception",
                    "MODEL": model,
                    "ID": object_id,
                    "STR_EXCEPTION": e.__class__.__name__,
                    "STR_ERROR": str(e),
                },
            )
            message = (_(
                "Currently, the space limit was reached. Please try again later."
            ) if "[Errno 28]" in str(e) else _(
                "Something went wrong during processing of your file, please contact support"
            ))
            datasource.validation.errors = message
            datasource.validation.is_valid = False
            datasource.validation.save(update_fields=["errors", "is_valid"])
            async_to_sync(channel_layer.group_send)(
                f"datasource_{datasource.id}",
                {
                    "type": "task.validate",
                    "error": message
                },
            )
        except Exception as e:
            logger.exception(
                "Error while validating data %s",
                object_id,
                extra={
                    "MESSAGE_ID": "validation_exception",
                    "MODEL": model,
                    "ID": object_id,
                    "STR_EXCEPTION": e.__class__.__name__,
                    "STR_ERROR": str(e),
                },
            )
            message = _("Error while validating data `%s`") % str(e)
            datasource.validation.errors = message
            datasource.validation.is_valid = False
            datasource.validation.save(update_fields=["errors", "is_valid"])
            async_to_sync(channel_layer.group_send)(
                f"datasource_{datasource.id}",
                {
                    "type": "task.validate",
                    "error": message
                },
            )

Example #18

0

Show file

def flatten_data(flatten_id, model=None, lang_code="en_US"):
    with internationalization(lang_code=lang_code):
        logger_context = {
            "FLATTEN_ID": flatten_id,
            "TASK": "flatten_data",
            "MODEL": model
        }
        channel_layer = get_channel_layer()
        if model not in getters:
            extra = {
                "MESSAGE_ID": "model_not_registered",
                "MODEL": model,
                "TASK": "flatten_data",
                "FLATTEN_ID": flatten_id,
            }
            logger.info("Model %s not registered in getters",
                        model,
                        extra=extra)
            return
        try:
            serializer = FlattenSerializer()
            flatten = Flatten.objects.get(id=flatten_id)
            selection = flatten.dataselection_set.all()[0]
            datasource = getattr(selection, f"{model.lower()}_set").all()[0]
            flatten.status = "processing"
            flatten.save(update_fields=["status"])
            async_to_sync(channel_layer.group_send)(
                f"datasource_{datasource.id}",
                {
                    "type": "task.flatten",
                    "flatten": serializer.to_representation(instance=flatten)
                },
            )
            spec = DataPreprocessor.restore(datasource.analyzed_file.path)
            total_rows = spec.total_items
            opt = get_flatten_options(selection)
            # In case of exclusion of child tables, 'split' of root table should be set to 'False' for proper export
            # TODO: There should be better way to handle this (probably on library side)
            if "exclude" in opt:
                for _table in opt["exclude"]:
                    _parent = spec.tables[_table].parent
                    if _parent != "" and _parent.name in opt["selection"]:
                        opt["selection"][_parent.name]["split"] = False
            logger.debug(
                "Generate options for export",
                extra={
                    "MESSAGE_ID": "generate_flatten_options",
                    "DATASOURCE_ID": str(datasource.id),
                    "MODEL": model,
                    "SELECTION_ID": str(selection.id),
                    "FLATTEN_ID": str(flatten.id),
                    "OPTIONS": opt,
                },
            )
            options = FlattenOptions(**opt)
            files = [file.file.path for file in datasource.files.all()]
            workdir = pathlib.Path(files[0]).parent
            formats = {"csv": None, "xlsx": None}
            if flatten.export_format == flatten.CSV:
                workdir = workdir / "export"
                if not workdir.exists():
                    os.makedirs(workdir)
                formats[flatten.export_format] = workdir
            else:
                formats[flatten.export_format] = "result.xlsx"
            flattener = FileFlattener(
                workdir,
                options,
                tables=spec.tables,
                pkg_type=datasource.root_key,
                multiple_values=getattr(spec, "multiple_values", False),
                schema=spec.schema,
                **formats,
            )
            timestamp = time.time()
            for count in flattener.flatten_file(files):
                if (time.time() - timestamp) <= 1:
                    continue
                async_to_sync(channel_layer.group_send)(
                    f"datasource_{datasource.id}",
                    {
                        "type": "task.flatten",
                        "flatten": {
                            "id": str(flatten.id)
                        },
                        "progress": {
                            "total_rows":
                            total_rows,
                            "processed":
                            count,
                            "percentage": (count / total_rows) *
                            100 if total_rows else total_rows,
                        },
                    },
                )
                timestamp = time.time()

            if flatten.export_format == flatten.CSV:
                target_file = f"{workdir}/{datasource.id}.zip"
                zip_files(workdir, target_file, extension="csv")
                with open(target_file, "rb") as fd:
                    file_ = File(fd)
                    file_.name = f"{datasource.id}.zip"
                    flatten.file = file_
                    flatten.status = "completed"
                    flatten.save(update_fields=["file", "status"])
                os.remove(fd.name)
            else:
                target_file = f"{workdir}/result.xlsx"
                with open(target_file, "rb") as fd:
                    file_ = File(fd)
                    file_.name = "result.xlsx"
                    flatten.file = file_
                    flatten.status = "completed"
                    flatten.save(update_fields=["file", "status"])
                os.remove(fd.name)
            async_to_sync(channel_layer.group_send)(
                f"datasource_{datasource.id}",
                {
                    "type": "task.flatten",
                    "flatten": serializer.to_representation(instance=flatten)
                },
            )
        except ObjectDoesNotExist:
            extra = deepcopy(logger_context)
            extra["MESSAGE_ID"] = "flatten_not_found"
            logger.info("Flatten %s for %s model not found",
                        flatten_id,
                        model,
                        extra=extra)
        except OSError as e:
            extra = deepcopy(logger_context)
            extra.update({
                "MESSAGE_ID": "flatten_no_left_space",
                "DATASOURCE_ID": str(datasource.id),
                "ERROR_MSG": str(e)
            })
            logger.info("Flatten %s for %s model failed: %s",
                        flatten_id,
                        model,
                        e,
                        extra=extra)
            flatten.status = "failed"
            flatten.error = (_(
                "Currently, the space limit was reached. Please try again later."
            ) if "[Errno 28]" in str(e) else _(
                "Something went wrong during processing of your file, please contact support"
            ))
            flatten.save(update_fields=["error", "status"])
            async_to_sync(channel_layer.group_send)(
                f"datasource_{datasource.id}",
                {
                    "type": "task.flatten",
                    "flatten": serializer.to_representation(instance=flatten)
                },
            )
        except (TypeError, Exception) as e:
            error_message = str(e)
            extra = deepcopy(logger_context)
            extra["MESSAGE_ID"] = "flatten_failed"
            extra["ERROR_MESSAGE"] = error_message
            logger.error(
                "Flatten %s for %s datasource %s failed",
                flatten_id,
                model,
                datasource.id,
                extra=extra,
                exc_info=True,
            )
            flatten.status = "failed"
            flatten.error = error_message
            flatten.save(update_fields=["error", "status"])
            async_to_sync(channel_layer.group_send)(
                f"datasource_{datasource.id}",
                {
                    "type": "task.flatten",
                    "flatten": serializer.to_representation(instance=flatten)
                },
            )

Example #19

0

Show file

File: __init__.py Project: open-contracting/spoonbill

class FileAnalyzer:
    """Main utility for analyzing files
    :param workdir: Working directory
    :param schema: Json schema file to use with data
    :param root_tables: Path configuration which should become root tables
    :param combined_tables: Path configuration for tables with multiple sources
    :param pkg_type: Field name to access records
    :param language: Language to use for the human-readable headings
    :param table_threshold: The maximum number of elements in an array before it is split into a table
    """
    def __init__(
        self,
        workdir,
        schema=None,
        state_file=None,
        root_tables=ROOT_TABLES,
        combined_tables=COMBINED_TABLES,
        pkg_type="releases",
        language=LOCALE,
        table_threshold=TABLE_THRESHOLD,
    ):
        self.workdir = Path(workdir)
        self.multiple_values = False
        self.schema = schema
        self.root_tables = root_tables
        self.combined_tables = combined_tables
        self.language = language
        self.table_threshold = table_threshold
        if state_file:
            self.spec = DataPreprocessor.restore(state_file)
            self.sort_tables()
        else:
            self.spec = None
        self.pkg_type = pkg_type
        self.order = None

    def analyze_file(self, filenames, with_preview=True):
        """Analyze provided file
        :param filename: Input filename
        :param with_preview: Generate preview during analysis
        """
        if not isinstance(filenames, list):
            filenames = [filenames]
        path = self.workdir / filenames[0]
        (
            input_format,
            _is_concatenated,
            _is_array,
        ) = detect_format(path=path, reader=get_reader(path))
        LOGGER.info(_("Input file is {}").format(input_format))
        self.multiple_values = _is_concatenated
        self.parse_schema(input_format, self.schema)
        if self.spec is None:
            self.spec = DataPreprocessor(
                self.schema,
                self.root_tables,
                combined_tables=self.combined_tables,
                language=self.language,
                table_threshold=self.table_threshold,
                multiple_values=self.multiple_values,
                pkg_type=self.pkg_type,
            )
        for filename in filenames:
            path = self.workdir / filename
            reader = get_reader(path)
            with reader(path, "rb") as fd:
                items = iter_file(fd,
                                  self.pkg_type,
                                  multiple_values=self.multiple_values)
                for count in self.spec.process_items(items):
                    yield fd.tell(), count
        self.sort_tables()

    def dump_to_file(self, filename):
        """Save analyzed information to file
        :param filename: Output filename in working directory
        """
        path = self.workdir / filename
        self.spec.dump(path)

    def parse_schema(self, input_format, schema=None):
        if schema:
            schema = resolve_file_uri(schema)
        if "release" in input_format:
            pkg_type = "releases"
            getter = attrgetter("release_package_schema")
        else:
            pkg_type = "records"
            getter = attrgetter("record_package_schema")
        url = DEFAULT_SCHEMA_URL[pkg_type].get(
            self.language[:2], DEFAULT_SCHEMA_URL[pkg_type]["en"])
        if not schema:
            LOGGER.info(
                _("No schema provided, using version {}").format(
                    CURRENT_SCHEMA_TAG))
            profile = ProfileBuilder(CURRENT_SCHEMA_TAG, {},
                                     schema_base_url=url)
            schema = getter(profile)()
        title = schema.get("title", "").lower()
        if not title:
            raise ValueError(
                _("Incomplete schema, please make sure your data is correct"))
        if "package" in title:
            # TODO: is is a good way to get release/record schema
            schema = jsonref.JsonRef.replace_refs(schema)
            schema = schema["properties"][pkg_type]["items"]

        self.schema = schema
        self.pkg_type = pkg_type

    def sort_tables(self):
        """
        Sort tables according to order of arrays in schema
        :return:
        """
        self.order = get_order(self.spec.schema["properties"].keys())
        out_schema_tables = {
            name: table
            for name, table in self.spec.tables.items()
            if name.split("_")[0] not in self.order
        }
        within_schema_tables = {
            name: table
            for name, table in self.spec.tables.items()
            if name.split("_")[0] in self.order
        }

        sorted_tables = dict(
            sorted(
                within_schema_tables.items(),
                key=lambda sheet: self.order.index(sheet[0].split("_")[0])
                if sheet[0].split("_")[0] in self.order else -1,
            ))
        self.spec.tables = {**sorted_tables, **out_schema_tables}

Example #20

0

Show file

def spec_analyzed(schema, releases):
    dp = DataPreprocessor(schema, TEST_ROOT_TABLES, combined_tables=TEST_COMBINED_TABLES)
    for _ in dp.process_items(releases):
        pass
    return dp

Example #21

0

Show file

File: views.py Project: open-contracting/spoonbill-web

    def update(self, request, *args, **kwargs):
        try:
            datasource = (Url.objects.get(id=kwargs["url_id"])
                          if "url_id" in kwargs else Upload.objects.get(
                              id=kwargs["upload_id"]))
            table = Table.objects.get(id=kwargs["id"])
            spec = DataPreprocessor.restore(datasource.analyzed_file.path)
            update_fields = []
            for key in ("split", "include", "heading"):
                if key in request.data:
                    setattr(table, key, request.data[key])
                    # Remove "grandchildren" (child tables of child tables) if such are present
                    if key in ("split",
                               "include") and request.data[key] is False:
                        if table.array_tables and not table.parent:
                            for array_table in list(table.array_tables.all()):
                                setattr(array_table, key, False)
                                array_table.save()
                        if table.array_tables and table.parent:
                            parent = table.array_tables.all()[0]
                            for array_table in list(parent.array_tables.all()):
                                setattr(
                                    array_table,
                                    key,
                                    False if array_table.parent == table.name
                                    else getattr(array_table, key),
                                )
                                array_table.save()
                    # Forbid merge of table if any of child arrays is unmergeable
                    if (key == "split" and request.data[key] is False
                            and table.array_tables and False in [
                                _table.mergeable
                                for _table in list(table.array_tables.all())
                                if _table.include is True
                            ]):

                        return Response(
                            {
                                "detail":
                                _("Cannot merge '%(table_name)s' - child arrays are too large"
                                  ) % {
                                      "table_name": table.name
                                  }
                            },
                            status=status.HTTP_400_BAD_REQUEST,
                        )

                    update_fields.append(key)
            if update_fields:
                table.save(update_fields=update_fields)
            is_array_tables = len(table.array_tables.all())
            if "split" in request.data and request.data[
                    "split"] and not is_array_tables:
                child_tables = spec.tables[table.name].child_tables
                self._split_table(table, spec.tables, datasource, child_tables)
            serializer = self.get_serializer_class()(table)
            sources = table.dataselection_set.all() or table.array_tables.all(
            )[0].dataselection_set.all()
            if sources:
                sources[0].flattens.all().delete()
            return Response(serializer.data)
        except FileNotFoundError as e:
            extra = {
                "MESSAGE_ID": "update_table_failed",
                "DATASOURCE_ID": str(datasource.id),
                "TABLE_ID": kwargs["id"],
                "ERROR_MSG": str(e),
                "EXPIRED_AT": datasource.expired_at.isoformat(),
            }
            logger.info("Error while update table %s" % str(e), extra=extra)
            return Response({"detail": _("Datasource expired.")},
                            status=status.HTTP_404_NOT_FOUND)
        except OSError as e:
            extra = {
                "MESSAGE_ID": "update_table_failed",
                "DATASOURCE_ID": str(datasource.id),
                "TABLE_ID": kwargs["id"],
                "ERROR_MSG": str(e),
            }
            logger.info("Error while update table %s" % str(e), extra=extra)
            return Response(
                {
                    "detail":
                    _("Currently, the space limit was reached. Please try again later."
                      )
                },
                status=status.HTTP_413_REQUEST_ENTITY_TOO_LARGE,
            )