Python DataPreprocessor.dump Examples

Programming Language: Python

Namespace/Package Name: spoonbill.stats

Class/Type: DataPreprocessor

Method/Function: dump

Examples at hotexamples.com: 2

Python DataPreprocessor.dump - 2 examples found. These are the top rated real world Python examples of spoonbill.stats.DataPreprocessor.dump extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

restore(14)

DataPreprocessor(6)

process_items(4)

dump(2)

Example #1

Show file

File: __init__.py Project: lttga/test2

class FileAnalyzer:
    """Main utility for analyzing files

    :param workdir: Working directory
    :param schema: Json schema file to use with data
    :param root_tables: Path configuration which should become root tables
    :param combined_tables: Path configuration for tables with multiple sources
    :param root_key: Field name to access records
    """

    def __init__(
        self,
        workdir,
        schema=None,
        state_file=None,
        root_tables=ROOT_TABLES,
        combined_tables=COMBINED_TABLES,
        root_key="releases",
        language=LOCALE,
        table_threshold=TABLE_THRESHOLD,
    ):
        self.workdir = Path(workdir)
        if state_file:
            self.spec = DataPreprocessor.restore(state_file)
        else:
            self.spec = DataPreprocessor(
                schema,
                root_tables,
                combined_tables=combined_tables,
                language=language,
                table_threshold=table_threshold,
            )
        self.root_key = root_key

    def analyze_file(self, filename, with_preview=True):
        """Analyze provided file
        :param filename: Input filename
        :param with_preview: Generate preview during analysis
        """
        path = self.workdir / filename
        with open(path, "rb") as fd:
            items = iter_file(fd, self.root_key)
            for count in self.spec.process_items(items, with_preview=with_preview):
                yield fd.tell(), count

    def dump_to_file(self, filename):
        """Save analyzed information to file

        :param filename: Output filename in working directory
        """
        path = self.workdir / filename
        self.spec.dump(path)

Example #2

Show file

File: __init__.py Project: open-contracting/spoonbill

class FileAnalyzer:
    """Main utility for analyzing files
    :param workdir: Working directory
    :param schema: Json schema file to use with data
    :param root_tables: Path configuration which should become root tables
    :param combined_tables: Path configuration for tables with multiple sources
    :param pkg_type: Field name to access records
    :param language: Language to use for the human-readable headings
    :param table_threshold: The maximum number of elements in an array before it is split into a table
    """
    def __init__(
        self,
        workdir,
        schema=None,
        state_file=None,
        root_tables=ROOT_TABLES,
        combined_tables=COMBINED_TABLES,
        pkg_type="releases",
        language=LOCALE,
        table_threshold=TABLE_THRESHOLD,
    ):
        self.workdir = Path(workdir)
        self.multiple_values = False
        self.schema = schema
        self.root_tables = root_tables
        self.combined_tables = combined_tables
        self.language = language
        self.table_threshold = table_threshold
        if state_file:
            self.spec = DataPreprocessor.restore(state_file)
            self.sort_tables()
        else:
            self.spec = None
        self.pkg_type = pkg_type
        self.order = None

    def analyze_file(self, filenames, with_preview=True):
        """Analyze provided file
        :param filename: Input filename
        :param with_preview: Generate preview during analysis
        """
        if not isinstance(filenames, list):
            filenames = [filenames]
        path = self.workdir / filenames[0]
        (
            input_format,
            _is_concatenated,
            _is_array,
        ) = detect_format(path=path, reader=get_reader(path))
        LOGGER.info(_("Input file is {}").format(input_format))
        self.multiple_values = _is_concatenated
        self.parse_schema(input_format, self.schema)
        if self.spec is None:
            self.spec = DataPreprocessor(
                self.schema,
                self.root_tables,
                combined_tables=self.combined_tables,
                language=self.language,
                table_threshold=self.table_threshold,
                multiple_values=self.multiple_values,
                pkg_type=self.pkg_type,
            )
        for filename in filenames:
            path = self.workdir / filename
            reader = get_reader(path)
            with reader(path, "rb") as fd:
                items = iter_file(fd,
                                  self.pkg_type,
                                  multiple_values=self.multiple_values)
                for count in self.spec.process_items(items):
                    yield fd.tell(), count
        self.sort_tables()

    def dump_to_file(self, filename):
        """Save analyzed information to file
        :param filename: Output filename in working directory
        """
        path = self.workdir / filename
        self.spec.dump(path)

    def parse_schema(self, input_format, schema=None):
        if schema:
            schema = resolve_file_uri(schema)
        if "release" in input_format:
            pkg_type = "releases"
            getter = attrgetter("release_package_schema")
        else:
            pkg_type = "records"
            getter = attrgetter("record_package_schema")
        url = DEFAULT_SCHEMA_URL[pkg_type].get(
            self.language[:2], DEFAULT_SCHEMA_URL[pkg_type]["en"])
        if not schema:
            LOGGER.info(
                _("No schema provided, using version {}").format(
                    CURRENT_SCHEMA_TAG))
            profile = ProfileBuilder(CURRENT_SCHEMA_TAG, {},
                                     schema_base_url=url)
            schema = getter(profile)()
        title = schema.get("title", "").lower()
        if not title:
            raise ValueError(
                _("Incomplete schema, please make sure your data is correct"))
        if "package" in title:
            # TODO: is is a good way to get release/record schema
            schema = jsonref.JsonRef.replace_refs(schema)
            schema = schema["properties"][pkg_type]["items"]

        self.schema = schema
        self.pkg_type = pkg_type

    def sort_tables(self):
        """
        Sort tables according to order of arrays in schema
        :return:
        """
        self.order = get_order(self.spec.schema["properties"].keys())
        out_schema_tables = {
            name: table
            for name, table in self.spec.tables.items()
            if name.split("_")[0] not in self.order
        }
        within_schema_tables = {
            name: table
            for name, table in self.spec.tables.items()
            if name.split("_")[0] in self.order
        }

        sorted_tables = dict(
            sorted(
                within_schema_tables.items(),
                key=lambda sheet: self.order.index(sheet[0].split("_")[0])
                if sheet[0].split("_")[0] in self.order else -1,
            ))
        self.spec.tables = {**sorted_tables, **out_schema_tables}