Example #1
0
class FileAnalyzer:
    """Main utility for analyzing files

    :param workdir: Working directory
    :param schema: Json schema file to use with data
    :param root_tables: Path configuration which should become root tables
    :param combined_tables: Path configuration for tables with multiple sources
    :param root_key: Field name to access records
    """

    def __init__(
        self,
        workdir,
        schema=None,
        state_file=None,
        root_tables=ROOT_TABLES,
        combined_tables=COMBINED_TABLES,
        root_key="releases",
        language=LOCALE,
        table_threshold=TABLE_THRESHOLD,
    ):
        self.workdir = Path(workdir)
        if state_file:
            self.spec = DataPreprocessor.restore(state_file)
        else:
            self.spec = DataPreprocessor(
                schema,
                root_tables,
                combined_tables=combined_tables,
                language=language,
                table_threshold=table_threshold,
            )
        self.root_key = root_key

    def analyze_file(self, filename, with_preview=True):
        """Analyze provided file
        :param filename: Input filename
        :param with_preview: Generate preview during analysis
        """
        path = self.workdir / filename
        with open(path, "rb") as fd:
            items = iter_file(fd, self.root_key)
            for count in self.spec.process_items(items, with_preview=with_preview):
                yield fd.tell(), count

    def dump_to_file(self, filename):
        """Save analyzed information to file

        :param filename: Output filename in working directory
        """
        path = self.workdir / filename
        self.spec.dump(path)
Example #2
0
class FileAnalyzer:
    """Main utility for analyzing files
    :param workdir: Working directory
    :param schema: Json schema file to use with data
    :param root_tables: Path configuration which should become root tables
    :param combined_tables: Path configuration for tables with multiple sources
    :param pkg_type: Field name to access records
    :param language: Language to use for the human-readable headings
    :param table_threshold: The maximum number of elements in an array before it is split into a table
    """
    def __init__(
        self,
        workdir,
        schema=None,
        state_file=None,
        root_tables=ROOT_TABLES,
        combined_tables=COMBINED_TABLES,
        pkg_type="releases",
        language=LOCALE,
        table_threshold=TABLE_THRESHOLD,
    ):
        self.workdir = Path(workdir)
        self.multiple_values = False
        self.schema = schema
        self.root_tables = root_tables
        self.combined_tables = combined_tables
        self.language = language
        self.table_threshold = table_threshold
        if state_file:
            self.spec = DataPreprocessor.restore(state_file)
            self.sort_tables()
        else:
            self.spec = None
        self.pkg_type = pkg_type
        self.order = None

    def analyze_file(self, filenames, with_preview=True):
        """Analyze provided file
        :param filename: Input filename
        :param with_preview: Generate preview during analysis
        """
        if not isinstance(filenames, list):
            filenames = [filenames]
        path = self.workdir / filenames[0]
        (
            input_format,
            _is_concatenated,
            _is_array,
        ) = detect_format(path=path, reader=get_reader(path))
        LOGGER.info(_("Input file is {}").format(input_format))
        self.multiple_values = _is_concatenated
        self.parse_schema(input_format, self.schema)
        if self.spec is None:
            self.spec = DataPreprocessor(
                self.schema,
                self.root_tables,
                combined_tables=self.combined_tables,
                language=self.language,
                table_threshold=self.table_threshold,
                multiple_values=self.multiple_values,
                pkg_type=self.pkg_type,
            )
        for filename in filenames:
            path = self.workdir / filename
            reader = get_reader(path)
            with reader(path, "rb") as fd:
                items = iter_file(fd,
                                  self.pkg_type,
                                  multiple_values=self.multiple_values)
                for count in self.spec.process_items(items):
                    yield fd.tell(), count
        self.sort_tables()

    def dump_to_file(self, filename):
        """Save analyzed information to file
        :param filename: Output filename in working directory
        """
        path = self.workdir / filename
        self.spec.dump(path)

    def parse_schema(self, input_format, schema=None):
        if schema:
            schema = resolve_file_uri(schema)
        if "release" in input_format:
            pkg_type = "releases"
            getter = attrgetter("release_package_schema")
        else:
            pkg_type = "records"
            getter = attrgetter("record_package_schema")
        url = DEFAULT_SCHEMA_URL[pkg_type].get(
            self.language[:2], DEFAULT_SCHEMA_URL[pkg_type]["en"])
        if not schema:
            LOGGER.info(
                _("No schema provided, using version {}").format(
                    CURRENT_SCHEMA_TAG))
            profile = ProfileBuilder(CURRENT_SCHEMA_TAG, {},
                                     schema_base_url=url)
            schema = getter(profile)()
        title = schema.get("title", "").lower()
        if not title:
            raise ValueError(
                _("Incomplete schema, please make sure your data is correct"))
        if "package" in title:
            # TODO: is is a good way to get release/record schema
            schema = jsonref.JsonRef.replace_refs(schema)
            schema = schema["properties"][pkg_type]["items"]

        self.schema = schema
        self.pkg_type = pkg_type

    def sort_tables(self):
        """
        Sort tables according to order of arrays in schema
        :return:
        """
        self.order = get_order(self.spec.schema["properties"].keys())
        out_schema_tables = {
            name: table
            for name, table in self.spec.tables.items()
            if name.split("_")[0] not in self.order
        }
        within_schema_tables = {
            name: table
            for name, table in self.spec.tables.items()
            if name.split("_")[0] in self.order
        }

        sorted_tables = dict(
            sorted(
                within_schema_tables.items(),
                key=lambda sheet: self.order.index(sheet[0].split("_")[0])
                if sheet[0].split("_")[0] in self.order else -1,
            ))
        self.spec.tables = {**sorted_tables, **out_schema_tables}