Esempio n. 1
0
    def writerow(self, table, row):
        """
        Write a row to the output file.
        """

        table_name = self.names.get(table, table)
        sheet = self.workbook.get_worksheet_by_name(table_name)
        columns = self.col_index[table]
        if not columns:
            LOGGER.error(_("Invalid table {}").format(table))
            return

        for column, value in row.items():
            if isinstance(value, bool):
                value = str(value)
            try:
                col_index = columns[column]
            except KeyError:
                LOGGER.error(
                    _("Operation produced invalid path. This a software bug, please send issue to developers"
                      ))
                LOGGER.error(
                    _("Failed to write column {} to xlsx sheet {}").format(
                        column, table))
                return
            try:
                sheet.write(self.row_counters[table], col_index, value)
            except XlsxWriterException as err:
                LOGGER.error(
                    _("Failed to write column {} to xlsx sheet {} with error {}"
                      ).format(column, table, err))

        self.row_counters[table] += 1
Esempio n. 2
0
    def parse_schema(self, input_format, schema=None):
        if schema:
            schema = resolve_file_uri(schema)
        if "release" in input_format:
            pkg_type = "releases"
            getter = attrgetter("release_package_schema")
        else:
            pkg_type = "records"
            getter = attrgetter("record_package_schema")
        url = DEFAULT_SCHEMA_URL[pkg_type].get(
            self.language[:2], DEFAULT_SCHEMA_URL[pkg_type]["en"])
        if not schema:
            LOGGER.info(
                _("No schema provided, using version {}").format(
                    CURRENT_SCHEMA_TAG))
            profile = ProfileBuilder(CURRENT_SCHEMA_TAG, {},
                                     schema_base_url=url)
            schema = getter(profile)()
        title = schema.get("title", "").lower()
        if not title:
            raise ValueError(
                _("Incomplete schema, please make sure your data is correct"))
        if "package" in title:
            # TODO: is is a good way to get release/record schema
            schema = jsonref.JsonRef.replace_refs(schema)
            schema = schema["properties"][pkg_type]["items"]

        self.schema = schema
        self.pkg_type = pkg_type
Esempio n. 3
0
    def __enter__(self):
        """
        Write the headers to the output file.
        """

        for name, table in self.tables.items():
            table_name, headers = self.init_sheet(name, table)

            try:
                path = self.workdir / f"{table_name}.csv"
                LOGGER.info(
                    _("Dumping table '{}' to file '{}'").format(
                        table_name, path))
                fd = open(path, "w")
            except (IOError, OSError) as e:
                LOGGER.error(
                    _("Failed to open file {} with error {}").format(path, e))
                return
            writer = csv.DictWriter(fd, headers)
            self.fds.append(fd)
            self.writers[name] = writer

        for name, writer in self.writers.items():
            headers = self.headers[name]
            try:
                writer.writerow(headers)
            except ValueError as err:
                LOGGER.error(_("Failed to headers with error {}").format(err))
        return self
Esempio n. 4
0
File: csv.py Progetto: lttga/test2
 def writerow(self, table, row):
     """Write row to output file"""
     try:
         self.writers[table].writerow(row)
     except ValueError as err:
         LOGGER.error(
             _("Operation produced invalid path. This a software bug, please send issue to developers"
               ))
         LOGGER.error(
             _("Failed to write row {} with error {}").format(
                 row.get("rowID"), err))
     except KeyError:
         LOGGER.error(_("Invalid table {}").format(table))
Esempio n. 5
0
    def _init_options(self, tables):
        for table in tables.values():

            name = table.name
            count = self.options.count
            options = self.options.selection[name]
            unnest = options.unnest
            split = options.split
            repeat = options.repeat

            if count:
                for array in table.arrays:
                    parts = array.split("/")
                    parts[-1] = f"{parts[-1]}Count"
                    path = "/".join(parts)
                    target = self._types_cache.get(array) or table
                    combined = split and table.should_split
                    if combined:
                        # add count columns only if table is rolled up
                        # in other way it could be frustrating
                        # e.g. it may generate columns for whole array (/tender/items/200/additionalClassificationsCount)
                        target.add_column(
                            path,
                            "integer",
                            _(path, self.language),
                            additional=True,
                            combined_only=not combined,
                            propagate=False,
                        )
                        target.inc_column(path, path)
            if unnest:
                for col_id in unnest:
                    col = table.combined_columns[col_id]
                    table.columns[col_id] = col
            if repeat:
                for col_id in repeat:
                    columns = table.columns if split else table.combined_columns
                    title = table.titles.get(col_id)
                    col = columns.get(col_id)
                    if not col:
                        LOGGER.warning(
                            _("Ignoring repeat column {} because it is not in table {}"
                              ).format(col_id, name))
                        continue
                    for c_name in table.child_tables:
                        child_table = self.tables.get(c_name)
                        if child_table:
                            # if false means table isnt rolled up
                            child_table.columns[col_id] = col
                            child_table.combined_columns[col_id] = col
                            child_table.titles[col_id] = title
Esempio n. 6
0
 def __init__(self, workdir, tables, options, filename="result.xlsx"):
     super().__init__(workdir, tables, options)
     self.col_index = collections.defaultdict(dict)
     path = workdir / filename
     LOGGER.info(_("Dumping all sheets to file to file '{}'").format(path))
     self.workbook = xlsxwriter.Workbook(path, {"constant_memory": True})
     self.row_counters = {}
Esempio n. 7
0
 def __exit__(self, *args):
     """
     Close the workbook.
     """
     LOGGER.info(
         _("Dumped all sheets to file to file '{}'").format(self.path))
     self.workbook.close()
Esempio n. 8
0
 def dump(self, path):
     """Dump table objects to file system"""
     try:
         with open(path, "wb") as fd:
             pickle.dump(self, fd)
     except (OSError, IOError) as e:
         LOGGER.error(
             _("Failed to dump DataPreprocessor to file. Error: {}").format(
                 e))
Esempio n. 9
0
    def restore(_cls, path):
        """Restore DataPreprocessor from file

        :param path: Full path to file
        """
        try:
            with open(path, "rb") as fd:
                return pickle.load(fd)
        except (TypeError, pickle.UnpicklingError):
            LOGGER.error(_("Invalid pickle file. Can't restore."))
Esempio n. 10
0
 def add_joinable_column(self, abs_pointer, pointer):
     LOGGER.debug(
         _("Detected additional column: %s in %s table") %
         (abs_pointer, self.current_table.name))
     self.current_table.types[pointer] = JOINABLE
     self.current_table.add_column(pointer,
                                   JOINABLE,
                                   pointer,
                                   additional=True,
                                   abs_path=abs_pointer,
                                   header=pointer)
Esempio n. 11
0
    def add_column(self,
                   path,
                   item_type,
                   title,
                   *,
                   propagated=False,
                   additional=False,
                   abs_path=None,
                   header=[]):
        """
        Add a new column to the table.

        :param path: The column's path
        :param item_type: The column's expected type
        :param title: Column title
        :param combined_only: Make this column available only in combined version of table
        :param propagated: Add column to parent table
        :param additional: Mark this column as missing in schema
        :param abs_path: The column's full JSON path
        """
        combined_path = combine_path(self, path)
        col = Column(path, combined_path, title, item_type, header=header)
        array = self.is_array(path)
        if additional:
            if array:
                # when we analyzing file we need to keep index from data not to use 0
                # e.g. /tender/items/166/relatedLot
                combined_path = abs_path
                col = replace(col, path=combined_path)
            LOGGER.debug(
                _("Detected additional column: %s in %s table") %
                (path, self.name))
            self.additional_columns[combined_path] = col

        if not propagated:
            self.columns[combined_path] = col
        self.combined_columns[combined_path] = col

        if propagated:
            self.array_columns[combined_path] = col
            self.array_positions[array] = combined_path
        if not self.is_root:
            self.parent.add_column(path,
                                   item_type,
                                   title,
                                   propagated=True,
                                   header=header)
        for p in (path, combined_path):
            if path not in self.titles:
                self.titles[p] = header
        if path not in self.types:
            self.types[path] = item_type
Esempio n. 12
0
    def dump(self, path):
        """
        Dump the data processor's state to a file.

        :param path: Full path to file
        """
        try:
            with open(path, "wb") as fd:
                pickle.dump(self, fd)
        except (OSError, IOError) as e:
            LOGGER.error(
                _("Failed to dump DataPreprocessor to file. Error: {}").format(
                    e))
Esempio n. 13
0
 def init_repeat(self, table, options):
     for col_id in options.repeat:
         columns = table.columns if options.split else table.combined_columns
         col = columns.get(col_id)
         if not col:
             LOGGER.warning(
                 _("Ignoring repeat column {} because it is not in table {}"
                   ).format(col_id, table.name))
             continue
         for c_name in table.child_tables:
             child_table = self.tables.get(c_name)
             if child_table:
                 child_table.columns[col_id] = col
                 child_table.combined_columns[col_id] = col
                 child_table.titles[col_id] = col_id
Esempio n. 14
0
    def __enter__(self):
        """Write headers to output file"""
        for name, table in self.tables.items():
            table_name, headers = self.init_sheet(name, table)
            sheet = self.workbook.add_worksheet(table_name)

            for col_index, col_name in enumerate(headers):
                self.col_index[name][col_name] = col_index
                try:
                    sheet.write(0, col_index, headers[col_name])
                except XlsxWriterException as err:
                    LOGGER.error(
                        _("Failed to write header {} to xlsx sheet {} with error {}"
                          ).format(col_name, name, err))
            self.row_counters[name] = 1
        return self
Esempio n. 15
0
File: spec.py Progetto: lttga/test2
    def add_column(
        self,
        path,
        item_type,
        title,
        *,
        combined_only=False,
        propagate=True,
        additional=False,
        abs_path=None,
    ):
        """Add new column to the table

        :param path: Column path
        :param item: Object schema description
        :param item_type: Column expected type
        :param parent: Parent object schema description
        :param combined_only: Make this column available only in combined version of table
        :param additional: Mark this column as missing in schema
        """
        is_array = self.is_array(path)
        combined_path = combine_path(self, path)
        if not combined_only:
            self.columns[combined_path] = Column(title, item_type, combined_path)
        # new column to track hits differently
        self.combined_columns[combined_path] = Column(title, item_type, combined_path)

        if additional:
            if is_array:
                # when we analyzing file we need to keep index from data not to use 0
                # e.g. /tender/items/166/relatedLot
                combined_path = abs_path
            LOGGER.debug(_("Detected additional column: %s in %s table") % (path, self.name))
            self.additional_columns[combined_path] = Column(title, item_type, combined_path)

        for p in (path, combined_path):
            self.titles[p] = title
        if not self.is_root and propagate:
            self.parent.add_column(
                path,
                item_type,
                title,
                combined_only=combined_only,
                additional=additional,
                abs_path=abs_path,
            )
Esempio n. 16
0
File: spec.py Progetto: lttga/test2
 def __post_init__(self):
     for attr in (
         "columns",
         "combined_columns",
         "additional_columns",
     ):
         obj = getattr(self, attr, {})
         if obj:
             init = OrderedDict()
             for name, col in obj.items():
                 if not is_dataclass(col):
                     col = Column(**col)
                 init[name] = col
             setattr(self, attr, init)
         cols = DEFAULT_FIELDS if self.is_root else DEFAULT_FIELDS_COMBINED
         for col in cols:
             if col not in self.columns:
                 self.columns[col] = Column(col, "string", col)
             if col not in self.combined_columns:
                 self.combined_columns[col] = Column(col, "string", col)
             self.titles[col] = _(col)
Esempio n. 17
0
 def add_additional_table(self, pointer, abs_pointer, parent_key, key,
                          item):
     LOGGER.debug(_("Detected additional table: %s") % pointer)
     self.current_table.types[pointer] = ["array"]
     self._add_table(
         add_child_table(self.current_table, pointer, parent_key, key),
         pointer)
     # add columns beforehand because it might be required
     # to recalculate  and reorder headers when enlarging array
     # there must be a better way but it should work for now
     for extended_item in item:
         for path_, it in flatten(extended_item, reducer="path").items():
             ppointer = self.join_path(pointer, path_)
             if ppointer not in self.current_table:
                 self.current_table.add_column(
                     ppointer,
                     self.guess_type(it),
                     ppointer,
                     abs_path=self.join_path(abs_pointer, path_),
                     header=ppointer,
                 )
Esempio n. 18
0
 def analyze_file(self, filenames, with_preview=True):
     """Analyze provided file
     :param filename: Input filename
     :param with_preview: Generate preview during analysis
     """
     if not isinstance(filenames, list):
         filenames = [filenames]
     path = self.workdir / filenames[0]
     (
         input_format,
         _is_concatenated,
         _is_array,
     ) = detect_format(path=path, reader=get_reader(path))
     LOGGER.info(_("Input file is {}").format(input_format))
     self.multiple_values = _is_concatenated
     self.parse_schema(input_format, self.schema)
     if self.spec is None:
         self.spec = DataPreprocessor(
             self.schema,
             self.root_tables,
             combined_tables=self.combined_tables,
             language=self.language,
             table_threshold=self.table_threshold,
             multiple_values=self.multiple_values,
             pkg_type=self.pkg_type,
         )
     for filename in filenames:
         path = self.workdir / filename
         reader = get_reader(path)
         with reader(path, "rb") as fd:
             items = iter_file(fd,
                               self.pkg_type,
                               multiple_values=self.multiple_values)
             for count in self.spec.process_items(items):
                 yield fd.tell(), count
     self.sort_tables()
Esempio n. 19
0
    def parse_schema(self):
        """Extract all available information from schema"""
        if isinstance(self.schema, (str, Path)):
            self.schema = resolve_file_uri(self.schema)
        self.schema = jsonref.JsonRef.replace_refs(self.schema)
        self.init_tables(self.root_tables)
        if self.combined_tables:
            self.init_tables(self.combined_tables, is_combined=True)
        separator = self.header_separator
        to_analyze = deque([("", "", {}, self.schema)])

        # TODO: check if recursion is better for field ordering
        while to_analyze:
            path, parent_key, parent, prop = to_analyze.pop()
            if prop.get("deprecated"):
                continue
            # TODO: handle oneOf anyOf allOf
            properties = prop.get("properties", {})
            if properties:
                for key, item in properties.items():
                    if item.get("deprecated"):
                        continue
                    if hasattr(item, "__reference__"
                               ) and item.__reference__.get("deprecated"):
                        continue

                    typeset = extract_type(item)
                    pointer = separator.join([path, key])
                    self.current_table = self.get_table(pointer)
                    if not self.current_table:
                        continue

                    self.current_table.types[pointer] = typeset
                    if "object" in typeset:
                        to_analyze.append((pointer, key, properties, item))
                    elif "array" in typeset:
                        items = item["items"]
                        items_type = extract_type(items)
                        if set(items_type) & {"array", "object"}:
                            if pointer not in self.current_table.path:
                                # found child array, need to create child table
                                key = self.name_check(parent_key, key)
                                self._add_table(
                                    add_child_table(self.current_table,
                                                    pointer, parent_key, key),
                                    pointer)
                            to_analyze.append(
                                (pointer, key, properties, items))
                        else:
                            # This means we in array of strings, so this becomes a single joinable column
                            typeset = ARRAY.format(items_type)
                            self.current_table.types[pointer] = JOINABLE
                            self.current_table.add_column(
                                pointer, typeset, _(pointer, self.language))
                    else:
                        if self.current_table.is_combined:
                            pointer = separator + separator.join(
                                (parent_key, key))
                        self.current_table.add_column(
                            pointer, typeset, _(pointer, self.language))
            else:
                # TODO: not sure what to do here
                continue
Esempio n. 20
0
File: cli.py Progetto: lttga/test2
import click
import click_logging
from ocdsextensionregistry import ProfileBuilder
from ocdskit.util import detect_format

from spoonbill import FileAnalyzer, FileFlattener
from spoonbill.common import COMBINED_TABLES, ROOT_TABLES, TABLE_THRESHOLD
from spoonbill.flatten import FlattenOptions
from spoonbill.i18n import LOCALE, _
from spoonbill.utils import read_lines, resolve_file_uri

LOGGER = logging.getLogger("spoonbill")
click_logging.basic_config(LOGGER)

CURRENT_SCHEMA_TAG = "1__1__5"
ANALYZED_LABEL = _("  Processed {} objects")
FLATTENED_LABEL = _("  Flattened {} objects")


class CommaSeparated(click.ParamType):
    """Click option type to convert comma separated string into list"""

    name = "comma"

    def convert(self, value, param, ctx):  # noqa
        if not value:
            return []
        return [v.lower() for v in value.split(",")]


def read_option_file(option, option_file):
Esempio n. 21
0
File: cli.py Progetto: lttga/test2
def cli(
    filename,
    schema,
    selection,
    split,
    threshold,
    state_file,
    xlsx,
    csv,
    combine,
    unnest,
    unnest_file,
    only,
    only_file,
    repeat,
    repeat_file,
    count,
    human,
    language,
):
    """Spoonbill cli entry point"""
    click.echo(_("Detecting input file format"))
    # TODO: handle line separated json
    # TODO: handle single release/record
    (
        input_format,
        _is_concatenated,
        _is_array,
    ) = detect_format(filename)
    if csv:
        csv = pathlib.Path(csv).resolve()
        if not csv.exists():
            raise click.BadParameter(
                _("Desired location {} does not exists").format(csv))
    if xlsx:
        xlsx = pathlib.Path(xlsx).resolve()
        if not xlsx.parent.exists():
            raise click.BadParameter(
                _("Desired location {} does not exists").format(xlsx.parent))
    click.echo(
        _("Input file is {}").format(click.style(input_format, fg="green")))
    is_package = "package" in input_format
    combine_choice = combine if combine else ""
    if not is_package:
        # TODO: fix this
        click.echo("Single releases are not supported by now")
        return
    if schema:
        schema = resolve_file_uri(schema)
    if "release" in input_format:
        root_key = "releases"
        if not schema:
            click.echo(
                _("No schema provided, using version {}").format(
                    click.style(CURRENT_SCHEMA_TAG, fg="cyan")))
            profile = ProfileBuilder(CURRENT_SCHEMA_TAG, {})
            schema = profile.release_package_schema()
    else:
        root_key = "records"
        if not schema:
            click.echo(
                _("No schema provided, using version {}").format(
                    click.style(CURRENT_SCHEMA_TAG, fg="cyan")))
            profile = ProfileBuilder(CURRENT_SCHEMA_TAG, {})
            schema = profile.record_package_schema()
    title = schema.get("title", "").lower()
    if not title:
        raise ValueError(
            _("Incomplete schema, please make sure your data is correct"))
    if "package" in title:
        # TODO: is is a good way to get release/record schema
        schema = schema["properties"][root_key]["items"]

    path = pathlib.Path(filename)
    workdir = path.parent
    filename = path.name
    selection = selection or ROOT_TABLES.keys()
    combine = combine or COMBINED_TABLES.keys()
    root_tables = get_selected_tables(ROOT_TABLES, selection)
    combined_tables = get_selected_tables(COMBINED_TABLES, combine)

    if state_file:
        click.secho(_("Restoring from provided state file"), bold=True)
        analyzer = FileAnalyzer(workdir, state_file=state_file)
    else:
        click.secho(
            _("State file not supplied, going to analyze input file first"),
            bold=True)
        analyzer = FileAnalyzer(
            workdir,
            schema=schema,
            root_key=root_key,
            root_tables=root_tables,
            combined_tables=combined_tables,
            language=language,
            table_threshold=threshold,
        )
        click.echo(_("Analyze options:"))
        click.echo(
            _(" - table threshold => {}").format(
                click.style(str(threshold), fg="cyan")))
        click.echo(
            _(" - language        => {}").format(
                click.style(language, fg="cyan")))
        click.echo(
            _("Processing file: {}").format(click.style(str(path), fg="cyan")))
        total = path.stat().st_size
        progress = 0
        # Progress bar not showing with small files
        # https://github.com/pallets/click/pull/1296/files
        with click.progressbar(width=0,
                               show_percent=True,
                               show_pos=True,
                               length=total) as bar:
            for read, number in analyzer.analyze_file(filename,
                                                      with_preview=True):
                bar.label = ANALYZED_LABEL.format(
                    click.style(str(number), fg="cyan"))
                bar.update(read - progress)
                progress = read
        click.secho(_("Done processing. Analyzed objects: {}").format(
            click.style(str(number + 1), fg="red")),
                    fg="green")
        state_file = pathlib.Path(f"{filename}.state")
        state_file_path = workdir / state_file
        click.echo(
            _("Dumping analyzed data to '{}'").format(
                click.style(str(state_file_path.absolute()), fg="cyan")))
        analyzer.dump_to_file(state_file)

    click.echo(
        _("Flattening file: {}").format(click.style(str(path), fg="cyan")))

    if unnest and unnest_file:
        raise click.UsageError(
            _("Conflicting options: unnest and unnest-file"))
    if repeat and repeat_file:
        raise click.UsageError(
            _("Conflicting options: repeat and repeat-file"))
    if only and only_file:
        raise click.UsageError(_("Conflicting options: only and only-file"))

    options = {"selection": {}, "count": count}
    unnest = read_option_file(unnest, unnest_file)
    repeat = read_option_file(repeat, repeat_file)
    only = read_option_file(only, only_file)

    for name in selection:
        table = analyzer.spec[name]
        if table.total_rows == 0:
            click.echo(
                _("Ignoring empty table {}").format(click.style(name,
                                                                fg="red")))
            continue

        unnest = [col for col in unnest if col in table.combined_columns]
        if unnest:
            click.echo(
                _("Unnesting columns {} for table {}").format(
                    click.style(",".join(unnest), fg="cyan"),
                    click.style(name, fg="cyan")))

        only = [col for col in only if col in table]
        if only:
            click.echo(
                _("Using only columns {} for table {}").format(
                    click.style(",".join(only), fg="cyan"),
                    click.style(name, fg="cyan")))

        repeat = [col for col in repeat if col in table]
        if repeat:
            click.echo(
                _("Repeating columns {} in all child table of {}").format(
                    click.style(",".join(repeat), fg="cyan"),
                    click.style(name, fg="cyan")))

        options["selection"][name] = {
            "split": split or analyzer.spec[name].should_split,
            "pretty_headers": human,
            "unnest": unnest,
            "only": only,
            "repeat": repeat,
        }
    options = FlattenOptions(**options)
    flattener = FileFlattener(
        workdir,
        options,
        analyzer.spec.tables,
        root_key=root_key,
        csv=csv,
        xlsx=xlsx,
        language=language,
    )

    all_tables = chain([table for table in flattener.flattener.tables.keys()],
                       combine_choice)

    click.echo(
        _("Going to export tables: {}").format(
            click.style(",".join(all_tables), fg="magenta")))

    click.echo(_("Processed tables:"))
    for table in flattener.flattener.tables.keys():
        message = _("{}: {} rows").format(
            table, flattener.flattener.tables[table].total_rows)
        if not flattener.flattener.tables[table].is_root:
            message = "└-----" + message
            click.echo(message)
        else:
            click.echo(message)
    click.echo(_("Flattening input file"))
    with click.progressbar(
            flattener.flatten_file(filename),
            length=analyzer.spec.total_items + 1,
            width=0,
            show_percent=True,
            show_pos=True,
    ) as bar:
        for count in bar:
            bar.label = FLATTENED_LABEL.format(
                click.style(str(count + 1), fg="cyan"))

    click.secho(_("Done flattening. Flattened objects: {}").format(
        click.style(str(count + 1), fg="red")),
                fg="green")
Esempio n. 22
0
    def process_items(self, releases, with_preview=True):
        """Analyze releases

        Iterate over every item in provided list to
        calculate metrics and optionally generate preview for combined and split version of the table

        :param releases: Iterator of items to analyze
        :param with_preview: If set to True generates previews for each table
        """
        separator = self.header_separator
        for count, release in enumerate(releases):
            to_analyze = deque([("", "", "", {}, release)])
            ocid = release["ocid"]
            top_level_id = release["id"]

            while to_analyze:
                abs_path, path, parent_key, parent, record = to_analyze.pop()
                for key, item in record.items():
                    pointer = separator.join([path, key])
                    self.current_table = self.get_table(pointer)
                    if not self.current_table:
                        continue
                    item_type = self.current_table.types.get(pointer)
                    if pointer in self.current_table.path:
                        # strict match like /parties, /tender
                        row_id = generate_row_id(ocid, record.get("id", ""),
                                                 parent_key, top_level_id)
                        c = item if isinstance(item, list) else [item]
                        for _nop in c:
                            self.current_table.inc()
                            if with_preview and count < PREVIEW_ROWS:
                                parent_table = not self.current_table.is_root and parent_key
                                self.add_preview_row(ocid, record.get("id"),
                                                     row_id, parent.get("id"),
                                                     parent_table)

                    # TODO: this validation should probably be smarter with arrays
                    if item_type and item_type != JOINABLE and not validate_type(
                            item_type, item):
                        LOGGER.error("Mismatched type on %s expected %s" %
                                     (pointer, item_type))
                        continue

                    if isinstance(item, dict):
                        to_analyze.append((
                            separator.join([abs_path, key]),
                            pointer,
                            key,
                            record,
                            item,
                        ))
                    elif item and isinstance(item, list):
                        abs_pointer = separator.join([abs_path, key])
                        if not isinstance(item[0], dict) and not item_type:
                            LOGGER.debug(
                                _("Detected additional column: %s in %s table")
                                % (abs_pointer, root.name))
                            item_type = JOINABLE
                            self.current_table.add_column(
                                pointer,
                                JOINABLE,
                                _(pointer, self.language),
                                additional=True,
                                abs_path=abs_pointer,
                            )
                        if item_type == JOINABLE:
                            self.current_table.inc_column(abs_pointer, pointer)
                            if with_preview and count < PREVIEW_ROWS:
                                value = JOINABLE_SEPARATOR.join(item)
                                self.current_table.set_preview_path(
                                    abs_pointer, pointer, value,
                                    self.table_threshold)
                        elif self.current_table.is_root or self.current_table.is_combined:
                            for value in item:
                                to_analyze.append((
                                    abs_pointer,
                                    pointer,
                                    key,
                                    record,
                                    value,
                                ))
                        else:
                            parent_table = self.current_table.parent
                            if pointer not in parent_table.arrays:
                                LOGGER.debug(
                                    _("Detected additional table: %s") %
                                    pointer)
                                self.current_table.types[pointer] = ["array"]
                                parent_table = self.current_table
                                # TODO: do we need to mark this table as additional
                                self._add_table(
                                    add_child_table(self.current_table,
                                                    pointer, parent_key, key),
                                    pointer)
                                self.add_preview_row(ocid, record.get("id"),
                                                     row_id, parent.get("id"),
                                                     parent_table)

                            if parent_table.set_array(pointer, item):
                                should_split = len(
                                    item) >= self.table_threshold
                                if should_split:
                                    parent_table.should_split = True
                                    self.current_table.roll_up = True
                                recalculate_headers(parent_table, pointer,
                                                    abs_path, key, item,
                                                    should_split, separator)

                            for i, value in enumerate(item):
                                if isinstance(value, dict):
                                    abs_pointer = separator.join(
                                        [abs_path, key, str(i)])
                                    to_analyze.append((
                                        abs_pointer,
                                        pointer,
                                        parent_key,
                                        record,
                                        value,
                                    ))
                    else:
                        root = get_root(self.current_table)
                        abs_pointer = separator.join((abs_path, key))
                        if self.current_table.is_combined:
                            LOGGER.debug(
                                _("Path %s is targeted to combined table %s") %
                                (pointer, self.current_table.name))
                            pointer = separator + separator.join(
                                (parent_key, key))
                            abs_pointer = pointer
                        if abs_pointer not in root.combined_columns:
                            self.current_table.add_column(
                                pointer,
                                PYTHON_TO_JSON_TYPE.get(
                                    type(item).__name__, "N/A"),
                                _(pointer, self.language),
                                additional=True,
                                abs_path=abs_pointer,
                            )
                        self.current_table.inc_column(abs_pointer, pointer)
                        if item and with_preview and count < PREVIEW_ROWS:
                            self.current_table.set_preview_path(
                                abs_pointer, pointer, item,
                                self.table_threshold)
            yield count
        self.total_items = count
Esempio n. 23
0
def get_selected_tables(base, selection):
    for name in selection:
        if name not in base:
            msg = _("Wrong selection, table '{}' does not exist").format(name)
            raise click.BadParameter(msg)
    return {name: tab for name, tab in base.items() if name in selection}
Esempio n. 24
0
def cli(
    filename,
    schema,
    selection,
    threshold,
    state_file,
    xlsx,
    csv,
    combine,
    exclude,
    unnest,
    unnest_file,
    only,
    only_file,
    repeat,
    repeat_file,
    count,
    human,
    language,
):
    """Spoonbill cli entry point"""
    if csv:
        csv = pathlib.Path(csv).resolve()
        if not csv.exists():
            raise click.BadParameter(
                _("Desired location {} does not exists").format(csv))
    if xlsx:
        xlsx = pathlib.Path(xlsx).resolve()
        if not xlsx.parent.exists():
            raise click.BadParameter(
                _("Desired location {} does not exists").format(xlsx.parent))

    path = pathlib.Path(filename)
    workdir = path.parent
    filename = path.name
    selection = selection or ROOT_TABLES.keys()
    combine = combine or COMBINED_TABLES.keys()
    root_tables = get_selected_tables(ROOT_TABLES, selection)
    combined_tables = get_selected_tables(COMBINED_TABLES, combine)

    if state_file:
        click.secho(_("Restoring from provided state file"), bold=True)
        analyzer = FileAnalyzer(workdir, state_file=state_file)
    else:
        click.secho(
            _("State file not supplied, going to analyze input file first"),
            bold=True)
        analyzer = FileAnalyzer(
            workdir,
            schema=schema,
            root_tables=root_tables,
            combined_tables=combined_tables,
            language=language,
            table_threshold=threshold,
        )
        click.echo(_("Analyze options:"))
        for name, option in ("threshold", str(threshold)), ("language",
                                                            language):
            click.echo(
                _(" - {:30} => {}").format(name, click.style(option,
                                                             fg="cyan")))
        click.echo(
            _("Processing file: {}").format(click.style(str(path), fg="cyan")))
        total = path.stat().st_size
        progress = 0
        # Progress bar not showing with small files
        # https://github.com/pallets/click/pull/1296/files
        with click.progressbar(width=0,
                               show_percent=True,
                               show_pos=True,
                               length=total) as bar:
            for read, number in analyzer.analyze_file(filename,
                                                      with_preview=False):
                bar.label = ANALYZED_LABEL.format(
                    click.style(str(number), fg="cyan"))
                bar.update(read - progress)
                progress = read
        click.secho(_("Done processing. Analyzed objects: {}").format(
            click.style(str(number + 1), fg="red")),
                    fg="green")
        if isinstance(filename, list):
            state_file = pathlib.Path(f"{filename[0]}.state")
        else:
            state_file = pathlib.Path(f"{filename}.state")
        state_file_path = workdir / state_file
        click.echo(
            _("Dumping analyzed data to '{}'").format(
                click.style(str(state_file_path.absolute()), fg="cyan")))
        analyzer.dump_to_file(state_file)

    click.echo(
        _("Flattening file: {}").format(click.style(str(path), fg="cyan")))

    if unnest and unnest_file:
        raise click.UsageError(
            _("Conflicting options: unnest and unnest-file"))
    if repeat and repeat_file:
        raise click.UsageError(
            _("Conflicting options: repeat and repeat-file"))
    if only and only_file:
        raise click.UsageError(_("Conflicting options: only and only-file"))
    if exclude:
        click.echo(
            _("Ignoring tables (excluded by user): {}").format(
                click.style(",".join(exclude), fg="red")))

    options = {"selection": {}, "count": count, "exclude": exclude}
    unnest = read_option_file(unnest, unnest_file)
    repeat = read_option_file(repeat, repeat_file)
    only = read_option_file(only, only_file)

    for name in list(selection) + list(combine):
        table = analyzer.spec[name]
        if table.total_rows == 0:
            click.echo(
                _("Ignoring empty table {}").format(click.style(name,
                                                                fg="red")))
            continue
        options["selection"][name] = {
            "split": analyzer.spec[name].splitted,
            "pretty_headers": human,
        }
        if not analyzer.spec[name].is_combined:
            unnest_in_table = [
                col for col in unnest if col in table.combined_columns
            ]
            if unnest_in_table:
                click.echo(
                    _("Unnesting columns {} for table {}").format(
                        click.style(",".join(unnest_in_table), fg="cyan"),
                        click.style(name, fg="cyan")))

            only_in_table = [col for col in only if col in table]
            if only_in_table:
                click.echo(
                    _("Using only columns {} for table {}").format(
                        click.style(",".join(only_in_table), fg="cyan"),
                        click.style(name, fg="cyan")))

            repeat_in_table = [col for col in repeat if col in table]
            if repeat_in_table:
                click.echo(
                    _("Repeating columns {} in all child table of {}").format(
                        click.style(",".join(repeat_in_table), fg="cyan"),
                        click.style(name, fg="cyan")))
            options["selection"][name]["only"] = only_in_table
            options["selection"][name]["repeat"] = repeat_in_table
            options["selection"][name]["unnest"] = unnest_in_table

    options = FlattenOptions(**options)
    flattener = FileFlattener(
        workdir,
        options,
        analyzer,
        csv=csv,
        xlsx=xlsx,
        language=language,
    )

    click.echo(
        _("Going to export tables: {}").format(
            click.style(",".join(flattener.flattener.tables.keys()),
                        fg="magenta")))
    click.echo(_("Processed tables:"))
    for table_name, table in flattener.flattener.tables.items():
        msg = _(" - {:30} => {} rows") if table.is_root else _(
            " ---- {:27} => {} rows")
        message = msg.format(table_name,
                             click.style(str(table.total_rows), fg="cyan"))
        click.echo(message)
    click.echo(_("Flattening input file"))
    with click.progressbar(
            flattener.flatten_file(filename),
            length=analyzer.spec.total_items + 1,
            width=0,
            show_percent=True,
            show_pos=True,
    ) as bar:
        for count in bar:
            bar.label = FLATTENED_LABEL.format(
                click.style(str(count + 1), fg="cyan"))

    click.secho(_("Done flattening. Flattened objects: {}").format(
        click.style(str(count + 1), fg="red")),
                fg="green")
Esempio n. 25
0
import logging
import pathlib

import click
import click_logging

from spoonbill import FileAnalyzer, FileFlattener
from spoonbill.common import COMBINED_TABLES, ROOT_TABLES, TABLE_THRESHOLD
from spoonbill.flatten import FlattenOptions
from spoonbill.i18n import LOCALE, _
from spoonbill.utils import read_lines

LOGGER = logging.getLogger("spoonbill")
click_logging.basic_config(LOGGER)

ANALYZED_LABEL = _("  Processed {} objects")
FLATTENED_LABEL = _("  Flattened {} objects")
CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])


class CommaSeparated(click.ParamType):
    """Click option type to convert comma separated string into list"""

    name = "comma"

    def convert(self, value, param, ctx):  # noqa
        if not value:
            return []
        return [v for v in value.split(",")]

Esempio n. 26
0
    def process_items(self, releases, with_preview=True):
        """
        Analyze releases.

        Iterates over every release to calculate metrics and optionally generates previews for combined and split
        versions of each table.

        :param releases: The releases to analyze
        :param with_preview: Whether to generate previews for each table
        """
        for count, release in enumerate(releases):

            to_analyze = deque([("", "", "", {}, release)])
            rows = Rows(ocid=release["ocid"],
                        buyer=release.get("buyer", {}),
                        data=defaultdict(list))
            while to_analyze:
                abs_path, path, parent_key, parent, record = to_analyze.popleft(
                )
                if hasattr(record, "items"):
                    for key, item in record.items():
                        pointer = self.join_path(path, key)

                        self.current_table = self.get_table(pointer)
                        if not self.current_table:
                            continue

                        if self.is_new_row(pointer):
                            self.inc_table_rows(item, rows, parent_key, record)

                        self.extend_table_types(pointer, item)
                        item_type = self.current_table.types.get(pointer)

                        if not self.is_type_matched(pointer, item, item_type):
                            continue

                        if isinstance(item, dict):
                            to_analyze.append((
                                self.join_path(abs_path, key),
                                pointer,
                                key,
                                record,
                                item,
                            ))
                        elif item and isinstance(item, list):
                            abs_pointer = self.join_path(abs_path, key)

                            if not isinstance(item[0], dict) and not item_type:
                                item_type = JOINABLE
                                self.add_joinable_column(abs_pointer, pointer)

                            if item_type == JOINABLE:
                                if pointer not in self.current_table:
                                    self.add_joinable_column(
                                        abs_pointer, pointer)
                                self.current_table.inc_column(
                                    abs_pointer, pointer)
                                if self.with_preview and count < PREVIEW_ROWS:
                                    value = JOINABLE_SEPARATOR.join(
                                        [str(i) for i in item])
                                    self.current_table.set_preview_path(
                                        abs_pointer, pointer, value,
                                        self.table_threshold)
                            elif self.is_base_table():
                                for value in item:
                                    to_analyze.append((
                                        abs_pointer,
                                        pointer,
                                        key,
                                        record,
                                        value,
                                    ))
                            else:
                                parent_table = self.current_table.parent
                                if pointer not in parent_table.arrays:
                                    LOGGER.debug(
                                        _("Detected additional table: %s") %
                                        pointer)
                                    self.current_table.types[pointer] = [
                                        "array"
                                    ]
                                    parent_table = self.current_table
                                    self.add_additional_table(
                                        pointer, abs_pointer, parent_key, key,
                                        item)
                                    self.add_preview_row(
                                        rows, record.get("id", ""), parent_key)

                                if parent_table.set_array(pointer, item):
                                    self.handle_array_expanded(
                                        pointer, item, abs_path, key)

                                for i, value in enumerate(item):
                                    if isinstance(value, dict):
                                        abs_pointer = self.join_path(
                                            abs_path, key, str(i))
                                        to_analyze.append((
                                            abs_pointer,
                                            pointer,
                                            parent_key,
                                            record,
                                            value,
                                        ))
                        else:
                            abs_pointer = self.join_path(abs_path, key)
                            if self.current_table.is_combined:
                                pointer, abs_pointer = self.get_paths_for_combined_table(
                                    parent_key, key)
                            col = self.current_table.columns.get(pointer)
                            if col:
                                if abs_pointer not in self.current_table:
                                    parent = self.current_table.parent
                                    parent.add_array_column(
                                        col,
                                        pointer,
                                        abs_pointer,
                                        max=self.table_threshold)
                            else:
                                self.current_table.add_column(
                                    pointer,
                                    self.guess_type(item),
                                    pointer,
                                    additional=True,
                                    abs_path=abs_pointer,
                                )
                            self.current_table.inc_column(abs_pointer, pointer)
                            if item and self.with_preview and count < PREVIEW_ROWS:
                                if not pointer.startswith("/buyer"):
                                    self.current_table.set_preview_path(
                                        abs_pointer, pointer, item,
                                        self.table_threshold)
            yield count
        self.clean_up_missing_arrays()
        self.total_items = count