def writerow(self, table, row): """ Write a row to the output file. """ table_name = self.names.get(table, table) sheet = self.workbook.get_worksheet_by_name(table_name) columns = self.col_index[table] if not columns: LOGGER.error(_("Invalid table {}").format(table)) return for column, value in row.items(): if isinstance(value, bool): value = str(value) try: col_index = columns[column] except KeyError: LOGGER.error( _("Operation produced invalid path. This a software bug, please send issue to developers" )) LOGGER.error( _("Failed to write column {} to xlsx sheet {}").format( column, table)) return try: sheet.write(self.row_counters[table], col_index, value) except XlsxWriterException as err: LOGGER.error( _("Failed to write column {} to xlsx sheet {} with error {}" ).format(column, table, err)) self.row_counters[table] += 1
def parse_schema(self, input_format, schema=None): if schema: schema = resolve_file_uri(schema) if "release" in input_format: pkg_type = "releases" getter = attrgetter("release_package_schema") else: pkg_type = "records" getter = attrgetter("record_package_schema") url = DEFAULT_SCHEMA_URL[pkg_type].get( self.language[:2], DEFAULT_SCHEMA_URL[pkg_type]["en"]) if not schema: LOGGER.info( _("No schema provided, using version {}").format( CURRENT_SCHEMA_TAG)) profile = ProfileBuilder(CURRENT_SCHEMA_TAG, {}, schema_base_url=url) schema = getter(profile)() title = schema.get("title", "").lower() if not title: raise ValueError( _("Incomplete schema, please make sure your data is correct")) if "package" in title: # TODO: is is a good way to get release/record schema schema = jsonref.JsonRef.replace_refs(schema) schema = schema["properties"][pkg_type]["items"] self.schema = schema self.pkg_type = pkg_type
def __enter__(self): """ Write the headers to the output file. """ for name, table in self.tables.items(): table_name, headers = self.init_sheet(name, table) try: path = self.workdir / f"{table_name}.csv" LOGGER.info( _("Dumping table '{}' to file '{}'").format( table_name, path)) fd = open(path, "w") except (IOError, OSError) as e: LOGGER.error( _("Failed to open file {} with error {}").format(path, e)) return writer = csv.DictWriter(fd, headers) self.fds.append(fd) self.writers[name] = writer for name, writer in self.writers.items(): headers = self.headers[name] try: writer.writerow(headers) except ValueError as err: LOGGER.error(_("Failed to headers with error {}").format(err)) return self
def writerow(self, table, row): """Write row to output file""" try: self.writers[table].writerow(row) except ValueError as err: LOGGER.error( _("Operation produced invalid path. This a software bug, please send issue to developers" )) LOGGER.error( _("Failed to write row {} with error {}").format( row.get("rowID"), err)) except KeyError: LOGGER.error(_("Invalid table {}").format(table))
def _init_options(self, tables): for table in tables.values(): name = table.name count = self.options.count options = self.options.selection[name] unnest = options.unnest split = options.split repeat = options.repeat if count: for array in table.arrays: parts = array.split("/") parts[-1] = f"{parts[-1]}Count" path = "/".join(parts) target = self._types_cache.get(array) or table combined = split and table.should_split if combined: # add count columns only if table is rolled up # in other way it could be frustrating # e.g. it may generate columns for whole array (/tender/items/200/additionalClassificationsCount) target.add_column( path, "integer", _(path, self.language), additional=True, combined_only=not combined, propagate=False, ) target.inc_column(path, path) if unnest: for col_id in unnest: col = table.combined_columns[col_id] table.columns[col_id] = col if repeat: for col_id in repeat: columns = table.columns if split else table.combined_columns title = table.titles.get(col_id) col = columns.get(col_id) if not col: LOGGER.warning( _("Ignoring repeat column {} because it is not in table {}" ).format(col_id, name)) continue for c_name in table.child_tables: child_table = self.tables.get(c_name) if child_table: # if false means table isnt rolled up child_table.columns[col_id] = col child_table.combined_columns[col_id] = col child_table.titles[col_id] = title
def __init__(self, workdir, tables, options, filename="result.xlsx"): super().__init__(workdir, tables, options) self.col_index = collections.defaultdict(dict) path = workdir / filename LOGGER.info(_("Dumping all sheets to file to file '{}'").format(path)) self.workbook = xlsxwriter.Workbook(path, {"constant_memory": True}) self.row_counters = {}
def __exit__(self, *args): """ Close the workbook. """ LOGGER.info( _("Dumped all sheets to file to file '{}'").format(self.path)) self.workbook.close()
def dump(self, path): """Dump table objects to file system""" try: with open(path, "wb") as fd: pickle.dump(self, fd) except (OSError, IOError) as e: LOGGER.error( _("Failed to dump DataPreprocessor to file. Error: {}").format( e))
def restore(_cls, path): """Restore DataPreprocessor from file :param path: Full path to file """ try: with open(path, "rb") as fd: return pickle.load(fd) except (TypeError, pickle.UnpicklingError): LOGGER.error(_("Invalid pickle file. Can't restore."))
def add_joinable_column(self, abs_pointer, pointer): LOGGER.debug( _("Detected additional column: %s in %s table") % (abs_pointer, self.current_table.name)) self.current_table.types[pointer] = JOINABLE self.current_table.add_column(pointer, JOINABLE, pointer, additional=True, abs_path=abs_pointer, header=pointer)
def add_column(self, path, item_type, title, *, propagated=False, additional=False, abs_path=None, header=[]): """ Add a new column to the table. :param path: The column's path :param item_type: The column's expected type :param title: Column title :param combined_only: Make this column available only in combined version of table :param propagated: Add column to parent table :param additional: Mark this column as missing in schema :param abs_path: The column's full JSON path """ combined_path = combine_path(self, path) col = Column(path, combined_path, title, item_type, header=header) array = self.is_array(path) if additional: if array: # when we analyzing file we need to keep index from data not to use 0 # e.g. /tender/items/166/relatedLot combined_path = abs_path col = replace(col, path=combined_path) LOGGER.debug( _("Detected additional column: %s in %s table") % (path, self.name)) self.additional_columns[combined_path] = col if not propagated: self.columns[combined_path] = col self.combined_columns[combined_path] = col if propagated: self.array_columns[combined_path] = col self.array_positions[array] = combined_path if not self.is_root: self.parent.add_column(path, item_type, title, propagated=True, header=header) for p in (path, combined_path): if path not in self.titles: self.titles[p] = header if path not in self.types: self.types[path] = item_type
def dump(self, path): """ Dump the data processor's state to a file. :param path: Full path to file """ try: with open(path, "wb") as fd: pickle.dump(self, fd) except (OSError, IOError) as e: LOGGER.error( _("Failed to dump DataPreprocessor to file. Error: {}").format( e))
def init_repeat(self, table, options): for col_id in options.repeat: columns = table.columns if options.split else table.combined_columns col = columns.get(col_id) if not col: LOGGER.warning( _("Ignoring repeat column {} because it is not in table {}" ).format(col_id, table.name)) continue for c_name in table.child_tables: child_table = self.tables.get(c_name) if child_table: child_table.columns[col_id] = col child_table.combined_columns[col_id] = col child_table.titles[col_id] = col_id
def __enter__(self): """Write headers to output file""" for name, table in self.tables.items(): table_name, headers = self.init_sheet(name, table) sheet = self.workbook.add_worksheet(table_name) for col_index, col_name in enumerate(headers): self.col_index[name][col_name] = col_index try: sheet.write(0, col_index, headers[col_name]) except XlsxWriterException as err: LOGGER.error( _("Failed to write header {} to xlsx sheet {} with error {}" ).format(col_name, name, err)) self.row_counters[name] = 1 return self
def add_column( self, path, item_type, title, *, combined_only=False, propagate=True, additional=False, abs_path=None, ): """Add new column to the table :param path: Column path :param item: Object schema description :param item_type: Column expected type :param parent: Parent object schema description :param combined_only: Make this column available only in combined version of table :param additional: Mark this column as missing in schema """ is_array = self.is_array(path) combined_path = combine_path(self, path) if not combined_only: self.columns[combined_path] = Column(title, item_type, combined_path) # new column to track hits differently self.combined_columns[combined_path] = Column(title, item_type, combined_path) if additional: if is_array: # when we analyzing file we need to keep index from data not to use 0 # e.g. /tender/items/166/relatedLot combined_path = abs_path LOGGER.debug(_("Detected additional column: %s in %s table") % (path, self.name)) self.additional_columns[combined_path] = Column(title, item_type, combined_path) for p in (path, combined_path): self.titles[p] = title if not self.is_root and propagate: self.parent.add_column( path, item_type, title, combined_only=combined_only, additional=additional, abs_path=abs_path, )
def __post_init__(self): for attr in ( "columns", "combined_columns", "additional_columns", ): obj = getattr(self, attr, {}) if obj: init = OrderedDict() for name, col in obj.items(): if not is_dataclass(col): col = Column(**col) init[name] = col setattr(self, attr, init) cols = DEFAULT_FIELDS if self.is_root else DEFAULT_FIELDS_COMBINED for col in cols: if col not in self.columns: self.columns[col] = Column(col, "string", col) if col not in self.combined_columns: self.combined_columns[col] = Column(col, "string", col) self.titles[col] = _(col)
def add_additional_table(self, pointer, abs_pointer, parent_key, key, item): LOGGER.debug(_("Detected additional table: %s") % pointer) self.current_table.types[pointer] = ["array"] self._add_table( add_child_table(self.current_table, pointer, parent_key, key), pointer) # add columns beforehand because it might be required # to recalculate and reorder headers when enlarging array # there must be a better way but it should work for now for extended_item in item: for path_, it in flatten(extended_item, reducer="path").items(): ppointer = self.join_path(pointer, path_) if ppointer not in self.current_table: self.current_table.add_column( ppointer, self.guess_type(it), ppointer, abs_path=self.join_path(abs_pointer, path_), header=ppointer, )
def analyze_file(self, filenames, with_preview=True): """Analyze provided file :param filename: Input filename :param with_preview: Generate preview during analysis """ if not isinstance(filenames, list): filenames = [filenames] path = self.workdir / filenames[0] ( input_format, _is_concatenated, _is_array, ) = detect_format(path=path, reader=get_reader(path)) LOGGER.info(_("Input file is {}").format(input_format)) self.multiple_values = _is_concatenated self.parse_schema(input_format, self.schema) if self.spec is None: self.spec = DataPreprocessor( self.schema, self.root_tables, combined_tables=self.combined_tables, language=self.language, table_threshold=self.table_threshold, multiple_values=self.multiple_values, pkg_type=self.pkg_type, ) for filename in filenames: path = self.workdir / filename reader = get_reader(path) with reader(path, "rb") as fd: items = iter_file(fd, self.pkg_type, multiple_values=self.multiple_values) for count in self.spec.process_items(items): yield fd.tell(), count self.sort_tables()
def parse_schema(self): """Extract all available information from schema""" if isinstance(self.schema, (str, Path)): self.schema = resolve_file_uri(self.schema) self.schema = jsonref.JsonRef.replace_refs(self.schema) self.init_tables(self.root_tables) if self.combined_tables: self.init_tables(self.combined_tables, is_combined=True) separator = self.header_separator to_analyze = deque([("", "", {}, self.schema)]) # TODO: check if recursion is better for field ordering while to_analyze: path, parent_key, parent, prop = to_analyze.pop() if prop.get("deprecated"): continue # TODO: handle oneOf anyOf allOf properties = prop.get("properties", {}) if properties: for key, item in properties.items(): if item.get("deprecated"): continue if hasattr(item, "__reference__" ) and item.__reference__.get("deprecated"): continue typeset = extract_type(item) pointer = separator.join([path, key]) self.current_table = self.get_table(pointer) if not self.current_table: continue self.current_table.types[pointer] = typeset if "object" in typeset: to_analyze.append((pointer, key, properties, item)) elif "array" in typeset: items = item["items"] items_type = extract_type(items) if set(items_type) & {"array", "object"}: if pointer not in self.current_table.path: # found child array, need to create child table key = self.name_check(parent_key, key) self._add_table( add_child_table(self.current_table, pointer, parent_key, key), pointer) to_analyze.append( (pointer, key, properties, items)) else: # This means we in array of strings, so this becomes a single joinable column typeset = ARRAY.format(items_type) self.current_table.types[pointer] = JOINABLE self.current_table.add_column( pointer, typeset, _(pointer, self.language)) else: if self.current_table.is_combined: pointer = separator + separator.join( (parent_key, key)) self.current_table.add_column( pointer, typeset, _(pointer, self.language)) else: # TODO: not sure what to do here continue
import click import click_logging from ocdsextensionregistry import ProfileBuilder from ocdskit.util import detect_format from spoonbill import FileAnalyzer, FileFlattener from spoonbill.common import COMBINED_TABLES, ROOT_TABLES, TABLE_THRESHOLD from spoonbill.flatten import FlattenOptions from spoonbill.i18n import LOCALE, _ from spoonbill.utils import read_lines, resolve_file_uri LOGGER = logging.getLogger("spoonbill") click_logging.basic_config(LOGGER) CURRENT_SCHEMA_TAG = "1__1__5" ANALYZED_LABEL = _(" Processed {} objects") FLATTENED_LABEL = _(" Flattened {} objects") class CommaSeparated(click.ParamType): """Click option type to convert comma separated string into list""" name = "comma" def convert(self, value, param, ctx): # noqa if not value: return [] return [v.lower() for v in value.split(",")] def read_option_file(option, option_file):
def cli( filename, schema, selection, split, threshold, state_file, xlsx, csv, combine, unnest, unnest_file, only, only_file, repeat, repeat_file, count, human, language, ): """Spoonbill cli entry point""" click.echo(_("Detecting input file format")) # TODO: handle line separated json # TODO: handle single release/record ( input_format, _is_concatenated, _is_array, ) = detect_format(filename) if csv: csv = pathlib.Path(csv).resolve() if not csv.exists(): raise click.BadParameter( _("Desired location {} does not exists").format(csv)) if xlsx: xlsx = pathlib.Path(xlsx).resolve() if not xlsx.parent.exists(): raise click.BadParameter( _("Desired location {} does not exists").format(xlsx.parent)) click.echo( _("Input file is {}").format(click.style(input_format, fg="green"))) is_package = "package" in input_format combine_choice = combine if combine else "" if not is_package: # TODO: fix this click.echo("Single releases are not supported by now") return if schema: schema = resolve_file_uri(schema) if "release" in input_format: root_key = "releases" if not schema: click.echo( _("No schema provided, using version {}").format( click.style(CURRENT_SCHEMA_TAG, fg="cyan"))) profile = ProfileBuilder(CURRENT_SCHEMA_TAG, {}) schema = profile.release_package_schema() else: root_key = "records" if not schema: click.echo( _("No schema provided, using version {}").format( click.style(CURRENT_SCHEMA_TAG, fg="cyan"))) profile = ProfileBuilder(CURRENT_SCHEMA_TAG, {}) schema = profile.record_package_schema() title = schema.get("title", "").lower() if not title: raise ValueError( _("Incomplete schema, please make sure your data is correct")) if "package" in title: # TODO: is is a good way to get release/record schema schema = schema["properties"][root_key]["items"] path = pathlib.Path(filename) workdir = path.parent filename = path.name selection = selection or ROOT_TABLES.keys() combine = combine or COMBINED_TABLES.keys() root_tables = get_selected_tables(ROOT_TABLES, selection) combined_tables = get_selected_tables(COMBINED_TABLES, combine) if state_file: click.secho(_("Restoring from provided state file"), bold=True) analyzer = FileAnalyzer(workdir, state_file=state_file) else: click.secho( _("State file not supplied, going to analyze input file first"), bold=True) analyzer = FileAnalyzer( workdir, schema=schema, root_key=root_key, root_tables=root_tables, combined_tables=combined_tables, language=language, table_threshold=threshold, ) click.echo(_("Analyze options:")) click.echo( _(" - table threshold => {}").format( click.style(str(threshold), fg="cyan"))) click.echo( _(" - language => {}").format( click.style(language, fg="cyan"))) click.echo( _("Processing file: {}").format(click.style(str(path), fg="cyan"))) total = path.stat().st_size progress = 0 # Progress bar not showing with small files # https://github.com/pallets/click/pull/1296/files with click.progressbar(width=0, show_percent=True, show_pos=True, length=total) as bar: for read, number in analyzer.analyze_file(filename, with_preview=True): bar.label = ANALYZED_LABEL.format( click.style(str(number), fg="cyan")) bar.update(read - progress) progress = read click.secho(_("Done processing. Analyzed objects: {}").format( click.style(str(number + 1), fg="red")), fg="green") state_file = pathlib.Path(f"{filename}.state") state_file_path = workdir / state_file click.echo( _("Dumping analyzed data to '{}'").format( click.style(str(state_file_path.absolute()), fg="cyan"))) analyzer.dump_to_file(state_file) click.echo( _("Flattening file: {}").format(click.style(str(path), fg="cyan"))) if unnest and unnest_file: raise click.UsageError( _("Conflicting options: unnest and unnest-file")) if repeat and repeat_file: raise click.UsageError( _("Conflicting options: repeat and repeat-file")) if only and only_file: raise click.UsageError(_("Conflicting options: only and only-file")) options = {"selection": {}, "count": count} unnest = read_option_file(unnest, unnest_file) repeat = read_option_file(repeat, repeat_file) only = read_option_file(only, only_file) for name in selection: table = analyzer.spec[name] if table.total_rows == 0: click.echo( _("Ignoring empty table {}").format(click.style(name, fg="red"))) continue unnest = [col for col in unnest if col in table.combined_columns] if unnest: click.echo( _("Unnesting columns {} for table {}").format( click.style(",".join(unnest), fg="cyan"), click.style(name, fg="cyan"))) only = [col for col in only if col in table] if only: click.echo( _("Using only columns {} for table {}").format( click.style(",".join(only), fg="cyan"), click.style(name, fg="cyan"))) repeat = [col for col in repeat if col in table] if repeat: click.echo( _("Repeating columns {} in all child table of {}").format( click.style(",".join(repeat), fg="cyan"), click.style(name, fg="cyan"))) options["selection"][name] = { "split": split or analyzer.spec[name].should_split, "pretty_headers": human, "unnest": unnest, "only": only, "repeat": repeat, } options = FlattenOptions(**options) flattener = FileFlattener( workdir, options, analyzer.spec.tables, root_key=root_key, csv=csv, xlsx=xlsx, language=language, ) all_tables = chain([table for table in flattener.flattener.tables.keys()], combine_choice) click.echo( _("Going to export tables: {}").format( click.style(",".join(all_tables), fg="magenta"))) click.echo(_("Processed tables:")) for table in flattener.flattener.tables.keys(): message = _("{}: {} rows").format( table, flattener.flattener.tables[table].total_rows) if not flattener.flattener.tables[table].is_root: message = "└-----" + message click.echo(message) else: click.echo(message) click.echo(_("Flattening input file")) with click.progressbar( flattener.flatten_file(filename), length=analyzer.spec.total_items + 1, width=0, show_percent=True, show_pos=True, ) as bar: for count in bar: bar.label = FLATTENED_LABEL.format( click.style(str(count + 1), fg="cyan")) click.secho(_("Done flattening. Flattened objects: {}").format( click.style(str(count + 1), fg="red")), fg="green")
def process_items(self, releases, with_preview=True): """Analyze releases Iterate over every item in provided list to calculate metrics and optionally generate preview for combined and split version of the table :param releases: Iterator of items to analyze :param with_preview: If set to True generates previews for each table """ separator = self.header_separator for count, release in enumerate(releases): to_analyze = deque([("", "", "", {}, release)]) ocid = release["ocid"] top_level_id = release["id"] while to_analyze: abs_path, path, parent_key, parent, record = to_analyze.pop() for key, item in record.items(): pointer = separator.join([path, key]) self.current_table = self.get_table(pointer) if not self.current_table: continue item_type = self.current_table.types.get(pointer) if pointer in self.current_table.path: # strict match like /parties, /tender row_id = generate_row_id(ocid, record.get("id", ""), parent_key, top_level_id) c = item if isinstance(item, list) else [item] for _nop in c: self.current_table.inc() if with_preview and count < PREVIEW_ROWS: parent_table = not self.current_table.is_root and parent_key self.add_preview_row(ocid, record.get("id"), row_id, parent.get("id"), parent_table) # TODO: this validation should probably be smarter with arrays if item_type and item_type != JOINABLE and not validate_type( item_type, item): LOGGER.error("Mismatched type on %s expected %s" % (pointer, item_type)) continue if isinstance(item, dict): to_analyze.append(( separator.join([abs_path, key]), pointer, key, record, item, )) elif item and isinstance(item, list): abs_pointer = separator.join([abs_path, key]) if not isinstance(item[0], dict) and not item_type: LOGGER.debug( _("Detected additional column: %s in %s table") % (abs_pointer, root.name)) item_type = JOINABLE self.current_table.add_column( pointer, JOINABLE, _(pointer, self.language), additional=True, abs_path=abs_pointer, ) if item_type == JOINABLE: self.current_table.inc_column(abs_pointer, pointer) if with_preview and count < PREVIEW_ROWS: value = JOINABLE_SEPARATOR.join(item) self.current_table.set_preview_path( abs_pointer, pointer, value, self.table_threshold) elif self.current_table.is_root or self.current_table.is_combined: for value in item: to_analyze.append(( abs_pointer, pointer, key, record, value, )) else: parent_table = self.current_table.parent if pointer not in parent_table.arrays: LOGGER.debug( _("Detected additional table: %s") % pointer) self.current_table.types[pointer] = ["array"] parent_table = self.current_table # TODO: do we need to mark this table as additional self._add_table( add_child_table(self.current_table, pointer, parent_key, key), pointer) self.add_preview_row(ocid, record.get("id"), row_id, parent.get("id"), parent_table) if parent_table.set_array(pointer, item): should_split = len( item) >= self.table_threshold if should_split: parent_table.should_split = True self.current_table.roll_up = True recalculate_headers(parent_table, pointer, abs_path, key, item, should_split, separator) for i, value in enumerate(item): if isinstance(value, dict): abs_pointer = separator.join( [abs_path, key, str(i)]) to_analyze.append(( abs_pointer, pointer, parent_key, record, value, )) else: root = get_root(self.current_table) abs_pointer = separator.join((abs_path, key)) if self.current_table.is_combined: LOGGER.debug( _("Path %s is targeted to combined table %s") % (pointer, self.current_table.name)) pointer = separator + separator.join( (parent_key, key)) abs_pointer = pointer if abs_pointer not in root.combined_columns: self.current_table.add_column( pointer, PYTHON_TO_JSON_TYPE.get( type(item).__name__, "N/A"), _(pointer, self.language), additional=True, abs_path=abs_pointer, ) self.current_table.inc_column(abs_pointer, pointer) if item and with_preview and count < PREVIEW_ROWS: self.current_table.set_preview_path( abs_pointer, pointer, item, self.table_threshold) yield count self.total_items = count
def get_selected_tables(base, selection): for name in selection: if name not in base: msg = _("Wrong selection, table '{}' does not exist").format(name) raise click.BadParameter(msg) return {name: tab for name, tab in base.items() if name in selection}
def cli( filename, schema, selection, threshold, state_file, xlsx, csv, combine, exclude, unnest, unnest_file, only, only_file, repeat, repeat_file, count, human, language, ): """Spoonbill cli entry point""" if csv: csv = pathlib.Path(csv).resolve() if not csv.exists(): raise click.BadParameter( _("Desired location {} does not exists").format(csv)) if xlsx: xlsx = pathlib.Path(xlsx).resolve() if not xlsx.parent.exists(): raise click.BadParameter( _("Desired location {} does not exists").format(xlsx.parent)) path = pathlib.Path(filename) workdir = path.parent filename = path.name selection = selection or ROOT_TABLES.keys() combine = combine or COMBINED_TABLES.keys() root_tables = get_selected_tables(ROOT_TABLES, selection) combined_tables = get_selected_tables(COMBINED_TABLES, combine) if state_file: click.secho(_("Restoring from provided state file"), bold=True) analyzer = FileAnalyzer(workdir, state_file=state_file) else: click.secho( _("State file not supplied, going to analyze input file first"), bold=True) analyzer = FileAnalyzer( workdir, schema=schema, root_tables=root_tables, combined_tables=combined_tables, language=language, table_threshold=threshold, ) click.echo(_("Analyze options:")) for name, option in ("threshold", str(threshold)), ("language", language): click.echo( _(" - {:30} => {}").format(name, click.style(option, fg="cyan"))) click.echo( _("Processing file: {}").format(click.style(str(path), fg="cyan"))) total = path.stat().st_size progress = 0 # Progress bar not showing with small files # https://github.com/pallets/click/pull/1296/files with click.progressbar(width=0, show_percent=True, show_pos=True, length=total) as bar: for read, number in analyzer.analyze_file(filename, with_preview=False): bar.label = ANALYZED_LABEL.format( click.style(str(number), fg="cyan")) bar.update(read - progress) progress = read click.secho(_("Done processing. Analyzed objects: {}").format( click.style(str(number + 1), fg="red")), fg="green") if isinstance(filename, list): state_file = pathlib.Path(f"{filename[0]}.state") else: state_file = pathlib.Path(f"{filename}.state") state_file_path = workdir / state_file click.echo( _("Dumping analyzed data to '{}'").format( click.style(str(state_file_path.absolute()), fg="cyan"))) analyzer.dump_to_file(state_file) click.echo( _("Flattening file: {}").format(click.style(str(path), fg="cyan"))) if unnest and unnest_file: raise click.UsageError( _("Conflicting options: unnest and unnest-file")) if repeat and repeat_file: raise click.UsageError( _("Conflicting options: repeat and repeat-file")) if only and only_file: raise click.UsageError(_("Conflicting options: only and only-file")) if exclude: click.echo( _("Ignoring tables (excluded by user): {}").format( click.style(",".join(exclude), fg="red"))) options = {"selection": {}, "count": count, "exclude": exclude} unnest = read_option_file(unnest, unnest_file) repeat = read_option_file(repeat, repeat_file) only = read_option_file(only, only_file) for name in list(selection) + list(combine): table = analyzer.spec[name] if table.total_rows == 0: click.echo( _("Ignoring empty table {}").format(click.style(name, fg="red"))) continue options["selection"][name] = { "split": analyzer.spec[name].splitted, "pretty_headers": human, } if not analyzer.spec[name].is_combined: unnest_in_table = [ col for col in unnest if col in table.combined_columns ] if unnest_in_table: click.echo( _("Unnesting columns {} for table {}").format( click.style(",".join(unnest_in_table), fg="cyan"), click.style(name, fg="cyan"))) only_in_table = [col for col in only if col in table] if only_in_table: click.echo( _("Using only columns {} for table {}").format( click.style(",".join(only_in_table), fg="cyan"), click.style(name, fg="cyan"))) repeat_in_table = [col for col in repeat if col in table] if repeat_in_table: click.echo( _("Repeating columns {} in all child table of {}").format( click.style(",".join(repeat_in_table), fg="cyan"), click.style(name, fg="cyan"))) options["selection"][name]["only"] = only_in_table options["selection"][name]["repeat"] = repeat_in_table options["selection"][name]["unnest"] = unnest_in_table options = FlattenOptions(**options) flattener = FileFlattener( workdir, options, analyzer, csv=csv, xlsx=xlsx, language=language, ) click.echo( _("Going to export tables: {}").format( click.style(",".join(flattener.flattener.tables.keys()), fg="magenta"))) click.echo(_("Processed tables:")) for table_name, table in flattener.flattener.tables.items(): msg = _(" - {:30} => {} rows") if table.is_root else _( " ---- {:27} => {} rows") message = msg.format(table_name, click.style(str(table.total_rows), fg="cyan")) click.echo(message) click.echo(_("Flattening input file")) with click.progressbar( flattener.flatten_file(filename), length=analyzer.spec.total_items + 1, width=0, show_percent=True, show_pos=True, ) as bar: for count in bar: bar.label = FLATTENED_LABEL.format( click.style(str(count + 1), fg="cyan")) click.secho(_("Done flattening. Flattened objects: {}").format( click.style(str(count + 1), fg="red")), fg="green")
import logging import pathlib import click import click_logging from spoonbill import FileAnalyzer, FileFlattener from spoonbill.common import COMBINED_TABLES, ROOT_TABLES, TABLE_THRESHOLD from spoonbill.flatten import FlattenOptions from spoonbill.i18n import LOCALE, _ from spoonbill.utils import read_lines LOGGER = logging.getLogger("spoonbill") click_logging.basic_config(LOGGER) ANALYZED_LABEL = _(" Processed {} objects") FLATTENED_LABEL = _(" Flattened {} objects") CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"]) class CommaSeparated(click.ParamType): """Click option type to convert comma separated string into list""" name = "comma" def convert(self, value, param, ctx): # noqa if not value: return [] return [v for v in value.split(",")]
def process_items(self, releases, with_preview=True): """ Analyze releases. Iterates over every release to calculate metrics and optionally generates previews for combined and split versions of each table. :param releases: The releases to analyze :param with_preview: Whether to generate previews for each table """ for count, release in enumerate(releases): to_analyze = deque([("", "", "", {}, release)]) rows = Rows(ocid=release["ocid"], buyer=release.get("buyer", {}), data=defaultdict(list)) while to_analyze: abs_path, path, parent_key, parent, record = to_analyze.popleft( ) if hasattr(record, "items"): for key, item in record.items(): pointer = self.join_path(path, key) self.current_table = self.get_table(pointer) if not self.current_table: continue if self.is_new_row(pointer): self.inc_table_rows(item, rows, parent_key, record) self.extend_table_types(pointer, item) item_type = self.current_table.types.get(pointer) if not self.is_type_matched(pointer, item, item_type): continue if isinstance(item, dict): to_analyze.append(( self.join_path(abs_path, key), pointer, key, record, item, )) elif item and isinstance(item, list): abs_pointer = self.join_path(abs_path, key) if not isinstance(item[0], dict) and not item_type: item_type = JOINABLE self.add_joinable_column(abs_pointer, pointer) if item_type == JOINABLE: if pointer not in self.current_table: self.add_joinable_column( abs_pointer, pointer) self.current_table.inc_column( abs_pointer, pointer) if self.with_preview and count < PREVIEW_ROWS: value = JOINABLE_SEPARATOR.join( [str(i) for i in item]) self.current_table.set_preview_path( abs_pointer, pointer, value, self.table_threshold) elif self.is_base_table(): for value in item: to_analyze.append(( abs_pointer, pointer, key, record, value, )) else: parent_table = self.current_table.parent if pointer not in parent_table.arrays: LOGGER.debug( _("Detected additional table: %s") % pointer) self.current_table.types[pointer] = [ "array" ] parent_table = self.current_table self.add_additional_table( pointer, abs_pointer, parent_key, key, item) self.add_preview_row( rows, record.get("id", ""), parent_key) if parent_table.set_array(pointer, item): self.handle_array_expanded( pointer, item, abs_path, key) for i, value in enumerate(item): if isinstance(value, dict): abs_pointer = self.join_path( abs_path, key, str(i)) to_analyze.append(( abs_pointer, pointer, parent_key, record, value, )) else: abs_pointer = self.join_path(abs_path, key) if self.current_table.is_combined: pointer, abs_pointer = self.get_paths_for_combined_table( parent_key, key) col = self.current_table.columns.get(pointer) if col: if abs_pointer not in self.current_table: parent = self.current_table.parent parent.add_array_column( col, pointer, abs_pointer, max=self.table_threshold) else: self.current_table.add_column( pointer, self.guess_type(item), pointer, additional=True, abs_path=abs_pointer, ) self.current_table.inc_column(abs_pointer, pointer) if item and self.with_preview and count < PREVIEW_ROWS: if not pointer.startswith("/buyer"): self.current_table.set_preview_path( abs_pointer, pointer, item, self.table_threshold) yield count self.clean_up_missing_arrays() self.total_items = count