def extract_dict_to_error_path(path, input): output = {} for k in input: if isinstance(input[k], list): res = extract_list_to_error_path(path + [k], input[k]) for p in res: assert p not in output, _("Already have key {}").format(p) output[p] = res[p] elif isinstance(input[k], dict): res = extract_dict_to_error_path(path + [k], input[k]) for p in res: assert p not in output, _("Already have key {}").format(p) output[p] = res[p] elif isinstance(input[k], Cell): p = tuple(path + [k]) assert p not in output, _("Already have key {}").format(p) output[p] = [input[k].cell_location] for sub_cell in input[k].sub_cells: assert sub_cell.cell_value == input[k].cell_value, _( "Two sub-cells have different values: {}, {}").format( input[k].cell_value, sub_cell.cell_value) output[p].append(sub_cell.cell_location) else: raise Exception( _("Unexpected result type in the JSON cell tree: {}").format( input[k])) return output
def parse(self): if self.root_list_path is None: root_json_list = self.root_json_dict else: root_json_list = path_search(self.root_json_dict, self.root_list_path.split("/")) for json_dict in root_json_list: if json_dict is None: # This is particularly useful for IATI XML, in order to not # fallover on empty activity, e.g. <iati-activity/> continue self.parse_json_dict(json_dict, sheet=self.main_sheet) if self.remove_empty_schema_columns: # Remove sheets with no lines of data for sheet_name, sheet in list(self.sub_sheets.items()): if not sheet.lines: del self.sub_sheets[sheet_name] if self.preserve_fields_input: nonexistent_input_paths = [] for field in self.preserve_fields_input: if field not in self.seen_paths: nonexistent_input_paths.append(field) if len(nonexistent_input_paths) > 0: warn( _("You wanted to preserve the following fields which are not present in the input data: {}" ).format(nonexistent_input_paths))
def write_sheet(self, sheet_name, sheet): worksheet = odf.table.Table(name=sheet_name) sheet_header = list(sheet) header_row = odf.table.TableRow() for header in sheet_header: header_row.addElement(self._make_cell(header)) worksheet.addElement(header_row) for sheet_line in sheet.lines: row = odf.table.TableRow() for header in sheet_header: value = sheet_line.get(header) if isinstance(value, str): new_value = ILLEGAL_CHARACTERS_RE.sub("", value) if new_value != value: warn( _("Character(s) in '{}' are not allowed in a spreadsheet cell. Those character(s) will be removed" ).format(value), DataErrorWarning, ) value = new_value row.addElement(self._make_cell(value)) worksheet.addElement(row) self.workbook.spreadsheet.addElement(worksheet)
def extract_list_to_error_path(path, input): output = {} for i, item in enumerate(input): res = extract_dict_to_error_path(path + [i], item) for p in res: assert p not in output, _("Already have key {}").format(p) output[p] = res[p] return output
def parse(self): fields = self.parse_schema_dict("", self.root_schema_dict) for field, title in fields: if self.use_titles: if not title: warn(_("Field {} does not have a title, skipping.").format(field)) else: self.main_sheet.append(title) self.main_sheet.titles[field] = title else: self.main_sheet.append(field)
def extract_dict_to_value(input): output = OrderedDict() for k in input: if isinstance(input[k], list): output[k] = extract_list_to_value(input[k]) elif isinstance(input[k], dict): output[k] = extract_dict_to_value(input[k]) elif isinstance(input[k], Cell): output[k] = input[k].cell_value else: raise Exception( _("Unexpected result type in the JSON cell tree: {}").format( input[k])) return output
def fancy_unflatten(self, with_cell_source_map, with_heading_source_map): cell_tree = self.do_unflatten() result = extract_list_to_value(cell_tree) ordered_cell_source_map = None heading_source_map = None if with_cell_source_map or with_heading_source_map: cell_source_map = extract_list_to_error_path( [] if self.root_is_list else [self.root_list_path], cell_tree) ordered_items = sorted(cell_source_map.items()) row_source_map = OrderedDict() heading_source_map = OrderedDict() for path, _unused in ordered_items: cells = cell_source_map[path] # Prepare row_source_map key key = "/".join(str(x) for x in path[:-1]) if not key in row_source_map: row_source_map[key] = [] if with_heading_source_map: # Prepeare header_source_map key header_path_parts = [] for x in path: try: int(x) except: header_path_parts.append(x) header_path = "/".join(header_path_parts) if header_path not in heading_source_map: heading_source_map[header_path] = [] # Populate the row and header source maps for cell in cells: sheet, col, row, header = cell if (sheet, row) not in row_source_map[key]: row_source_map[key].append((sheet, row)) if with_heading_source_map: if (sheet, header) not in heading_source_map[header_path]: heading_source_map[header_path].append( (sheet, header)) if with_cell_source_map: ordered_cell_source_map = OrderedDict( ("/".join(str(x) for x in path), location) for path, location in ordered_items) for key in row_source_map: assert key not in ordered_cell_source_map, _( "Row/cell collision: {}").format(key) ordered_cell_source_map[key] = row_source_map[key] return result, ordered_cell_source_map, heading_source_map
def write_sheet(self, sheet_name, sheet): sheet_header = list(sheet) worksheet = self.workbook.create_sheet() worksheet.title = self.sheet_prefix + sheet_name worksheet.append(sheet_header) for sheet_line in sheet.lines: line = [] for header in sheet_header: value = sheet_line.get(header) if isinstance(value, str): new_value = ILLEGAL_CHARACTERS_RE.sub("", value) if new_value != value: warn( _("Character(s) in '{}' are not allowed in a spreadsheet cell. Those character(s) will be removed" ).format(value), DataErrorWarning, ) value = new_value line.append(value) worksheet.append(line)
def read_sheets(self): try: self.workbook = openpyxl.load_workbook(self.input_name, data_only=True) except BadZipFile as e: # noqa # TODO when we have python3 only add 'from e' to show exception chain raise BadXLSXZipFile( _("The supplied file has extension .xlsx but isn't an XLSX file." )) self.sheet_names_map = OrderedDict( (sheet_name, sheet_name) for sheet_name in self.workbook.sheetnames) if self.include_sheets: for sheet in list(self.sheet_names_map): if sheet not in self.include_sheets: self.sheet_names_map.pop(sheet) for sheet in self.exclude_sheets or []: self.sheet_names_map.pop(sheet, None) sheet_names = list(sheet for sheet in self.sheet_names_map.keys()) self.sub_sheet_names = sheet_names self.configure_sheets()
def __init__( self, json_filename=None, root_json_dict=None, schema_parser=None, root_list_path=None, root_id="ocid", use_titles=False, xml=False, id_name="id", filter_field=None, filter_value=None, preserve_fields=None, remove_empty_schema_columns=False, rollup=False, truncation_length=3, ): self.sub_sheets = {} self.main_sheet = Sheet() self.root_list_path = root_list_path self.root_id = root_id self.use_titles = use_titles self.truncation_length = truncation_length self.id_name = id_name self.xml = xml self.filter_field = filter_field self.filter_value = filter_value self.remove_empty_schema_columns = remove_empty_schema_columns self.seen_paths = set() if schema_parser: self.main_sheet = copy.deepcopy(schema_parser.main_sheet) self.sub_sheets = copy.deepcopy(schema_parser.sub_sheets) if remove_empty_schema_columns: # Don't use columns from the schema parser # (avoids empty columns) self.main_sheet.columns = [] for sheet_name, sheet in list(self.sub_sheets.items()): sheet.columns = [] self.schema_parser = schema_parser else: self.schema_parser = None self.rollup = False if rollup: if schema_parser and len(schema_parser.rollup) > 0: # If rollUp is present in the schema this takes precedence over direct input. self.rollup = schema_parser.rollup if isinstance(rollup, (list, )) and (len(rollup) > 1 or (len(rollup) == 1 and rollup[0] is not True)): warn( _("Using rollUp values from schema, ignoring direct input." )) elif isinstance(rollup, (list, )): if len(rollup) == 1 and os.path.isfile(rollup[0]): # Parse file, one json path per line. rollup_from_file = set() with open(rollup[0]) as rollup_file: for line in rollup_file: line = line.strip() rollup_from_file.add(line) self.rollup = rollup_from_file # Rollup args passed directly at the commandline elif len(rollup) == 1 and rollup[0] is True: warn( _("No fields to rollup found (pass json path directly, as a list in a file, or via a schema)" )) else: self.rollup = set(rollup) else: warn( _("Invalid value passed for rollup (pass json path directly, as a list in a file, or via a schema)" )) if self.xml: with codecs.open(json_filename, "rb") as xml_file: top_dict = xmltodict.parse( xml_file, force_list=(root_list_path, ), force_cdata=True, ) # AFAICT, this should be true for *all* XML files assert len(top_dict) == 1 root_json_dict = list(top_dict.values())[0] list_dict_consistency(root_json_dict) json_filename = None if json_filename is None and root_json_dict is None: raise ValueError( _("Etiher json_filename or root_json_dict must be supplied")) if json_filename is not None and root_json_dict is not None: raise ValueError( _("Only one of json_file or root_json_dict should be supplied") ) if json_filename: with codecs.open(json_filename, encoding="utf-8") as json_file: try: self.root_json_dict = json.load( json_file, object_pairs_hook=OrderedDict, parse_float=Decimal) except UnicodeError as err: raise BadlyFormedJSONErrorUTF8(*err.args) except ValueError as err: raise BadlyFormedJSONError(*err.args) else: self.root_json_dict = root_json_dict if preserve_fields: # Extract fields to be preserved from input file (one path per line) preserve_fields_all = [] preserve_fields_input = [] with open(preserve_fields) as preserve_fields_file: for line in preserve_fields_file: line = line.strip() path_fields = line.rsplit("/", 1) preserve_fields_all = (preserve_fields_all + path_fields + [line.rstrip("/")]) preserve_fields_input = preserve_fields_input + [ line.rstrip("/") ] self.preserve_fields = set(preserve_fields_all) self.preserve_fields_input = set(preserve_fields_input) try: input_not_in_schema = set() for field in self.preserve_fields_input: if field not in self.schema_parser.flattened.keys(): input_not_in_schema.add(field) warn( _("You wanted to preserve the following fields which are not present in the supplied schema: {}" ).format(list(input_not_in_schema))) except AttributeError: # no schema pass else: self.preserve_fields = None self.preserve_fields_input = None
def parse_json_dict( self, json_dict, sheet, json_key=None, parent_name="", flattened_dict=None, parent_id_fields=None, top_level_of_sub_sheet=False, ): """ Parse a json dictionary. json_dict - the json dictionary sheet - a sheet.Sheet object representing the resulting spreadsheet json_key - the key that maps to this JSON dict, either directly to the dict, or to a dict that this list contains. Is None if this dict is contained in root_json_list directly. """ # Possibly main_sheet should be main_sheet_columns, but this is # currently named for consistency with schema.py if self.use_titles: sheet_key = sheet_key_title else: sheet_key = sheet_key_field parent_id_fields = copy.copy(parent_id_fields) or OrderedDict() if flattened_dict is None: flattened_dict = {} top = True else: top = False if parent_name == "" and self.filter_field and self.filter_value: if self.filter_field not in json_dict: return if json_dict[self.filter_field] != self.filter_value: return if top_level_of_sub_sheet: # Add the IDs for the top level of object in an array for k, v in parent_id_fields.items(): if self.xml: flattened_dict[sheet_key(sheet, k)] = v["#text"] else: flattened_dict[sheet_key(sheet, k)] = v if self.root_id and self.root_id in json_dict: parent_id_fields[sheet_key(sheet, self.root_id)] = json_dict[self.root_id] if self.id_name in json_dict: parent_id_fields[sheet_key(sheet, parent_name + self.id_name)] = json_dict[self.id_name] for key, value in json_dict.items(): # Keep a unique list of all the JSON paths in the data that have been seen. parent_path = parent_name.replace("/0", "") full_path = parent_path + key self.seen_paths.add(full_path) if self.preserve_fields: siblings = False for field in self.preserve_fields: if parent_path in field: siblings = True if siblings and full_path not in self.preserve_fields: continue if type(value) in BASIC_TYPES: if self.xml and key == "#text": # Handle the text output from xmltodict key = "" parent_name = parent_name.strip("/") flattened_dict[sheet_key(sheet, parent_name + key)] = value elif hasattr(value, "items"): self.parse_json_dict( value, sheet=sheet, json_key=key, parent_name=parent_name + key + "/", flattened_dict=flattened_dict, parent_id_fields=parent_id_fields, ) elif hasattr(value, "__iter__"): if all(type(x) in BASIC_TYPES for x in value): # Check for an array of BASIC types # TODO Make this check the schema # TODO Error if the any of the values contain the seperator # TODO Support doubly nested arrays flattened_dict[sheet_key(sheet, parent_name + key)] = ";".join( map(str, value)) else: if (self.rollup and parent_name == "" ): # Rollup only currently possible to main sheet if self.use_titles and not self.schema_parser: warn( _("Warning: No schema was provided so column headings are JSON keys, not titles." )) if len(value) == 1: for k, v in value[0].items(): if (self.preserve_fields and parent_name + key + "/" + k not in self.preserve_fields): continue if type(v) not in BASIC_TYPES: raise ValueError( _("Rolled up values must be basic types" )) else: if self.schema_parser: # We want titles and there's a schema and rollUp is in it if (self.use_titles and parent_name + key + "/0/" + k in self.schema_parser. main_sheet.titles): flattened_dict[sheet_key_title( sheet, parent_name + key + "/0/" + k)] = v # We want titles and there's a schema but rollUp isn't in it # so the titles for rollup properties aren't in the main sheet # so we need to try to get the titles from a subsheet elif (self.use_titles and parent_name + key in self.rollup and self.schema_parser. sub_sheet_titles.get(( parent_name, key, )) in self.schema_parser.sub_sheets): relevant_subsheet = self.schema_parser.sub_sheets.get( self.schema_parser. sub_sheet_titles.get(( parent_name, key, ))) if relevant_subsheet is not None: rollup_field_title = sheet_key_title( relevant_subsheet, parent_name + key + "/0/" + k, ) flattened_dict[sheet_key( sheet, rollup_field_title)] = v # We don't want titles even though there's a schema elif not self.use_titles and ( parent_name + key + "/0/" + k in self.schema_parser. main_sheet or parent_name + key in self.rollup): flattened_dict[sheet_key( sheet, parent_name + key + "/0/" + k)] = v # No schema, so no titles elif parent_name + key in self.rollup: flattened_dict[sheet_key( sheet, parent_name + key + "/0/" + k)] = v elif len(value) > 1: for k in set( sum((list(x.keys()) for x in value), [])): if (self.preserve_fields and parent_name + key + "/" + k not in self.preserve_fields): continue if (self.schema_parser and parent_name + key + "/0/" + k in self.schema_parser.main_sheet): warn( _('More than one value supplied for "{}". Could not provide rollup, so adding a warning to the relevant cell(s) in the spreadsheet.' ).format(parent_name + key)) flattened_dict[sheet_key( sheet, parent_name + key + "/0/" + k )] = _( "WARNING: More than one value supplied, consult the relevant sub-sheet for the data." ) elif parent_name + key in self.rollup: warn( _('More than one value supplied for "{}". Could not provide rollup, so adding a warning to the relevant cell(s) in the spreadsheet.' ).format(parent_name + key)) flattened_dict[sheet_key( sheet, parent_name + key + "/0/" + k )] = _( "WARNING: More than one value supplied, consult the relevant sub-sheet for the data." ) if (self.use_titles and self.schema_parser and ( parent_name, key, ) in self.schema_parser.sub_sheet_titles): sub_sheet_name = self.schema_parser.sub_sheet_titles[( parent_name, key, )] else: sub_sheet_name = make_sub_sheet_name( parent_name, key, truncation_length=self.truncation_length) if sub_sheet_name not in self.sub_sheets: self.sub_sheets[sub_sheet_name] = Sheet( name=sub_sheet_name) for json_dict in value: if json_dict is None: continue self.parse_json_dict( json_dict, sheet=self.sub_sheets[sub_sheet_name], json_key=key, parent_id_fields=parent_id_fields, parent_name=parent_name + key + "/0/", top_level_of_sub_sheet=True, ) else: raise ValueError(_("Unsupported type {}").format(type(value))) if top: sheet.lines.append(flattened_dict)
def unflatten_main_with_parser(parser, line, timezone, xml, id_name): unflattened = OrderedDict() for path, cell in line.items(): # Skip blank cells if cell.cell_value is None or cell.cell_value == "": continue current_path = unflattened path_list = [item.rstrip("[]") for item in str(path).split("/")] for num, path_item in enumerate(path_list): if isint(path_item): if num == 0: warn( _('Column "{}" has been ignored because it is a number.' ).format(path), DataErrorWarning, ) continue current_type = None path_till_now = "/".join( [item for item in path_list[:num + 1] if not isint(item)]) if parser: current_type = parser.flattened.get(path_till_now) try: next_path_item = path_list[num + 1] except IndexError: next_path_item = "" # Quick solution to avoid casting of date as datetinme in spreadsheet > xml if xml: if type(cell.cell_value ) == datetime.datetime and not next_path_item: if "datetime" not in str(path): current_type = "date" ## Array list_index = -1 if isint(next_path_item): if current_type and current_type != "array": raise ValueError( _("There is an array at '{}' when the schema says there should be a '{}'" ).format(path_till_now, current_type)) list_index = int(next_path_item) current_type = "array" if current_type == "array": list_as_dict = current_path.get(path_item) if list_as_dict is None: list_as_dict = ListAsDict() current_path[path_item] = list_as_dict elif type(list_as_dict) is not ListAsDict: warn( _("Column {} has been ignored, because it treats {} as an array, but another column does not." ).format(path, path_till_now), DataErrorWarning, ) break new_path = list_as_dict.get(list_index) if new_path is None: new_path = OrderedDict() list_as_dict[list_index] = new_path current_path = new_path if not xml or num < len(path_list) - 2: # In xml "arrays" can have text values, if they're the final element # This corresponds to a tag with text, but also possibly attributes continue ## Object if current_type == "object" or (not current_type and next_path_item): new_path = current_path.get(path_item) if new_path is None: new_path = OrderedDict() current_path[path_item] = new_path elif type(new_path) is ListAsDict or not hasattr( new_path, "items"): warn( _("Column {} has been ignored, because it treats {} as an object, but another column does not." ).format(path, path_till_now), DataErrorWarning, ) break current_path = new_path continue if (current_type and current_type not in ["object", "array"] and next_path_item): raise ValueError( _("There is an object or list at '{}' but it should be an {}" ).format(path_till_now, current_type)) ## Other Types current_path_value = current_path.get(path_item) if not xml and (type(current_path_value) is ListAsDict or hasattr(current_path_value, "items")): # ^ # xml can have an object/array that also has a text value warn( _("Column {} has been ignored, because another column treats it as an array or object" ).format(path_till_now), DataErrorWarning, ) continue value = cell.cell_value if xml and current_type == "array": # In xml "arrays" can have text values, if they're the final element # However the type of the text value itself should not be "array", # as that would split the text on commas, which we don't want. # https://github.com/OpenDataServices/cove/issues/1030 converted_value = convert_type("", value, timezone) else: converted_value = convert_type(current_type or "", value, timezone) cell.cell_value = converted_value if converted_value is not None and converted_value != "": if xml: # For XML we want to support text and attributes at the # same level, e.g. # <my-element a="b">some text</my-element> # which we represent in a dict as: # {"@a":"b", "text()": "some text"} # To ensure we can attach attributes everywhere, all # element text must be added as a dict with a `text()` key. if path_item.startswith("@"): current_path[path_item] = cell else: if current_type == "array": current_path["text()"] = cell elif path_item not in current_path: current_path[path_item] = {"text()": cell} else: current_path[path_item]["text()"] = cell else: current_path[path_item] = cell unflattened = list_as_dicts_to_temporary_dicts(unflattened, id_name, xml) return unflattened
def __init__( self, schema_filename=None, root_schema_dict=None, rollup=False, root_id=None, use_titles=False, disable_local_refs=False, truncation_length=3, exclude_deprecated_fields=False, ): self.sub_sheets = {} self.main_sheet = Sheet() self.sub_sheet_mapping = {} self.do_rollup = rollup self.rollup = set() self.root_id = root_id self.use_titles = use_titles self.sub_sheet_titles = {} self.truncation_length = truncation_length self.title_lookup = TitleLookup() self.flattened = {} self.exclude_deprecated_fields = exclude_deprecated_fields if root_schema_dict is None and schema_filename is None: raise ValueError( _("One of schema_filename or root_schema_dict must be supplied" )) if root_schema_dict is not None and schema_filename is not None: raise ValueError( _("Only one of schema_filename or root_schema_dict should be supplied" )) if schema_filename: if schema_filename.startswith("http"): import requests r = requests.get(schema_filename) self.root_schema_dict = jsonref.loads( r.text, object_pairs_hook=OrderedDict) else: if disable_local_refs: with codecs.open(schema_filename, encoding="utf-8") as schema_file: self.root_schema_dict = jsonref.load( schema_file, object_pairs_hook=OrderedDict, loader=JsonLoaderLocalRefsDisabled(), ) else: if sys.version_info[:2] > (3, 0): base_uri = pathlib.Path( os.path.realpath(schema_filename)).as_uri() else: base_uri = urlparse.urljoin( "file:", urllib.pathname2url( os.path.abspath(schema_filename)), ) with codecs.open(schema_filename, encoding="utf-8") as schema_file: self.root_schema_dict = jsonref.load( schema_file, object_pairs_hook=OrderedDict, base_uri=base_uri, ) else: self.root_schema_dict = root_schema_dict
def parse_schema_dict( self, parent_path, schema_dict, parent_id_fields=None, title_lookup=None, parent_title="", ): if parent_path: parent_path = parent_path + "/" parent_id_fields = parent_id_fields or [] title_lookup = self.title_lookup if title_lookup is None else title_lookup if ("type" in schema_dict and schema_dict["type"] == "array" and "items" in schema_dict and "oneOf" in schema_dict["items"]): for oneOf in schema_dict["items"]["oneOf"]: if "type" in oneOf and oneOf["type"] == "object": for field, child_title in self.parse_schema_dict( parent_path, oneOf, parent_id_fields=parent_id_fields, title_lookup=title_lookup, parent_title=parent_title, ): yield (field, child_title) elif "properties" in schema_dict: if "id" in schema_dict["properties"]: if self.use_titles: id_fields = parent_id_fields + [ (parent_title if parent_title is not None else parent_path) + (schema_dict["properties"]["id"].get("title") or "id") ] else: id_fields = parent_id_fields + [parent_path + "id"] else: id_fields = parent_id_fields for property_name, property_schema_dict in schema_dict[ "properties"].items(): if self.exclude_deprecated_fields and property_schema_dict.get( "deprecated"): continue if (self.exclude_deprecated_fields and hasattr(property_schema_dict, "__reference__") and property_schema_dict.__reference__.get("deprecated")): continue property_type_set = get_property_type_set(property_schema_dict) if (hasattr(property_schema_dict, "__reference__") and "title" in property_schema_dict.__reference__): title = property_schema_dict.__reference__["title"] else: title = property_schema_dict.get("title") if title: title_lookup[title] = TitleLookup() title_lookup[title].property_name = property_name if "object" in property_type_set: self.flattened[parent_path + property_name] = "object" for field, child_title in self.parse_schema_dict( parent_path + property_name, property_schema_dict, parent_id_fields=id_fields, title_lookup=title_lookup.get(title), parent_title=parent_title + title + ":" if parent_title is not None and title else None, ): yield ( property_name + "/" + field, # TODO ambiguous use of "title" (title + ":" + child_title if title and child_title else None), ) elif "array" in property_type_set: flattened_key = parent_path.replace("/0/", "/") + property_name self.flattened[flattened_key] = "array" type_set = get_property_type_set( property_schema_dict["items"]) if "string" in type_set or not type_set: self.flattened[flattened_key] = "string_array" yield property_name, title elif "number" in type_set: self.flattened[flattened_key] = "number_array" yield property_name, title elif "array" in type_set: self.flattened[flattened_key] = "array_array" nested_type_set = get_property_type_set( property_schema_dict["items"]["items"]) if "string" in nested_type_set or "number" in nested_type_set: yield property_name, title else: raise ValueError elif "object" in type_set: if title: title_lookup[title].property_name = property_name if self.use_titles and parent_title is not None: sub_sheet_name = make_sub_sheet_name( parent_title, title or property_name, truncation_length=self.truncation_length, path_separator=":", ) self.sub_sheet_titles[( parent_path, property_name, )] = sub_sheet_name else: sub_sheet_name = make_sub_sheet_name( parent_path, property_name, truncation_length=self.truncation_length, ) # self.sub_sheet_mapping[parent_name+'/'+property_name] = sub_sheet_name if sub_sheet_name not in self.sub_sheets: self.sub_sheets[sub_sheet_name] = Sheet( root_id=self.root_id, name=sub_sheet_name) sub_sheet = self.sub_sheets[sub_sheet_name] sub_sheet.title_lookup = title_lookup.get(title) for field in id_fields: sub_sheet.add_field(field, id_field=True) sub_sheet.titles[title_lookup.lookup_header( field)] = field fields = self.parse_schema_dict( parent_path + property_name + "/0", property_schema_dict["items"], parent_id_fields=id_fields, title_lookup=title_lookup.get(title), parent_title=parent_title + title + ":" if parent_title is not None and title else None, ) rollup_fields = set() for field, child_title in fields: full_path = parent_path + property_name + "/0/" + field if self.use_titles: if not child_title or parent_title is None: warn( _("Field {}{}/0/{} is missing a title, skipping." ).format(parent_path, property_name, field)) elif not title: warn( _("Field {}{} does not have a title, skipping it and all its children." ).format(parent_path, property_name)) else: # This code only works for arrays that are at 0 or 1 layer of nesting full_title = (parent_title + title + ":" + child_title) sub_sheet.add_field(full_title) sub_sheet.titles[full_path] = full_title else: sub_sheet.add_field(full_path) if (self.do_rollup and "rollUp" in property_schema_dict and field in property_schema_dict["rollUp"]): rollup_fields.add(field) self.rollup.add(full_path) yield property_name + "/0/" + field, ( title + ":" + child_title if title and child_title else None) # Check that all items in rollUp are in the schema if self.do_rollup and "rollUp" in property_schema_dict: missedRollUp = ( set(property_schema_dict["rollUp"]) - rollup_fields) if missedRollUp: warn("{} in rollUp but not in schema".format( ", ".join(missedRollUp))) else: raise ValueError( _('Unknown type_set: {}, did you forget to explicity set the "type" key on "items"?' ).format(type_set)) elif "string" in property_type_set or not property_type_set: # We only check for date here, because its the only format # for which we need to specially transform the input if property_schema_dict.get("format") == "date": self.flattened[parent_path.replace("/0/", "/") + property_name] = "date" else: self.flattened[parent_path.replace("/0/", "/") + property_name] = "string" yield property_name, title elif "number" in property_type_set: self.flattened[parent_path.replace("/0/", "/") + property_name] = "number" yield property_name, title elif "integer" in property_type_set: self.flattened[parent_path.replace("/0/", "/") + property_name] = "integer" yield property_name, title elif "boolean" in property_type_set: self.flattened[parent_path.replace("/0/", "/") + property_name] = "boolean" yield property_name, title else: warn( _('Unrecognised types {} for property "{}" with context "{}",' "so this property has been ignored.").format( repr(property_type_set), property_name, parent_path)) else: warn( _('Skipping field "{}", because it has no properties.').format( parent_path))
def merge(base, mergee, debug_info=None): if not debug_info: debug_info = {} for key, v in mergee.items(): if isinstance(v, Cell): value = v.cell_value else: value = v if key in base: if isinstance(value, TemporaryDict): if not isinstance(base[key], TemporaryDict): warnings_for_ignored_columns( v, _("because it treats {} as an array, but another column does not" ).format(key), ) continue for temporarydict_key, temporarydict_value in value.items(): if temporarydict_key in base[key]: merge( base[key][temporarydict_key], temporarydict_value, debug_info, ) else: assert temporarydict_key not in base[key], _( "Overwriting cell {} by mistake").format( temporarydict_value) base[key][temporarydict_key] = temporarydict_value for temporarydict_value in value.items_no_keyfield: base[key].items_no_keyfield.append(temporarydict_value) elif isinstance(value, dict): if isinstance(base[key], dict): merge(base[key], value, debug_info) else: warnings_for_ignored_columns( v, _("because it treats {} as an object, but another column does not" ).format(key), ) else: if not isinstance(base[key], Cell): id_info = '{} "{}"'.format( debug_info.get("id_name"), debug_info.get(debug_info.get("id_name")), ) if debug_info.get("root_id"): id_info = ('{} "{}", '.format( debug_info.get("root_id"), debug_info.get("root_id_or_none"), ) + id_info) warnings_for_ignored_columns( v, _("because another column treats it as an array or object" )) continue base_value = base[key].cell_value if base_value != value: id_info = '{} "{}"'.format( debug_info.get("id_name"), debug_info.get(debug_info.get("id_name")), ) if debug_info.get("root_id"): id_info = ('{} "{}", '.format( debug_info.get("root_id"), debug_info.get("root_id_or_none"), ) + id_info) warn( _('You may have a duplicate Identifier: We couldn\'t merge these rows with the {}: field "{}" in sheet "{}": one cell has the value: "{}", the other cell has the value: "{}"' ).format( id_info, key, debug_info.get("sheet_name"), base_value, value, ), DataErrorWarning, ) else: base[key].sub_cells.append(v) else: # This happens when a parent record finds the first a child record of a known type base[key] = v
def do_unflatten(self): main_sheet_by_ocid = OrderedDict() sheets = list(self.get_sub_sheets_lines()) for i, sheet in enumerate(sheets): sheet_name, lines = sheet try: actual_headings = self.get_sheet_headings(sheet_name) # If sheet is empty or too many lines have been skipped if not actual_headings: continue found = OrderedDict() last_col = len(actual_headings) # We want to ignore data in earlier columns, so we look # through the data backwards for i, actual_heading in enumerate(reversed(actual_headings)): if actual_heading is None: continue if actual_heading in found: found[actual_heading].append((last_col - i) - 1) else: found[actual_heading] = [i] for actual_heading in reversed(found): if len(found[actual_heading]) > 1: keeping = found[actual_heading][0] # noqa ignoring = found[actual_heading][1:] ignoring.reverse() if len(ignoring) >= 3: warn( (_('Duplicate heading "{}" found, ignoring ' 'the data in columns {} and {} (sheet: "{}").' )).format( actual_heading, ", ".join([ _get_column_letter(x + 1) for x in ignoring[:-1] ]), _get_column_letter(ignoring[-1] + 1), sheet_name, ), DataErrorWarning, ) elif len(found[actual_heading]) == 3: warn( (_('Duplicate heading "{}" found, ignoring ' 'the data in columns {} and {} (sheet: "{}").' )).format( actual_heading, _get_column_letter(ignoring[0] + 1), _get_column_letter(ignoring[1] + 1), sheet_name, ), DataErrorWarning, ) else: warn( (_('Duplicate heading "{}" found, ignoring ' 'the data in column {} (sheet: "{}").') ).format( actual_heading, _get_column_letter(ignoring[0] + 1), sheet_name, ), DataErrorWarning, ) except NotImplementedError: # The ListInput type used in the tests doesn't support getting headings. actual_headings = None for j, line in enumerate(lines): if all(x is None or x == "" for x in line.values()): # if all(x == '' for x in line.values()): continue root_id_or_none = line.get( self.root_id) if self.root_id else None cells = OrderedDict() for k, header in enumerate(line): heading = actual_headings[k] if actual_headings else header if self.vertical_orientation: # This is misleading as it specifies the row number as the distance vertically # and the horizontal 'letter' as a number. # https://github.com/OpenDataServices/flatten-tool/issues/153 cells[header] = Cell( line[header], (sheet_name, str(k + 1), j + 2, heading)) else: cells[header] = Cell( line[header], (sheet_name, _get_column_letter(k + 1), j + 2, heading), ) unflattened = unflatten_main_with_parser( self.parser, cells, self.timezone, self.xml, self.id_name) if root_id_or_none not in main_sheet_by_ocid: main_sheet_by_ocid[root_id_or_none] = TemporaryDict( self.id_name, xml=self.xml) def inthere(unflattened, id_name): if self.xml and not isinstance( unflattened.get(self.id_name), Cell): # For an XML tag return unflattened[id_name]["text()"].cell_value else: # For a JSON, or an XML attribute return unflattened[id_name].cell_value if (self.id_name in unflattened and inthere(unflattened, self.id_name) in main_sheet_by_ocid[root_id_or_none]): if self.xml and not isinstance( unflattened.get(self.id_name), Cell): unflattened_id = unflattened.get( self.id_name)["text()"].cell_value else: unflattened_id = unflattened.get( self.id_name).cell_value merge( main_sheet_by_ocid[root_id_or_none][unflattened_id], unflattened, { "sheet_name": sheet_name, "root_id": self.root_id, "root_id_or_none": root_id_or_none, "id_name": self.id_name, self.id_name: unflattened_id, }, ) else: main_sheet_by_ocid[root_id_or_none].append(unflattened) temporarydicts_to_lists(main_sheet_by_ocid) return sum(main_sheet_by_ocid.values(), [])
def convert_type(type_string, value, timezone=pytz.timezone("UTC")): if value == "" or value is None: return None if type_string == "number": try: return Decimal(value) except (TypeError, ValueError, InvalidOperation): warn( _('Non-numeric value "{}" found in number column, returning as string instead.' ).format(value), DataErrorWarning, ) return str(value) elif type_string == "integer": try: return int(value) except (TypeError, ValueError): warn( _('Non-integer value "{}" found in integer column, returning as string instead.' ).format(value), DataErrorWarning, ) return str(value) elif type_string == "boolean": value = str(value) if value.lower() in ["true", "1"]: return True elif value.lower() in ["false", "0"]: return False else: warn( _('Unrecognised value for boolean: "{}", returning as string instead' ).format(value), DataErrorWarning, ) return str(value) elif type_string in ("array", "array_array", "string_array", "number_array"): value = str(value) if type_string == "number_array": try: if "," in value: return [[Decimal(y) for y in x.split(",")] for x in value.split(";")] else: return [Decimal(x) for x in value.split(";")] except (TypeError, ValueError, InvalidOperation): warn( _('Non-numeric value "{}" found in number array column, returning as string array instead).' ).format(value), DataErrorWarning, ) if "," in value: return [x.split(",") for x in value.split(";")] else: return value.split(";") elif type_string == "string": if type(value) == datetime.datetime: return timezone.localize(value).isoformat() return str(value) elif type_string == "date": if type(value) == datetime.datetime: return value.date().isoformat() return str(value) elif type_string == "": if type(value) == datetime.datetime: return timezone.localize(value).isoformat() if type(value) == float and int(value) == value: return int(value) return value if type(value) in [int] else str(value) else: raise ValueError('Unrecognised type: "{}"'.format(type_string))