コード例 #1
0
ファイル: input.py プロジェクト: Xtuden-com/flatten-tool
def extract_dict_to_error_path(path, input):
    output = {}
    for k in input:
        if isinstance(input[k], list):
            res = extract_list_to_error_path(path + [k], input[k])
            for p in res:
                assert p not in output, _("Already have key {}").format(p)
                output[p] = res[p]
        elif isinstance(input[k], dict):
            res = extract_dict_to_error_path(path + [k], input[k])
            for p in res:
                assert p not in output, _("Already have key {}").format(p)
                output[p] = res[p]
        elif isinstance(input[k], Cell):
            p = tuple(path + [k])
            assert p not in output, _("Already have key {}").format(p)
            output[p] = [input[k].cell_location]
            for sub_cell in input[k].sub_cells:
                assert sub_cell.cell_value == input[k].cell_value, _(
                    "Two sub-cells have different values: {}, {}").format(
                        input[k].cell_value, sub_cell.cell_value)
                output[p].append(sub_cell.cell_location)
        else:
            raise Exception(
                _("Unexpected result type in the JSON cell tree: {}").format(
                    input[k]))
    return output
コード例 #2
0
    def parse(self):
        if self.root_list_path is None:
            root_json_list = self.root_json_dict
        else:
            root_json_list = path_search(self.root_json_dict,
                                         self.root_list_path.split("/"))
        for json_dict in root_json_list:
            if json_dict is None:
                # This is particularly useful for IATI XML, in order to not
                # fallover on empty activity, e.g. <iati-activity/>
                continue
            self.parse_json_dict(json_dict, sheet=self.main_sheet)

        if self.remove_empty_schema_columns:
            # Remove sheets with no lines of data
            for sheet_name, sheet in list(self.sub_sheets.items()):
                if not sheet.lines:
                    del self.sub_sheets[sheet_name]

        if self.preserve_fields_input:
            nonexistent_input_paths = []
            for field in self.preserve_fields_input:
                if field not in self.seen_paths:
                    nonexistent_input_paths.append(field)
            if len(nonexistent_input_paths) > 0:
                warn(
                    _("You wanted to preserve the following fields which are not present in the input data: {}"
                      ).format(nonexistent_input_paths))
コード例 #3
0
ファイル: output.py プロジェクト: Xtuden-com/flatten-tool
    def write_sheet(self, sheet_name, sheet):

        worksheet = odf.table.Table(name=sheet_name)
        sheet_header = list(sheet)

        header_row = odf.table.TableRow()

        for header in sheet_header:
            header_row.addElement(self._make_cell(header))

        worksheet.addElement(header_row)

        for sheet_line in sheet.lines:
            row = odf.table.TableRow()
            for header in sheet_header:
                value = sheet_line.get(header)
                if isinstance(value, str):
                    new_value = ILLEGAL_CHARACTERS_RE.sub("", value)
                    if new_value != value:
                        warn(
                            _("Character(s) in '{}' are not allowed in a spreadsheet cell. Those character(s) will be removed"
                              ).format(value),
                            DataErrorWarning,
                        )
                    value = new_value
                row.addElement(self._make_cell(value))
            worksheet.addElement(row)

        self.workbook.spreadsheet.addElement(worksheet)
コード例 #4
0
ファイル: input.py プロジェクト: Xtuden-com/flatten-tool
def extract_list_to_error_path(path, input):
    output = {}
    for i, item in enumerate(input):
        res = extract_dict_to_error_path(path + [i], item)
        for p in res:
            assert p not in output, _("Already have key {}").format(p)
            output[p] = res[p]
    return output
コード例 #5
0
 def parse(self):
     fields = self.parse_schema_dict("", self.root_schema_dict)
     for field, title in fields:
         if self.use_titles:
             if not title:
                 warn(_("Field {} does not have a title, skipping.").format(field))
             else:
                 self.main_sheet.append(title)
                 self.main_sheet.titles[field] = title
         else:
             self.main_sheet.append(field)
コード例 #6
0
ファイル: input.py プロジェクト: Xtuden-com/flatten-tool
def extract_dict_to_value(input):
    output = OrderedDict()
    for k in input:
        if isinstance(input[k], list):
            output[k] = extract_list_to_value(input[k])
        elif isinstance(input[k], dict):
            output[k] = extract_dict_to_value(input[k])
        elif isinstance(input[k], Cell):
            output[k] = input[k].cell_value
        else:
            raise Exception(
                _("Unexpected result type in the JSON cell tree: {}").format(
                    input[k]))
    return output
コード例 #7
0
ファイル: input.py プロジェクト: Xtuden-com/flatten-tool
 def fancy_unflatten(self, with_cell_source_map, with_heading_source_map):
     cell_tree = self.do_unflatten()
     result = extract_list_to_value(cell_tree)
     ordered_cell_source_map = None
     heading_source_map = None
     if with_cell_source_map or with_heading_source_map:
         cell_source_map = extract_list_to_error_path(
             [] if self.root_is_list else [self.root_list_path], cell_tree)
         ordered_items = sorted(cell_source_map.items())
         row_source_map = OrderedDict()
         heading_source_map = OrderedDict()
         for path, _unused in ordered_items:
             cells = cell_source_map[path]
             # Prepare row_source_map key
             key = "/".join(str(x) for x in path[:-1])
             if not key in row_source_map:
                 row_source_map[key] = []
             if with_heading_source_map:
                 # Prepeare header_source_map key
                 header_path_parts = []
                 for x in path:
                     try:
                         int(x)
                     except:
                         header_path_parts.append(x)
                 header_path = "/".join(header_path_parts)
                 if header_path not in heading_source_map:
                     heading_source_map[header_path] = []
             # Populate the row and header source maps
             for cell in cells:
                 sheet, col, row, header = cell
                 if (sheet, row) not in row_source_map[key]:
                     row_source_map[key].append((sheet, row))
                 if with_heading_source_map:
                     if (sheet,
                             header) not in heading_source_map[header_path]:
                         heading_source_map[header_path].append(
                             (sheet, header))
     if with_cell_source_map:
         ordered_cell_source_map = OrderedDict(
             ("/".join(str(x) for x in path), location)
             for path, location in ordered_items)
         for key in row_source_map:
             assert key not in ordered_cell_source_map, _(
                 "Row/cell collision: {}").format(key)
             ordered_cell_source_map[key] = row_source_map[key]
     return result, ordered_cell_source_map, heading_source_map
コード例 #8
0
ファイル: output.py プロジェクト: Xtuden-com/flatten-tool
 def write_sheet(self, sheet_name, sheet):
     sheet_header = list(sheet)
     worksheet = self.workbook.create_sheet()
     worksheet.title = self.sheet_prefix + sheet_name
     worksheet.append(sheet_header)
     for sheet_line in sheet.lines:
         line = []
         for header in sheet_header:
             value = sheet_line.get(header)
             if isinstance(value, str):
                 new_value = ILLEGAL_CHARACTERS_RE.sub("", value)
                 if new_value != value:
                     warn(
                         _("Character(s) in '{}' are not allowed in a spreadsheet cell. Those character(s) will be removed"
                           ).format(value),
                         DataErrorWarning,
                     )
                 value = new_value
             line.append(value)
         worksheet.append(line)
コード例 #9
0
ファイル: input.py プロジェクト: Xtuden-com/flatten-tool
    def read_sheets(self):
        try:
            self.workbook = openpyxl.load_workbook(self.input_name,
                                                   data_only=True)
        except BadZipFile as e:  # noqa
            # TODO when we have python3 only add 'from e' to show exception chain
            raise BadXLSXZipFile(
                _("The supplied file has extension .xlsx but isn't an XLSX file."
                  ))

        self.sheet_names_map = OrderedDict(
            (sheet_name, sheet_name)
            for sheet_name in self.workbook.sheetnames)
        if self.include_sheets:
            for sheet in list(self.sheet_names_map):
                if sheet not in self.include_sheets:
                    self.sheet_names_map.pop(sheet)
        for sheet in self.exclude_sheets or []:
            self.sheet_names_map.pop(sheet, None)

        sheet_names = list(sheet for sheet in self.sheet_names_map.keys())
        self.sub_sheet_names = sheet_names
        self.configure_sheets()
コード例 #10
0
    def __init__(
        self,
        json_filename=None,
        root_json_dict=None,
        schema_parser=None,
        root_list_path=None,
        root_id="ocid",
        use_titles=False,
        xml=False,
        id_name="id",
        filter_field=None,
        filter_value=None,
        preserve_fields=None,
        remove_empty_schema_columns=False,
        rollup=False,
        truncation_length=3,
    ):
        self.sub_sheets = {}
        self.main_sheet = Sheet()
        self.root_list_path = root_list_path
        self.root_id = root_id
        self.use_titles = use_titles
        self.truncation_length = truncation_length
        self.id_name = id_name
        self.xml = xml
        self.filter_field = filter_field
        self.filter_value = filter_value
        self.remove_empty_schema_columns = remove_empty_schema_columns
        self.seen_paths = set()

        if schema_parser:
            self.main_sheet = copy.deepcopy(schema_parser.main_sheet)
            self.sub_sheets = copy.deepcopy(schema_parser.sub_sheets)
            if remove_empty_schema_columns:
                # Don't use columns from the schema parser
                # (avoids empty columns)
                self.main_sheet.columns = []
                for sheet_name, sheet in list(self.sub_sheets.items()):
                    sheet.columns = []
            self.schema_parser = schema_parser
        else:
            self.schema_parser = None

        self.rollup = False
        if rollup:
            if schema_parser and len(schema_parser.rollup) > 0:
                # If rollUp is present in the schema this takes precedence over direct input.
                self.rollup = schema_parser.rollup
                if isinstance(rollup,
                              (list, )) and (len(rollup) > 1 or
                                             (len(rollup) == 1
                                              and rollup[0] is not True)):
                    warn(
                        _("Using rollUp values from schema, ignoring direct input."
                          ))
            elif isinstance(rollup, (list, )):
                if len(rollup) == 1 and os.path.isfile(rollup[0]):
                    # Parse file, one json path per line.
                    rollup_from_file = set()
                    with open(rollup[0]) as rollup_file:
                        for line in rollup_file:
                            line = line.strip()
                            rollup_from_file.add(line)
                    self.rollup = rollup_from_file
                    # Rollup args passed directly at the commandline
                elif len(rollup) == 1 and rollup[0] is True:
                    warn(
                        _("No fields to rollup found (pass json path directly, as a list in a file, or via a schema)"
                          ))
                else:
                    self.rollup = set(rollup)
            else:
                warn(
                    _("Invalid value passed for rollup (pass json path directly, as a list in a file, or via a schema)"
                      ))

        if self.xml:
            with codecs.open(json_filename, "rb") as xml_file:
                top_dict = xmltodict.parse(
                    xml_file,
                    force_list=(root_list_path, ),
                    force_cdata=True,
                )
                # AFAICT, this should be true for *all* XML files
                assert len(top_dict) == 1
                root_json_dict = list(top_dict.values())[0]
                list_dict_consistency(root_json_dict)
            json_filename = None

        if json_filename is None and root_json_dict is None:
            raise ValueError(
                _("Etiher json_filename or root_json_dict must be supplied"))

        if json_filename is not None and root_json_dict is not None:
            raise ValueError(
                _("Only one of json_file or root_json_dict should be supplied")
            )

        if json_filename:
            with codecs.open(json_filename, encoding="utf-8") as json_file:
                try:
                    self.root_json_dict = json.load(
                        json_file,
                        object_pairs_hook=OrderedDict,
                        parse_float=Decimal)
                except UnicodeError as err:
                    raise BadlyFormedJSONErrorUTF8(*err.args)
                except ValueError as err:
                    raise BadlyFormedJSONError(*err.args)
        else:
            self.root_json_dict = root_json_dict

        if preserve_fields:
            # Extract fields to be preserved from input file (one path per line)
            preserve_fields_all = []
            preserve_fields_input = []
            with open(preserve_fields) as preserve_fields_file:
                for line in preserve_fields_file:
                    line = line.strip()
                    path_fields = line.rsplit("/", 1)
                    preserve_fields_all = (preserve_fields_all + path_fields +
                                           [line.rstrip("/")])
                    preserve_fields_input = preserve_fields_input + [
                        line.rstrip("/")
                    ]

            self.preserve_fields = set(preserve_fields_all)
            self.preserve_fields_input = set(preserve_fields_input)

            try:
                input_not_in_schema = set()
                for field in self.preserve_fields_input:
                    if field not in self.schema_parser.flattened.keys():
                        input_not_in_schema.add(field)
                warn(
                    _("You wanted to preserve the following fields which are not present in the supplied schema: {}"
                      ).format(list(input_not_in_schema)))
            except AttributeError:
                # no schema
                pass
        else:
            self.preserve_fields = None
            self.preserve_fields_input = None
コード例 #11
0
    def parse_json_dict(
        self,
        json_dict,
        sheet,
        json_key=None,
        parent_name="",
        flattened_dict=None,
        parent_id_fields=None,
        top_level_of_sub_sheet=False,
    ):
        """
        Parse a json dictionary.

        json_dict - the json dictionary
        sheet - a sheet.Sheet object representing the resulting spreadsheet
        json_key - the key that maps to this JSON dict, either directly to the dict, or to a dict that this list contains.  Is None if this dict is contained in root_json_list directly.
        """
        # Possibly main_sheet should be main_sheet_columns, but this is
        # currently named for consistency with schema.py

        if self.use_titles:
            sheet_key = sheet_key_title
        else:
            sheet_key = sheet_key_field

        parent_id_fields = copy.copy(parent_id_fields) or OrderedDict()
        if flattened_dict is None:
            flattened_dict = {}
            top = True
        else:
            top = False

        if parent_name == "" and self.filter_field and self.filter_value:
            if self.filter_field not in json_dict:
                return
            if json_dict[self.filter_field] != self.filter_value:
                return

        if top_level_of_sub_sheet:
            # Add the IDs for the top level of object in an array
            for k, v in parent_id_fields.items():
                if self.xml:
                    flattened_dict[sheet_key(sheet, k)] = v["#text"]
                else:
                    flattened_dict[sheet_key(sheet, k)] = v

        if self.root_id and self.root_id in json_dict:
            parent_id_fields[sheet_key(sheet,
                                       self.root_id)] = json_dict[self.root_id]

        if self.id_name in json_dict:
            parent_id_fields[sheet_key(sheet, parent_name +
                                       self.id_name)] = json_dict[self.id_name]

        for key, value in json_dict.items():

            # Keep a unique list of all the JSON paths in the data that have been seen.
            parent_path = parent_name.replace("/0", "")
            full_path = parent_path + key
            self.seen_paths.add(full_path)

            if self.preserve_fields:

                siblings = False
                for field in self.preserve_fields:
                    if parent_path in field:
                        siblings = True
                if siblings and full_path not in self.preserve_fields:
                    continue

            if type(value) in BASIC_TYPES:
                if self.xml and key == "#text":
                    # Handle the text output from xmltodict
                    key = ""
                    parent_name = parent_name.strip("/")
                flattened_dict[sheet_key(sheet, parent_name + key)] = value
            elif hasattr(value, "items"):
                self.parse_json_dict(
                    value,
                    sheet=sheet,
                    json_key=key,
                    parent_name=parent_name + key + "/",
                    flattened_dict=flattened_dict,
                    parent_id_fields=parent_id_fields,
                )
            elif hasattr(value, "__iter__"):
                if all(type(x) in BASIC_TYPES for x in value):
                    # Check for an array of BASIC types
                    # TODO Make this check the schema
                    # TODO Error if the any of the values contain the seperator
                    # TODO Support doubly nested arrays
                    flattened_dict[sheet_key(sheet,
                                             parent_name + key)] = ";".join(
                                                 map(str, value))
                else:
                    if (self.rollup and parent_name == ""
                        ):  # Rollup only currently possible to main sheet

                        if self.use_titles and not self.schema_parser:
                            warn(
                                _("Warning: No schema was provided so column headings are JSON keys, not titles."
                                  ))

                        if len(value) == 1:
                            for k, v in value[0].items():

                                if (self.preserve_fields
                                        and parent_name + key + "/" + k
                                        not in self.preserve_fields):
                                    continue

                                if type(v) not in BASIC_TYPES:
                                    raise ValueError(
                                        _("Rolled up values must be basic types"
                                          ))
                                else:
                                    if self.schema_parser:
                                        # We want titles and there's a schema and rollUp is in it
                                        if (self.use_titles
                                                and parent_name + key + "/0/" +
                                                k in self.schema_parser.
                                                main_sheet.titles):
                                            flattened_dict[sheet_key_title(
                                                sheet, parent_name + key +
                                                "/0/" + k)] = v

                                        # We want titles and there's a schema but rollUp isn't in it
                                        # so the titles for rollup properties aren't in the main sheet
                                        # so we need to try to get the titles from a subsheet
                                        elif (self.use_titles and
                                              parent_name + key in self.rollup
                                              and self.schema_parser.
                                              sub_sheet_titles.get((
                                                  parent_name,
                                                  key,
                                              )) in
                                              self.schema_parser.sub_sheets):
                                            relevant_subsheet = self.schema_parser.sub_sheets.get(
                                                self.schema_parser.
                                                sub_sheet_titles.get((
                                                    parent_name,
                                                    key,
                                                )))
                                            if relevant_subsheet is not None:
                                                rollup_field_title = sheet_key_title(
                                                    relevant_subsheet,
                                                    parent_name + key + "/0/" +
                                                    k,
                                                )
                                                flattened_dict[sheet_key(
                                                    sheet,
                                                    rollup_field_title)] = v

                                        # We don't want titles even though there's a schema
                                        elif not self.use_titles and (
                                                parent_name + key + "/0/" + k
                                                in self.schema_parser.
                                                main_sheet or parent_name + key
                                                in self.rollup):
                                            flattened_dict[sheet_key(
                                                sheet, parent_name + key +
                                                "/0/" + k)] = v

                                    # No schema, so no titles
                                    elif parent_name + key in self.rollup:
                                        flattened_dict[sheet_key(
                                            sheet,
                                            parent_name + key + "/0/" + k)] = v

                        elif len(value) > 1:
                            for k in set(
                                    sum((list(x.keys()) for x in value), [])):

                                if (self.preserve_fields
                                        and parent_name + key + "/" + k
                                        not in self.preserve_fields):
                                    continue

                                if (self.schema_parser
                                        and parent_name + key + "/0/" + k
                                        in self.schema_parser.main_sheet):
                                    warn(
                                        _('More than one value supplied for "{}". Could not provide rollup, so adding a warning to the relevant cell(s) in the spreadsheet.'
                                          ).format(parent_name + key))
                                    flattened_dict[sheet_key(
                                        sheet, parent_name + key + "/0/" + k
                                    )] = _(
                                        "WARNING: More than one value supplied, consult the relevant sub-sheet for the data."
                                    )
                                elif parent_name + key in self.rollup:
                                    warn(
                                        _('More than one value supplied for "{}". Could not provide rollup, so adding a warning to the relevant cell(s) in the spreadsheet.'
                                          ).format(parent_name + key))
                                    flattened_dict[sheet_key(
                                        sheet, parent_name + key + "/0/" + k
                                    )] = _(
                                        "WARNING: More than one value supplied, consult the relevant sub-sheet for the data."
                                    )

                    if (self.use_titles and self.schema_parser and (
                            parent_name,
                            key,
                    ) in self.schema_parser.sub_sheet_titles):
                        sub_sheet_name = self.schema_parser.sub_sheet_titles[(
                            parent_name,
                            key,
                        )]
                    else:
                        sub_sheet_name = make_sub_sheet_name(
                            parent_name,
                            key,
                            truncation_length=self.truncation_length)
                    if sub_sheet_name not in self.sub_sheets:
                        self.sub_sheets[sub_sheet_name] = Sheet(
                            name=sub_sheet_name)

                    for json_dict in value:
                        if json_dict is None:
                            continue
                        self.parse_json_dict(
                            json_dict,
                            sheet=self.sub_sheets[sub_sheet_name],
                            json_key=key,
                            parent_id_fields=parent_id_fields,
                            parent_name=parent_name + key + "/0/",
                            top_level_of_sub_sheet=True,
                        )
            else:
                raise ValueError(_("Unsupported type {}").format(type(value)))

        if top:
            sheet.lines.append(flattened_dict)
コード例 #12
0
ファイル: input.py プロジェクト: Xtuden-com/flatten-tool
def unflatten_main_with_parser(parser, line, timezone, xml, id_name):
    unflattened = OrderedDict()
    for path, cell in line.items():
        # Skip blank cells
        if cell.cell_value is None or cell.cell_value == "":
            continue
        current_path = unflattened
        path_list = [item.rstrip("[]") for item in str(path).split("/")]
        for num, path_item in enumerate(path_list):
            if isint(path_item):
                if num == 0:
                    warn(
                        _('Column "{}" has been ignored because it is a number.'
                          ).format(path),
                        DataErrorWarning,
                    )
                continue
            current_type = None
            path_till_now = "/".join(
                [item for item in path_list[:num + 1] if not isint(item)])
            if parser:
                current_type = parser.flattened.get(path_till_now)
            try:
                next_path_item = path_list[num + 1]
            except IndexError:
                next_path_item = ""

            # Quick solution to avoid casting of date as datetinme in spreadsheet > xml
            if xml:
                if type(cell.cell_value
                        ) == datetime.datetime and not next_path_item:
                    if "datetime" not in str(path):
                        current_type = "date"

            ## Array
            list_index = -1
            if isint(next_path_item):
                if current_type and current_type != "array":
                    raise ValueError(
                        _("There is an array at '{}' when the schema says there should be a '{}'"
                          ).format(path_till_now, current_type))
                list_index = int(next_path_item)
                current_type = "array"

            if current_type == "array":
                list_as_dict = current_path.get(path_item)
                if list_as_dict is None:
                    list_as_dict = ListAsDict()
                    current_path[path_item] = list_as_dict
                elif type(list_as_dict) is not ListAsDict:
                    warn(
                        _("Column {} has been ignored, because it treats {} as an array, but another column does not."
                          ).format(path, path_till_now),
                        DataErrorWarning,
                    )
                    break
                new_path = list_as_dict.get(list_index)
                if new_path is None:
                    new_path = OrderedDict()
                    list_as_dict[list_index] = new_path
                current_path = new_path
                if not xml or num < len(path_list) - 2:
                    # In xml "arrays" can have text values, if they're the final element
                    # This corresponds to a tag with text, but also possibly attributes
                    continue

            ## Object
            if current_type == "object" or (not current_type
                                            and next_path_item):
                new_path = current_path.get(path_item)
                if new_path is None:
                    new_path = OrderedDict()
                    current_path[path_item] = new_path
                elif type(new_path) is ListAsDict or not hasattr(
                        new_path, "items"):
                    warn(
                        _("Column {} has been ignored, because it treats {} as an object, but another column does not."
                          ).format(path, path_till_now),
                        DataErrorWarning,
                    )
                    break
                current_path = new_path
                continue
            if (current_type and current_type not in ["object", "array"]
                    and next_path_item):
                raise ValueError(
                    _("There is an object or list at '{}' but it should be an {}"
                      ).format(path_till_now, current_type))

            ## Other Types
            current_path_value = current_path.get(path_item)
            if not xml and (type(current_path_value) is ListAsDict
                            or hasattr(current_path_value, "items")):
                #   ^
                # xml can have an object/array that also has a text value
                warn(
                    _("Column {} has been ignored, because another column treats it as an array or object"
                      ).format(path_till_now),
                    DataErrorWarning,
                )
                continue

            value = cell.cell_value
            if xml and current_type == "array":
                # In xml "arrays" can have text values, if they're the final element
                # However the type of the text value itself should not be "array",
                # as that would split the text on commas, which we don't want.
                # https://github.com/OpenDataServices/cove/issues/1030
                converted_value = convert_type("", value, timezone)
            else:
                converted_value = convert_type(current_type or "", value,
                                               timezone)
            cell.cell_value = converted_value
            if converted_value is not None and converted_value != "":
                if xml:
                    # For XML we want to support text and attributes at the
                    # same level, e.g.
                    # <my-element a="b">some text</my-element>
                    # which we represent in a dict as:
                    # {"@a":"b", "text()": "some text"}
                    # To ensure we can attach attributes everywhere, all
                    # element text must be added as a dict with a `text()` key.
                    if path_item.startswith("@"):
                        current_path[path_item] = cell
                    else:
                        if current_type == "array":
                            current_path["text()"] = cell
                        elif path_item not in current_path:
                            current_path[path_item] = {"text()": cell}
                        else:
                            current_path[path_item]["text()"] = cell
                else:
                    current_path[path_item] = cell

    unflattened = list_as_dicts_to_temporary_dicts(unflattened, id_name, xml)
    return unflattened
コード例 #13
0
    def __init__(
        self,
        schema_filename=None,
        root_schema_dict=None,
        rollup=False,
        root_id=None,
        use_titles=False,
        disable_local_refs=False,
        truncation_length=3,
        exclude_deprecated_fields=False,
    ):
        self.sub_sheets = {}
        self.main_sheet = Sheet()
        self.sub_sheet_mapping = {}
        self.do_rollup = rollup
        self.rollup = set()
        self.root_id = root_id
        self.use_titles = use_titles
        self.sub_sheet_titles = {}
        self.truncation_length = truncation_length
        self.title_lookup = TitleLookup()
        self.flattened = {}
        self.exclude_deprecated_fields = exclude_deprecated_fields

        if root_schema_dict is None and schema_filename is None:
            raise ValueError(
                _("One of schema_filename or root_schema_dict must be supplied"
                  ))
        if root_schema_dict is not None and schema_filename is not None:
            raise ValueError(
                _("Only one of schema_filename or root_schema_dict should be supplied"
                  ))
        if schema_filename:
            if schema_filename.startswith("http"):
                import requests

                r = requests.get(schema_filename)
                self.root_schema_dict = jsonref.loads(
                    r.text, object_pairs_hook=OrderedDict)
            else:
                if disable_local_refs:
                    with codecs.open(schema_filename,
                                     encoding="utf-8") as schema_file:
                        self.root_schema_dict = jsonref.load(
                            schema_file,
                            object_pairs_hook=OrderedDict,
                            loader=JsonLoaderLocalRefsDisabled(),
                        )
                else:
                    if sys.version_info[:2] > (3, 0):
                        base_uri = pathlib.Path(
                            os.path.realpath(schema_filename)).as_uri()
                    else:
                        base_uri = urlparse.urljoin(
                            "file:",
                            urllib.pathname2url(
                                os.path.abspath(schema_filename)),
                        )
                    with codecs.open(schema_filename,
                                     encoding="utf-8") as schema_file:
                        self.root_schema_dict = jsonref.load(
                            schema_file,
                            object_pairs_hook=OrderedDict,
                            base_uri=base_uri,
                        )

        else:
            self.root_schema_dict = root_schema_dict
コード例 #14
0
    def parse_schema_dict(
        self,
        parent_path,
        schema_dict,
        parent_id_fields=None,
        title_lookup=None,
        parent_title="",
    ):
        if parent_path:
            parent_path = parent_path + "/"
        parent_id_fields = parent_id_fields or []
        title_lookup = self.title_lookup if title_lookup is None else title_lookup

        if ("type" in schema_dict and schema_dict["type"] == "array"
                and "items" in schema_dict
                and "oneOf" in schema_dict["items"]):
            for oneOf in schema_dict["items"]["oneOf"]:
                if "type" in oneOf and oneOf["type"] == "object":
                    for field, child_title in self.parse_schema_dict(
                            parent_path,
                            oneOf,
                            parent_id_fields=parent_id_fields,
                            title_lookup=title_lookup,
                            parent_title=parent_title,
                    ):
                        yield (field, child_title)

        elif "properties" in schema_dict:
            if "id" in schema_dict["properties"]:
                if self.use_titles:
                    id_fields = parent_id_fields + [
                        (parent_title
                         if parent_title is not None else parent_path) +
                        (schema_dict["properties"]["id"].get("title") or "id")
                    ]
                else:
                    id_fields = parent_id_fields + [parent_path + "id"]
            else:
                id_fields = parent_id_fields

            for property_name, property_schema_dict in schema_dict[
                    "properties"].items():
                if self.exclude_deprecated_fields and property_schema_dict.get(
                        "deprecated"):
                    continue

                if (self.exclude_deprecated_fields
                        and hasattr(property_schema_dict, "__reference__") and
                        property_schema_dict.__reference__.get("deprecated")):
                    continue

                property_type_set = get_property_type_set(property_schema_dict)

                if (hasattr(property_schema_dict, "__reference__")
                        and "title" in property_schema_dict.__reference__):
                    title = property_schema_dict.__reference__["title"]
                else:
                    title = property_schema_dict.get("title")
                if title:
                    title_lookup[title] = TitleLookup()
                    title_lookup[title].property_name = property_name

                if "object" in property_type_set:
                    self.flattened[parent_path + property_name] = "object"
                    for field, child_title in self.parse_schema_dict(
                            parent_path + property_name,
                            property_schema_dict,
                            parent_id_fields=id_fields,
                            title_lookup=title_lookup.get(title),
                            parent_title=parent_title + title + ":"
                            if parent_title is not None and title else None,
                    ):
                        yield (
                            property_name + "/" + field,
                            # TODO ambiguous use of "title"
                            (title + ":" +
                             child_title if title and child_title else None),
                        )

                elif "array" in property_type_set:
                    flattened_key = parent_path.replace("/0/",
                                                        "/") + property_name
                    self.flattened[flattened_key] = "array"
                    type_set = get_property_type_set(
                        property_schema_dict["items"])
                    if "string" in type_set or not type_set:
                        self.flattened[flattened_key] = "string_array"
                        yield property_name, title
                    elif "number" in type_set:
                        self.flattened[flattened_key] = "number_array"
                        yield property_name, title
                    elif "array" in type_set:
                        self.flattened[flattened_key] = "array_array"
                        nested_type_set = get_property_type_set(
                            property_schema_dict["items"]["items"])
                        if "string" in nested_type_set or "number" in nested_type_set:
                            yield property_name, title
                        else:
                            raise ValueError
                    elif "object" in type_set:
                        if title:
                            title_lookup[title].property_name = property_name

                        if self.use_titles and parent_title is not None:
                            sub_sheet_name = make_sub_sheet_name(
                                parent_title,
                                title or property_name,
                                truncation_length=self.truncation_length,
                                path_separator=":",
                            )
                            self.sub_sheet_titles[(
                                parent_path,
                                property_name,
                            )] = sub_sheet_name
                        else:
                            sub_sheet_name = make_sub_sheet_name(
                                parent_path,
                                property_name,
                                truncation_length=self.truncation_length,
                            )
                        # self.sub_sheet_mapping[parent_name+'/'+property_name] = sub_sheet_name

                        if sub_sheet_name not in self.sub_sheets:
                            self.sub_sheets[sub_sheet_name] = Sheet(
                                root_id=self.root_id, name=sub_sheet_name)
                        sub_sheet = self.sub_sheets[sub_sheet_name]
                        sub_sheet.title_lookup = title_lookup.get(title)

                        for field in id_fields:
                            sub_sheet.add_field(field, id_field=True)
                            sub_sheet.titles[title_lookup.lookup_header(
                                field)] = field
                        fields = self.parse_schema_dict(
                            parent_path + property_name + "/0",
                            property_schema_dict["items"],
                            parent_id_fields=id_fields,
                            title_lookup=title_lookup.get(title),
                            parent_title=parent_title + title + ":"
                            if parent_title is not None and title else None,
                        )

                        rollup_fields = set()
                        for field, child_title in fields:
                            full_path = parent_path + property_name + "/0/" + field
                            if self.use_titles:
                                if not child_title or parent_title is None:
                                    warn(
                                        _("Field {}{}/0/{} is missing a title, skipping."
                                          ).format(parent_path, property_name,
                                                   field))
                                elif not title:
                                    warn(
                                        _("Field {}{} does not have a title, skipping it and all its children."
                                          ).format(parent_path, property_name))
                                else:
                                    # This code only works for arrays that are at 0 or 1 layer of nesting
                                    full_title = (parent_title + title + ":" +
                                                  child_title)
                                    sub_sheet.add_field(full_title)
                                    sub_sheet.titles[full_path] = full_title
                            else:
                                sub_sheet.add_field(full_path)
                            if (self.do_rollup
                                    and "rollUp" in property_schema_dict and
                                    field in property_schema_dict["rollUp"]):
                                rollup_fields.add(field)
                                self.rollup.add(full_path)
                                yield property_name + "/0/" + field, (
                                    title + ":" + child_title
                                    if title and child_title else None)

                        # Check that all items in rollUp are in the schema
                        if self.do_rollup and "rollUp" in property_schema_dict:
                            missedRollUp = (
                                set(property_schema_dict["rollUp"]) -
                                rollup_fields)
                            if missedRollUp:
                                warn("{} in rollUp but not in schema".format(
                                    ", ".join(missedRollUp)))

                    else:
                        raise ValueError(
                            _('Unknown type_set: {}, did you forget to explicity set the "type" key on "items"?'
                              ).format(type_set))
                elif "string" in property_type_set or not property_type_set:
                    # We only check for date here, because its the only format
                    # for which we need to specially transform the input
                    if property_schema_dict.get("format") == "date":
                        self.flattened[parent_path.replace("/0/", "/") +
                                       property_name] = "date"
                    else:
                        self.flattened[parent_path.replace("/0/", "/") +
                                       property_name] = "string"
                    yield property_name, title
                elif "number" in property_type_set:
                    self.flattened[parent_path.replace("/0/", "/") +
                                   property_name] = "number"
                    yield property_name, title
                elif "integer" in property_type_set:
                    self.flattened[parent_path.replace("/0/", "/") +
                                   property_name] = "integer"
                    yield property_name, title
                elif "boolean" in property_type_set:
                    self.flattened[parent_path.replace("/0/", "/") +
                                   property_name] = "boolean"
                    yield property_name, title
                else:
                    warn(
                        _('Unrecognised types {} for property "{}" with context "{}",'
                          "so this property has been ignored.").format(
                              repr(property_type_set), property_name,
                              parent_path))

        else:
            warn(
                _('Skipping field "{}", because it has no properties.').format(
                    parent_path))
コード例 #15
0
ファイル: input.py プロジェクト: Xtuden-com/flatten-tool
def merge(base, mergee, debug_info=None):
    if not debug_info:
        debug_info = {}
    for key, v in mergee.items():
        if isinstance(v, Cell):
            value = v.cell_value
        else:
            value = v
        if key in base:
            if isinstance(value, TemporaryDict):
                if not isinstance(base[key], TemporaryDict):
                    warnings_for_ignored_columns(
                        v,
                        _("because it treats {} as an array, but another column does not"
                          ).format(key),
                    )
                    continue
                for temporarydict_key, temporarydict_value in value.items():
                    if temporarydict_key in base[key]:
                        merge(
                            base[key][temporarydict_key],
                            temporarydict_value,
                            debug_info,
                        )
                    else:
                        assert temporarydict_key not in base[key], _(
                            "Overwriting cell {} by mistake").format(
                                temporarydict_value)
                        base[key][temporarydict_key] = temporarydict_value
                for temporarydict_value in value.items_no_keyfield:
                    base[key].items_no_keyfield.append(temporarydict_value)
            elif isinstance(value, dict):
                if isinstance(base[key], dict):
                    merge(base[key], value, debug_info)
                else:
                    warnings_for_ignored_columns(
                        v,
                        _("because it treats {} as an object, but another column does not"
                          ).format(key),
                    )
            else:
                if not isinstance(base[key], Cell):
                    id_info = '{} "{}"'.format(
                        debug_info.get("id_name"),
                        debug_info.get(debug_info.get("id_name")),
                    )
                    if debug_info.get("root_id"):
                        id_info = ('{} "{}", '.format(
                            debug_info.get("root_id"),
                            debug_info.get("root_id_or_none"),
                        ) + id_info)
                    warnings_for_ignored_columns(
                        v,
                        _("because another column treats it as an array or object"
                          ))
                    continue
                base_value = base[key].cell_value
                if base_value != value:
                    id_info = '{} "{}"'.format(
                        debug_info.get("id_name"),
                        debug_info.get(debug_info.get("id_name")),
                    )
                    if debug_info.get("root_id"):
                        id_info = ('{} "{}", '.format(
                            debug_info.get("root_id"),
                            debug_info.get("root_id_or_none"),
                        ) + id_info)
                    warn(
                        _('You may have a duplicate Identifier: We couldn\'t merge these rows with the {}: field "{}" in sheet "{}": one cell has the value: "{}", the other cell has the value: "{}"'
                          ).format(
                              id_info,
                              key,
                              debug_info.get("sheet_name"),
                              base_value,
                              value,
                          ),
                        DataErrorWarning,
                    )
                else:
                    base[key].sub_cells.append(v)
        else:
            # This happens when a parent record finds the first a child record of a known type
            base[key] = v
コード例 #16
0
ファイル: input.py プロジェクト: Xtuden-com/flatten-tool
    def do_unflatten(self):
        main_sheet_by_ocid = OrderedDict()
        sheets = list(self.get_sub_sheets_lines())
        for i, sheet in enumerate(sheets):
            sheet_name, lines = sheet
            try:
                actual_headings = self.get_sheet_headings(sheet_name)
                # If sheet is empty or too many lines have been skipped
                if not actual_headings:
                    continue
                found = OrderedDict()
                last_col = len(actual_headings)
                # We want to ignore data in earlier columns, so we look
                # through the data backwards
                for i, actual_heading in enumerate(reversed(actual_headings)):
                    if actual_heading is None:
                        continue
                    if actual_heading in found:
                        found[actual_heading].append((last_col - i) - 1)
                    else:
                        found[actual_heading] = [i]
                for actual_heading in reversed(found):
                    if len(found[actual_heading]) > 1:
                        keeping = found[actual_heading][0]  # noqa
                        ignoring = found[actual_heading][1:]
                        ignoring.reverse()
                        if len(ignoring) >= 3:
                            warn(
                                (_('Duplicate heading "{}" found, ignoring '
                                   'the data in columns {} and {} (sheet: "{}").'
                                   )).format(
                                       actual_heading,
                                       ", ".join([
                                           _get_column_letter(x + 1)
                                           for x in ignoring[:-1]
                                       ]),
                                       _get_column_letter(ignoring[-1] + 1),
                                       sheet_name,
                                   ),
                                DataErrorWarning,
                            )
                        elif len(found[actual_heading]) == 3:
                            warn(
                                (_('Duplicate heading "{}" found, ignoring '
                                   'the data in columns {} and {} (sheet: "{}").'
                                   )).format(
                                       actual_heading,
                                       _get_column_letter(ignoring[0] + 1),
                                       _get_column_letter(ignoring[1] + 1),
                                       sheet_name,
                                   ),
                                DataErrorWarning,
                            )
                        else:
                            warn(
                                (_('Duplicate heading "{}" found, ignoring '
                                   'the data in column {} (sheet: "{}").')
                                 ).format(
                                     actual_heading,
                                     _get_column_letter(ignoring[0] + 1),
                                     sheet_name,
                                 ),
                                DataErrorWarning,
                            )
            except NotImplementedError:
                # The ListInput type used in the tests doesn't support getting headings.
                actual_headings = None
            for j, line in enumerate(lines):
                if all(x is None or x == "" for x in line.values()):
                    # if all(x == '' for x in line.values()):
                    continue
                root_id_or_none = line.get(
                    self.root_id) if self.root_id else None
                cells = OrderedDict()
                for k, header in enumerate(line):
                    heading = actual_headings[k] if actual_headings else header
                    if self.vertical_orientation:
                        # This is misleading as it specifies the row number as the distance vertically
                        # and the horizontal 'letter' as a number.
                        # https://github.com/OpenDataServices/flatten-tool/issues/153
                        cells[header] = Cell(
                            line[header],
                            (sheet_name, str(k + 1), j + 2, heading))
                    else:
                        cells[header] = Cell(
                            line[header],
                            (sheet_name, _get_column_letter(k + 1), j + 2,
                             heading),
                        )
                unflattened = unflatten_main_with_parser(
                    self.parser, cells, self.timezone, self.xml, self.id_name)
                if root_id_or_none not in main_sheet_by_ocid:
                    main_sheet_by_ocid[root_id_or_none] = TemporaryDict(
                        self.id_name, xml=self.xml)

                def inthere(unflattened, id_name):
                    if self.xml and not isinstance(
                            unflattened.get(self.id_name), Cell):
                        # For an XML tag
                        return unflattened[id_name]["text()"].cell_value
                    else:
                        # For a JSON, or an XML attribute
                        return unflattened[id_name].cell_value

                if (self.id_name in unflattened
                        and inthere(unflattened, self.id_name)
                        in main_sheet_by_ocid[root_id_or_none]):
                    if self.xml and not isinstance(
                            unflattened.get(self.id_name), Cell):
                        unflattened_id = unflattened.get(
                            self.id_name)["text()"].cell_value
                    else:
                        unflattened_id = unflattened.get(
                            self.id_name).cell_value
                    merge(
                        main_sheet_by_ocid[root_id_or_none][unflattened_id],
                        unflattened,
                        {
                            "sheet_name": sheet_name,
                            "root_id": self.root_id,
                            "root_id_or_none": root_id_or_none,
                            "id_name": self.id_name,
                            self.id_name: unflattened_id,
                        },
                    )
                else:
                    main_sheet_by_ocid[root_id_or_none].append(unflattened)
        temporarydicts_to_lists(main_sheet_by_ocid)
        return sum(main_sheet_by_ocid.values(), [])
コード例 #17
0
ファイル: input.py プロジェクト: Xtuden-com/flatten-tool
def convert_type(type_string, value, timezone=pytz.timezone("UTC")):
    if value == "" or value is None:
        return None
    if type_string == "number":
        try:
            return Decimal(value)
        except (TypeError, ValueError, InvalidOperation):
            warn(
                _('Non-numeric value "{}" found in number column, returning as string instead.'
                  ).format(value),
                DataErrorWarning,
            )
            return str(value)
    elif type_string == "integer":
        try:
            return int(value)
        except (TypeError, ValueError):
            warn(
                _('Non-integer value "{}" found in integer column, returning as string instead.'
                  ).format(value),
                DataErrorWarning,
            )
            return str(value)
    elif type_string == "boolean":
        value = str(value)
        if value.lower() in ["true", "1"]:
            return True
        elif value.lower() in ["false", "0"]:
            return False
        else:
            warn(
                _('Unrecognised value for boolean: "{}", returning as string instead'
                  ).format(value),
                DataErrorWarning,
            )
            return str(value)
    elif type_string in ("array", "array_array", "string_array",
                         "number_array"):
        value = str(value)
        if type_string == "number_array":
            try:
                if "," in value:
                    return [[Decimal(y) for y in x.split(",")]
                            for x in value.split(";")]
                else:
                    return [Decimal(x) for x in value.split(";")]
            except (TypeError, ValueError, InvalidOperation):
                warn(
                    _('Non-numeric value "{}" found in number array column, returning as string array instead).'
                      ).format(value),
                    DataErrorWarning,
                )
        if "," in value:
            return [x.split(",") for x in value.split(";")]
        else:
            return value.split(";")
    elif type_string == "string":
        if type(value) == datetime.datetime:
            return timezone.localize(value).isoformat()
        return str(value)
    elif type_string == "date":
        if type(value) == datetime.datetime:
            return value.date().isoformat()
        return str(value)
    elif type_string == "":
        if type(value) == datetime.datetime:
            return timezone.localize(value).isoformat()
        if type(value) == float and int(value) == value:
            return int(value)
        return value if type(value) in [int] else str(value)
    else:
        raise ValueError('Unrecognised type: "{}"'.format(type_string))