def test_empty_lines(tmpdir): subsheet = Sheet(root_id='ocid') subsheet.add_field('c') parser = MockParser(['a', 'd'], {'b': subsheet}) parser.main_sheet.lines = [] for format_name, spreadsheet_output_class in output.FORMATS.items(): spreadsheet_output = spreadsheet_output_class( parser=parser, main_sheet_name='release', output_name=os.path.join( tmpdir.strpath, 'release' + output.FORMATS_SUFFIX[format_name])) spreadsheet_output.write_sheets() # Check XLSX wb = openpyxl.load_workbook(tmpdir.join('release.xlsx').strpath) assert wb.sheetnames == ['release', 'b'] rows = list(wb['release'].rows) assert len(rows) == 1 assert [x.value for x in rows[0]] == ['a', 'd'] b_rows = list(wb['b'].rows) assert len(b_rows) == 1 assert [x.value for x in b_rows[0]] == ['ocid', 'c'] # Check CSV assert set(tmpdir.join('release').listdir()) == set([ tmpdir.join('release').join('release.csv'), tmpdir.join('release').join('b.csv') ]) assert tmpdir.join('release', 'release.csv').read().strip('\r\n') == 'a,d' assert tmpdir.join('release', 'b.csv').read().strip('\r\n') == 'ocid,c'
def __init__(self, schema_filename=None, root_schema_dict=None, rollup=False, root_id=None, use_titles=False, disable_local_refs=False, truncation_length=3, exclude_deprecated_fields=False): self.sub_sheets = {} self.main_sheet = Sheet() self.sub_sheet_mapping = {} self.do_rollup = rollup self.rollup = set() self.root_id = root_id self.use_titles = use_titles self.truncation_length = truncation_length self.title_lookup = TitleLookup() self.flattened = {} self.exclude_deprecated_fields = exclude_deprecated_fields if root_schema_dict is None and schema_filename is None: raise ValueError( 'One of schema_filename or root_schema_dict must be supplied') if root_schema_dict is not None and schema_filename is not None: raise ValueError( 'Only one of schema_filename or root_schema_dict should be supplied' ) if schema_filename: if schema_filename.startswith('http'): import requests r = requests.get(schema_filename) self.root_schema_dict = jsonref.loads( r.text, object_pairs_hook=OrderedDict) else: if disable_local_refs: with codecs.open(schema_filename, encoding="utf-8") as schema_file: self.root_schema_dict = jsonref.load( schema_file, object_pairs_hook=OrderedDict, loader=JsonLoaderLocalRefsDisabled()) else: if sys.version_info[:2] > (3, 0): base_uri = pathlib.Path( os.path.realpath(schema_filename)).as_uri() else: base_uri = urlparse.urljoin( 'file:', urllib.pathname2url( os.path.abspath(schema_filename))) with codecs.open(schema_filename, encoding="utf-8") as schema_file: self.root_schema_dict = jsonref.load( schema_file, object_pairs_hook=OrderedDict, base_uri=base_uri) else: self.root_schema_dict = root_schema_dict
def test_populated_lines(tmpdir): subsheet = Sheet(root_id='ocid') subsheet.add_field('c') parser = MockParser(['a'], {}) parser.main_sheet.lines = [{'a': 'cell1'}, {'a': 'cell2'}] subsheet.lines = [{'c': 'cell3'}, {'c': 'cell4'}] parser.sub_sheets['b'] = subsheet for format_name, spreadsheet_output_class in output.FORMATS.items(): spreadsheet_output = spreadsheet_output_class( parser=parser, main_sheet_name='release', output_name=os.path.join(tmpdir.strpath, 'release'+output.FORMATS_SUFFIX[format_name])) spreadsheet_output.write_sheets() # Check XLSX wb = openpyxl.load_workbook(tmpdir.join('release.xlsx').strpath) assert wb.get_sheet_names() == ['release', 'b'] assert len(wb['release'].rows) == 3 assert [ x.value for x in wb['release'].rows[0] ] == [ 'a' ] assert [ x.value for x in wb['release'].rows[1] ] == [ 'cell1' ] assert [ x.value for x in wb['release'].rows[2] ] == [ 'cell2' ] assert len(wb['b'].rows) == 3 assert [ x.value for x in wb['b'].rows[0] ] == [ 'ocid', 'c' ] assert [ x.value for x in wb['b'].rows[1] ] == [ None, 'cell3' ] assert [ x.value for x in wb['b'].rows[2] ] == [ None, 'cell4' ] # Check CSV assert set(tmpdir.join('release').listdir()) == set([ tmpdir.join('release').join('release.csv'), tmpdir.join('release').join('b.csv') ]) assert tmpdir.join('release', 'release.csv').read().strip('\r\n').replace('\r', '') == 'a\ncell1\ncell2' assert tmpdir.join('release', 'b.csv').read().strip('\r\n').replace('\r', '') == 'ocid,c\n,cell3\n,cell4'
def test_empty_lines(tmpdir): subsheet = Sheet(root_id='ocid') subsheet.add_field('c') parser = MockParser(['a', 'd'], {'b': subsheet}) parser.main_sheet.lines = [] for format_name, spreadsheet_output_class in output.FORMATS.items(): spreadsheet_output = spreadsheet_output_class( parser=parser, main_sheet_name='release', output_name=os.path.join(tmpdir.strpath, 'release'+output.FORMATS_SUFFIX[format_name])) spreadsheet_output.write_sheets() # Check XLSX wb = openpyxl.load_workbook(tmpdir.join('release.xlsx').strpath) assert wb.get_sheet_names() == ['release', 'b'] assert len(wb['release'].rows) == 1 assert [ x.value for x in wb['release'].rows[0] ] == [ 'a', 'd' ] assert len(wb['b'].rows) == 1 assert [ x.value for x in wb['b'].rows[0] ] == [ 'ocid', 'c' ] # Check CSV assert set(tmpdir.join('release').listdir()) == set([ tmpdir.join('release').join('release.csv'), tmpdir.join('release').join('b.csv') ]) assert tmpdir.join('release', 'release.csv').read().strip('\r\n') == 'a,d' assert tmpdir.join('release', 'b.csv').read().strip('\r\n') == 'ocid,c'
def test_populated_lines(tmpdir): subsheet = Sheet(root_id="ocid") subsheet.add_field("c") parser = MockParser(["a"], {}) parser.main_sheet.lines = [{"a": "cell1"}, {"a": "cell2"}] subsheet.lines = [{"c": "cell3"}, {"c": "cell4"}] parser.sub_sheets["b"] = subsheet for format_name, spreadsheet_output_class in output.FORMATS.items(): spreadsheet_output = spreadsheet_output_class( parser=parser, main_sheet_name="release", output_name=os.path.join( tmpdir.strpath, "release" + output.FORMATS_SUFFIX[format_name] ), ) spreadsheet_output.write_sheets() # Check XLSX wb = openpyxl.load_workbook(tmpdir.join("release.xlsx").strpath) assert wb.sheetnames == ["release", "b"] rows = list(wb["release"].rows) assert len(rows) == 3 assert [x.value for x in rows[0]] == ["a"] assert [x.value for x in rows[1]] == ["cell1"] assert [x.value for x in rows[2]] == ["cell2"] b_rows = list(wb["b"].rows) assert len(b_rows) == 3 assert [x.value for x in b_rows[0]] == ["ocid", "c"] assert [x.value for x in b_rows[1]] == [None, "cell3"] assert [x.value for x in b_rows[2]] == [None, "cell4"] # Check CSV assert set(tmpdir.join("release").listdir()) == set( [ tmpdir.join("release").join("release.csv"), tmpdir.join("release").join("b.csv"), ] ) assert ( tmpdir.join("release", "release.csv").read().strip("\r\n").replace("\r", "") == "a\ncell1\ncell2" ) assert ( tmpdir.join("release", "b.csv").read().strip("\r\n").replace("\r", "") == "ocid,c\n,cell3\n,cell4" ) # Check ODS - currently broken test odswb = ODSReader(tmpdir.join("release.ods").strpath) ods_rows = odswb.getSheet("release") assert len(ods_rows) == 3 assert [x for x in ods_rows[0]] == ["a"] assert [x for x in ods_rows[1]] == ["cell1"] assert [x for x in ods_rows[2]] == ["cell2"] ods_b_rows = odswb.getSheet("b") assert len(ods_b_rows) == 3 assert [x for x in ods_b_rows[0]] == ["ocid", "c"] assert [x for x in ods_b_rows[1]] == [None, "cell3"] assert [x for x in ods_b_rows[2]] == [None, "cell4"]
def __init__(self, json_filename=None, root_json_dict=None, schema_parser=None, root_list_path=None, root_id='ocid', use_titles=False, xml=False, id_name='id', filter_field=None, filter_value=None): self.sub_sheets = {} self.main_sheet = Sheet() self.root_list_path = root_list_path self.root_id = root_id self.use_titles = use_titles self.id_name = id_name self.xml = xml self.filter_field = filter_field self.filter_value = filter_value if schema_parser: self.main_sheet = schema_parser.main_sheet self.sub_sheets = schema_parser.sub_sheets # Rollup is pulled from the schema_parser, as rollup is only possible if a schema parser is specified self.rollup = schema_parser.rollup self.schema_parser = schema_parser else: self.rollup = False if self.xml: with codecs.open(json_filename, 'rb') as xml_file: top_dict = xmltodict.parse( xml_file, force_list=(root_list_path, ), force_cdata=True, ) # AFAICT, this should be true for *all* XML files assert len(top_dict) == 1 root_json_dict = list(top_dict.values())[0] json_filename = None if json_filename is None and root_json_dict is None: raise ValueError( 'Etiher json_filename or root_json_dict must be supplied') if json_filename is not None and root_json_dict is not None: raise ValueError( 'Only one of json_file or root_json_dict should be supplied') if json_filename: with codecs.open(json_filename, encoding='utf-8') as json_file: try: self.root_json_dict = json.load( json_file, object_pairs_hook=OrderedDict, parse_float=Decimal) except ValueError as err: raise BadlyFormedJSONError(*err.args) else: self.root_json_dict = root_json_dict
def __init__( self, schema_filename=None, root_schema_dict=None, main_sheet_name="main", rollup=False, root_id="ocid", use_titles=False, ): self.sub_sheets = {} self.main_sheet = Sheet() self.sub_sheet_mapping = {} self.main_sheet_name = main_sheet_name self.rollup = rollup self.root_id = root_id self.use_titles = use_titles if root_schema_dict is None and schema_filename is None: raise ValueError("One of schema_filename or root_schema_dict must be supplied") if root_schema_dict is not None and schema_filename is not None: raise ValueError("Only one of schema_filename or root_schema_dict should be supplied") if schema_filename: if schema_filename.startswith("http"): import requests r = requests.get(schema_filename) self.root_schema_dict = jsonref.loads(r.text, object_pairs_hook=OrderedDict) else: with codecs.open(schema_filename, encoding="utf-8") as schema_file: self.root_schema_dict = jsonref.load(schema_file, object_pairs_hook=OrderedDict) else: self.root_schema_dict = root_schema_dict
def test_empty_lines(tmpdir): subsheet = Sheet(root_id="ocid") subsheet.add_field("c") parser = MockParser(["a", "d"], {"b": subsheet}) parser.main_sheet.lines = [] for format_name, spreadsheet_output_class in output.FORMATS.items(): spreadsheet_output = spreadsheet_output_class( parser=parser, main_sheet_name="release", output_name=os.path.join( tmpdir.strpath, "release" + output.FORMATS_SUFFIX[format_name] ), ) spreadsheet_output.write_sheets() # Check XLSX wb = openpyxl.load_workbook(tmpdir.join("release.xlsx").strpath) assert wb.sheetnames == ["release", "b"] rows = list(wb["release"].rows) assert len(rows) == 1 assert [x.value for x in rows[0]] == ["a", "d"] b_rows = list(wb["b"].rows) assert len(b_rows) == 1 assert [x.value for x in b_rows[0]] == ["ocid", "c"] # Check CSV assert set(tmpdir.join("release").listdir()) == set( [ tmpdir.join("release").join("release.csv"), tmpdir.join("release").join("b.csv"), ] ) assert tmpdir.join("release", "release.csv").read().strip("\r\n") == "a,d" assert tmpdir.join("release", "b.csv").read().strip("\r\n") == "ocid,c" # Check ODS odswb = ODSReader(tmpdir.join("release.ods").strpath) ods_rows = odswb.getSheet("release") assert len(ods_rows) == 1 assert [x for x in ods_rows[0]] == ["a", "d"] ods_b_rows = odswb.getSheet("b") assert len(ods_b_rows) == 1 assert [x for x in ods_b_rows[0]] == ["ocid", "c"]
def test_sub_sheet_list_like(): # SubSheet object should be appendable and iterable... # .append() is used in json_input.py at https://github.com/OpenDataServices/flatten-tool/blob/master/flattentool/json_input.py#L33 sub_sheet = Sheet() assert list(sub_sheet) == [] sub_sheet.append('a') sub_sheet.append('b') assert list(sub_sheet) == ['a', 'b'] # ... but also has an add_field method, which also appends sub_sheet.add_field('c') assert list(sub_sheet) == ['a', 'b', 'c'] # but with the option to add an id_field, which appears at the start of the list sub_sheet.add_field('d', id_field=True) assert list(sub_sheet) == ['d', 'a', 'b', 'c']
def test_populated_lines(tmpdir): subsheet = Sheet(root_id='ocid') subsheet.add_field('c') parser = MockParser(['a'], {}) parser.main_sheet.lines = [{'a': 'cell1'}, {'a': 'cell2'}] subsheet.lines = [{'c': 'cell3'}, {'c': 'cell4'}] parser.sub_sheets['b'] = subsheet for format_name, spreadsheet_output_class in output.FORMATS.items(): spreadsheet_output = spreadsheet_output_class( parser=parser, main_sheet_name='release', output_name=os.path.join( tmpdir.strpath, 'release' + output.FORMATS_SUFFIX[format_name])) spreadsheet_output.write_sheets() # Check XLSX wb = openpyxl.load_workbook(tmpdir.join('release.xlsx').strpath) assert wb.sheetnames == ['release', 'b'] rows = list(wb['release'].rows) assert len(rows) == 3 assert [x.value for x in rows[0]] == ['a'] assert [x.value for x in rows[1]] == ['cell1'] assert [x.value for x in rows[2]] == ['cell2'] b_rows = list(wb['b'].rows) assert len(b_rows) == 3 assert [x.value for x in b_rows[0]] == ['ocid', 'c'] assert [x.value for x in b_rows[1]] == [None, 'cell3'] assert [x.value for x in b_rows[2]] == [None, 'cell4'] # Check CSV assert set(tmpdir.join('release').listdir()) == set([ tmpdir.join('release').join('release.csv'), tmpdir.join('release').join('b.csv') ]) assert tmpdir.join('release', 'release.csv').read().strip('\r\n').replace( '\r', '') == 'a\ncell1\ncell2' assert tmpdir.join('release', 'b.csv').read().strip('\r\n').replace( '\r', '') == 'ocid,c\n,cell3\n,cell4'
def __init__(self, schema_filename=None, root_schema_dict=None, main_sheet_name='main', rollup=False, root_id='ocid', use_titles=False): self.sub_sheets = {} self.main_sheet = Sheet() self.sub_sheet_mapping = {} self.main_sheet_name = main_sheet_name self.rollup = rollup self.root_id = root_id self.use_titles = use_titles self.title_lookup = TitleLookup() self.flattened = {} if root_schema_dict is None and schema_filename is None: raise ValueError( 'One of schema_filename or root_schema_dict must be supplied') if root_schema_dict is not None and schema_filename is not None: raise ValueError( 'Only one of schema_filename or root_schema_dict should be supplied' ) if schema_filename: if schema_filename.startswith('http'): import requests r = requests.get(schema_filename) self.root_schema_dict = jsonref.loads( r.text, object_pairs_hook=OrderedDict) else: with codecs.open(schema_filename, encoding="utf-8") as schema_file: self.root_schema_dict = jsonref.load( schema_file, object_pairs_hook=OrderedDict) else: self.root_schema_dict = root_schema_dict
def __init__(self, json_filename=None, root_json_dict=None, main_sheet_name='main', schema_parser=None, root_list_path=None, root_id='ocid', use_titles=False): self.sub_sheets = {} self.main_sheet = Sheet() self.main_sheet_name = main_sheet_name self.root_list_path = root_list_path self.root_id = root_id self.use_titles = use_titles if schema_parser: self.sub_sheet_mapping = { '/'.join(k.split('/')[1:]): v for k, v in schema_parser.sub_sheet_mapping.items() } self.main_sheet = schema_parser.main_sheet self.sub_sheets = schema_parser.sub_sheets # Rollup is pulled from the schema_parser, as rollup is only possible if a schema parser is specified self.rollup = schema_parser.rollup self.schema_parser = schema_parser else: self.sub_sheet_mapping = {} self.rollup = False if json_filename is None and root_json_dict is None: raise ValueError( 'Etiher json_filename or root_json_dict must be supplied') if json_filename is not None and root_json_dict is not None: raise ValueError( 'Only one of json_file or root_json_dict should be supplied') if json_filename: with codecs.open(json_filename, encoding='utf-8') as json_file: try: self.root_json_dict = json.load( json_file, object_pairs_hook=OrderedDict, parse_float=Decimal) except ValueError as err: raise BadlyFormedJSONError(*err.args) else: self.root_json_dict = root_json_dict
def __init__(self, schema_filename=None, root_schema_dict=None, rollup=False, root_id=None, use_titles=False, disable_local_refs=False, truncation_length=3, exclude_deprecated_fields=False): self.sub_sheets = {} self.main_sheet = Sheet() self.sub_sheet_mapping = {} self.rollup = rollup self.root_id = root_id self.use_titles = use_titles self.truncation_length = truncation_length self.title_lookup = TitleLookup() self.flattened = {} self.exclude_deprecated_fields = exclude_deprecated_fields if root_schema_dict is None and schema_filename is None: raise ValueError('One of schema_filename or root_schema_dict must be supplied') if root_schema_dict is not None and schema_filename is not None: raise ValueError('Only one of schema_filename or root_schema_dict should be supplied') if schema_filename: if schema_filename.startswith('http'): import requests r = requests.get(schema_filename) self.root_schema_dict = jsonref.loads(r.text, object_pairs_hook=OrderedDict) else: if disable_local_refs: with codecs.open(schema_filename, encoding="utf-8") as schema_file: self.root_schema_dict = jsonref.load(schema_file, object_pairs_hook=OrderedDict, loader=JsonLoaderLocalRefsDisabled()) else: if sys.version_info[:2] > (3, 0): base_uri = pathlib.Path(os.path.realpath(schema_filename)).as_uri() else: base_uri = urlparse.urljoin('file:', urllib.pathname2url(os.path.abspath(schema_filename))) with codecs.open(schema_filename, encoding="utf-8") as schema_file: self.root_schema_dict = jsonref.load(schema_file, object_pairs_hook=OrderedDict, base_uri=base_uri) else: self.root_schema_dict = root_schema_dict
class SchemaParser(object): """Parse the fields of a JSON schema into a flattened structure.""" def __init__(self, schema_filename=None, root_schema_dict=None, rollup=False, root_id=None, use_titles=False, disable_local_refs=False, truncation_length=3, exclude_deprecated_fields=False): self.sub_sheets = {} self.main_sheet = Sheet() self.sub_sheet_mapping = {} self.do_rollup = rollup self.rollup = set() self.root_id = root_id self.use_titles = use_titles self.truncation_length = truncation_length self.title_lookup = TitleLookup() self.flattened = {} self.exclude_deprecated_fields = exclude_deprecated_fields if root_schema_dict is None and schema_filename is None: raise ValueError( 'One of schema_filename or root_schema_dict must be supplied') if root_schema_dict is not None and schema_filename is not None: raise ValueError( 'Only one of schema_filename or root_schema_dict should be supplied' ) if schema_filename: if schema_filename.startswith('http'): import requests r = requests.get(schema_filename) self.root_schema_dict = jsonref.loads( r.text, object_pairs_hook=OrderedDict) else: if disable_local_refs: with codecs.open(schema_filename, encoding="utf-8") as schema_file: self.root_schema_dict = jsonref.load( schema_file, object_pairs_hook=OrderedDict, loader=JsonLoaderLocalRefsDisabled()) else: if sys.version_info[:2] > (3, 0): base_uri = pathlib.Path( os.path.realpath(schema_filename)).as_uri() else: base_uri = urlparse.urljoin( 'file:', urllib.pathname2url( os.path.abspath(schema_filename))) with codecs.open(schema_filename, encoding="utf-8") as schema_file: self.root_schema_dict = jsonref.load( schema_file, object_pairs_hook=OrderedDict, base_uri=base_uri) else: self.root_schema_dict = root_schema_dict def parse(self): fields = self.parse_schema_dict('', self.root_schema_dict) for field, title in fields: if self.use_titles: if not title: warn('Field {} does not have a title, skipping.'.format( field)) else: self.main_sheet.append(title) self.main_sheet.titles[field] = title else: self.main_sheet.append(field) def parse_schema_dict(self, parent_path, schema_dict, parent_id_fields=None, title_lookup=None, parent_title=''): if parent_path: parent_path = parent_path + '/' parent_id_fields = parent_id_fields or [] title_lookup = self.title_lookup if title_lookup is None else title_lookup if 'type' in schema_dict and schema_dict['type'] == 'array' \ and 'items' in schema_dict and 'oneOf' in schema_dict['items']: for oneOf in schema_dict['items']['oneOf']: if 'type' in oneOf and oneOf['type'] == 'object': for field, child_title in self.parse_schema_dict( parent_path, oneOf, parent_id_fields=parent_id_fields, title_lookup=title_lookup, parent_title=parent_title): yield (field, child_title) elif 'properties' in schema_dict: if 'id' in schema_dict['properties']: if self.use_titles: id_fields = parent_id_fields + [ (parent_title if parent_title is not None else parent_path) + (schema_dict['properties']['id'].get('title') or 'id') ] else: id_fields = parent_id_fields + [parent_path + 'id'] else: id_fields = parent_id_fields for property_name, property_schema_dict in schema_dict[ 'properties'].items(): if self.exclude_deprecated_fields and property_schema_dict.get( 'deprecated'): continue property_type_set = get_property_type_set(property_schema_dict) title = property_schema_dict.get('title') if title: title_lookup[title] = TitleLookup() title_lookup[title].property_name = property_name if 'object' in property_type_set: self.flattened[parent_path + property_name] = "object" for field, child_title in self.parse_schema_dict( parent_path + property_name, property_schema_dict, parent_id_fields=id_fields, title_lookup=title_lookup.get(title), parent_title=parent_title + title + ':' if parent_title is not None and title else None): yield ( property_name + '/' + field, # TODO ambiguous use of "title" (title + ':' + child_title if title and child_title else None)) elif 'array' in property_type_set: flattened_key = parent_path.replace('/0/', '/') + property_name self.flattened[flattened_key] = "array" type_set = get_property_type_set( property_schema_dict['items']) if 'string' in type_set or not type_set: self.flattened[flattened_key] = "string_array" yield property_name, title elif 'number' in type_set: self.flattened[flattened_key] = "number_array" yield property_name, title elif 'array' in type_set: self.flattened[flattened_key] = "array_array" nested_type_set = get_property_type_set( property_schema_dict['items']['items']) if 'string' in nested_type_set or 'number' in nested_type_set: yield property_name, title else: raise ValueError elif 'object' in type_set: if title: title_lookup[title].property_name = property_name sub_sheet_name = make_sub_sheet_name( parent_path, property_name, truncation_length=self.truncation_length) #self.sub_sheet_mapping[parent_name+'/'+property_name] = sub_sheet_name if sub_sheet_name not in self.sub_sheets: self.sub_sheets[sub_sheet_name] = Sheet( root_id=self.root_id, name=sub_sheet_name) sub_sheet = self.sub_sheets[sub_sheet_name] sub_sheet.title_lookup = title_lookup.get(title) for field in id_fields: sub_sheet.add_field(field, id_field=True) sub_sheet.titles[title_lookup.lookup_header( field)] = field fields = self.parse_schema_dict( parent_path + property_name + '/0', property_schema_dict['items'], parent_id_fields=id_fields, title_lookup=title_lookup.get(title), parent_title=parent_title + title + ':' if parent_title is not None and title else None) rollup_fields = set() for field, child_title in fields: full_path = parent_path + property_name + '/0/' + field if self.use_titles: if not child_title or parent_title is None: warn( 'Field {}{}/0/{} is missing a title, skipping.' .format(parent_path, property_name, field)) elif not title: warn( 'Field {}{} does not have a title, skipping it and all its children.' .format(parent_path, property_name)) else: # This code only works for arrays that are at 0 or 1 layer of nesting full_title = parent_title + title + ':' + child_title sub_sheet.add_field(full_title) sub_sheet.titles[full_path] = full_title else: sub_sheet.add_field(full_path) if self.do_rollup and 'rollUp' in property_schema_dict and field in property_schema_dict[ 'rollUp']: rollup_fields.add(field) self.rollup.add(full_path) yield property_name + '/0/' + field, ( title + ':' + child_title if title and child_title else None) # Check that all items in rollUp are in the schema if self.do_rollup and 'rollUp' in property_schema_dict: missedRollUp = set( property_schema_dict['rollUp']) - rollup_fields if missedRollUp: warn('{} in rollUp but not in schema'.format( ', '.join(missedRollUp))) else: raise ValueError( 'Unknown type_set: {}, did you forget to explicity set the "type" key on "items"?' .format(type_set)) elif 'string' in property_type_set or not property_type_set: self.flattened[parent_path.replace('/0/', '/') + property_name] = "string" yield property_name, title elif 'number' in property_type_set: self.flattened[parent_path.replace('/0/', '/') + property_name] = "number" yield property_name, title elif 'integer' in property_type_set: self.flattened[parent_path.replace('/0/', '/') + property_name] = "integer" yield property_name, title elif 'boolean' in property_type_set: self.flattened[parent_path.replace('/0/', '/') + property_name] = "boolean" yield property_name, title else: warn( 'Unrecognised types {} for property "{}" with context "{}",' 'so this property has been ignored.'.format( repr(property_type_set), property_name, parent_path)) else: warn('Skipping field "{}", because it has no properties.'.format( parent_path))
def parse_schema_dict(self, parent_path, schema_dict, parent_id_fields=None, title_lookup=None, parent_title=''): if parent_path: parent_path = parent_path + '/' parent_id_fields = parent_id_fields or [] title_lookup = self.title_lookup if title_lookup is None else title_lookup if 'type' in schema_dict and schema_dict['type'] == 'array' \ and 'items' in schema_dict and 'oneOf' in schema_dict['items']: for oneOf in schema_dict['items']['oneOf']: if 'type' in oneOf and oneOf['type'] == 'object': for field, child_title in self.parse_schema_dict( parent_path, oneOf, parent_id_fields=parent_id_fields, title_lookup=title_lookup, parent_title=parent_title): yield (field, child_title) elif 'properties' in schema_dict: if 'id' in schema_dict['properties']: if self.use_titles: id_fields = parent_id_fields + [ (parent_title if parent_title is not None else parent_path) + (schema_dict['properties']['id'].get('title') or 'id') ] else: id_fields = parent_id_fields + [parent_path + 'id'] else: id_fields = parent_id_fields for property_name, property_schema_dict in schema_dict[ 'properties'].items(): if self.exclude_deprecated_fields and property_schema_dict.get( 'deprecated'): continue property_type_set = get_property_type_set(property_schema_dict) title = property_schema_dict.get('title') if title: title_lookup[title] = TitleLookup() title_lookup[title].property_name = property_name if 'object' in property_type_set: self.flattened[parent_path + property_name] = "object" for field, child_title in self.parse_schema_dict( parent_path + property_name, property_schema_dict, parent_id_fields=id_fields, title_lookup=title_lookup.get(title), parent_title=parent_title + title + ':' if parent_title is not None and title else None): yield ( property_name + '/' + field, # TODO ambiguous use of "title" (title + ':' + child_title if title and child_title else None)) elif 'array' in property_type_set: flattened_key = parent_path.replace('/0/', '/') + property_name self.flattened[flattened_key] = "array" type_set = get_property_type_set( property_schema_dict['items']) if 'string' in type_set or not type_set: self.flattened[flattened_key] = "string_array" yield property_name, title elif 'number' in type_set: self.flattened[flattened_key] = "number_array" yield property_name, title elif 'array' in type_set: self.flattened[flattened_key] = "array_array" nested_type_set = get_property_type_set( property_schema_dict['items']['items']) if 'string' in nested_type_set or 'number' in nested_type_set: yield property_name, title else: raise ValueError elif 'object' in type_set: if title: title_lookup[title].property_name = property_name sub_sheet_name = make_sub_sheet_name( parent_path, property_name, truncation_length=self.truncation_length) #self.sub_sheet_mapping[parent_name+'/'+property_name] = sub_sheet_name if sub_sheet_name not in self.sub_sheets: self.sub_sheets[sub_sheet_name] = Sheet( root_id=self.root_id, name=sub_sheet_name) sub_sheet = self.sub_sheets[sub_sheet_name] sub_sheet.title_lookup = title_lookup.get(title) for field in id_fields: sub_sheet.add_field(field, id_field=True) sub_sheet.titles[title_lookup.lookup_header( field)] = field fields = self.parse_schema_dict( parent_path + property_name + '/0', property_schema_dict['items'], parent_id_fields=id_fields, title_lookup=title_lookup.get(title), parent_title=parent_title + title + ':' if parent_title is not None and title else None) rollup_fields = set() for field, child_title in fields: full_path = parent_path + property_name + '/0/' + field if self.use_titles: if not child_title or parent_title is None: warn( 'Field {}{}/0/{} is missing a title, skipping.' .format(parent_path, property_name, field)) elif not title: warn( 'Field {}{} does not have a title, skipping it and all its children.' .format(parent_path, property_name)) else: # This code only works for arrays that are at 0 or 1 layer of nesting full_title = parent_title + title + ':' + child_title sub_sheet.add_field(full_title) sub_sheet.titles[full_path] = full_title else: sub_sheet.add_field(full_path) if self.do_rollup and 'rollUp' in property_schema_dict and field in property_schema_dict[ 'rollUp']: rollup_fields.add(field) self.rollup.add(full_path) yield property_name + '/0/' + field, ( title + ':' + child_title if title and child_title else None) # Check that all items in rollUp are in the schema if self.do_rollup and 'rollUp' in property_schema_dict: missedRollUp = set( property_schema_dict['rollUp']) - rollup_fields if missedRollUp: warn('{} in rollUp but not in schema'.format( ', '.join(missedRollUp))) else: raise ValueError( 'Unknown type_set: {}, did you forget to explicity set the "type" key on "items"?' .format(type_set)) elif 'string' in property_type_set or not property_type_set: self.flattened[parent_path.replace('/0/', '/') + property_name] = "string" yield property_name, title elif 'number' in property_type_set: self.flattened[parent_path.replace('/0/', '/') + property_name] = "number" yield property_name, title elif 'integer' in property_type_set: self.flattened[parent_path.replace('/0/', '/') + property_name] = "integer" yield property_name, title elif 'boolean' in property_type_set: self.flattened[parent_path.replace('/0/', '/') + property_name] = "boolean" yield property_name, title else: warn( 'Unrecognised types {} for property "{}" with context "{}",' 'so this property has been ignored.'.format( repr(property_type_set), property_name, parent_path)) else: warn('Skipping field "{}", because it has no properties.'.format( parent_path))
def __init__( self, json_filename=None, root_json_dict=None, schema_parser=None, root_list_path=None, root_id="ocid", use_titles=False, xml=False, id_name="id", filter_field=None, filter_value=None, preserve_fields=None, remove_empty_schema_columns=False, rollup=False, truncation_length=3, ): self.sub_sheets = {} self.main_sheet = Sheet() self.root_list_path = root_list_path self.root_id = root_id self.use_titles = use_titles self.truncation_length = truncation_length self.id_name = id_name self.xml = xml self.filter_field = filter_field self.filter_value = filter_value self.remove_empty_schema_columns = remove_empty_schema_columns self.seen_paths = set() if schema_parser: self.main_sheet = copy.deepcopy(schema_parser.main_sheet) self.sub_sheets = copy.deepcopy(schema_parser.sub_sheets) if remove_empty_schema_columns: # Don't use columns from the schema parser # (avoids empty columns) self.main_sheet.columns = [] for sheet_name, sheet in list(self.sub_sheets.items()): sheet.columns = [] self.schema_parser = schema_parser else: self.schema_parser = None self.rollup = False if rollup: if schema_parser and len(schema_parser.rollup) > 0: # If rollUp is present in the schema this takes precedence over direct input. self.rollup = schema_parser.rollup if isinstance(rollup, (list, )) and (len(rollup) > 1 or (len(rollup) == 1 and rollup[0] is not True)): warn( _("Using rollUp values from schema, ignoring direct input." )) elif isinstance(rollup, (list, )): if len(rollup) == 1 and os.path.isfile(rollup[0]): # Parse file, one json path per line. rollup_from_file = set() with open(rollup[0]) as rollup_file: for line in rollup_file: line = line.strip() rollup_from_file.add(line) self.rollup = rollup_from_file # Rollup args passed directly at the commandline elif len(rollup) == 1 and rollup[0] is True: warn( _("No fields to rollup found (pass json path directly, as a list in a file, or via a schema)" )) else: self.rollup = set(rollup) else: warn( _("Invalid value passed for rollup (pass json path directly, as a list in a file, or via a schema)" )) if self.xml: with codecs.open(json_filename, "rb") as xml_file: top_dict = xmltodict.parse( xml_file, force_list=(root_list_path, ), force_cdata=True, ) # AFAICT, this should be true for *all* XML files assert len(top_dict) == 1 root_json_dict = list(top_dict.values())[0] list_dict_consistency(root_json_dict) json_filename = None if json_filename is None and root_json_dict is None: raise ValueError( _("Etiher json_filename or root_json_dict must be supplied")) if json_filename is not None and root_json_dict is not None: raise ValueError( _("Only one of json_file or root_json_dict should be supplied") ) if json_filename: with codecs.open(json_filename, encoding="utf-8") as json_file: try: self.root_json_dict = json.load( json_file, object_pairs_hook=OrderedDict, parse_float=Decimal) except UnicodeError as err: raise BadlyFormedJSONErrorUTF8(*err.args) except ValueError as err: raise BadlyFormedJSONError(*err.args) else: self.root_json_dict = root_json_dict if preserve_fields: # Extract fields to be preserved from input file (one path per line) preserve_fields_all = [] preserve_fields_input = [] with open(preserve_fields) as preserve_fields_file: for line in preserve_fields_file: line = line.strip() path_fields = line.rsplit("/", 1) preserve_fields_all = (preserve_fields_all + path_fields + [line.rstrip("/")]) preserve_fields_input = preserve_fields_input + [ line.rstrip("/") ] self.preserve_fields = set(preserve_fields_all) self.preserve_fields_input = set(preserve_fields_input) try: input_not_in_schema = set() for field in self.preserve_fields_input: if field not in self.schema_parser.flattened.keys(): input_not_in_schema.add(field) warn( _("You wanted to preserve the following fields which are not present in the supplied schema: {}" ).format(list(input_not_in_schema))) except AttributeError: # no schema pass else: self.preserve_fields = None self.preserve_fields_input = None
def parse_json_dict( self, json_dict, sheet, json_key=None, parent_name="", flattened_dict=None, parent_id_fields=None, top_level_of_sub_sheet=False, ): """ Parse a json dictionary. json_dict - the json dictionary sheet - a sheet.Sheet object representing the resulting spreadsheet json_key - the key that maps to this JSON dict, either directly to the dict, or to a dict that this list contains. Is None if this dict is contained in root_json_list directly. """ # Possibly main_sheet should be main_sheet_columns, but this is # currently named for consistency with schema.py if self.use_titles: sheet_key = sheet_key_title else: sheet_key = sheet_key_field parent_id_fields = copy.copy(parent_id_fields) or OrderedDict() if flattened_dict is None: flattened_dict = {} top = True else: top = False if parent_name == "" and self.filter_field and self.filter_value: if self.filter_field not in json_dict: return if json_dict[self.filter_field] != self.filter_value: return if top_level_of_sub_sheet: # Add the IDs for the top level of object in an array for k, v in parent_id_fields.items(): if self.xml: flattened_dict[sheet_key(sheet, k)] = v["#text"] else: flattened_dict[sheet_key(sheet, k)] = v if self.root_id and self.root_id in json_dict: parent_id_fields[sheet_key(sheet, self.root_id)] = json_dict[self.root_id] if self.id_name in json_dict: parent_id_fields[sheet_key(sheet, parent_name + self.id_name)] = json_dict[self.id_name] for key, value in json_dict.items(): # Keep a unique list of all the JSON paths in the data that have been seen. parent_path = parent_name.replace("/0", "") full_path = parent_path + key self.seen_paths.add(full_path) if self.preserve_fields: siblings = False for field in self.preserve_fields: if parent_path in field: siblings = True if siblings and full_path not in self.preserve_fields: continue if type(value) in BASIC_TYPES: if self.xml and key == "#text": # Handle the text output from xmltodict key = "" parent_name = parent_name.strip("/") flattened_dict[sheet_key(sheet, parent_name + key)] = value elif hasattr(value, "items"): self.parse_json_dict( value, sheet=sheet, json_key=key, parent_name=parent_name + key + "/", flattened_dict=flattened_dict, parent_id_fields=parent_id_fields, ) elif hasattr(value, "__iter__"): if all(type(x) in BASIC_TYPES for x in value): # Check for an array of BASIC types # TODO Make this check the schema # TODO Error if the any of the values contain the seperator # TODO Support doubly nested arrays flattened_dict[sheet_key(sheet, parent_name + key)] = ";".join( map(str, value)) else: if (self.rollup and parent_name == "" ): # Rollup only currently possible to main sheet if self.use_titles and not self.schema_parser: warn( _("Warning: No schema was provided so column headings are JSON keys, not titles." )) if len(value) == 1: for k, v in value[0].items(): if (self.preserve_fields and parent_name + key + "/" + k not in self.preserve_fields): continue if type(v) not in BASIC_TYPES: raise ValueError( _("Rolled up values must be basic types" )) else: if self.schema_parser: # We want titles and there's a schema and rollUp is in it if (self.use_titles and parent_name + key + "/0/" + k in self.schema_parser. main_sheet.titles): flattened_dict[sheet_key_title( sheet, parent_name + key + "/0/" + k)] = v # We want titles and there's a schema but rollUp isn't in it # so the titles for rollup properties aren't in the main sheet # so we need to try to get the titles from a subsheet elif (self.use_titles and parent_name + key in self.rollup and self.schema_parser. sub_sheet_titles.get(( parent_name, key, )) in self.schema_parser.sub_sheets): relevant_subsheet = self.schema_parser.sub_sheets.get( self.schema_parser. sub_sheet_titles.get(( parent_name, key, ))) if relevant_subsheet is not None: rollup_field_title = sheet_key_title( relevant_subsheet, parent_name + key + "/0/" + k, ) flattened_dict[sheet_key( sheet, rollup_field_title)] = v # We don't want titles even though there's a schema elif not self.use_titles and ( parent_name + key + "/0/" + k in self.schema_parser. main_sheet or parent_name + key in self.rollup): flattened_dict[sheet_key( sheet, parent_name + key + "/0/" + k)] = v # No schema, so no titles elif parent_name + key in self.rollup: flattened_dict[sheet_key( sheet, parent_name + key + "/0/" + k)] = v elif len(value) > 1: for k in set( sum((list(x.keys()) for x in value), [])): if (self.preserve_fields and parent_name + key + "/" + k not in self.preserve_fields): continue if (self.schema_parser and parent_name + key + "/0/" + k in self.schema_parser.main_sheet): warn( _('More than one value supplied for "{}". Could not provide rollup, so adding a warning to the relevant cell(s) in the spreadsheet.' ).format(parent_name + key)) flattened_dict[sheet_key( sheet, parent_name + key + "/0/" + k )] = _( "WARNING: More than one value supplied, consult the relevant sub-sheet for the data." ) elif parent_name + key in self.rollup: warn( _('More than one value supplied for "{}". Could not provide rollup, so adding a warning to the relevant cell(s) in the spreadsheet.' ).format(parent_name + key)) flattened_dict[sheet_key( sheet, parent_name + key + "/0/" + k )] = _( "WARNING: More than one value supplied, consult the relevant sub-sheet for the data." ) if (self.use_titles and self.schema_parser and ( parent_name, key, ) in self.schema_parser.sub_sheet_titles): sub_sheet_name = self.schema_parser.sub_sheet_titles[( parent_name, key, )] else: sub_sheet_name = make_sub_sheet_name( parent_name, key, truncation_length=self.truncation_length) if sub_sheet_name not in self.sub_sheets: self.sub_sheets[sub_sheet_name] = Sheet( name=sub_sheet_name) for json_dict in value: if json_dict is None: continue self.parse_json_dict( json_dict, sheet=self.sub_sheets[sub_sheet_name], json_key=key, parent_id_fields=parent_id_fields, parent_name=parent_name + key + "/0/", top_level_of_sub_sheet=True, ) else: raise ValueError(_("Unsupported type {}").format(type(value))) if top: sheet.lines.append(flattened_dict)
def parse_schema_dict( self, parent_path, schema_dict, parent_id_fields=None, title_lookup=None, parent_title="", ): if parent_path: parent_path = parent_path + "/" parent_id_fields = parent_id_fields or [] title_lookup = self.title_lookup if title_lookup is None else title_lookup if ("type" in schema_dict and schema_dict["type"] == "array" and "items" in schema_dict and "oneOf" in schema_dict["items"]): for oneOf in schema_dict["items"]["oneOf"]: if "type" in oneOf and oneOf["type"] == "object": for field, child_title in self.parse_schema_dict( parent_path, oneOf, parent_id_fields=parent_id_fields, title_lookup=title_lookup, parent_title=parent_title, ): yield (field, child_title) elif "properties" in schema_dict: if "id" in schema_dict["properties"]: if self.use_titles: id_fields = parent_id_fields + [ (parent_title if parent_title is not None else parent_path) + (schema_dict["properties"]["id"].get("title") or "id") ] else: id_fields = parent_id_fields + [parent_path + "id"] else: id_fields = parent_id_fields for property_name, property_schema_dict in schema_dict[ "properties"].items(): if self.exclude_deprecated_fields and property_schema_dict.get( "deprecated"): continue property_type_set = get_property_type_set(property_schema_dict) title = property_schema_dict.get("title") if title: title_lookup[title] = TitleLookup() title_lookup[title].property_name = property_name if "object" in property_type_set: self.flattened[parent_path + property_name] = "object" for field, child_title in self.parse_schema_dict( parent_path + property_name, property_schema_dict, parent_id_fields=id_fields, title_lookup=title_lookup.get(title), parent_title=parent_title + title + ":" if parent_title is not None and title else None, ): yield ( property_name + "/" + field, # TODO ambiguous use of "title" (title + ":" + child_title if title and child_title else None), ) elif "array" in property_type_set: flattened_key = parent_path.replace("/0/", "/") + property_name self.flattened[flattened_key] = "array" type_set = get_property_type_set( property_schema_dict["items"]) if "string" in type_set or not type_set: self.flattened[flattened_key] = "string_array" yield property_name, title elif "number" in type_set: self.flattened[flattened_key] = "number_array" yield property_name, title elif "array" in type_set: self.flattened[flattened_key] = "array_array" nested_type_set = get_property_type_set( property_schema_dict["items"]["items"]) if "string" in nested_type_set or "number" in nested_type_set: yield property_name, title else: raise ValueError elif "object" in type_set: if title: title_lookup[title].property_name = property_name sub_sheet_name = make_sub_sheet_name( parent_path, property_name, truncation_length=self.truncation_length, ) # self.sub_sheet_mapping[parent_name+'/'+property_name] = sub_sheet_name if sub_sheet_name not in self.sub_sheets: self.sub_sheets[sub_sheet_name] = Sheet( root_id=self.root_id, name=sub_sheet_name) sub_sheet = self.sub_sheets[sub_sheet_name] sub_sheet.title_lookup = title_lookup.get(title) for field in id_fields: sub_sheet.add_field(field, id_field=True) sub_sheet.titles[title_lookup.lookup_header( field)] = field fields = self.parse_schema_dict( parent_path + property_name + "/0", property_schema_dict["items"], parent_id_fields=id_fields, title_lookup=title_lookup.get(title), parent_title=parent_title + title + ":" if parent_title is not None and title else None, ) rollup_fields = set() for field, child_title in fields: full_path = parent_path + property_name + "/0/" + field if self.use_titles: if not child_title or parent_title is None: warn( "Field {}{}/0/{} is missing a title, skipping." .format(parent_path, property_name, field)) elif not title: warn( "Field {}{} does not have a title, skipping it and all its children." .format(parent_path, property_name)) else: # This code only works for arrays that are at 0 or 1 layer of nesting full_title = (parent_title + title + ":" + child_title) sub_sheet.add_field(full_title) sub_sheet.titles[full_path] = full_title else: sub_sheet.add_field(full_path) if (self.do_rollup and "rollUp" in property_schema_dict and field in property_schema_dict["rollUp"]): rollup_fields.add(field) self.rollup.add(full_path) yield property_name + "/0/" + field, ( title + ":" + child_title if title and child_title else None) # Check that all items in rollUp are in the schema if self.do_rollup and "rollUp" in property_schema_dict: missedRollUp = ( set(property_schema_dict["rollUp"]) - rollup_fields) if missedRollUp: warn("{} in rollUp but not in schema".format( ", ".join(missedRollUp))) else: raise ValueError( 'Unknown type_set: {}, did you forget to explicity set the "type" key on "items"?' .format(type_set)) elif "string" in property_type_set or not property_type_set: self.flattened[parent_path.replace("/0/", "/") + property_name] = "string" yield property_name, title elif "number" in property_type_set: self.flattened[parent_path.replace("/0/", "/") + property_name] = "number" yield property_name, title elif "integer" in property_type_set: self.flattened[parent_path.replace("/0/", "/") + property_name] = "integer" yield property_name, title elif "boolean" in property_type_set: self.flattened[parent_path.replace("/0/", "/") + property_name] = "boolean" yield property_name, title else: warn( 'Unrecognised types {} for property "{}" with context "{}",' "so this property has been ignored.".format( repr(property_type_set), property_name, parent_path)) else: warn('Skipping field "{}", because it has no properties.'.format( parent_path))
def parse_schema_dict(self, parent_name, parent_path, schema_dict, parent_id_fields=None, title_lookup=None): if parent_path: parent_path = parent_path + '/' parent_id_fields = parent_id_fields or [] title_lookup = self.title_lookup if title_lookup is None else title_lookup if 'properties' in schema_dict: if 'id' in schema_dict['properties']: id_fields = parent_id_fields + [parent_name + '/id'] else: id_fields = parent_id_fields for property_name, property_schema_dict in schema_dict[ 'properties'].items(): property_type_set = get_property_type_set(property_schema_dict) title = property_schema_dict.get('title') if title: title_lookup[title] = TitleLookup() title_lookup[title].property_name = property_name if 'object' in property_type_set: self.flattened[parent_path + property_name] = "object" for field, child_title in self.parse_schema_dict( parent_name + '/' + property_name, parent_path + property_name, property_schema_dict, parent_id_fields=id_fields, title_lookup=title_lookup.get(title)): yield ( property_name + '/' + field, # TODO ambiguous use of "title" (title + ':' + child_title if title and child_title else None)) elif 'array' in property_type_set: self.flattened[parent_path + property_name] = "array" type_set = get_property_type_set( property_schema_dict['items']) if 'string' in type_set: self.flattened[parent_path + property_name] = "string_array" yield property_name + ':array', title elif 'array' in type_set: self.flattened[parent_path + property_name] = "array_array" if 'string' in get_property_type_set( property_schema_dict['items']['items']): yield property_name + ':array', title else: raise ValueError elif 'object' in type_set: if title: title_lookup[ title].property_name = property_name + '[]' if hasattr(property_schema_dict['items'], '__reference__'): sub_sheet_name = property_schema_dict[ 'items'].__reference__['$ref'].split('/')[-1] else: sub_sheet_name = property_name self.sub_sheet_mapping[parent_name + '/' + property_name] = sub_sheet_name if sub_sheet_name not in self.sub_sheets: self.sub_sheets[sub_sheet_name] = Sheet( root_id=self.root_id, name=sub_sheet_name) sub_sheet = self.sub_sheets[sub_sheet_name] sub_sheet.title_lookup = title_lookup.get(title) for field in id_fields: sub_sheet.add_field(field + ':' + property_name, id_field=True) fields = self.parse_schema_dict( parent_name + '/' + property_name + '[]', parent_path + property_name, property_schema_dict['items'], parent_id_fields=id_fields, title_lookup=title_lookup.get(title)) rolledUp = set() for field, child_title in fields: if self.use_titles: if not child_title: warn( 'Field {} does not have a title, skipping.' .format(field)) else: sub_sheet.add_field(child_title) else: sub_sheet.add_field(field) if self.rollup and 'rollUp' in property_schema_dict and field in property_schema_dict[ 'rollUp']: rolledUp.add(field) yield property_name + '[]/' + field, ( title + ':' + child_title if title and child_title else None) # Check that all items in rollUp are in the schema if self.rollup and 'rollUp' in property_schema_dict: missedRollUp = set( property_schema_dict['rollUp']) - rolledUp if missedRollUp: warn('{} in rollUp but not in schema'.format( ', '.join(missedRollUp))) else: raise ValueError elif 'string' in property_type_set: self.flattened[parent_path + property_name] = "string" yield property_name, title elif 'number' in property_type_set: self.flattened[parent_path + property_name] = "number" yield property_name + ':number', title elif 'integer' in property_type_set: self.flattened[parent_path + property_name] = "integer" yield property_name + ':integer', title elif 'boolean' in property_type_set: self.flattened[parent_path + property_name] = "boolean" yield property_name + ':boolean', title else: warn( 'Unrecognised types {} for property "{}" with context "{}",' 'so this property has been ignored.'.format( repr(property_type_set), property_name, parent_name)) else: warn('Skipping field "{}", because it has no properties.'.format( parent_name))
def __init__(self, main_sheet, sub_sheets): self.main_sheet = Sheet(main_sheet) self.sub_sheets = {k: Sheet(v) for k, v in sub_sheets.items()}
def parse_json_dict(self, json_dict, sheet, json_key=None, parent_name='', flattened_dict=None, parent_id_fields=None, top_level_of_sub_sheet=False): """ Parse a json dictionary. json_dict - the json dictionary sheet - a sheet.Sheet object representing the resulting spreadsheet json_key - the key that maps to this JSON dict, either directly to the dict, or to a dict that this list contains. Is None if this dict is contained in root_json_list directly. """ # Possibly main_sheet should be main_sheet_columns, but this is # currently named for consistency with schema.py if self.use_titles: sheet_key = sheet_key_title else: sheet_key = sheet_key_field parent_id_fields = copy.copy(parent_id_fields) or OrderedDict() if flattened_dict is None: flattened_dict = {} top = True else: top = False if parent_name == '' and self.filter_field and self.filter_value: if self.filter_field not in json_dict: return if json_dict[self.filter_field] != self.filter_value: return if top_level_of_sub_sheet: # Only add the IDs for the top level of object in an array for k, v in parent_id_fields.items(): if self.xml: flattened_dict[sheet_key(sheet, k)] = v['#text'] else: flattened_dict[sheet_key(sheet, k)] = v if self.root_id and self.root_id in json_dict: parent_id_fields[sheet_key(sheet, self.root_id)] = json_dict[self.root_id] if self.id_name in json_dict: parent_id_fields[sheet_key(sheet, parent_name + self.id_name)] = json_dict[self.id_name] for key, value in json_dict.items(): if type(value) in BASIC_TYPES: if self.xml and key == '#text': # Handle the text output from xmltodict key = '' parent_name = parent_name.strip('/') flattened_dict[sheet_key(sheet, parent_name + key)] = value elif hasattr(value, 'items'): self.parse_json_dict(value, sheet=sheet, json_key=key, parent_name=parent_name + key + '/', flattened_dict=flattened_dict, parent_id_fields=parent_id_fields) elif hasattr(value, '__iter__'): if all(type(x) in BASIC_TYPES for x in value): # Check for an array of BASIC types # TODO Make this check the schema # TODO Error if the any of the values contain the seperator # TODO Support doubly nested arrays flattened_dict[sheet_key(sheet, parent_name + key)] = ';'.join( map(six.text_type, value)) else: if self.rollup and parent_name == '': # Rollup only currently possible to main sheet if len(value) == 1: for k, v in value[0].items(): if self.use_titles and parent_name + key + '/0/' + k in self.schema_parser.main_sheet.titles: if type(v) in BASIC_TYPES: flattened_dict[sheet_key_title( sheet, parent_name + key + '/0/' + k)] = v else: raise ValueError( 'Rolled up values must be basic types' ) elif not self.use_titles and parent_name + key + '/0/' + k in self.schema_parser.main_sheet: if type(v) in BASIC_TYPES: flattened_dict[sheet_key( sheet, parent_name + key + '/0/' + k)] = v else: raise ValueError( 'Rolled up values must be basic types' ) elif len(value) > 1: for k in set( sum((list(x.keys()) for x in value), [])): warn( 'More than one value supplied for "{}". Could not provide rollup, so adding a warning to the relevant cell(s) in the spreadsheet.' .format(parent_name + key)) if parent_name + key + '/0/' + k in self.schema_parser.main_sheet: flattened_dict[sheet_key( sheet, parent_name + key + '/0/' + k )] = 'WARNING: More than one value supplied, consult the relevant sub-sheet for the data.' sub_sheet_name = make_sub_sheet_name(parent_name, key) if sub_sheet_name not in self.sub_sheets: self.sub_sheets[sub_sheet_name] = Sheet( name=sub_sheet_name) for json_dict in value: self.parse_json_dict( json_dict, sheet=self.sub_sheets[sub_sheet_name], json_key=key, parent_id_fields=parent_id_fields, parent_name=parent_name + key + '/0/', top_level_of_sub_sheet=True) else: raise ValueError('Unsupported type {}'.format(type(value))) if top: sheet.lines.append(flattened_dict)
class SchemaParser(object): """Parse the fields of a JSON schema into a flattened structure.""" def __init__(self, schema_filename=None, root_schema_dict=None, main_sheet_name='main', rollup=False, root_id='ocid', use_titles=False): self.sub_sheets = {} self.main_sheet = Sheet() self.sub_sheet_mapping = {} self.main_sheet_name = main_sheet_name self.rollup = rollup self.root_id = root_id self.use_titles = use_titles self.title_lookup = TitleLookup() self.flattened = {} if root_schema_dict is None and schema_filename is None: raise ValueError( 'One of schema_filename or root_schema_dict must be supplied') if root_schema_dict is not None and schema_filename is not None: raise ValueError( 'Only one of schema_filename or root_schema_dict should be supplied' ) if schema_filename: if schema_filename.startswith('http'): import requests r = requests.get(schema_filename) self.root_schema_dict = jsonref.loads( r.text, object_pairs_hook=OrderedDict) else: with codecs.open(schema_filename, encoding="utf-8") as schema_file: self.root_schema_dict = jsonref.load( schema_file, object_pairs_hook=OrderedDict) else: self.root_schema_dict = root_schema_dict def parse(self): fields = self.parse_schema_dict(self.main_sheet_name, '', self.root_schema_dict) for field, title in fields: if self.use_titles: if not title: warn('Field {} does not have a title, skipping.'.format( field)) else: self.main_sheet.append(title) else: self.main_sheet.append(field) def parse_schema_dict(self, parent_name, parent_path, schema_dict, parent_id_fields=None, title_lookup=None): if parent_path: parent_path = parent_path + '/' parent_id_fields = parent_id_fields or [] title_lookup = self.title_lookup if title_lookup is None else title_lookup if 'properties' in schema_dict: if 'id' in schema_dict['properties']: id_fields = parent_id_fields + [parent_name + '/id'] else: id_fields = parent_id_fields for property_name, property_schema_dict in schema_dict[ 'properties'].items(): property_type_set = get_property_type_set(property_schema_dict) title = property_schema_dict.get('title') if title: title_lookup[title] = TitleLookup() title_lookup[title].property_name = property_name if 'object' in property_type_set: self.flattened[parent_path + property_name] = "object" for field, child_title in self.parse_schema_dict( parent_name + '/' + property_name, parent_path + property_name, property_schema_dict, parent_id_fields=id_fields, title_lookup=title_lookup.get(title)): yield ( property_name + '/' + field, # TODO ambiguous use of "title" (title + ':' + child_title if title and child_title else None)) elif 'array' in property_type_set: self.flattened[parent_path + property_name] = "array" type_set = get_property_type_set( property_schema_dict['items']) if 'string' in type_set: self.flattened[parent_path + property_name] = "string_array" yield property_name + ':array', title elif 'array' in type_set: self.flattened[parent_path + property_name] = "array_array" if 'string' in get_property_type_set( property_schema_dict['items']['items']): yield property_name + ':array', title else: raise ValueError elif 'object' in type_set: if title: title_lookup[ title].property_name = property_name + '[]' if hasattr(property_schema_dict['items'], '__reference__'): sub_sheet_name = property_schema_dict[ 'items'].__reference__['$ref'].split('/')[-1] else: sub_sheet_name = property_name self.sub_sheet_mapping[parent_name + '/' + property_name] = sub_sheet_name if sub_sheet_name not in self.sub_sheets: self.sub_sheets[sub_sheet_name] = Sheet( root_id=self.root_id, name=sub_sheet_name) sub_sheet = self.sub_sheets[sub_sheet_name] sub_sheet.title_lookup = title_lookup.get(title) for field in id_fields: sub_sheet.add_field(field + ':' + property_name, id_field=True) fields = self.parse_schema_dict( parent_name + '/' + property_name + '[]', parent_path + property_name, property_schema_dict['items'], parent_id_fields=id_fields, title_lookup=title_lookup.get(title)) rolledUp = set() for field, child_title in fields: if self.use_titles: if not child_title: warn( 'Field {} does not have a title, skipping.' .format(field)) else: sub_sheet.add_field(child_title) else: sub_sheet.add_field(field) if self.rollup and 'rollUp' in property_schema_dict and field in property_schema_dict[ 'rollUp']: rolledUp.add(field) yield property_name + '[]/' + field, ( title + ':' + child_title if title and child_title else None) # Check that all items in rollUp are in the schema if self.rollup and 'rollUp' in property_schema_dict: missedRollUp = set( property_schema_dict['rollUp']) - rolledUp if missedRollUp: warn('{} in rollUp but not in schema'.format( ', '.join(missedRollUp))) else: raise ValueError elif 'string' in property_type_set: self.flattened[parent_path + property_name] = "string" yield property_name, title elif 'number' in property_type_set: self.flattened[parent_path + property_name] = "number" yield property_name + ':number', title elif 'integer' in property_type_set: self.flattened[parent_path + property_name] = "integer" yield property_name + ':integer', title elif 'boolean' in property_type_set: self.flattened[parent_path + property_name] = "boolean" yield property_name + ':boolean', title else: warn( 'Unrecognised types {} for property "{}" with context "{}",' 'so this property has been ignored.'.format( repr(property_type_set), property_name, parent_name)) else: warn('Skipping field "{}", because it has no properties.'.format( parent_name))
class SchemaParser(object): """Parse the fields of a JSON schema into a flattened structure.""" def __init__(self, schema_filename=None, root_schema_dict=None, rollup=False, root_id=None, use_titles=False, disable_local_refs=False, truncation_length=3, exclude_deprecated_fields=False): self.sub_sheets = {} self.main_sheet = Sheet() self.sub_sheet_mapping = {} self.rollup = rollup self.root_id = root_id self.use_titles = use_titles self.truncation_length = truncation_length self.title_lookup = TitleLookup() self.flattened = {} self.exclude_deprecated_fields = exclude_deprecated_fields if root_schema_dict is None and schema_filename is None: raise ValueError('One of schema_filename or root_schema_dict must be supplied') if root_schema_dict is not None and schema_filename is not None: raise ValueError('Only one of schema_filename or root_schema_dict should be supplied') if schema_filename: if schema_filename.startswith('http'): import requests r = requests.get(schema_filename) self.root_schema_dict = jsonref.loads(r.text, object_pairs_hook=OrderedDict) else: if disable_local_refs: with codecs.open(schema_filename, encoding="utf-8") as schema_file: self.root_schema_dict = jsonref.load(schema_file, object_pairs_hook=OrderedDict, loader=JsonLoaderLocalRefsDisabled()) else: if sys.version_info[:2] > (3, 0): base_uri = pathlib.Path(os.path.realpath(schema_filename)).as_uri() else: base_uri = urlparse.urljoin('file:', urllib.pathname2url(os.path.abspath(schema_filename))) with codecs.open(schema_filename, encoding="utf-8") as schema_file: self.root_schema_dict = jsonref.load(schema_file, object_pairs_hook=OrderedDict, base_uri=base_uri) else: self.root_schema_dict = root_schema_dict def parse(self): fields = self.parse_schema_dict('', self.root_schema_dict) for field, title in fields: if self.use_titles: if not title: warn('Field {} does not have a title, skipping.'.format(field)) else: self.main_sheet.append(title) self.main_sheet.titles[field] = title else: self.main_sheet.append(field) def parse_schema_dict(self, parent_path, schema_dict, parent_id_fields=None, title_lookup=None, parent_title=''): if parent_path: parent_path = parent_path + '/' parent_id_fields = parent_id_fields or [] title_lookup = self.title_lookup if title_lookup is None else title_lookup if 'type' in schema_dict and schema_dict['type'] == 'array' \ and 'items' in schema_dict and 'oneOf' in schema_dict['items']: for oneOf in schema_dict['items']['oneOf']: if 'type' in oneOf and oneOf['type'] == 'object': for field, child_title in self.parse_schema_dict( parent_path, oneOf, parent_id_fields=parent_id_fields, title_lookup=title_lookup, parent_title=parent_title): yield ( field, child_title ) elif 'properties' in schema_dict: if 'id' in schema_dict['properties']: if self.use_titles: id_fields = parent_id_fields + [(parent_title if parent_title is not None else parent_path)+(schema_dict['properties']['id'].get('title') or 'id')] else: id_fields = parent_id_fields + [parent_path+'id'] else: id_fields = parent_id_fields for property_name, property_schema_dict in schema_dict['properties'].items(): if self.exclude_deprecated_fields and property_schema_dict.get('deprecated'): continue property_type_set = get_property_type_set(property_schema_dict) title = property_schema_dict.get('title') if title: title_lookup[title] = TitleLookup() title_lookup[title].property_name = property_name if 'object' in property_type_set: self.flattened[parent_path+property_name] = "object" for field, child_title in self.parse_schema_dict( parent_path+property_name, property_schema_dict, parent_id_fields=id_fields, title_lookup=title_lookup.get(title), parent_title=parent_title+title+':' if parent_title is not None and title else None): yield ( property_name+'/'+field, # TODO ambiguous use of "title" (title+':'+child_title if title and child_title else None) ) elif 'array' in property_type_set: flattened_key = parent_path.replace('/0/', '/')+property_name self.flattened[flattened_key] = "array" type_set = get_property_type_set(property_schema_dict['items']) if 'string' in type_set or not type_set: self.flattened[flattened_key] = "string_array" yield property_name, title elif 'number' in type_set: self.flattened[flattened_key] = "number_array" yield property_name, title elif 'array' in type_set: self.flattened[flattened_key] = "array_array" nested_type_set = get_property_type_set(property_schema_dict['items']['items']) if 'string' in nested_type_set or 'number' in nested_type_set: yield property_name, title else: raise ValueError elif 'object' in type_set: if title: title_lookup[title].property_name = property_name sub_sheet_name = make_sub_sheet_name(parent_path, property_name, truncation_length=self.truncation_length) #self.sub_sheet_mapping[parent_name+'/'+property_name] = sub_sheet_name if sub_sheet_name not in self.sub_sheets: self.sub_sheets[sub_sheet_name] = Sheet(root_id=self.root_id, name=sub_sheet_name) sub_sheet = self.sub_sheets[sub_sheet_name] sub_sheet.title_lookup = title_lookup.get(title) for field in id_fields: sub_sheet.add_field(field, id_field=True) sub_sheet.titles[title_lookup.lookup_header(field)] = field fields = self.parse_schema_dict( parent_path+property_name+'/0', property_schema_dict['items'], parent_id_fields=id_fields, title_lookup=title_lookup.get(title), parent_title=parent_title+title+':' if parent_title is not None and title else None) rolledUp = set() for field, child_title in fields: full_path = parent_path+property_name+'/0/'+field if self.use_titles: if not child_title or parent_title is None: warn('Field {}{}/0/{} is missing a title, skipping.'.format(parent_path, property_name, field)) elif not title: warn('Field {}{} does not have a title, skipping it and all its children.'.format(parent_path, property_name)) else: # This code only works for arrays that are at 0 or 1 layer of nesting full_title = parent_title+title+':'+child_title sub_sheet.add_field(full_title) sub_sheet.titles[full_path] = full_title else: sub_sheet.add_field(full_path) if self.rollup and 'rollUp' in property_schema_dict and field in property_schema_dict['rollUp']: rolledUp.add(field) yield property_name+'/0/'+field, (title+':'+child_title if title and child_title else None) # Check that all items in rollUp are in the schema if self.rollup and 'rollUp' in property_schema_dict: missedRollUp = set(property_schema_dict['rollUp']) - rolledUp if missedRollUp: warn('{} in rollUp but not in schema'.format(', '.join(missedRollUp))) else: raise ValueError('Unknown type_set: {}, did you forget to explicity set the "type" key on "items"?'.format(type_set)) elif 'string' in property_type_set or not property_type_set: self.flattened[parent_path.replace('/0/', '/')+property_name] = "string" yield property_name, title elif 'number' in property_type_set: self.flattened[parent_path.replace('/0/', '/')+property_name] = "number" yield property_name, title elif 'integer' in property_type_set: self.flattened[parent_path.replace('/0/', '/')+property_name] = "integer" yield property_name, title elif 'boolean' in property_type_set: self.flattened[parent_path.replace('/0/', '/')+property_name] = "boolean" yield property_name, title else: warn('Unrecognised types {} for property "{}" with context "{}",' 'so this property has been ignored.'.format( repr(property_type_set), property_name, parent_path)) else: warn('Skipping field "{}", because it has no properties.'.format(parent_path))
class SchemaParser(object): """Parse the fields of a JSON schema into a flattened structure.""" def __init__( self, schema_filename=None, root_schema_dict=None, rollup=False, root_id=None, use_titles=False, disable_local_refs=False, truncation_length=3, exclude_deprecated_fields=False, ): self.sub_sheets = {} self.main_sheet = Sheet() self.sub_sheet_mapping = {} self.do_rollup = rollup self.rollup = set() self.root_id = root_id self.use_titles = use_titles self.sub_sheet_titles = {} self.truncation_length = truncation_length self.title_lookup = TitleLookup() self.flattened = {} self.exclude_deprecated_fields = exclude_deprecated_fields if root_schema_dict is None and schema_filename is None: raise ValueError( _("One of schema_filename or root_schema_dict must be supplied" )) if root_schema_dict is not None and schema_filename is not None: raise ValueError( _("Only one of schema_filename or root_schema_dict should be supplied" )) if schema_filename: if schema_filename.startswith("http"): import requests r = requests.get(schema_filename) self.root_schema_dict = jsonref.loads( r.text, object_pairs_hook=OrderedDict) else: if disable_local_refs: with codecs.open(schema_filename, encoding="utf-8") as schema_file: self.root_schema_dict = jsonref.load( schema_file, object_pairs_hook=OrderedDict, loader=JsonLoaderLocalRefsDisabled(), ) else: if sys.version_info[:2] > (3, 0): base_uri = pathlib.Path( os.path.realpath(schema_filename)).as_uri() else: base_uri = urlparse.urljoin( "file:", urllib.pathname2url( os.path.abspath(schema_filename)), ) with codecs.open(schema_filename, encoding="utf-8") as schema_file: self.root_schema_dict = jsonref.load( schema_file, object_pairs_hook=OrderedDict, base_uri=base_uri, ) else: self.root_schema_dict = root_schema_dict def parse(self): fields = self.parse_schema_dict("", self.root_schema_dict) for field, title in fields: if self.use_titles: if not title: warn( _("Field {} does not have a title, skipping.").format( field)) else: self.main_sheet.append(title) self.main_sheet.titles[field] = title else: self.main_sheet.append(field) def parse_schema_dict( self, parent_path, schema_dict, parent_id_fields=None, title_lookup=None, parent_title="", ): if parent_path: parent_path = parent_path + "/" parent_id_fields = parent_id_fields or [] title_lookup = self.title_lookup if title_lookup is None else title_lookup if ("type" in schema_dict and schema_dict["type"] == "array" and "items" in schema_dict and "oneOf" in schema_dict["items"]): for oneOf in schema_dict["items"]["oneOf"]: if "type" in oneOf and oneOf["type"] == "object": for field, child_title in self.parse_schema_dict( parent_path, oneOf, parent_id_fields=parent_id_fields, title_lookup=title_lookup, parent_title=parent_title, ): yield (field, child_title) elif "properties" in schema_dict: if "id" in schema_dict["properties"]: if self.use_titles: id_fields = parent_id_fields + [ (parent_title if parent_title is not None else parent_path) + (schema_dict["properties"]["id"].get("title") or "id") ] else: id_fields = parent_id_fields + [parent_path + "id"] else: id_fields = parent_id_fields for property_name, property_schema_dict in schema_dict[ "properties"].items(): if self.exclude_deprecated_fields and property_schema_dict.get( "deprecated"): continue if (self.exclude_deprecated_fields and hasattr(property_schema_dict, "__reference__") and property_schema_dict.__reference__.get("deprecated")): continue property_type_set = get_property_type_set(property_schema_dict) if (hasattr(property_schema_dict, "__reference__") and "title" in property_schema_dict.__reference__): title = property_schema_dict.__reference__["title"] else: title = property_schema_dict.get("title") if title: title_lookup[title] = TitleLookup() title_lookup[title].property_name = property_name if "object" in property_type_set: self.flattened[parent_path + property_name] = "object" for field, child_title in self.parse_schema_dict( parent_path + property_name, property_schema_dict, parent_id_fields=id_fields, title_lookup=title_lookup.get(title), parent_title=parent_title + title + ":" if parent_title is not None and title else None, ): yield ( property_name + "/" + field, # TODO ambiguous use of "title" (title + ":" + child_title if title and child_title else None), ) elif "array" in property_type_set: flattened_key = parent_path.replace("/0/", "/") + property_name self.flattened[flattened_key] = "array" type_set = get_property_type_set( property_schema_dict["items"]) if "string" in type_set or not type_set: self.flattened[flattened_key] = "string_array" yield property_name, title elif "number" in type_set: self.flattened[flattened_key] = "number_array" yield property_name, title elif "array" in type_set: self.flattened[flattened_key] = "array_array" nested_type_set = get_property_type_set( property_schema_dict["items"]["items"]) if "string" in nested_type_set or "number" in nested_type_set: yield property_name, title else: raise ValueError elif "object" in type_set: if title: title_lookup[title].property_name = property_name if self.use_titles and parent_title is not None: sub_sheet_name = make_sub_sheet_name( parent_title, title or property_name, truncation_length=self.truncation_length, path_separator=":", ) self.sub_sheet_titles[( parent_path, property_name, )] = sub_sheet_name else: sub_sheet_name = make_sub_sheet_name( parent_path, property_name, truncation_length=self.truncation_length, ) # self.sub_sheet_mapping[parent_name+'/'+property_name] = sub_sheet_name if sub_sheet_name not in self.sub_sheets: self.sub_sheets[sub_sheet_name] = Sheet( root_id=self.root_id, name=sub_sheet_name) sub_sheet = self.sub_sheets[sub_sheet_name] sub_sheet.title_lookup = title_lookup.get(title) for field in id_fields: sub_sheet.add_field(field, id_field=True) sub_sheet.titles[title_lookup.lookup_header( field)] = field fields = self.parse_schema_dict( parent_path + property_name + "/0", property_schema_dict["items"], parent_id_fields=id_fields, title_lookup=title_lookup.get(title), parent_title=parent_title + title + ":" if parent_title is not None and title else None, ) rollup_fields = set() for field, child_title in fields: full_path = parent_path + property_name + "/0/" + field if self.use_titles: if not child_title or parent_title is None: warn( _("Field {}{}/0/{} is missing a title, skipping." ).format(parent_path, property_name, field)) elif not title: warn( _("Field {}{} does not have a title, skipping it and all its children." ).format(parent_path, property_name)) else: # This code only works for arrays that are at 0 or 1 layer of nesting full_title = (parent_title + title + ":" + child_title) sub_sheet.add_field(full_title) sub_sheet.titles[full_path] = full_title else: sub_sheet.add_field(full_path) if (self.do_rollup and "rollUp" in property_schema_dict and field in property_schema_dict["rollUp"]): rollup_fields.add(field) self.rollup.add(full_path) yield property_name + "/0/" + field, ( title + ":" + child_title if title and child_title else None) # Check that all items in rollUp are in the schema if self.do_rollup and "rollUp" in property_schema_dict: missedRollUp = ( set(property_schema_dict["rollUp"]) - rollup_fields) if missedRollUp: warn("{} in rollUp but not in schema".format( ", ".join(missedRollUp))) else: raise ValueError( _('Unknown type_set: {}, did you forget to explicity set the "type" key on "items"?' ).format(type_set)) elif "string" in property_type_set or not property_type_set: # We only check for date here, because its the only format # for which we need to specially transform the input if property_schema_dict.get("format") == "date": self.flattened[parent_path.replace("/0/", "/") + property_name] = "date" else: self.flattened[parent_path.replace("/0/", "/") + property_name] = "string" yield property_name, title elif "number" in property_type_set: self.flattened[parent_path.replace("/0/", "/") + property_name] = "number" yield property_name, title elif "integer" in property_type_set: self.flattened[parent_path.replace("/0/", "/") + property_name] = "integer" yield property_name, title elif "boolean" in property_type_set: self.flattened[parent_path.replace("/0/", "/") + property_name] = "boolean" yield property_name, title else: warn( _('Unrecognised types {} for property "{}" with context "{}",' "so this property has been ignored.").format( repr(property_type_set), property_name, parent_path)) else: warn( _('Skipping field "{}", because it has no properties.').format( parent_path))
class SchemaParser(object): """Parse the fields of a JSON schema into a flattened structure.""" def __init__( self, schema_filename=None, root_schema_dict=None, main_sheet_name="main", rollup=False, root_id="ocid", use_titles=False, ): self.sub_sheets = {} self.main_sheet = Sheet() self.sub_sheet_mapping = {} self.main_sheet_name = main_sheet_name self.rollup = rollup self.root_id = root_id self.use_titles = use_titles if root_schema_dict is None and schema_filename is None: raise ValueError("One of schema_filename or root_schema_dict must be supplied") if root_schema_dict is not None and schema_filename is not None: raise ValueError("Only one of schema_filename or root_schema_dict should be supplied") if schema_filename: if schema_filename.startswith("http"): import requests r = requests.get(schema_filename) self.root_schema_dict = jsonref.loads(r.text, object_pairs_hook=OrderedDict) else: with codecs.open(schema_filename, encoding="utf-8") as schema_file: self.root_schema_dict = jsonref.load(schema_file, object_pairs_hook=OrderedDict) else: self.root_schema_dict = root_schema_dict def parse(self): fields = self.parse_schema_dict(self.main_sheet_name, self.root_schema_dict) for field, title in fields: if self.use_titles: if not title: warn("Field {} does not have a title, skipping.".format(field)) else: self.main_sheet.append(title) else: self.main_sheet.append(field) if title: self.main_sheet.titles[title] = field def parse_schema_dict(self, parent_name, schema_dict, parent_id_fields=None): parent_id_fields = parent_id_fields or [] if "properties" in schema_dict: if "id" in schema_dict["properties"]: id_fields = parent_id_fields + [parent_name + "/id"] else: id_fields = parent_id_fields for property_name, property_schema_dict in schema_dict["properties"].items(): property_type_set = get_property_type_set(property_schema_dict) title = property_schema_dict.get("title") if "object" in property_type_set: for field, child_title in self.parse_schema_dict( parent_name + "/" + property_name, property_schema_dict, parent_id_fields=id_fields ): yield property_name + "/" + field, ( title + ":" + child_title if title and child_title else None ) # TODO ambiguous use of "title" elif "array" in property_type_set: type_set = get_property_type_set(property_schema_dict["items"]) if "string" in type_set: yield property_name + ":array", title elif "array" in type_set: if "string" in get_property_type_set(property_schema_dict["items"]["items"]): yield property_name + ":array", title else: raise ValueError elif "object" in type_set: if hasattr(property_schema_dict["items"], "__reference__"): sub_sheet_name = property_schema_dict["items"].__reference__["$ref"].split("/")[-1] else: sub_sheet_name = property_name self.sub_sheet_mapping[parent_name + "/" + property_name] = sub_sheet_name if sub_sheet_name not in self.sub_sheets: self.sub_sheets[sub_sheet_name] = Sheet(root_id=self.root_id, name=sub_sheet_name) sub_sheet = self.sub_sheets[sub_sheet_name] for field in id_fields: sub_sheet.add_field(field + ":" + property_name, id_field=True) fields = self.parse_schema_dict( parent_name + "/" + property_name + "[]", property_schema_dict["items"], parent_id_fields=id_fields, ) rolledUp = set() for field, child_title in fields: if self.use_titles: if not child_title: warn("Field {} does not have a title, skipping.".format(field)) else: sub_sheet.add_field(child_title) else: sub_sheet.add_field(field) if child_title: self.sub_sheets[sub_sheet_name].titles[child_title] = field if ( self.rollup and "rollUp" in property_schema_dict and field in property_schema_dict["rollUp"] ): rolledUp.add(field) yield property_name + "[]/" + field, ( title + ":" + child_title if title and child_title else None ) # Check that all items in rollUp are in the schema if self.rollup and "rollUp" in property_schema_dict: missedRollUp = set(property_schema_dict["rollUp"]) - rolledUp if missedRollUp: warn("{} in rollUp but not in schema".format(", ".join(missedRollUp))) else: raise ValueError elif "string" in property_type_set: yield property_name, title elif "number" in property_type_set: yield property_name + ":number", title elif "integer" in property_type_set: yield property_name + ":integer", title elif "boolean" in property_type_set: yield property_name + ":boolean", title else: warn( 'Unrecognised types {} for property "{}" with context "{}",' "so this property has been ignored.".format(repr(property_type_set), property_name, parent_name) ) else: warn('Skipping field "{}", because it has no properties.'.format(parent_name))