Esempio n. 1
0
def test_empty_lines(tmpdir):
    subsheet = Sheet(root_id='ocid')
    subsheet.add_field('c')
    parser = MockParser(['a', 'd'], {'b': subsheet})
    parser.main_sheet.lines = []
    for format_name, spreadsheet_output_class in output.FORMATS.items():
        spreadsheet_output = spreadsheet_output_class(
            parser=parser,
            main_sheet_name='release',
            output_name=os.path.join(
                tmpdir.strpath,
                'release' + output.FORMATS_SUFFIX[format_name]))
        spreadsheet_output.write_sheets()

    # Check XLSX
    wb = openpyxl.load_workbook(tmpdir.join('release.xlsx').strpath)
    assert wb.sheetnames == ['release', 'b']
    rows = list(wb['release'].rows)
    assert len(rows) == 1
    assert [x.value for x in rows[0]] == ['a', 'd']
    b_rows = list(wb['b'].rows)
    assert len(b_rows) == 1
    assert [x.value for x in b_rows[0]] == ['ocid', 'c']

    # Check CSV
    assert set(tmpdir.join('release').listdir()) == set([
        tmpdir.join('release').join('release.csv'),
        tmpdir.join('release').join('b.csv')
    ])
    assert tmpdir.join('release', 'release.csv').read().strip('\r\n') == 'a,d'
    assert tmpdir.join('release', 'b.csv').read().strip('\r\n') == 'ocid,c'
Esempio n. 2
0
    def __init__(self,
                 schema_filename=None,
                 root_schema_dict=None,
                 rollup=False,
                 root_id=None,
                 use_titles=False,
                 disable_local_refs=False,
                 truncation_length=3,
                 exclude_deprecated_fields=False):
        self.sub_sheets = {}
        self.main_sheet = Sheet()
        self.sub_sheet_mapping = {}
        self.do_rollup = rollup
        self.rollup = set()
        self.root_id = root_id
        self.use_titles = use_titles
        self.truncation_length = truncation_length
        self.title_lookup = TitleLookup()
        self.flattened = {}
        self.exclude_deprecated_fields = exclude_deprecated_fields

        if root_schema_dict is None and schema_filename is None:
            raise ValueError(
                'One of schema_filename or root_schema_dict must be supplied')
        if root_schema_dict is not None and schema_filename is not None:
            raise ValueError(
                'Only one of schema_filename or root_schema_dict should be supplied'
            )
        if schema_filename:
            if schema_filename.startswith('http'):
                import requests
                r = requests.get(schema_filename)
                self.root_schema_dict = jsonref.loads(
                    r.text, object_pairs_hook=OrderedDict)
            else:
                if disable_local_refs:
                    with codecs.open(schema_filename,
                                     encoding="utf-8") as schema_file:
                        self.root_schema_dict = jsonref.load(
                            schema_file,
                            object_pairs_hook=OrderedDict,
                            loader=JsonLoaderLocalRefsDisabled())
                else:
                    if sys.version_info[:2] > (3, 0):
                        base_uri = pathlib.Path(
                            os.path.realpath(schema_filename)).as_uri()
                    else:
                        base_uri = urlparse.urljoin(
                            'file:',
                            urllib.pathname2url(
                                os.path.abspath(schema_filename)))
                    with codecs.open(schema_filename,
                                     encoding="utf-8") as schema_file:
                        self.root_schema_dict = jsonref.load(
                            schema_file,
                            object_pairs_hook=OrderedDict,
                            base_uri=base_uri)

        else:
            self.root_schema_dict = root_schema_dict
Esempio n. 3
0
def test_populated_lines(tmpdir):
    subsheet = Sheet(root_id='ocid')
    subsheet.add_field('c')
    parser = MockParser(['a'], {})
    parser.main_sheet.lines = [{'a': 'cell1'}, {'a': 'cell2'}]
    subsheet.lines = [{'c': 'cell3'}, {'c': 'cell4'}]
    parser.sub_sheets['b'] = subsheet
    for format_name, spreadsheet_output_class in output.FORMATS.items():
        spreadsheet_output = spreadsheet_output_class(
            parser=parser,
            main_sheet_name='release',
            output_name=os.path.join(tmpdir.strpath, 'release'+output.FORMATS_SUFFIX[format_name]))
        spreadsheet_output.write_sheets()

    # Check XLSX
    wb = openpyxl.load_workbook(tmpdir.join('release.xlsx').strpath)
    assert wb.get_sheet_names() == ['release', 'b']
    assert len(wb['release'].rows) == 3
    assert [ x.value for x in wb['release'].rows[0] ] == [ 'a' ]
    assert [ x.value for x in wb['release'].rows[1] ] == [ 'cell1' ]
    assert [ x.value for x in wb['release'].rows[2] ] == [ 'cell2' ]
    assert len(wb['b'].rows) == 3
    assert [ x.value for x in wb['b'].rows[0] ] == [ 'ocid', 'c' ]
    assert [ x.value for x in wb['b'].rows[1] ] == [ None, 'cell3' ]
    assert [ x.value for x in wb['b'].rows[2] ] == [ None, 'cell4' ]

    # Check CSV
    assert set(tmpdir.join('release').listdir()) == set([
        tmpdir.join('release').join('release.csv'),
        tmpdir.join('release').join('b.csv')
    ])
    assert tmpdir.join('release', 'release.csv').read().strip('\r\n').replace('\r', '') == 'a\ncell1\ncell2'
    assert tmpdir.join('release', 'b.csv').read().strip('\r\n').replace('\r', '') == 'ocid,c\n,cell3\n,cell4'
Esempio n. 4
0
def test_empty_lines(tmpdir):
    subsheet = Sheet(root_id='ocid')
    subsheet.add_field('c')
    parser = MockParser(['a', 'd'], {'b': subsheet})
    parser.main_sheet.lines = []
    for format_name, spreadsheet_output_class in output.FORMATS.items():
        spreadsheet_output = spreadsheet_output_class(
            parser=parser,
            main_sheet_name='release',
            output_name=os.path.join(tmpdir.strpath, 'release'+output.FORMATS_SUFFIX[format_name]))
        spreadsheet_output.write_sheets()

    # Check XLSX
    wb = openpyxl.load_workbook(tmpdir.join('release.xlsx').strpath)
    assert wb.get_sheet_names() == ['release', 'b']
    assert len(wb['release'].rows) == 1
    assert [ x.value for x in wb['release'].rows[0] ] == [ 'a', 'd' ]
    assert len(wb['b'].rows) == 1
    assert [ x.value for x in wb['b'].rows[0] ] == [ 'ocid', 'c' ]

    # Check CSV
    assert set(tmpdir.join('release').listdir()) == set([
        tmpdir.join('release').join('release.csv'),
        tmpdir.join('release').join('b.csv')
    ])
    assert tmpdir.join('release', 'release.csv').read().strip('\r\n') == 'a,d'
    assert tmpdir.join('release', 'b.csv').read().strip('\r\n') == 'ocid,c'
Esempio n. 5
0
def test_populated_lines(tmpdir):
    subsheet = Sheet(root_id="ocid")
    subsheet.add_field("c")
    parser = MockParser(["a"], {})
    parser.main_sheet.lines = [{"a": "cell1"}, {"a": "cell2"}]
    subsheet.lines = [{"c": "cell3"}, {"c": "cell4"}]
    parser.sub_sheets["b"] = subsheet
    for format_name, spreadsheet_output_class in output.FORMATS.items():
        spreadsheet_output = spreadsheet_output_class(
            parser=parser,
            main_sheet_name="release",
            output_name=os.path.join(
                tmpdir.strpath, "release" + output.FORMATS_SUFFIX[format_name]
            ),
        )
        spreadsheet_output.write_sheets()

    # Check XLSX
    wb = openpyxl.load_workbook(tmpdir.join("release.xlsx").strpath)
    assert wb.sheetnames == ["release", "b"]
    rows = list(wb["release"].rows)
    assert len(rows) == 3
    assert [x.value for x in rows[0]] == ["a"]
    assert [x.value for x in rows[1]] == ["cell1"]
    assert [x.value for x in rows[2]] == ["cell2"]
    b_rows = list(wb["b"].rows)
    assert len(b_rows) == 3
    assert [x.value for x in b_rows[0]] == ["ocid", "c"]
    assert [x.value for x in b_rows[1]] == [None, "cell3"]
    assert [x.value for x in b_rows[2]] == [None, "cell4"]

    # Check CSV
    assert set(tmpdir.join("release").listdir()) == set(
        [
            tmpdir.join("release").join("release.csv"),
            tmpdir.join("release").join("b.csv"),
        ]
    )
    assert (
        tmpdir.join("release", "release.csv").read().strip("\r\n").replace("\r", "")
        == "a\ncell1\ncell2"
    )
    assert (
        tmpdir.join("release", "b.csv").read().strip("\r\n").replace("\r", "")
        == "ocid,c\n,cell3\n,cell4"
    )

    # Check ODS - currently broken test
    odswb = ODSReader(tmpdir.join("release.ods").strpath)
    ods_rows = odswb.getSheet("release")
    assert len(ods_rows) == 3
    assert [x for x in ods_rows[0]] == ["a"]
    assert [x for x in ods_rows[1]] == ["cell1"]
    assert [x for x in ods_rows[2]] == ["cell2"]
    ods_b_rows = odswb.getSheet("b")
    assert len(ods_b_rows) == 3
    assert [x for x in ods_b_rows[0]] == ["ocid", "c"]
    assert [x for x in ods_b_rows[1]] == [None, "cell3"]
    assert [x for x in ods_b_rows[2]] == [None, "cell4"]
Esempio n. 6
0
    def __init__(self,
                 json_filename=None,
                 root_json_dict=None,
                 schema_parser=None,
                 root_list_path=None,
                 root_id='ocid',
                 use_titles=False,
                 xml=False,
                 id_name='id',
                 filter_field=None,
                 filter_value=None):
        self.sub_sheets = {}
        self.main_sheet = Sheet()
        self.root_list_path = root_list_path
        self.root_id = root_id
        self.use_titles = use_titles
        self.id_name = id_name
        self.xml = xml
        self.filter_field = filter_field
        self.filter_value = filter_value
        if schema_parser:
            self.main_sheet = schema_parser.main_sheet
            self.sub_sheets = schema_parser.sub_sheets
            # Rollup is pulled from the schema_parser, as rollup is only possible if a schema parser is specified
            self.rollup = schema_parser.rollup
            self.schema_parser = schema_parser
        else:
            self.rollup = False

        if self.xml:
            with codecs.open(json_filename, 'rb') as xml_file:
                top_dict = xmltodict.parse(
                    xml_file,
                    force_list=(root_list_path, ),
                    force_cdata=True,
                )
                # AFAICT, this should be true for *all* XML files
                assert len(top_dict) == 1
                root_json_dict = list(top_dict.values())[0]
            json_filename = None

        if json_filename is None and root_json_dict is None:
            raise ValueError(
                'Etiher json_filename or root_json_dict must be supplied')

        if json_filename is not None and root_json_dict is not None:
            raise ValueError(
                'Only one of json_file or root_json_dict should be supplied')

        if json_filename:
            with codecs.open(json_filename, encoding='utf-8') as json_file:
                try:
                    self.root_json_dict = json.load(
                        json_file,
                        object_pairs_hook=OrderedDict,
                        parse_float=Decimal)
                except ValueError as err:
                    raise BadlyFormedJSONError(*err.args)
        else:
            self.root_json_dict = root_json_dict
Esempio n. 7
0
    def __init__(
        self,
        schema_filename=None,
        root_schema_dict=None,
        main_sheet_name="main",
        rollup=False,
        root_id="ocid",
        use_titles=False,
    ):
        self.sub_sheets = {}
        self.main_sheet = Sheet()
        self.sub_sheet_mapping = {}
        self.main_sheet_name = main_sheet_name
        self.rollup = rollup
        self.root_id = root_id
        self.use_titles = use_titles

        if root_schema_dict is None and schema_filename is None:
            raise ValueError("One of schema_filename or root_schema_dict must be supplied")
        if root_schema_dict is not None and schema_filename is not None:
            raise ValueError("Only one of schema_filename or root_schema_dict should be supplied")
        if schema_filename:
            if schema_filename.startswith("http"):
                import requests

                r = requests.get(schema_filename)
                self.root_schema_dict = jsonref.loads(r.text, object_pairs_hook=OrderedDict)
            else:
                with codecs.open(schema_filename, encoding="utf-8") as schema_file:
                    self.root_schema_dict = jsonref.load(schema_file, object_pairs_hook=OrderedDict)
        else:
            self.root_schema_dict = root_schema_dict
Esempio n. 8
0
def test_empty_lines(tmpdir):
    subsheet = Sheet(root_id="ocid")
    subsheet.add_field("c")
    parser = MockParser(["a", "d"], {"b": subsheet})
    parser.main_sheet.lines = []
    for format_name, spreadsheet_output_class in output.FORMATS.items():
        spreadsheet_output = spreadsheet_output_class(
            parser=parser,
            main_sheet_name="release",
            output_name=os.path.join(
                tmpdir.strpath, "release" + output.FORMATS_SUFFIX[format_name]
            ),
        )
        spreadsheet_output.write_sheets()

    # Check XLSX
    wb = openpyxl.load_workbook(tmpdir.join("release.xlsx").strpath)
    assert wb.sheetnames == ["release", "b"]
    rows = list(wb["release"].rows)
    assert len(rows) == 1
    assert [x.value for x in rows[0]] == ["a", "d"]
    b_rows = list(wb["b"].rows)
    assert len(b_rows) == 1
    assert [x.value for x in b_rows[0]] == ["ocid", "c"]

    # Check CSV
    assert set(tmpdir.join("release").listdir()) == set(
        [
            tmpdir.join("release").join("release.csv"),
            tmpdir.join("release").join("b.csv"),
        ]
    )
    assert tmpdir.join("release", "release.csv").read().strip("\r\n") == "a,d"
    assert tmpdir.join("release", "b.csv").read().strip("\r\n") == "ocid,c"

    # Check ODS
    odswb = ODSReader(tmpdir.join("release.ods").strpath)
    ods_rows = odswb.getSheet("release")
    assert len(ods_rows) == 1
    assert [x for x in ods_rows[0]] == ["a", "d"]
    ods_b_rows = odswb.getSheet("b")
    assert len(ods_b_rows) == 1
    assert [x for x in ods_b_rows[0]] == ["ocid", "c"]
Esempio n. 9
0
def test_sub_sheet_list_like():
    # SubSheet object should be appendable and iterable...
    # .append() is used in json_input.py at https://github.com/OpenDataServices/flatten-tool/blob/master/flattentool/json_input.py#L33
    sub_sheet = Sheet()
    assert list(sub_sheet) == []
    sub_sheet.append('a')
    sub_sheet.append('b')
    assert list(sub_sheet) == ['a', 'b']
    # ... but also has an add_field method, which also appends
    sub_sheet.add_field('c')
    assert list(sub_sheet) == ['a', 'b', 'c']
    # but with the option to add an id_field, which appears at the start of the list
    sub_sheet.add_field('d', id_field=True)
    assert list(sub_sheet) == ['d', 'a', 'b', 'c']
Esempio n. 10
0
def test_populated_lines(tmpdir):
    subsheet = Sheet(root_id='ocid')
    subsheet.add_field('c')
    parser = MockParser(['a'], {})
    parser.main_sheet.lines = [{'a': 'cell1'}, {'a': 'cell2'}]
    subsheet.lines = [{'c': 'cell3'}, {'c': 'cell4'}]
    parser.sub_sheets['b'] = subsheet
    for format_name, spreadsheet_output_class in output.FORMATS.items():
        spreadsheet_output = spreadsheet_output_class(
            parser=parser,
            main_sheet_name='release',
            output_name=os.path.join(
                tmpdir.strpath,
                'release' + output.FORMATS_SUFFIX[format_name]))
        spreadsheet_output.write_sheets()

    # Check XLSX
    wb = openpyxl.load_workbook(tmpdir.join('release.xlsx').strpath)
    assert wb.sheetnames == ['release', 'b']
    rows = list(wb['release'].rows)
    assert len(rows) == 3
    assert [x.value for x in rows[0]] == ['a']
    assert [x.value for x in rows[1]] == ['cell1']
    assert [x.value for x in rows[2]] == ['cell2']
    b_rows = list(wb['b'].rows)
    assert len(b_rows) == 3
    assert [x.value for x in b_rows[0]] == ['ocid', 'c']
    assert [x.value for x in b_rows[1]] == [None, 'cell3']
    assert [x.value for x in b_rows[2]] == [None, 'cell4']

    # Check CSV
    assert set(tmpdir.join('release').listdir()) == set([
        tmpdir.join('release').join('release.csv'),
        tmpdir.join('release').join('b.csv')
    ])
    assert tmpdir.join('release', 'release.csv').read().strip('\r\n').replace(
        '\r', '') == 'a\ncell1\ncell2'
    assert tmpdir.join('release', 'b.csv').read().strip('\r\n').replace(
        '\r', '') == 'ocid,c\n,cell3\n,cell4'
Esempio n. 11
0
    def __init__(self,
                 schema_filename=None,
                 root_schema_dict=None,
                 main_sheet_name='main',
                 rollup=False,
                 root_id='ocid',
                 use_titles=False):
        self.sub_sheets = {}
        self.main_sheet = Sheet()
        self.sub_sheet_mapping = {}
        self.main_sheet_name = main_sheet_name
        self.rollup = rollup
        self.root_id = root_id
        self.use_titles = use_titles
        self.title_lookup = TitleLookup()
        self.flattened = {}

        if root_schema_dict is None and schema_filename is None:
            raise ValueError(
                'One of schema_filename or root_schema_dict must be supplied')
        if root_schema_dict is not None and schema_filename is not None:
            raise ValueError(
                'Only one of schema_filename or root_schema_dict should be supplied'
            )
        if schema_filename:
            if schema_filename.startswith('http'):
                import requests
                r = requests.get(schema_filename)
                self.root_schema_dict = jsonref.loads(
                    r.text, object_pairs_hook=OrderedDict)
            else:
                with codecs.open(schema_filename,
                                 encoding="utf-8") as schema_file:
                    self.root_schema_dict = jsonref.load(
                        schema_file, object_pairs_hook=OrderedDict)
        else:
            self.root_schema_dict = root_schema_dict
def test_sub_sheet_list_like():
    # SubSheet object should be appendable and iterable...
    # .append() is used in json_input.py at https://github.com/OpenDataServices/flatten-tool/blob/master/flattentool/json_input.py#L33
    sub_sheet = Sheet()
    assert list(sub_sheet) == []
    sub_sheet.append('a')
    sub_sheet.append('b')
    assert list(sub_sheet) == ['a', 'b']
    # ... but also has an add_field method, which also appends
    sub_sheet.add_field('c')
    assert list(sub_sheet) == ['a', 'b', 'c']
    # but with the option to add an id_field, which appears at the start of the list
    sub_sheet.add_field('d', id_field=True)
    assert list(sub_sheet) == ['d', 'a', 'b', 'c']
Esempio n. 13
0
    def __init__(self,
                 json_filename=None,
                 root_json_dict=None,
                 main_sheet_name='main',
                 schema_parser=None,
                 root_list_path=None,
                 root_id='ocid',
                 use_titles=False):
        self.sub_sheets = {}
        self.main_sheet = Sheet()
        self.main_sheet_name = main_sheet_name
        self.root_list_path = root_list_path
        self.root_id = root_id
        self.use_titles = use_titles
        if schema_parser:
            self.sub_sheet_mapping = {
                '/'.join(k.split('/')[1:]): v
                for k, v in schema_parser.sub_sheet_mapping.items()
            }
            self.main_sheet = schema_parser.main_sheet
            self.sub_sheets = schema_parser.sub_sheets
            # Rollup is pulled from the schema_parser, as rollup is only possible if a schema parser is specified
            self.rollup = schema_parser.rollup
            self.schema_parser = schema_parser
        else:
            self.sub_sheet_mapping = {}
            self.rollup = False

        if json_filename is None and root_json_dict is None:
            raise ValueError(
                'Etiher json_filename or root_json_dict must be supplied')

        if json_filename is not None and root_json_dict is not None:
            raise ValueError(
                'Only one of json_file or root_json_dict should be supplied')

        if json_filename:
            with codecs.open(json_filename, encoding='utf-8') as json_file:
                try:
                    self.root_json_dict = json.load(
                        json_file,
                        object_pairs_hook=OrderedDict,
                        parse_float=Decimal)
                except ValueError as err:
                    raise BadlyFormedJSONError(*err.args)
        else:
            self.root_json_dict = root_json_dict
Esempio n. 14
0
    def __init__(self, schema_filename=None, root_schema_dict=None, rollup=False, root_id=None, use_titles=False,
                 disable_local_refs=False, truncation_length=3, exclude_deprecated_fields=False):
        self.sub_sheets = {}
        self.main_sheet = Sheet()
        self.sub_sheet_mapping = {}
        self.rollup = rollup
        self.root_id = root_id
        self.use_titles = use_titles
        self.truncation_length = truncation_length
        self.title_lookup = TitleLookup()
        self.flattened = {}
        self.exclude_deprecated_fields = exclude_deprecated_fields

        if root_schema_dict is None and schema_filename is  None:
            raise ValueError('One of schema_filename or root_schema_dict must be supplied')
        if root_schema_dict is not None and schema_filename is not None:
            raise ValueError('Only one of schema_filename or root_schema_dict should be supplied')
        if schema_filename:
            if schema_filename.startswith('http'):
                import requests
                r = requests.get(schema_filename)
                self.root_schema_dict = jsonref.loads(r.text, object_pairs_hook=OrderedDict)
            else:
                if disable_local_refs:
                    with codecs.open(schema_filename, encoding="utf-8") as schema_file:
                        self.root_schema_dict = jsonref.load(schema_file, object_pairs_hook=OrderedDict,
                                                             loader=JsonLoaderLocalRefsDisabled())
                else:
                    if sys.version_info[:2] > (3, 0):
                        base_uri = pathlib.Path(os.path.realpath(schema_filename)).as_uri()
                    else:
                        base_uri = urlparse.urljoin('file:', urllib.pathname2url(os.path.abspath(schema_filename)))
                    with codecs.open(schema_filename, encoding="utf-8") as schema_file:
                        self.root_schema_dict = jsonref.load(schema_file, object_pairs_hook=OrderedDict,
                                                             base_uri=base_uri)


        else:
            self.root_schema_dict = root_schema_dict
Esempio n. 15
0
class SchemaParser(object):
    """Parse the fields of a JSON schema into a flattened structure."""
    def __init__(self,
                 schema_filename=None,
                 root_schema_dict=None,
                 rollup=False,
                 root_id=None,
                 use_titles=False,
                 disable_local_refs=False,
                 truncation_length=3,
                 exclude_deprecated_fields=False):
        self.sub_sheets = {}
        self.main_sheet = Sheet()
        self.sub_sheet_mapping = {}
        self.do_rollup = rollup
        self.rollup = set()
        self.root_id = root_id
        self.use_titles = use_titles
        self.truncation_length = truncation_length
        self.title_lookup = TitleLookup()
        self.flattened = {}
        self.exclude_deprecated_fields = exclude_deprecated_fields

        if root_schema_dict is None and schema_filename is None:
            raise ValueError(
                'One of schema_filename or root_schema_dict must be supplied')
        if root_schema_dict is not None and schema_filename is not None:
            raise ValueError(
                'Only one of schema_filename or root_schema_dict should be supplied'
            )
        if schema_filename:
            if schema_filename.startswith('http'):
                import requests
                r = requests.get(schema_filename)
                self.root_schema_dict = jsonref.loads(
                    r.text, object_pairs_hook=OrderedDict)
            else:
                if disable_local_refs:
                    with codecs.open(schema_filename,
                                     encoding="utf-8") as schema_file:
                        self.root_schema_dict = jsonref.load(
                            schema_file,
                            object_pairs_hook=OrderedDict,
                            loader=JsonLoaderLocalRefsDisabled())
                else:
                    if sys.version_info[:2] > (3, 0):
                        base_uri = pathlib.Path(
                            os.path.realpath(schema_filename)).as_uri()
                    else:
                        base_uri = urlparse.urljoin(
                            'file:',
                            urllib.pathname2url(
                                os.path.abspath(schema_filename)))
                    with codecs.open(schema_filename,
                                     encoding="utf-8") as schema_file:
                        self.root_schema_dict = jsonref.load(
                            schema_file,
                            object_pairs_hook=OrderedDict,
                            base_uri=base_uri)

        else:
            self.root_schema_dict = root_schema_dict

    def parse(self):
        fields = self.parse_schema_dict('', self.root_schema_dict)
        for field, title in fields:
            if self.use_titles:
                if not title:
                    warn('Field {} does not have a title, skipping.'.format(
                        field))
                else:
                    self.main_sheet.append(title)
                    self.main_sheet.titles[field] = title
            else:
                self.main_sheet.append(field)

    def parse_schema_dict(self,
                          parent_path,
                          schema_dict,
                          parent_id_fields=None,
                          title_lookup=None,
                          parent_title=''):
        if parent_path:
            parent_path = parent_path + '/'
        parent_id_fields = parent_id_fields or []
        title_lookup = self.title_lookup if title_lookup is None else title_lookup

        if 'type' in schema_dict and schema_dict['type'] == 'array' \
                and 'items' in schema_dict and 'oneOf' in schema_dict['items']:
            for oneOf in schema_dict['items']['oneOf']:
                if 'type' in oneOf and oneOf['type'] == 'object':
                    for field, child_title in self.parse_schema_dict(
                            parent_path,
                            oneOf,
                            parent_id_fields=parent_id_fields,
                            title_lookup=title_lookup,
                            parent_title=parent_title):
                        yield (field, child_title)

        elif 'properties' in schema_dict:
            if 'id' in schema_dict['properties']:
                if self.use_titles:
                    id_fields = parent_id_fields + [
                        (parent_title
                         if parent_title is not None else parent_path) +
                        (schema_dict['properties']['id'].get('title') or 'id')
                    ]
                else:
                    id_fields = parent_id_fields + [parent_path + 'id']
            else:
                id_fields = parent_id_fields

            for property_name, property_schema_dict in schema_dict[
                    'properties'].items():
                if self.exclude_deprecated_fields and property_schema_dict.get(
                        'deprecated'):
                    continue

                property_type_set = get_property_type_set(property_schema_dict)

                title = property_schema_dict.get('title')
                if title:
                    title_lookup[title] = TitleLookup()
                    title_lookup[title].property_name = property_name

                if 'object' in property_type_set:
                    self.flattened[parent_path + property_name] = "object"
                    for field, child_title in self.parse_schema_dict(
                            parent_path + property_name,
                            property_schema_dict,
                            parent_id_fields=id_fields,
                            title_lookup=title_lookup.get(title),
                            parent_title=parent_title + title + ':'
                            if parent_title is not None and title else None):
                        yield (
                            property_name + '/' + field,
                            # TODO ambiguous use of "title"
                            (title + ':' +
                             child_title if title and child_title else None))

                elif 'array' in property_type_set:
                    flattened_key = parent_path.replace('/0/',
                                                        '/') + property_name
                    self.flattened[flattened_key] = "array"
                    type_set = get_property_type_set(
                        property_schema_dict['items'])
                    if 'string' in type_set or not type_set:
                        self.flattened[flattened_key] = "string_array"
                        yield property_name, title
                    elif 'number' in type_set:
                        self.flattened[flattened_key] = "number_array"
                        yield property_name, title
                    elif 'array' in type_set:
                        self.flattened[flattened_key] = "array_array"
                        nested_type_set = get_property_type_set(
                            property_schema_dict['items']['items'])
                        if 'string' in nested_type_set or 'number' in nested_type_set:
                            yield property_name, title
                        else:
                            raise ValueError
                    elif 'object' in type_set:
                        if title:
                            title_lookup[title].property_name = property_name

                        sub_sheet_name = make_sub_sheet_name(
                            parent_path,
                            property_name,
                            truncation_length=self.truncation_length)
                        #self.sub_sheet_mapping[parent_name+'/'+property_name] = sub_sheet_name

                        if sub_sheet_name not in self.sub_sheets:
                            self.sub_sheets[sub_sheet_name] = Sheet(
                                root_id=self.root_id, name=sub_sheet_name)
                        sub_sheet = self.sub_sheets[sub_sheet_name]
                        sub_sheet.title_lookup = title_lookup.get(title)

                        for field in id_fields:
                            sub_sheet.add_field(field, id_field=True)
                            sub_sheet.titles[title_lookup.lookup_header(
                                field)] = field
                        fields = self.parse_schema_dict(
                            parent_path + property_name + '/0',
                            property_schema_dict['items'],
                            parent_id_fields=id_fields,
                            title_lookup=title_lookup.get(title),
                            parent_title=parent_title + title + ':'
                            if parent_title is not None and title else None)

                        rollup_fields = set()
                        for field, child_title in fields:
                            full_path = parent_path + property_name + '/0/' + field
                            if self.use_titles:
                                if not child_title or parent_title is None:
                                    warn(
                                        'Field {}{}/0/{} is missing a title, skipping.'
                                        .format(parent_path, property_name,
                                                field))
                                elif not title:
                                    warn(
                                        'Field {}{} does not have a title, skipping it and all its children.'
                                        .format(parent_path, property_name))
                                else:
                                    # This code only works for arrays that are at 0 or 1 layer of nesting
                                    full_title = parent_title + title + ':' + child_title
                                    sub_sheet.add_field(full_title)
                                    sub_sheet.titles[full_path] = full_title
                            else:
                                sub_sheet.add_field(full_path)
                            if self.do_rollup and 'rollUp' in property_schema_dict and field in property_schema_dict[
                                    'rollUp']:
                                rollup_fields.add(field)
                                self.rollup.add(full_path)
                                yield property_name + '/0/' + field, (
                                    title + ':' + child_title
                                    if title and child_title else None)

                        # Check that all items in rollUp are in the schema
                        if self.do_rollup and 'rollUp' in property_schema_dict:
                            missedRollUp = set(
                                property_schema_dict['rollUp']) - rollup_fields
                            if missedRollUp:
                                warn('{} in rollUp but not in schema'.format(
                                    ', '.join(missedRollUp)))

                    else:
                        raise ValueError(
                            'Unknown type_set: {}, did you forget to explicity set the "type" key on "items"?'
                            .format(type_set))
                elif 'string' in property_type_set or not property_type_set:
                    self.flattened[parent_path.replace('/0/', '/') +
                                   property_name] = "string"
                    yield property_name, title
                elif 'number' in property_type_set:
                    self.flattened[parent_path.replace('/0/', '/') +
                                   property_name] = "number"
                    yield property_name, title
                elif 'integer' in property_type_set:
                    self.flattened[parent_path.replace('/0/', '/') +
                                   property_name] = "integer"
                    yield property_name, title
                elif 'boolean' in property_type_set:
                    self.flattened[parent_path.replace('/0/', '/') +
                                   property_name] = "boolean"
                    yield property_name, title
                else:
                    warn(
                        'Unrecognised types {} for property "{}" with context "{}",'
                        'so this property has been ignored.'.format(
                            repr(property_type_set), property_name,
                            parent_path))

        else:
            warn('Skipping field "{}", because it has no properties.'.format(
                parent_path))
Esempio n. 16
0
    def parse_schema_dict(self,
                          parent_path,
                          schema_dict,
                          parent_id_fields=None,
                          title_lookup=None,
                          parent_title=''):
        if parent_path:
            parent_path = parent_path + '/'
        parent_id_fields = parent_id_fields or []
        title_lookup = self.title_lookup if title_lookup is None else title_lookup

        if 'type' in schema_dict and schema_dict['type'] == 'array' \
                and 'items' in schema_dict and 'oneOf' in schema_dict['items']:
            for oneOf in schema_dict['items']['oneOf']:
                if 'type' in oneOf and oneOf['type'] == 'object':
                    for field, child_title in self.parse_schema_dict(
                            parent_path,
                            oneOf,
                            parent_id_fields=parent_id_fields,
                            title_lookup=title_lookup,
                            parent_title=parent_title):
                        yield (field, child_title)

        elif 'properties' in schema_dict:
            if 'id' in schema_dict['properties']:
                if self.use_titles:
                    id_fields = parent_id_fields + [
                        (parent_title
                         if parent_title is not None else parent_path) +
                        (schema_dict['properties']['id'].get('title') or 'id')
                    ]
                else:
                    id_fields = parent_id_fields + [parent_path + 'id']
            else:
                id_fields = parent_id_fields

            for property_name, property_schema_dict in schema_dict[
                    'properties'].items():
                if self.exclude_deprecated_fields and property_schema_dict.get(
                        'deprecated'):
                    continue

                property_type_set = get_property_type_set(property_schema_dict)

                title = property_schema_dict.get('title')
                if title:
                    title_lookup[title] = TitleLookup()
                    title_lookup[title].property_name = property_name

                if 'object' in property_type_set:
                    self.flattened[parent_path + property_name] = "object"
                    for field, child_title in self.parse_schema_dict(
                            parent_path + property_name,
                            property_schema_dict,
                            parent_id_fields=id_fields,
                            title_lookup=title_lookup.get(title),
                            parent_title=parent_title + title + ':'
                            if parent_title is not None and title else None):
                        yield (
                            property_name + '/' + field,
                            # TODO ambiguous use of "title"
                            (title + ':' +
                             child_title if title and child_title else None))

                elif 'array' in property_type_set:
                    flattened_key = parent_path.replace('/0/',
                                                        '/') + property_name
                    self.flattened[flattened_key] = "array"
                    type_set = get_property_type_set(
                        property_schema_dict['items'])
                    if 'string' in type_set or not type_set:
                        self.flattened[flattened_key] = "string_array"
                        yield property_name, title
                    elif 'number' in type_set:
                        self.flattened[flattened_key] = "number_array"
                        yield property_name, title
                    elif 'array' in type_set:
                        self.flattened[flattened_key] = "array_array"
                        nested_type_set = get_property_type_set(
                            property_schema_dict['items']['items'])
                        if 'string' in nested_type_set or 'number' in nested_type_set:
                            yield property_name, title
                        else:
                            raise ValueError
                    elif 'object' in type_set:
                        if title:
                            title_lookup[title].property_name = property_name

                        sub_sheet_name = make_sub_sheet_name(
                            parent_path,
                            property_name,
                            truncation_length=self.truncation_length)
                        #self.sub_sheet_mapping[parent_name+'/'+property_name] = sub_sheet_name

                        if sub_sheet_name not in self.sub_sheets:
                            self.sub_sheets[sub_sheet_name] = Sheet(
                                root_id=self.root_id, name=sub_sheet_name)
                        sub_sheet = self.sub_sheets[sub_sheet_name]
                        sub_sheet.title_lookup = title_lookup.get(title)

                        for field in id_fields:
                            sub_sheet.add_field(field, id_field=True)
                            sub_sheet.titles[title_lookup.lookup_header(
                                field)] = field
                        fields = self.parse_schema_dict(
                            parent_path + property_name + '/0',
                            property_schema_dict['items'],
                            parent_id_fields=id_fields,
                            title_lookup=title_lookup.get(title),
                            parent_title=parent_title + title + ':'
                            if parent_title is not None and title else None)

                        rollup_fields = set()
                        for field, child_title in fields:
                            full_path = parent_path + property_name + '/0/' + field
                            if self.use_titles:
                                if not child_title or parent_title is None:
                                    warn(
                                        'Field {}{}/0/{} is missing a title, skipping.'
                                        .format(parent_path, property_name,
                                                field))
                                elif not title:
                                    warn(
                                        'Field {}{} does not have a title, skipping it and all its children.'
                                        .format(parent_path, property_name))
                                else:
                                    # This code only works for arrays that are at 0 or 1 layer of nesting
                                    full_title = parent_title + title + ':' + child_title
                                    sub_sheet.add_field(full_title)
                                    sub_sheet.titles[full_path] = full_title
                            else:
                                sub_sheet.add_field(full_path)
                            if self.do_rollup and 'rollUp' in property_schema_dict and field in property_schema_dict[
                                    'rollUp']:
                                rollup_fields.add(field)
                                self.rollup.add(full_path)
                                yield property_name + '/0/' + field, (
                                    title + ':' + child_title
                                    if title and child_title else None)

                        # Check that all items in rollUp are in the schema
                        if self.do_rollup and 'rollUp' in property_schema_dict:
                            missedRollUp = set(
                                property_schema_dict['rollUp']) - rollup_fields
                            if missedRollUp:
                                warn('{} in rollUp but not in schema'.format(
                                    ', '.join(missedRollUp)))

                    else:
                        raise ValueError(
                            'Unknown type_set: {}, did you forget to explicity set the "type" key on "items"?'
                            .format(type_set))
                elif 'string' in property_type_set or not property_type_set:
                    self.flattened[parent_path.replace('/0/', '/') +
                                   property_name] = "string"
                    yield property_name, title
                elif 'number' in property_type_set:
                    self.flattened[parent_path.replace('/0/', '/') +
                                   property_name] = "number"
                    yield property_name, title
                elif 'integer' in property_type_set:
                    self.flattened[parent_path.replace('/0/', '/') +
                                   property_name] = "integer"
                    yield property_name, title
                elif 'boolean' in property_type_set:
                    self.flattened[parent_path.replace('/0/', '/') +
                                   property_name] = "boolean"
                    yield property_name, title
                else:
                    warn(
                        'Unrecognised types {} for property "{}" with context "{}",'
                        'so this property has been ignored.'.format(
                            repr(property_type_set), property_name,
                            parent_path))

        else:
            warn('Skipping field "{}", because it has no properties.'.format(
                parent_path))
Esempio n. 17
0
    def __init__(
        self,
        json_filename=None,
        root_json_dict=None,
        schema_parser=None,
        root_list_path=None,
        root_id="ocid",
        use_titles=False,
        xml=False,
        id_name="id",
        filter_field=None,
        filter_value=None,
        preserve_fields=None,
        remove_empty_schema_columns=False,
        rollup=False,
        truncation_length=3,
    ):
        self.sub_sheets = {}
        self.main_sheet = Sheet()
        self.root_list_path = root_list_path
        self.root_id = root_id
        self.use_titles = use_titles
        self.truncation_length = truncation_length
        self.id_name = id_name
        self.xml = xml
        self.filter_field = filter_field
        self.filter_value = filter_value
        self.remove_empty_schema_columns = remove_empty_schema_columns
        self.seen_paths = set()

        if schema_parser:
            self.main_sheet = copy.deepcopy(schema_parser.main_sheet)
            self.sub_sheets = copy.deepcopy(schema_parser.sub_sheets)
            if remove_empty_schema_columns:
                # Don't use columns from the schema parser
                # (avoids empty columns)
                self.main_sheet.columns = []
                for sheet_name, sheet in list(self.sub_sheets.items()):
                    sheet.columns = []
            self.schema_parser = schema_parser
        else:
            self.schema_parser = None

        self.rollup = False
        if rollup:
            if schema_parser and len(schema_parser.rollup) > 0:
                # If rollUp is present in the schema this takes precedence over direct input.
                self.rollup = schema_parser.rollup
                if isinstance(rollup,
                              (list, )) and (len(rollup) > 1 or
                                             (len(rollup) == 1
                                              and rollup[0] is not True)):
                    warn(
                        _("Using rollUp values from schema, ignoring direct input."
                          ))
            elif isinstance(rollup, (list, )):
                if len(rollup) == 1 and os.path.isfile(rollup[0]):
                    # Parse file, one json path per line.
                    rollup_from_file = set()
                    with open(rollup[0]) as rollup_file:
                        for line in rollup_file:
                            line = line.strip()
                            rollup_from_file.add(line)
                    self.rollup = rollup_from_file
                    # Rollup args passed directly at the commandline
                elif len(rollup) == 1 and rollup[0] is True:
                    warn(
                        _("No fields to rollup found (pass json path directly, as a list in a file, or via a schema)"
                          ))
                else:
                    self.rollup = set(rollup)
            else:
                warn(
                    _("Invalid value passed for rollup (pass json path directly, as a list in a file, or via a schema)"
                      ))

        if self.xml:
            with codecs.open(json_filename, "rb") as xml_file:
                top_dict = xmltodict.parse(
                    xml_file,
                    force_list=(root_list_path, ),
                    force_cdata=True,
                )
                # AFAICT, this should be true for *all* XML files
                assert len(top_dict) == 1
                root_json_dict = list(top_dict.values())[0]
                list_dict_consistency(root_json_dict)
            json_filename = None

        if json_filename is None and root_json_dict is None:
            raise ValueError(
                _("Etiher json_filename or root_json_dict must be supplied"))

        if json_filename is not None and root_json_dict is not None:
            raise ValueError(
                _("Only one of json_file or root_json_dict should be supplied")
            )

        if json_filename:
            with codecs.open(json_filename, encoding="utf-8") as json_file:
                try:
                    self.root_json_dict = json.load(
                        json_file,
                        object_pairs_hook=OrderedDict,
                        parse_float=Decimal)
                except UnicodeError as err:
                    raise BadlyFormedJSONErrorUTF8(*err.args)
                except ValueError as err:
                    raise BadlyFormedJSONError(*err.args)
        else:
            self.root_json_dict = root_json_dict

        if preserve_fields:
            # Extract fields to be preserved from input file (one path per line)
            preserve_fields_all = []
            preserve_fields_input = []
            with open(preserve_fields) as preserve_fields_file:
                for line in preserve_fields_file:
                    line = line.strip()
                    path_fields = line.rsplit("/", 1)
                    preserve_fields_all = (preserve_fields_all + path_fields +
                                           [line.rstrip("/")])
                    preserve_fields_input = preserve_fields_input + [
                        line.rstrip("/")
                    ]

            self.preserve_fields = set(preserve_fields_all)
            self.preserve_fields_input = set(preserve_fields_input)

            try:
                input_not_in_schema = set()
                for field in self.preserve_fields_input:
                    if field not in self.schema_parser.flattened.keys():
                        input_not_in_schema.add(field)
                warn(
                    _("You wanted to preserve the following fields which are not present in the supplied schema: {}"
                      ).format(list(input_not_in_schema)))
            except AttributeError:
                # no schema
                pass
        else:
            self.preserve_fields = None
            self.preserve_fields_input = None
Esempio n. 18
0
    def parse_json_dict(
        self,
        json_dict,
        sheet,
        json_key=None,
        parent_name="",
        flattened_dict=None,
        parent_id_fields=None,
        top_level_of_sub_sheet=False,
    ):
        """
        Parse a json dictionary.

        json_dict - the json dictionary
        sheet - a sheet.Sheet object representing the resulting spreadsheet
        json_key - the key that maps to this JSON dict, either directly to the dict, or to a dict that this list contains.  Is None if this dict is contained in root_json_list directly.
        """
        # Possibly main_sheet should be main_sheet_columns, but this is
        # currently named for consistency with schema.py

        if self.use_titles:
            sheet_key = sheet_key_title
        else:
            sheet_key = sheet_key_field

        parent_id_fields = copy.copy(parent_id_fields) or OrderedDict()
        if flattened_dict is None:
            flattened_dict = {}
            top = True
        else:
            top = False

        if parent_name == "" and self.filter_field and self.filter_value:
            if self.filter_field not in json_dict:
                return
            if json_dict[self.filter_field] != self.filter_value:
                return

        if top_level_of_sub_sheet:
            # Add the IDs for the top level of object in an array
            for k, v in parent_id_fields.items():
                if self.xml:
                    flattened_dict[sheet_key(sheet, k)] = v["#text"]
                else:
                    flattened_dict[sheet_key(sheet, k)] = v

        if self.root_id and self.root_id in json_dict:
            parent_id_fields[sheet_key(sheet,
                                       self.root_id)] = json_dict[self.root_id]

        if self.id_name in json_dict:
            parent_id_fields[sheet_key(sheet, parent_name +
                                       self.id_name)] = json_dict[self.id_name]

        for key, value in json_dict.items():

            # Keep a unique list of all the JSON paths in the data that have been seen.
            parent_path = parent_name.replace("/0", "")
            full_path = parent_path + key
            self.seen_paths.add(full_path)

            if self.preserve_fields:

                siblings = False
                for field in self.preserve_fields:
                    if parent_path in field:
                        siblings = True
                if siblings and full_path not in self.preserve_fields:
                    continue

            if type(value) in BASIC_TYPES:
                if self.xml and key == "#text":
                    # Handle the text output from xmltodict
                    key = ""
                    parent_name = parent_name.strip("/")
                flattened_dict[sheet_key(sheet, parent_name + key)] = value
            elif hasattr(value, "items"):
                self.parse_json_dict(
                    value,
                    sheet=sheet,
                    json_key=key,
                    parent_name=parent_name + key + "/",
                    flattened_dict=flattened_dict,
                    parent_id_fields=parent_id_fields,
                )
            elif hasattr(value, "__iter__"):
                if all(type(x) in BASIC_TYPES for x in value):
                    # Check for an array of BASIC types
                    # TODO Make this check the schema
                    # TODO Error if the any of the values contain the seperator
                    # TODO Support doubly nested arrays
                    flattened_dict[sheet_key(sheet,
                                             parent_name + key)] = ";".join(
                                                 map(str, value))
                else:
                    if (self.rollup and parent_name == ""
                        ):  # Rollup only currently possible to main sheet

                        if self.use_titles and not self.schema_parser:
                            warn(
                                _("Warning: No schema was provided so column headings are JSON keys, not titles."
                                  ))

                        if len(value) == 1:
                            for k, v in value[0].items():

                                if (self.preserve_fields
                                        and parent_name + key + "/" + k
                                        not in self.preserve_fields):
                                    continue

                                if type(v) not in BASIC_TYPES:
                                    raise ValueError(
                                        _("Rolled up values must be basic types"
                                          ))
                                else:
                                    if self.schema_parser:
                                        # We want titles and there's a schema and rollUp is in it
                                        if (self.use_titles
                                                and parent_name + key + "/0/" +
                                                k in self.schema_parser.
                                                main_sheet.titles):
                                            flattened_dict[sheet_key_title(
                                                sheet, parent_name + key +
                                                "/0/" + k)] = v

                                        # We want titles and there's a schema but rollUp isn't in it
                                        # so the titles for rollup properties aren't in the main sheet
                                        # so we need to try to get the titles from a subsheet
                                        elif (self.use_titles and
                                              parent_name + key in self.rollup
                                              and self.schema_parser.
                                              sub_sheet_titles.get((
                                                  parent_name,
                                                  key,
                                              )) in
                                              self.schema_parser.sub_sheets):
                                            relevant_subsheet = self.schema_parser.sub_sheets.get(
                                                self.schema_parser.
                                                sub_sheet_titles.get((
                                                    parent_name,
                                                    key,
                                                )))
                                            if relevant_subsheet is not None:
                                                rollup_field_title = sheet_key_title(
                                                    relevant_subsheet,
                                                    parent_name + key + "/0/" +
                                                    k,
                                                )
                                                flattened_dict[sheet_key(
                                                    sheet,
                                                    rollup_field_title)] = v

                                        # We don't want titles even though there's a schema
                                        elif not self.use_titles and (
                                                parent_name + key + "/0/" + k
                                                in self.schema_parser.
                                                main_sheet or parent_name + key
                                                in self.rollup):
                                            flattened_dict[sheet_key(
                                                sheet, parent_name + key +
                                                "/0/" + k)] = v

                                    # No schema, so no titles
                                    elif parent_name + key in self.rollup:
                                        flattened_dict[sheet_key(
                                            sheet,
                                            parent_name + key + "/0/" + k)] = v

                        elif len(value) > 1:
                            for k in set(
                                    sum((list(x.keys()) for x in value), [])):

                                if (self.preserve_fields
                                        and parent_name + key + "/" + k
                                        not in self.preserve_fields):
                                    continue

                                if (self.schema_parser
                                        and parent_name + key + "/0/" + k
                                        in self.schema_parser.main_sheet):
                                    warn(
                                        _('More than one value supplied for "{}". Could not provide rollup, so adding a warning to the relevant cell(s) in the spreadsheet.'
                                          ).format(parent_name + key))
                                    flattened_dict[sheet_key(
                                        sheet, parent_name + key + "/0/" + k
                                    )] = _(
                                        "WARNING: More than one value supplied, consult the relevant sub-sheet for the data."
                                    )
                                elif parent_name + key in self.rollup:
                                    warn(
                                        _('More than one value supplied for "{}". Could not provide rollup, so adding a warning to the relevant cell(s) in the spreadsheet.'
                                          ).format(parent_name + key))
                                    flattened_dict[sheet_key(
                                        sheet, parent_name + key + "/0/" + k
                                    )] = _(
                                        "WARNING: More than one value supplied, consult the relevant sub-sheet for the data."
                                    )

                    if (self.use_titles and self.schema_parser and (
                            parent_name,
                            key,
                    ) in self.schema_parser.sub_sheet_titles):
                        sub_sheet_name = self.schema_parser.sub_sheet_titles[(
                            parent_name,
                            key,
                        )]
                    else:
                        sub_sheet_name = make_sub_sheet_name(
                            parent_name,
                            key,
                            truncation_length=self.truncation_length)
                    if sub_sheet_name not in self.sub_sheets:
                        self.sub_sheets[sub_sheet_name] = Sheet(
                            name=sub_sheet_name)

                    for json_dict in value:
                        if json_dict is None:
                            continue
                        self.parse_json_dict(
                            json_dict,
                            sheet=self.sub_sheets[sub_sheet_name],
                            json_key=key,
                            parent_id_fields=parent_id_fields,
                            parent_name=parent_name + key + "/0/",
                            top_level_of_sub_sheet=True,
                        )
            else:
                raise ValueError(_("Unsupported type {}").format(type(value)))

        if top:
            sheet.lines.append(flattened_dict)
Esempio n. 19
0
    def parse_schema_dict(
        self,
        parent_path,
        schema_dict,
        parent_id_fields=None,
        title_lookup=None,
        parent_title="",
    ):
        if parent_path:
            parent_path = parent_path + "/"
        parent_id_fields = parent_id_fields or []
        title_lookup = self.title_lookup if title_lookup is None else title_lookup

        if ("type" in schema_dict and schema_dict["type"] == "array"
                and "items" in schema_dict
                and "oneOf" in schema_dict["items"]):
            for oneOf in schema_dict["items"]["oneOf"]:
                if "type" in oneOf and oneOf["type"] == "object":
                    for field, child_title in self.parse_schema_dict(
                            parent_path,
                            oneOf,
                            parent_id_fields=parent_id_fields,
                            title_lookup=title_lookup,
                            parent_title=parent_title,
                    ):
                        yield (field, child_title)

        elif "properties" in schema_dict:
            if "id" in schema_dict["properties"]:
                if self.use_titles:
                    id_fields = parent_id_fields + [
                        (parent_title
                         if parent_title is not None else parent_path) +
                        (schema_dict["properties"]["id"].get("title") or "id")
                    ]
                else:
                    id_fields = parent_id_fields + [parent_path + "id"]
            else:
                id_fields = parent_id_fields

            for property_name, property_schema_dict in schema_dict[
                    "properties"].items():
                if self.exclude_deprecated_fields and property_schema_dict.get(
                        "deprecated"):
                    continue

                property_type_set = get_property_type_set(property_schema_dict)

                title = property_schema_dict.get("title")
                if title:
                    title_lookup[title] = TitleLookup()
                    title_lookup[title].property_name = property_name

                if "object" in property_type_set:
                    self.flattened[parent_path + property_name] = "object"
                    for field, child_title in self.parse_schema_dict(
                            parent_path + property_name,
                            property_schema_dict,
                            parent_id_fields=id_fields,
                            title_lookup=title_lookup.get(title),
                            parent_title=parent_title + title + ":"
                            if parent_title is not None and title else None,
                    ):
                        yield (
                            property_name + "/" + field,
                            # TODO ambiguous use of "title"
                            (title + ":" +
                             child_title if title and child_title else None),
                        )

                elif "array" in property_type_set:
                    flattened_key = parent_path.replace("/0/",
                                                        "/") + property_name
                    self.flattened[flattened_key] = "array"
                    type_set = get_property_type_set(
                        property_schema_dict["items"])
                    if "string" in type_set or not type_set:
                        self.flattened[flattened_key] = "string_array"
                        yield property_name, title
                    elif "number" in type_set:
                        self.flattened[flattened_key] = "number_array"
                        yield property_name, title
                    elif "array" in type_set:
                        self.flattened[flattened_key] = "array_array"
                        nested_type_set = get_property_type_set(
                            property_schema_dict["items"]["items"])
                        if "string" in nested_type_set or "number" in nested_type_set:
                            yield property_name, title
                        else:
                            raise ValueError
                    elif "object" in type_set:
                        if title:
                            title_lookup[title].property_name = property_name

                        sub_sheet_name = make_sub_sheet_name(
                            parent_path,
                            property_name,
                            truncation_length=self.truncation_length,
                        )
                        # self.sub_sheet_mapping[parent_name+'/'+property_name] = sub_sheet_name

                        if sub_sheet_name not in self.sub_sheets:
                            self.sub_sheets[sub_sheet_name] = Sheet(
                                root_id=self.root_id, name=sub_sheet_name)
                        sub_sheet = self.sub_sheets[sub_sheet_name]
                        sub_sheet.title_lookup = title_lookup.get(title)

                        for field in id_fields:
                            sub_sheet.add_field(field, id_field=True)
                            sub_sheet.titles[title_lookup.lookup_header(
                                field)] = field
                        fields = self.parse_schema_dict(
                            parent_path + property_name + "/0",
                            property_schema_dict["items"],
                            parent_id_fields=id_fields,
                            title_lookup=title_lookup.get(title),
                            parent_title=parent_title + title + ":"
                            if parent_title is not None and title else None,
                        )

                        rollup_fields = set()
                        for field, child_title in fields:
                            full_path = parent_path + property_name + "/0/" + field
                            if self.use_titles:
                                if not child_title or parent_title is None:
                                    warn(
                                        "Field {}{}/0/{} is missing a title, skipping."
                                        .format(parent_path, property_name,
                                                field))
                                elif not title:
                                    warn(
                                        "Field {}{} does not have a title, skipping it and all its children."
                                        .format(parent_path, property_name))
                                else:
                                    # This code only works for arrays that are at 0 or 1 layer of nesting
                                    full_title = (parent_title + title + ":" +
                                                  child_title)
                                    sub_sheet.add_field(full_title)
                                    sub_sheet.titles[full_path] = full_title
                            else:
                                sub_sheet.add_field(full_path)
                            if (self.do_rollup
                                    and "rollUp" in property_schema_dict and
                                    field in property_schema_dict["rollUp"]):
                                rollup_fields.add(field)
                                self.rollup.add(full_path)
                                yield property_name + "/0/" + field, (
                                    title + ":" + child_title
                                    if title and child_title else None)

                        # Check that all items in rollUp are in the schema
                        if self.do_rollup and "rollUp" in property_schema_dict:
                            missedRollUp = (
                                set(property_schema_dict["rollUp"]) -
                                rollup_fields)
                            if missedRollUp:
                                warn("{} in rollUp but not in schema".format(
                                    ", ".join(missedRollUp)))

                    else:
                        raise ValueError(
                            'Unknown type_set: {}, did you forget to explicity set the "type" key on "items"?'
                            .format(type_set))
                elif "string" in property_type_set or not property_type_set:
                    self.flattened[parent_path.replace("/0/", "/") +
                                   property_name] = "string"
                    yield property_name, title
                elif "number" in property_type_set:
                    self.flattened[parent_path.replace("/0/", "/") +
                                   property_name] = "number"
                    yield property_name, title
                elif "integer" in property_type_set:
                    self.flattened[parent_path.replace("/0/", "/") +
                                   property_name] = "integer"
                    yield property_name, title
                elif "boolean" in property_type_set:
                    self.flattened[parent_path.replace("/0/", "/") +
                                   property_name] = "boolean"
                    yield property_name, title
                else:
                    warn(
                        'Unrecognised types {} for property "{}" with context "{}",'
                        "so this property has been ignored.".format(
                            repr(property_type_set), property_name,
                            parent_path))

        else:
            warn('Skipping field "{}", because it has no properties.'.format(
                parent_path))
Esempio n. 20
0
    def parse_schema_dict(self,
                          parent_name,
                          parent_path,
                          schema_dict,
                          parent_id_fields=None,
                          title_lookup=None):
        if parent_path:
            parent_path = parent_path + '/'
        parent_id_fields = parent_id_fields or []
        title_lookup = self.title_lookup if title_lookup is None else title_lookup
        if 'properties' in schema_dict:
            if 'id' in schema_dict['properties']:
                id_fields = parent_id_fields + [parent_name + '/id']
            else:
                id_fields = parent_id_fields

            for property_name, property_schema_dict in schema_dict[
                    'properties'].items():
                property_type_set = get_property_type_set(property_schema_dict)

                title = property_schema_dict.get('title')
                if title:
                    title_lookup[title] = TitleLookup()
                    title_lookup[title].property_name = property_name

                if 'object' in property_type_set:
                    self.flattened[parent_path + property_name] = "object"
                    for field, child_title in self.parse_schema_dict(
                            parent_name + '/' + property_name,
                            parent_path + property_name,
                            property_schema_dict,
                            parent_id_fields=id_fields,
                            title_lookup=title_lookup.get(title)):
                        yield (
                            property_name + '/' + field,
                            # TODO ambiguous use of "title"
                            (title + ':' +
                             child_title if title and child_title else None))

                elif 'array' in property_type_set:
                    self.flattened[parent_path + property_name] = "array"
                    type_set = get_property_type_set(
                        property_schema_dict['items'])
                    if 'string' in type_set:
                        self.flattened[parent_path +
                                       property_name] = "string_array"
                        yield property_name + ':array', title
                    elif 'array' in type_set:
                        self.flattened[parent_path +
                                       property_name] = "array_array"
                        if 'string' in get_property_type_set(
                                property_schema_dict['items']['items']):
                            yield property_name + ':array', title
                        else:
                            raise ValueError
                    elif 'object' in type_set:
                        if title:
                            title_lookup[
                                title].property_name = property_name + '[]'
                        if hasattr(property_schema_dict['items'],
                                   '__reference__'):
                            sub_sheet_name = property_schema_dict[
                                'items'].__reference__['$ref'].split('/')[-1]
                        else:
                            sub_sheet_name = property_name

                        self.sub_sheet_mapping[parent_name + '/' +
                                               property_name] = sub_sheet_name

                        if sub_sheet_name not in self.sub_sheets:
                            self.sub_sheets[sub_sheet_name] = Sheet(
                                root_id=self.root_id, name=sub_sheet_name)
                        sub_sheet = self.sub_sheets[sub_sheet_name]
                        sub_sheet.title_lookup = title_lookup.get(title)

                        for field in id_fields:
                            sub_sheet.add_field(field + ':' + property_name,
                                                id_field=True)
                        fields = self.parse_schema_dict(
                            parent_name + '/' + property_name + '[]',
                            parent_path + property_name,
                            property_schema_dict['items'],
                            parent_id_fields=id_fields,
                            title_lookup=title_lookup.get(title))

                        rolledUp = set()

                        for field, child_title in fields:
                            if self.use_titles:
                                if not child_title:
                                    warn(
                                        'Field {} does not have a title, skipping.'
                                        .format(field))
                                else:
                                    sub_sheet.add_field(child_title)
                            else:
                                sub_sheet.add_field(field)
                            if self.rollup and 'rollUp' in property_schema_dict and field in property_schema_dict[
                                    'rollUp']:
                                rolledUp.add(field)
                                yield property_name + '[]/' + field, (
                                    title + ':' + child_title
                                    if title and child_title else None)

                        # Check that all items in rollUp are in the schema
                        if self.rollup and 'rollUp' in property_schema_dict:
                            missedRollUp = set(
                                property_schema_dict['rollUp']) - rolledUp
                            if missedRollUp:
                                warn('{} in rollUp but not in schema'.format(
                                    ', '.join(missedRollUp)))
                    else:
                        raise ValueError
                elif 'string' in property_type_set:
                    self.flattened[parent_path + property_name] = "string"
                    yield property_name, title
                elif 'number' in property_type_set:
                    self.flattened[parent_path + property_name] = "number"
                    yield property_name + ':number', title
                elif 'integer' in property_type_set:
                    self.flattened[parent_path + property_name] = "integer"
                    yield property_name + ':integer', title
                elif 'boolean' in property_type_set:
                    self.flattened[parent_path + property_name] = "boolean"
                    yield property_name + ':boolean', title
                else:
                    warn(
                        'Unrecognised types {} for property "{}" with context "{}",'
                        'so this property has been ignored.'.format(
                            repr(property_type_set), property_name,
                            parent_name))
        else:
            warn('Skipping field "{}", because it has no properties.'.format(
                parent_name))
Esempio n. 21
0
 def __init__(self, main_sheet, sub_sheets):
     self.main_sheet = Sheet(main_sheet)
     self.sub_sheets = {k: Sheet(v) for k, v in sub_sheets.items()}
Esempio n. 22
0
    def parse_json_dict(self,
                        json_dict,
                        sheet,
                        json_key=None,
                        parent_name='',
                        flattened_dict=None,
                        parent_id_fields=None,
                        top_level_of_sub_sheet=False):
        """
        Parse a json dictionary.

        json_dict - the json dictionary
        sheet - a sheet.Sheet object representing the resulting spreadsheet
        json_key - the key that maps to this JSON dict, either directly to the dict, or to a dict that this list contains.  Is None if this dict is contained in root_json_list directly.
        """
        # Possibly main_sheet should be main_sheet_columns, but this is
        # currently named for consistency with schema.py

        if self.use_titles:
            sheet_key = sheet_key_title
        else:
            sheet_key = sheet_key_field

        parent_id_fields = copy.copy(parent_id_fields) or OrderedDict()
        if flattened_dict is None:
            flattened_dict = {}
            top = True
        else:
            top = False

        if parent_name == '' and self.filter_field and self.filter_value:
            if self.filter_field not in json_dict:
                return
            if json_dict[self.filter_field] != self.filter_value:
                return

        if top_level_of_sub_sheet:
            # Only add the IDs for the top level of object in an array
            for k, v in parent_id_fields.items():
                if self.xml:
                    flattened_dict[sheet_key(sheet, k)] = v['#text']
                else:
                    flattened_dict[sheet_key(sheet, k)] = v

        if self.root_id and self.root_id in json_dict:
            parent_id_fields[sheet_key(sheet,
                                       self.root_id)] = json_dict[self.root_id]

        if self.id_name in json_dict:
            parent_id_fields[sheet_key(sheet, parent_name +
                                       self.id_name)] = json_dict[self.id_name]

        for key, value in json_dict.items():
            if type(value) in BASIC_TYPES:
                if self.xml and key == '#text':
                    # Handle the text output from xmltodict
                    key = ''
                    parent_name = parent_name.strip('/')
                flattened_dict[sheet_key(sheet, parent_name + key)] = value
            elif hasattr(value, 'items'):
                self.parse_json_dict(value,
                                     sheet=sheet,
                                     json_key=key,
                                     parent_name=parent_name + key + '/',
                                     flattened_dict=flattened_dict,
                                     parent_id_fields=parent_id_fields)
            elif hasattr(value, '__iter__'):
                if all(type(x) in BASIC_TYPES for x in value):
                    # Check for an array of BASIC types
                    # TODO Make this check the schema
                    # TODO Error if the any of the values contain the seperator
                    # TODO Support doubly nested arrays
                    flattened_dict[sheet_key(sheet,
                                             parent_name + key)] = ';'.join(
                                                 map(six.text_type, value))
                else:
                    if self.rollup and parent_name == '':  # Rollup only currently possible to main sheet
                        if len(value) == 1:
                            for k, v in value[0].items():
                                if self.use_titles and parent_name + key + '/0/' + k in self.schema_parser.main_sheet.titles:
                                    if type(v) in BASIC_TYPES:
                                        flattened_dict[sheet_key_title(
                                            sheet,
                                            parent_name + key + '/0/' + k)] = v
                                    else:
                                        raise ValueError(
                                            'Rolled up values must be basic types'
                                        )
                                elif not self.use_titles and parent_name + key + '/0/' + k in self.schema_parser.main_sheet:
                                    if type(v) in BASIC_TYPES:
                                        flattened_dict[sheet_key(
                                            sheet,
                                            parent_name + key + '/0/' + k)] = v
                                    else:
                                        raise ValueError(
                                            'Rolled up values must be basic types'
                                        )
                        elif len(value) > 1:
                            for k in set(
                                    sum((list(x.keys()) for x in value), [])):
                                warn(
                                    'More than one value supplied for "{}". Could not provide rollup, so adding a warning to the relevant cell(s) in the spreadsheet.'
                                    .format(parent_name + key))
                                if parent_name + key + '/0/' + k in self.schema_parser.main_sheet:
                                    flattened_dict[sheet_key(
                                        sheet, parent_name + key + '/0/' + k
                                    )] = 'WARNING: More than one value supplied, consult the relevant sub-sheet for the data.'

                    sub_sheet_name = make_sub_sheet_name(parent_name, key)
                    if sub_sheet_name not in self.sub_sheets:
                        self.sub_sheets[sub_sheet_name] = Sheet(
                            name=sub_sheet_name)

                    for json_dict in value:
                        self.parse_json_dict(
                            json_dict,
                            sheet=self.sub_sheets[sub_sheet_name],
                            json_key=key,
                            parent_id_fields=parent_id_fields,
                            parent_name=parent_name + key + '/0/',
                            top_level_of_sub_sheet=True)
            else:
                raise ValueError('Unsupported type {}'.format(type(value)))

        if top:
            sheet.lines.append(flattened_dict)
Esempio n. 23
0
class SchemaParser(object):
    """Parse the fields of a JSON schema into a flattened structure."""
    def __init__(self,
                 schema_filename=None,
                 root_schema_dict=None,
                 main_sheet_name='main',
                 rollup=False,
                 root_id='ocid',
                 use_titles=False):
        self.sub_sheets = {}
        self.main_sheet = Sheet()
        self.sub_sheet_mapping = {}
        self.main_sheet_name = main_sheet_name
        self.rollup = rollup
        self.root_id = root_id
        self.use_titles = use_titles
        self.title_lookup = TitleLookup()
        self.flattened = {}

        if root_schema_dict is None and schema_filename is None:
            raise ValueError(
                'One of schema_filename or root_schema_dict must be supplied')
        if root_schema_dict is not None and schema_filename is not None:
            raise ValueError(
                'Only one of schema_filename or root_schema_dict should be supplied'
            )
        if schema_filename:
            if schema_filename.startswith('http'):
                import requests
                r = requests.get(schema_filename)
                self.root_schema_dict = jsonref.loads(
                    r.text, object_pairs_hook=OrderedDict)
            else:
                with codecs.open(schema_filename,
                                 encoding="utf-8") as schema_file:
                    self.root_schema_dict = jsonref.load(
                        schema_file, object_pairs_hook=OrderedDict)
        else:
            self.root_schema_dict = root_schema_dict

    def parse(self):
        fields = self.parse_schema_dict(self.main_sheet_name, '',
                                        self.root_schema_dict)
        for field, title in fields:
            if self.use_titles:
                if not title:
                    warn('Field {} does not have a title, skipping.'.format(
                        field))
                else:
                    self.main_sheet.append(title)
            else:
                self.main_sheet.append(field)

    def parse_schema_dict(self,
                          parent_name,
                          parent_path,
                          schema_dict,
                          parent_id_fields=None,
                          title_lookup=None):
        if parent_path:
            parent_path = parent_path + '/'
        parent_id_fields = parent_id_fields or []
        title_lookup = self.title_lookup if title_lookup is None else title_lookup
        if 'properties' in schema_dict:
            if 'id' in schema_dict['properties']:
                id_fields = parent_id_fields + [parent_name + '/id']
            else:
                id_fields = parent_id_fields

            for property_name, property_schema_dict in schema_dict[
                    'properties'].items():
                property_type_set = get_property_type_set(property_schema_dict)

                title = property_schema_dict.get('title')
                if title:
                    title_lookup[title] = TitleLookup()
                    title_lookup[title].property_name = property_name

                if 'object' in property_type_set:
                    self.flattened[parent_path + property_name] = "object"
                    for field, child_title in self.parse_schema_dict(
                            parent_name + '/' + property_name,
                            parent_path + property_name,
                            property_schema_dict,
                            parent_id_fields=id_fields,
                            title_lookup=title_lookup.get(title)):
                        yield (
                            property_name + '/' + field,
                            # TODO ambiguous use of "title"
                            (title + ':' +
                             child_title if title and child_title else None))

                elif 'array' in property_type_set:
                    self.flattened[parent_path + property_name] = "array"
                    type_set = get_property_type_set(
                        property_schema_dict['items'])
                    if 'string' in type_set:
                        self.flattened[parent_path +
                                       property_name] = "string_array"
                        yield property_name + ':array', title
                    elif 'array' in type_set:
                        self.flattened[parent_path +
                                       property_name] = "array_array"
                        if 'string' in get_property_type_set(
                                property_schema_dict['items']['items']):
                            yield property_name + ':array', title
                        else:
                            raise ValueError
                    elif 'object' in type_set:
                        if title:
                            title_lookup[
                                title].property_name = property_name + '[]'
                        if hasattr(property_schema_dict['items'],
                                   '__reference__'):
                            sub_sheet_name = property_schema_dict[
                                'items'].__reference__['$ref'].split('/')[-1]
                        else:
                            sub_sheet_name = property_name

                        self.sub_sheet_mapping[parent_name + '/' +
                                               property_name] = sub_sheet_name

                        if sub_sheet_name not in self.sub_sheets:
                            self.sub_sheets[sub_sheet_name] = Sheet(
                                root_id=self.root_id, name=sub_sheet_name)
                        sub_sheet = self.sub_sheets[sub_sheet_name]
                        sub_sheet.title_lookup = title_lookup.get(title)

                        for field in id_fields:
                            sub_sheet.add_field(field + ':' + property_name,
                                                id_field=True)
                        fields = self.parse_schema_dict(
                            parent_name + '/' + property_name + '[]',
                            parent_path + property_name,
                            property_schema_dict['items'],
                            parent_id_fields=id_fields,
                            title_lookup=title_lookup.get(title))

                        rolledUp = set()

                        for field, child_title in fields:
                            if self.use_titles:
                                if not child_title:
                                    warn(
                                        'Field {} does not have a title, skipping.'
                                        .format(field))
                                else:
                                    sub_sheet.add_field(child_title)
                            else:
                                sub_sheet.add_field(field)
                            if self.rollup and 'rollUp' in property_schema_dict and field in property_schema_dict[
                                    'rollUp']:
                                rolledUp.add(field)
                                yield property_name + '[]/' + field, (
                                    title + ':' + child_title
                                    if title and child_title else None)

                        # Check that all items in rollUp are in the schema
                        if self.rollup and 'rollUp' in property_schema_dict:
                            missedRollUp = set(
                                property_schema_dict['rollUp']) - rolledUp
                            if missedRollUp:
                                warn('{} in rollUp but not in schema'.format(
                                    ', '.join(missedRollUp)))
                    else:
                        raise ValueError
                elif 'string' in property_type_set:
                    self.flattened[parent_path + property_name] = "string"
                    yield property_name, title
                elif 'number' in property_type_set:
                    self.flattened[parent_path + property_name] = "number"
                    yield property_name + ':number', title
                elif 'integer' in property_type_set:
                    self.flattened[parent_path + property_name] = "integer"
                    yield property_name + ':integer', title
                elif 'boolean' in property_type_set:
                    self.flattened[parent_path + property_name] = "boolean"
                    yield property_name + ':boolean', title
                else:
                    warn(
                        'Unrecognised types {} for property "{}" with context "{}",'
                        'so this property has been ignored.'.format(
                            repr(property_type_set), property_name,
                            parent_name))
        else:
            warn('Skipping field "{}", because it has no properties.'.format(
                parent_name))
Esempio n. 24
0
class SchemaParser(object):
    """Parse the fields of a JSON schema into a flattened structure."""

    def __init__(self, schema_filename=None, root_schema_dict=None, rollup=False, root_id=None, use_titles=False,
                 disable_local_refs=False, truncation_length=3, exclude_deprecated_fields=False):
        self.sub_sheets = {}
        self.main_sheet = Sheet()
        self.sub_sheet_mapping = {}
        self.rollup = rollup
        self.root_id = root_id
        self.use_titles = use_titles
        self.truncation_length = truncation_length
        self.title_lookup = TitleLookup()
        self.flattened = {}
        self.exclude_deprecated_fields = exclude_deprecated_fields

        if root_schema_dict is None and schema_filename is  None:
            raise ValueError('One of schema_filename or root_schema_dict must be supplied')
        if root_schema_dict is not None and schema_filename is not None:
            raise ValueError('Only one of schema_filename or root_schema_dict should be supplied')
        if schema_filename:
            if schema_filename.startswith('http'):
                import requests
                r = requests.get(schema_filename)
                self.root_schema_dict = jsonref.loads(r.text, object_pairs_hook=OrderedDict)
            else:
                if disable_local_refs:
                    with codecs.open(schema_filename, encoding="utf-8") as schema_file:
                        self.root_schema_dict = jsonref.load(schema_file, object_pairs_hook=OrderedDict,
                                                             loader=JsonLoaderLocalRefsDisabled())
                else:
                    if sys.version_info[:2] > (3, 0):
                        base_uri = pathlib.Path(os.path.realpath(schema_filename)).as_uri()
                    else:
                        base_uri = urlparse.urljoin('file:', urllib.pathname2url(os.path.abspath(schema_filename)))
                    with codecs.open(schema_filename, encoding="utf-8") as schema_file:
                        self.root_schema_dict = jsonref.load(schema_file, object_pairs_hook=OrderedDict,
                                                             base_uri=base_uri)


        else:
            self.root_schema_dict = root_schema_dict

    def parse(self):
        fields = self.parse_schema_dict('', self.root_schema_dict)
        for field, title in fields:
            if self.use_titles:
                if not title:
                    warn('Field {} does not have a title, skipping.'.format(field))
                else:
                    self.main_sheet.append(title)
                    self.main_sheet.titles[field] = title
            else:
                self.main_sheet.append(field)

    def parse_schema_dict(self, parent_path, schema_dict, parent_id_fields=None, title_lookup=None, parent_title=''):
        if parent_path:
            parent_path = parent_path + '/'
        parent_id_fields = parent_id_fields or []
        title_lookup = self.title_lookup if title_lookup is None else title_lookup

        if 'type' in schema_dict and schema_dict['type'] == 'array' \
                and 'items' in schema_dict and 'oneOf' in schema_dict['items']:
            for oneOf in schema_dict['items']['oneOf']:
                if 'type' in oneOf and oneOf['type'] == 'object':
                    for field, child_title in self.parse_schema_dict(
                                parent_path,
                                oneOf,
                                parent_id_fields=parent_id_fields,
                                title_lookup=title_lookup,
                                parent_title=parent_title):
                            yield (
                                field,
                                child_title
                            )

        elif 'properties' in schema_dict:
            if 'id' in schema_dict['properties']:
                if self.use_titles:
                    id_fields = parent_id_fields + [(parent_title if parent_title is not None else parent_path)+(schema_dict['properties']['id'].get('title') or 'id')]
                else:
                    id_fields = parent_id_fields + [parent_path+'id']
            else:
                id_fields = parent_id_fields

            for property_name, property_schema_dict in schema_dict['properties'].items():
                if self.exclude_deprecated_fields and property_schema_dict.get('deprecated'):
                    continue

                property_type_set = get_property_type_set(property_schema_dict)

                title = property_schema_dict.get('title')
                if title:
                    title_lookup[title] = TitleLookup()
                    title_lookup[title].property_name = property_name

                if 'object' in property_type_set:
                    self.flattened[parent_path+property_name] = "object"
                    for field, child_title in self.parse_schema_dict(
                            parent_path+property_name,
                            property_schema_dict,
                            parent_id_fields=id_fields,
                            title_lookup=title_lookup.get(title),
                            parent_title=parent_title+title+':' if parent_title is not None and title else None):
                        yield (
                            property_name+'/'+field,
                            # TODO ambiguous use of "title"
                            (title+':'+child_title if title and child_title else None)
                        )

                elif 'array' in property_type_set:
                    flattened_key = parent_path.replace('/0/', '/')+property_name
                    self.flattened[flattened_key] = "array"
                    type_set = get_property_type_set(property_schema_dict['items'])
                    if 'string' in type_set or not type_set:
                        self.flattened[flattened_key] = "string_array"
                        yield property_name, title
                    elif 'number' in type_set:
                        self.flattened[flattened_key] = "number_array"
                        yield property_name, title
                    elif 'array' in type_set:
                        self.flattened[flattened_key] = "array_array"
                        nested_type_set = get_property_type_set(property_schema_dict['items']['items'])
                        if 'string' in nested_type_set or 'number' in nested_type_set:
                            yield property_name, title
                        else:
                            raise ValueError
                    elif 'object' in type_set:
                        if title:
                            title_lookup[title].property_name = property_name

                        sub_sheet_name = make_sub_sheet_name(parent_path, property_name,
                                                             truncation_length=self.truncation_length)
                        #self.sub_sheet_mapping[parent_name+'/'+property_name] = sub_sheet_name

                        if sub_sheet_name not in self.sub_sheets:
                            self.sub_sheets[sub_sheet_name] = Sheet(root_id=self.root_id, name=sub_sheet_name)
                        sub_sheet = self.sub_sheets[sub_sheet_name]
                        sub_sheet.title_lookup = title_lookup.get(title)

                        for field in id_fields:
                            sub_sheet.add_field(field, id_field=True)
                            sub_sheet.titles[title_lookup.lookup_header(field)] = field
                        fields = self.parse_schema_dict(
                                parent_path+property_name+'/0',
                                property_schema_dict['items'],
                                parent_id_fields=id_fields,
                                title_lookup=title_lookup.get(title),
                                parent_title=parent_title+title+':' if parent_title is not None and title else None)
                        rolledUp = set()

                        for field, child_title in fields:
                            full_path = parent_path+property_name+'/0/'+field
                            if self.use_titles:
                                if not child_title or parent_title is None:
                                    warn('Field {}{}/0/{} is missing a title, skipping.'.format(parent_path, property_name, field))
                                elif not title:
                                    warn('Field {}{} does not have a title, skipping it and all its children.'.format(parent_path, property_name))
                                else:
                                    # This code only works for arrays that are at 0 or 1 layer of nesting
                                    full_title = parent_title+title+':'+child_title
                                    sub_sheet.add_field(full_title)
                                    sub_sheet.titles[full_path] = full_title
                            else:
                                sub_sheet.add_field(full_path)
                            if self.rollup and 'rollUp' in property_schema_dict and field in property_schema_dict['rollUp']:
                                rolledUp.add(field)
                                yield property_name+'/0/'+field, (title+':'+child_title if title and child_title else None)

                        # Check that all items in rollUp are in the schema
                        if self.rollup and 'rollUp' in property_schema_dict:
                            missedRollUp = set(property_schema_dict['rollUp']) - rolledUp
                            if missedRollUp:
                                warn('{} in rollUp but not in schema'.format(', '.join(missedRollUp)))
                    else:
                        raise ValueError('Unknown type_set: {}, did you forget to explicity set the "type" key on "items"?'.format(type_set))
                elif 'string' in property_type_set or not property_type_set:
                    self.flattened[parent_path.replace('/0/', '/')+property_name] = "string"
                    yield property_name, title
                elif 'number' in property_type_set:
                    self.flattened[parent_path.replace('/0/', '/')+property_name] = "number"
                    yield property_name, title
                elif 'integer' in property_type_set:
                    self.flattened[parent_path.replace('/0/', '/')+property_name] = "integer"
                    yield property_name, title
                elif 'boolean' in property_type_set:
                    self.flattened[parent_path.replace('/0/', '/')+property_name] = "boolean"
                    yield property_name, title
                else:
                    warn('Unrecognised types {} for property "{}" with context "{}",'
                         'so this property has been ignored.'.format(
                             repr(property_type_set),
                             property_name,
                             parent_path))
        else:
            warn('Skipping field "{}", because it has no properties.'.format(parent_path))
Esempio n. 25
0
class SchemaParser(object):
    """Parse the fields of a JSON schema into a flattened structure."""
    def __init__(
        self,
        schema_filename=None,
        root_schema_dict=None,
        rollup=False,
        root_id=None,
        use_titles=False,
        disable_local_refs=False,
        truncation_length=3,
        exclude_deprecated_fields=False,
    ):
        self.sub_sheets = {}
        self.main_sheet = Sheet()
        self.sub_sheet_mapping = {}
        self.do_rollup = rollup
        self.rollup = set()
        self.root_id = root_id
        self.use_titles = use_titles
        self.sub_sheet_titles = {}
        self.truncation_length = truncation_length
        self.title_lookup = TitleLookup()
        self.flattened = {}
        self.exclude_deprecated_fields = exclude_deprecated_fields

        if root_schema_dict is None and schema_filename is None:
            raise ValueError(
                _("One of schema_filename or root_schema_dict must be supplied"
                  ))
        if root_schema_dict is not None and schema_filename is not None:
            raise ValueError(
                _("Only one of schema_filename or root_schema_dict should be supplied"
                  ))
        if schema_filename:
            if schema_filename.startswith("http"):
                import requests

                r = requests.get(schema_filename)
                self.root_schema_dict = jsonref.loads(
                    r.text, object_pairs_hook=OrderedDict)
            else:
                if disable_local_refs:
                    with codecs.open(schema_filename,
                                     encoding="utf-8") as schema_file:
                        self.root_schema_dict = jsonref.load(
                            schema_file,
                            object_pairs_hook=OrderedDict,
                            loader=JsonLoaderLocalRefsDisabled(),
                        )
                else:
                    if sys.version_info[:2] > (3, 0):
                        base_uri = pathlib.Path(
                            os.path.realpath(schema_filename)).as_uri()
                    else:
                        base_uri = urlparse.urljoin(
                            "file:",
                            urllib.pathname2url(
                                os.path.abspath(schema_filename)),
                        )
                    with codecs.open(schema_filename,
                                     encoding="utf-8") as schema_file:
                        self.root_schema_dict = jsonref.load(
                            schema_file,
                            object_pairs_hook=OrderedDict,
                            base_uri=base_uri,
                        )

        else:
            self.root_schema_dict = root_schema_dict

    def parse(self):
        fields = self.parse_schema_dict("", self.root_schema_dict)
        for field, title in fields:
            if self.use_titles:
                if not title:
                    warn(
                        _("Field {} does not have a title, skipping.").format(
                            field))
                else:
                    self.main_sheet.append(title)
                    self.main_sheet.titles[field] = title
            else:
                self.main_sheet.append(field)

    def parse_schema_dict(
        self,
        parent_path,
        schema_dict,
        parent_id_fields=None,
        title_lookup=None,
        parent_title="",
    ):
        if parent_path:
            parent_path = parent_path + "/"
        parent_id_fields = parent_id_fields or []
        title_lookup = self.title_lookup if title_lookup is None else title_lookup

        if ("type" in schema_dict and schema_dict["type"] == "array"
                and "items" in schema_dict
                and "oneOf" in schema_dict["items"]):
            for oneOf in schema_dict["items"]["oneOf"]:
                if "type" in oneOf and oneOf["type"] == "object":
                    for field, child_title in self.parse_schema_dict(
                            parent_path,
                            oneOf,
                            parent_id_fields=parent_id_fields,
                            title_lookup=title_lookup,
                            parent_title=parent_title,
                    ):
                        yield (field, child_title)

        elif "properties" in schema_dict:
            if "id" in schema_dict["properties"]:
                if self.use_titles:
                    id_fields = parent_id_fields + [
                        (parent_title
                         if parent_title is not None else parent_path) +
                        (schema_dict["properties"]["id"].get("title") or "id")
                    ]
                else:
                    id_fields = parent_id_fields + [parent_path + "id"]
            else:
                id_fields = parent_id_fields

            for property_name, property_schema_dict in schema_dict[
                    "properties"].items():
                if self.exclude_deprecated_fields and property_schema_dict.get(
                        "deprecated"):
                    continue

                if (self.exclude_deprecated_fields
                        and hasattr(property_schema_dict, "__reference__") and
                        property_schema_dict.__reference__.get("deprecated")):
                    continue

                property_type_set = get_property_type_set(property_schema_dict)

                if (hasattr(property_schema_dict, "__reference__")
                        and "title" in property_schema_dict.__reference__):
                    title = property_schema_dict.__reference__["title"]
                else:
                    title = property_schema_dict.get("title")
                if title:
                    title_lookup[title] = TitleLookup()
                    title_lookup[title].property_name = property_name

                if "object" in property_type_set:
                    self.flattened[parent_path + property_name] = "object"
                    for field, child_title in self.parse_schema_dict(
                            parent_path + property_name,
                            property_schema_dict,
                            parent_id_fields=id_fields,
                            title_lookup=title_lookup.get(title),
                            parent_title=parent_title + title + ":"
                            if parent_title is not None and title else None,
                    ):
                        yield (
                            property_name + "/" + field,
                            # TODO ambiguous use of "title"
                            (title + ":" +
                             child_title if title and child_title else None),
                        )

                elif "array" in property_type_set:
                    flattened_key = parent_path.replace("/0/",
                                                        "/") + property_name
                    self.flattened[flattened_key] = "array"
                    type_set = get_property_type_set(
                        property_schema_dict["items"])
                    if "string" in type_set or not type_set:
                        self.flattened[flattened_key] = "string_array"
                        yield property_name, title
                    elif "number" in type_set:
                        self.flattened[flattened_key] = "number_array"
                        yield property_name, title
                    elif "array" in type_set:
                        self.flattened[flattened_key] = "array_array"
                        nested_type_set = get_property_type_set(
                            property_schema_dict["items"]["items"])
                        if "string" in nested_type_set or "number" in nested_type_set:
                            yield property_name, title
                        else:
                            raise ValueError
                    elif "object" in type_set:
                        if title:
                            title_lookup[title].property_name = property_name

                        if self.use_titles and parent_title is not None:
                            sub_sheet_name = make_sub_sheet_name(
                                parent_title,
                                title or property_name,
                                truncation_length=self.truncation_length,
                                path_separator=":",
                            )
                            self.sub_sheet_titles[(
                                parent_path,
                                property_name,
                            )] = sub_sheet_name
                        else:
                            sub_sheet_name = make_sub_sheet_name(
                                parent_path,
                                property_name,
                                truncation_length=self.truncation_length,
                            )
                        # self.sub_sheet_mapping[parent_name+'/'+property_name] = sub_sheet_name

                        if sub_sheet_name not in self.sub_sheets:
                            self.sub_sheets[sub_sheet_name] = Sheet(
                                root_id=self.root_id, name=sub_sheet_name)
                        sub_sheet = self.sub_sheets[sub_sheet_name]
                        sub_sheet.title_lookup = title_lookup.get(title)

                        for field in id_fields:
                            sub_sheet.add_field(field, id_field=True)
                            sub_sheet.titles[title_lookup.lookup_header(
                                field)] = field
                        fields = self.parse_schema_dict(
                            parent_path + property_name + "/0",
                            property_schema_dict["items"],
                            parent_id_fields=id_fields,
                            title_lookup=title_lookup.get(title),
                            parent_title=parent_title + title + ":"
                            if parent_title is not None and title else None,
                        )

                        rollup_fields = set()
                        for field, child_title in fields:
                            full_path = parent_path + property_name + "/0/" + field
                            if self.use_titles:
                                if not child_title or parent_title is None:
                                    warn(
                                        _("Field {}{}/0/{} is missing a title, skipping."
                                          ).format(parent_path, property_name,
                                                   field))
                                elif not title:
                                    warn(
                                        _("Field {}{} does not have a title, skipping it and all its children."
                                          ).format(parent_path, property_name))
                                else:
                                    # This code only works for arrays that are at 0 or 1 layer of nesting
                                    full_title = (parent_title + title + ":" +
                                                  child_title)
                                    sub_sheet.add_field(full_title)
                                    sub_sheet.titles[full_path] = full_title
                            else:
                                sub_sheet.add_field(full_path)
                            if (self.do_rollup
                                    and "rollUp" in property_schema_dict and
                                    field in property_schema_dict["rollUp"]):
                                rollup_fields.add(field)
                                self.rollup.add(full_path)
                                yield property_name + "/0/" + field, (
                                    title + ":" + child_title
                                    if title and child_title else None)

                        # Check that all items in rollUp are in the schema
                        if self.do_rollup and "rollUp" in property_schema_dict:
                            missedRollUp = (
                                set(property_schema_dict["rollUp"]) -
                                rollup_fields)
                            if missedRollUp:
                                warn("{} in rollUp but not in schema".format(
                                    ", ".join(missedRollUp)))

                    else:
                        raise ValueError(
                            _('Unknown type_set: {}, did you forget to explicity set the "type" key on "items"?'
                              ).format(type_set))
                elif "string" in property_type_set or not property_type_set:
                    # We only check for date here, because its the only format
                    # for which we need to specially transform the input
                    if property_schema_dict.get("format") == "date":
                        self.flattened[parent_path.replace("/0/", "/") +
                                       property_name] = "date"
                    else:
                        self.flattened[parent_path.replace("/0/", "/") +
                                       property_name] = "string"
                    yield property_name, title
                elif "number" in property_type_set:
                    self.flattened[parent_path.replace("/0/", "/") +
                                   property_name] = "number"
                    yield property_name, title
                elif "integer" in property_type_set:
                    self.flattened[parent_path.replace("/0/", "/") +
                                   property_name] = "integer"
                    yield property_name, title
                elif "boolean" in property_type_set:
                    self.flattened[parent_path.replace("/0/", "/") +
                                   property_name] = "boolean"
                    yield property_name, title
                else:
                    warn(
                        _('Unrecognised types {} for property "{}" with context "{}",'
                          "so this property has been ignored.").format(
                              repr(property_type_set), property_name,
                              parent_path))

        else:
            warn(
                _('Skipping field "{}", because it has no properties.').format(
                    parent_path))
Esempio n. 26
0
class SchemaParser(object):
    """Parse the fields of a JSON schema into a flattened structure."""

    def __init__(
        self,
        schema_filename=None,
        root_schema_dict=None,
        main_sheet_name="main",
        rollup=False,
        root_id="ocid",
        use_titles=False,
    ):
        self.sub_sheets = {}
        self.main_sheet = Sheet()
        self.sub_sheet_mapping = {}
        self.main_sheet_name = main_sheet_name
        self.rollup = rollup
        self.root_id = root_id
        self.use_titles = use_titles

        if root_schema_dict is None and schema_filename is None:
            raise ValueError("One of schema_filename or root_schema_dict must be supplied")
        if root_schema_dict is not None and schema_filename is not None:
            raise ValueError("Only one of schema_filename or root_schema_dict should be supplied")
        if schema_filename:
            if schema_filename.startswith("http"):
                import requests

                r = requests.get(schema_filename)
                self.root_schema_dict = jsonref.loads(r.text, object_pairs_hook=OrderedDict)
            else:
                with codecs.open(schema_filename, encoding="utf-8") as schema_file:
                    self.root_schema_dict = jsonref.load(schema_file, object_pairs_hook=OrderedDict)
        else:
            self.root_schema_dict = root_schema_dict

    def parse(self):
        fields = self.parse_schema_dict(self.main_sheet_name, self.root_schema_dict)
        for field, title in fields:
            if self.use_titles:
                if not title:
                    warn("Field {} does not have a title, skipping.".format(field))
                else:
                    self.main_sheet.append(title)
            else:
                self.main_sheet.append(field)
            if title:
                self.main_sheet.titles[title] = field

    def parse_schema_dict(self, parent_name, schema_dict, parent_id_fields=None):
        parent_id_fields = parent_id_fields or []
        if "properties" in schema_dict:
            if "id" in schema_dict["properties"]:
                id_fields = parent_id_fields + [parent_name + "/id"]
            else:
                id_fields = parent_id_fields

            for property_name, property_schema_dict in schema_dict["properties"].items():
                property_type_set = get_property_type_set(property_schema_dict)

                title = property_schema_dict.get("title")

                if "object" in property_type_set:
                    for field, child_title in self.parse_schema_dict(
                        parent_name + "/" + property_name, property_schema_dict, parent_id_fields=id_fields
                    ):
                        yield property_name + "/" + field, (
                            title + ":" + child_title if title and child_title else None
                        )  # TODO ambiguous use of "title"

                elif "array" in property_type_set:
                    type_set = get_property_type_set(property_schema_dict["items"])
                    if "string" in type_set:
                        yield property_name + ":array", title
                    elif "array" in type_set:
                        if "string" in get_property_type_set(property_schema_dict["items"]["items"]):
                            yield property_name + ":array", title
                        else:
                            raise ValueError
                    elif "object" in type_set:
                        if hasattr(property_schema_dict["items"], "__reference__"):
                            sub_sheet_name = property_schema_dict["items"].__reference__["$ref"].split("/")[-1]
                        else:
                            sub_sheet_name = property_name

                        self.sub_sheet_mapping[parent_name + "/" + property_name] = sub_sheet_name

                        if sub_sheet_name not in self.sub_sheets:
                            self.sub_sheets[sub_sheet_name] = Sheet(root_id=self.root_id, name=sub_sheet_name)
                        sub_sheet = self.sub_sheets[sub_sheet_name]

                        for field in id_fields:
                            sub_sheet.add_field(field + ":" + property_name, id_field=True)
                        fields = self.parse_schema_dict(
                            parent_name + "/" + property_name + "[]",
                            property_schema_dict["items"],
                            parent_id_fields=id_fields,
                        )

                        rolledUp = set()

                        for field, child_title in fields:
                            if self.use_titles:
                                if not child_title:
                                    warn("Field {} does not have a title, skipping.".format(field))
                                else:
                                    sub_sheet.add_field(child_title)
                            else:
                                sub_sheet.add_field(field)
                            if child_title:
                                self.sub_sheets[sub_sheet_name].titles[child_title] = field
                            if (
                                self.rollup
                                and "rollUp" in property_schema_dict
                                and field in property_schema_dict["rollUp"]
                            ):
                                rolledUp.add(field)
                                yield property_name + "[]/" + field, (
                                    title + ":" + child_title if title and child_title else None
                                )

                        # Check that all items in rollUp are in the schema
                        if self.rollup and "rollUp" in property_schema_dict:
                            missedRollUp = set(property_schema_dict["rollUp"]) - rolledUp
                            if missedRollUp:
                                warn("{} in rollUp but not in schema".format(", ".join(missedRollUp)))
                    else:
                        raise ValueError
                elif "string" in property_type_set:
                    yield property_name, title
                elif "number" in property_type_set:
                    yield property_name + ":number", title
                elif "integer" in property_type_set:
                    yield property_name + ":integer", title
                elif "boolean" in property_type_set:
                    yield property_name + ":boolean", title
                else:
                    warn(
                        'Unrecognised types {} for property "{}" with context "{}",'
                        "so this property has been ignored.".format(repr(property_type_set), property_name, parent_name)
                    )
        else:
            warn('Skipping field "{}", because it has no properties.'.format(parent_name))