コード例 #1
0
def test_flatten_should_split_with_child(spec, releases, options):
    releases[0]["tender"]["items"] = releases[0]["tender"]["items"] * 6
    for _ in spec.process_items(releases):
        pass
    flattener = Flattener(options, spec.tables)
    all_rows = defaultdict(list)
    for count, flat in flattener.flatten(releases):
        for name, rows in flat.items():
            all_rows[name].extend(rows)

    assert "tenders_items" in all_rows
    assert "tenders_items_addit" not in all_rows
    tenders = all_rows["tenders"]

    for tender, release in zip(tenders, releases):
        if release.get("tender", {}).get("items"):
            assert "/tender/items/0/id" not in tender
            assert "/tender/items/0/description" not in tender
            assert "/tender/items/1/id" not in tender
            assert "/tender/items/1/description" not in tender

    items = all_rows["tenders_items"]
    for item in items:
        assert "/tender/items/id" in item
        assert "/tender/items/description" in item
コード例 #2
0
def test_flatten_string_arrays(spec_analyzed, releases):
    options = FlattenOptions(
        **{
            "selection": {
                "tenders": {
                    "split": True
                },
                "parties": {
                    "split": True
                }
            },
            "exclude": ["tenders_items", "parties_ids", "tenders_tenderers"],
        })
    flattener = Flattener(options, spec_analyzed.tables)
    fields = ["submissionMethod", "roles"]
    for count, flat in flattener.flatten(releases):
        for name, rows in flat.items():
            counters = defaultdict(int)
            for row in reversed(rows):
                for key, value in row.items():
                    if "/" in key:
                        key = key.replace("parties",
                                          f"parties/{counters['parties']}")
                        actual = resolve_pointer(releases[count], key)
                        if any(key.endswith(field) for field in fields):
                            actual = JOINABLE_SEPARATOR.join(actual)
                        assert actual == value
                counters[name] += 1
コード例 #3
0
def test_flatten_fields_compare(spec_analyzed, releases):
    options = FlattenOptions(**{
        "selection": {
            "tenders": {
                "split": True
            },
            "parties": {
                "split": False
            }
        },
    })
    flattener = Flattener(options, spec_analyzed.tables)
    fields = ["submissionMethod", "roles"]
    for count, flat in flattener.flatten(releases):
        for name, rows in flat.items():
            counters = defaultdict(int)
            for row in reversed(rows):
                for key, value in row.items():
                    if "/" in key:
                        if "parties" in key:
                            key = key.replace(
                                "parties", f"parties/{counters['parties']}")
                        expected = resolve_pointer(releases[count], key)
                        if any(key.endswith(field) for field in fields):
                            expected = JOINABLE_SEPARATOR.join(expected)
                        assert expected == value
                counters[name] += 1
コード例 #4
0
def test_csv_writer(spec_analyzed, releases, flatten_options, tmpdir, schema):
    flattener = Flattener(flatten_options, spec_analyzed.tables)
    flatten_options.selection["parties"].split = True
    tables = prepare_tables(spec_analyzed, flatten_options)
    workdir = Path(tmpdir)
    with CSVWriter(workdir, tables, flatten_options, schema) as writer:
        # Writing CSV files
        for _count, flat in flattener.flatten(releases):
            for name, rows in flat.items():
                for row in rows:
                    writer.writerow(name, row)

    # Reading CSV files
    counter = {}
    for _count, flat in flattener.flatten(releases):
        for name, rows in flat.items():
            if name not in counter:
                counter[name] = 0
            for row in rows:
                str_row = {k: str(v) for (k, v) in row.items()}
                file = name + ".csv"
                path = workdir / file
                with open(path, newline="", encoding="utf-8") as csv_file:
                    csv_reader = csv.DictReader(csv_file)
                    for num, line in enumerate(csv_reader):
                        if num == counter[name]:
                            clean_line = {
                                k: v
                                for (k, v) in line.items() if v != ""
                            }
                            assert dict(clean_line) == str_row
                counter[name] += 1
コード例 #5
0
def test_xlsx_writer(spec_analyzed, releases, flatten_options, tmpdir, schema):
    flattener = Flattener(flatten_options, spec_analyzed.tables)
    tables = prepare_tables(spec_analyzed, flatten_options)
    workdir = Path(tmpdir)
    with XlsxWriter(workdir, tables, flatten_options, schema) as writer:
        # Writing XLSX file
        for _count, flat in flattener.flatten(releases):
            for name, rows in flat.items():
                for row in rows:
                    writer.writerow(name, row)

    # Reading XLSX files
    counter = {}
    path = workdir / "result.xlsx"
    xlsx_reader = openpyxl.load_workbook(path)

    for _count, flat in flattener.flatten(releases):
        for name, rows in flat.items():
            if name not in counter:
                counter[name] = 2
            sheet = xlsx_reader[name]
            headers = {cell.column_letter: cell.value for cell in sheet[1]}
            for row in rows:
                line = {
                    headers[cell.column_letter]: cell.value
                    for cell in sheet[counter[name]]
                }
                row = row.as_dict()
                assert not set(row.keys()).difference(set(line.keys()))
                for k, v in row.items():
                    assert str(v) == str(line[k])
                counter[name] += 1
コード例 #6
0
def test_flatten_with_counters(spec, releases):
    releases[0]["tender"]["items"] = releases[0]["tender"]["items"] * 6
    releases[0]["tender"]["items"][0]["additionalClassifications"] = (
        releases[0]["tender"]["items"][0]["additionalClassifications"] * 6)
    for _ in spec.process_items(releases):
        pass
    options = FlattenOptions(**{
        "selection": {
            "tenders": {
                "split": True
            }
        },
        "count": True
    })
    flattener = Flattener(options, spec.tables)
    for count, flat in flattener.flatten(releases):
        for name, rows in flat.items():
            if name == "tenders":
                for row in rows:
                    items = search(f"[{count}].tender.items", releases)
                    if items:
                        assert "/tender/itemsCount" in row
                        assert len(items) == row["/tender/itemsCount"]
            elif name == "tenders_items":
                for index, row in enumerate(rows):
                    additional = search(
                        f"[{count}].tender.items[{index}].additionalClassifications",
                        releases,
                    )
                    if additional:
                        assert "/tender/items/additionalClassificationsCount" in row
                        assert len(additional) == row[
                            "/tender/items/additionalClassificationsCount"]
コード例 #7
0
ファイル: __init__.py プロジェクト: lttga/test2
 def __init__(self, workdir, options, tables, root_key="releases", csv=None, xlsx="result.xlsx", language=LOCALE):
     self.flattener = Flattener(options, tables, language=language)
     self.workdir = Path(workdir)
     # TODO: detect package, where?
     self.root_key = root_key
     self.writers = []
     self.csv = csv
     self.xlsx = xlsx
コード例 #8
0
def test_flatten_only_no_default_columns(spec_analyzed, releases):
    options = FlattenOptions(
        **{"selection": {
            "tenders": {
                "split": False,
                "only": ["/tender/id"]
            }
        }})
    flattener = Flattener(options, spec_analyzed.tables)
    for _count, flat in flattener.flatten(releases):
        for name, rows in flat.items():
            for row in rows:
                assert not set(row.keys()).difference(set(["/tender/id"]))
コード例 #9
0
def test_less_five_arrays_csv(spec_analyzed, releases, flatten_options, tmpdir):
    test_arrays = ["tenders_items", "tenders_items_addit", "tenders_tende"]
    flattener = Flattener(flatten_options, spec_analyzed.tables)
    tables = prepare_tables(spec_analyzed, flatten_options)
    workdir = Path(tmpdir)
    with CSVWriter(workdir, tables, flatten_options) as writer:
        for _count, flat in flattener.flatten(releases):
            for name, rows in flat.items():
                for row in rows:
                    writer.writerow(name, row)

    for name in test_arrays:
        path = workdir / f"{name}.csv"
        assert not path.is_file()
コード例 #10
0
def test_less_five_arrays_xlsx(spec_analyzed, releases, flatten_options, tmpdir):
    test_arrays = ["tenders_items", "tenders_items_addit", "tenders_tende"]
    flattener = Flattener(flatten_options, spec_analyzed.tables)
    tables = prepare_tables(spec_analyzed, flatten_options)
    workdir = Path(tmpdir)
    with XlsxWriter(workdir, tables, flatten_options) as writer:
        for _count, flat in flattener.flatten(releases):
            for name, rows in flat.items():
                for row in rows:
                    writer.writerow(name, row)

    path = workdir / "result.xlsx"
    xlsx_reader = openpyxl.load_workbook(path)
    for name in test_arrays:
        assert name not in xlsx_reader
コード例 #11
0
def test_flatten_buyer(spec_analyzed, releases):
    options = FlattenOptions(**{
        "selection": {
            "parties": {
                "split": True
            }
        },
        "exclude": ["parties_ids"]
    })
    flattener = Flattener(options, spec_analyzed.tables)
    for count, flat in flattener.flatten(releases):
        buyer = search(f"[{count}].buyer", releases)
        for name, rows in flat.items():
            for row in rows:
                if buyer:
                    assert "/buyer/id" in row
                    assert "/buyer/name" in row
コード例 #12
0
def test_flatten_should_not_split(spec_analyzed, releases):
    options = FlattenOptions(**{"selection": {"tenders": {"split": False}}})
    flattener = Flattener(options, spec_analyzed.tables)
    all_rows = defaultdict(list)
    for count, flat in flattener.flatten(releases):
        for name, rows in flat.items():
            all_rows[name].extend(rows)
    assert "tender_items" not in all_rows
    assert "tenders_items_addit" not in all_rows
    tenders = all_rows["tenders"]

    for tender, release in zip(tenders, releases):
        items = release.get("tender", {}).get("items")
        if release.get("tender", {}).get("items"):
            assert "/tender/items/0/id" in tender
            assert "/tender/items/0/description" in tender
            if len(items) > 1:
                assert "/tender/items/1/id" in tender
                assert "/tender/items/1/description" in tender
コード例 #13
0
def test_flatten_row_id_parent_id_relation(spec, releases):
    releases[0]["tender"]["items"] = releases[0]["tender"]["items"] * 6
    releases[0]["tender"]["items"] = releases[0]["tender"]["items"] * 6
    releases[0]["tender"]["items"][0]["additionalClassifications"] = (
        releases[0]["tender"]["items"][0]["additionalClassifications"] * 6)
    for _ in spec.process_items(releases):
        pass
    options = FlattenOptions(**{"selection": {"tenders": {"split": True}}})
    flattener = Flattener(options, spec.tables)
    all_rows = defaultdict(list)
    for count, flat in flattener.flatten(releases):
        for name, rows in flat.items():
            all_rows[name].extend(rows)

    for row in all_rows["tenders_items_class"]:
        parent_id = row["parentID"]
        items = [
            i for i in all_rows["tenders_items"] if i["rowID"] == parent_id
        ]
        assert items
コード例 #14
0
ファイル: __init__.py プロジェクト: lttga/test2
class FileFlattener:
    """Main utility for flattening files

    :param workdir: Working directory
    :param options: Flattening configuration
    :param tables: Analyzed tables data
    :param root_key: Field name to access records
    :param csv: If True generate cvs files
    :param xlsx: Generate combined xlsx table
    """

    def __init__(self, workdir, options, tables, root_key="releases", csv=None, xlsx="result.xlsx", language=LOCALE):
        self.flattener = Flattener(options, tables, language=language)
        self.workdir = Path(workdir)
        # TODO: detect package, where?
        self.root_key = root_key
        self.writers = []
        self.csv = csv
        self.xlsx = xlsx

    def _flatten(self, filename, writers):
        path = self.workdir / filename
        with open(path, "rb") as fd:
            items = iter_file(fd, self.root_key)
            for count, data in self.flattener.flatten(items):
                for table, rows in data.items():
                    for row in rows:
                        for wr in writers:
                            wr.writerow(table, row)
                yield count

    def flatten_file(self, filename):
        """Flatten file

        :param filename: Input filename in working directory
        """
        workdir = self.workdir
        if isinstance(self.csv, Path):
            workdir = self.csv
        if not self.xlsx and self.csv:
            with CSVWriter(workdir, self.flattener.tables, self.flattener.options) as writer:
                for count in self._flatten(filename, [writer]):
                    yield count
        if self.xlsx and not self.csv:
            with XlsxWriter(self.workdir, self.flattener.tables, self.flattener.options, filename=self.xlsx) as writer:
                for count in self._flatten(filename, [writer]):
                    yield count

        if self.xlsx and self.csv:
            with XlsxWriter(
                self.workdir, self.flattener.tables, self.flattener.options, filename=self.xlsx
            ) as xlsx, CSVWriter(workdir, self.flattener.tables, self.flattener.options) as csv:
                for count in self._flatten(filename, [xlsx, csv]):
                    yield count
コード例 #15
0
def test_flatten_with_repeat(spec_analyzed, releases):
    options = FlattenOptions(**{
        "selection": {
            "tenders": {
                "split": True,
                "repeat": ["/tender/id"]
            }
        },
    })
    flattener = Flattener(options, spec_analyzed.tables)
    for count, flat in flattener.flatten(releases):
        for name, rows in flat.items():
            if name == "tenders":
                continue
            for row in rows:
                assert "id" in row
                assert "ocid" in row
                assert "rowID" in row
                assert "/tender/id" in row
                assert row["/tender/id"] == search(f"[{count}].tender.id",
                                                   releases)
コード例 #16
0
def test_flatten_with_exclude(spec, releases):
    releases[0]["tender"]["items"] = releases[0]["tender"]["items"] * 6
    for _ in spec.process_items(releases):
        pass
    options = FlattenOptions(**{
        "selection": {
            "tenders": {
                "split": True
            }
        },
        "exclude": ["tenders_items"]
    })
    flattener = Flattener(options, spec.tables)
    all_rows = defaultdict(list)
    for count, flat in flattener.flatten(releases):
        for name, rows in flat.items():
            all_rows[name].extend(rows)
    assert "tenders" in all_rows
    assert "tenders_items" not in all_rows

    options = FlattenOptions(**{"selection": {"tenders": {"split": True}}})
    flattener = Flattener(options, spec.tables)
    all_rows = defaultdict(list)
    for count, flat in flattener.flatten(releases):
        for name, rows in flat.items():
            all_rows[name].extend(rows)
    assert "tenders" in all_rows
    assert "tenders_items" in all_rows
コード例 #17
0
def test_flattener_generate_count_columns(spec, releases):
    releases[0]["tender"]["items"] = releases[0]["tender"]["items"] * 6
    for _ in spec.process_items(releases):
        pass
    options = FlattenOptions(**{
        "selection": {
            "tenders": {
                "split": False
            }
        },
        "count": True
    })
    flattener = Flattener(options, spec.tables)
    tenders = flattener.tables["tenders"]
    assert "/tender/itemsCount" not in tenders
    for index in range(
            tenders.arrays["/tender/items/additionalClassifications"]):
        assert f"/tender/items/{index}/additionalClassificationsCount" not in tenders

    options = FlattenOptions(
        **{
            "selection": {
                "tenders": {
                    "split": True
                },
                "tenders_items": {
                    "split": False
                }
            },
            "count": True
        })
    flattener = Flattener(options, spec.tables)
    tenders = flattener.tables["tenders"]
    tenders_items = flattener.tables["tenders_items"]
    assert "/tender/itemsCount" in tenders
    for index in range(
            tenders.arrays["/tender/items/additionalClassifications"]):
        assert f"/tender/items/{index}/additionalClassificationsCount" not in tenders
    assert "/tender/items/additionalClassificationsCount" in tenders_items
コード例 #18
0
def test_flatten_with_unnest(spec_analyzed, releases):
    field = "/tender/items/0/id"
    options = FlattenOptions(**{
        "selection": {
            "tenders": {
                "split": True,
                "unnest": [field]
            }
        },
    })
    flattener = Flattener(options, spec_analyzed.tables)
    for count, flat in flattener.flatten(releases):
        for name, rows in flat.items():
            for row in rows:
                if name != "tenders":
                    assert field not in row
                    continue
                item_id = search(f"[{count}].tender.items[0].id", releases)
                if item_id:
                    assert field in row
                    assert search(f"[{count}].tender.items[0].id",
                                  releases) == row[field]
コード例 #19
0
 def __init__(
     self,
     workdir,
     options,
     analyzer=None,
     tables=None,
     pkg_type="releases",
     csv=None,
     xlsx="result.xlsx",
     language=LOCALE,
     multiple_values=False,
     schema=None,
 ):
     self.tables = tables if tables else analyzer.spec.tables
     self.flattener = Flattener(options, self.tables, language=language)
     self.workdir = Path(workdir)
     # TODO: detect package, where?
     self.writers = []
     self.csv = csv
     self.xlsx = xlsx
     self.multiple_values = multiple_values if multiple_values else analyzer.multiple_values if analyzer else False
     self.pkg_type = pkg_type if pkg_type else analyzer.pkg_type if analyzer else "releases"
     self.schema = schema or analyzer.spec.schema
コード例 #20
0
def test_xlsx_writer(spec_analyzed, releases, flatten_options, tmpdir):
    flattener = Flattener(flatten_options, spec_analyzed.tables)
    tables = prepare_tables(spec_analyzed, flatten_options)
    workdir = Path(tmpdir)
    with XlsxWriter(workdir, tables, flatten_options) as writer:
        # Writing XLSX file
        for _count, flat in flattener.flatten(releases):
            for name, rows in flat.items():
                for row in rows:
                    writer.writerow(name, row)

    # Reading XLSX files
    counter = {}
    path = workdir / "result.xlsx"
    for _count, flat in flattener.flatten(releases):
        for name, rows in flat.items():
            if name not in counter:
                counter[name] = 2
            xlsx_reader = openpyxl.load_workbook(path)
            sheet = xlsx_reader[name]
            header_values = [cell.value for cell in sheet[1]]
            header_columns = [cell.column_letter for cell in sheet[1]]
            headers = dict(zip(header_columns, header_values))
            for row in rows:
                line_values = [cell.value for cell in sheet[counter[name]]]
                line_columns = [headers[cell.column_letter] for cell in sheet[counter[name]]]
                line = dict(zip(line_columns, line_values))
                # Cleaning empty cells
                line = {k: v for (k, v) in line.items() if v}

                if "/tender/hasEnquiries" in row:
                    str_row = {k: v for (k, v) in row.items()}
                    str_row["/tender/hasEnquiries"] = str(row["/tender/hasEnquiries"])
                    assert line == str_row
                else:
                    assert line == row
                counter[name] += 1
コード例 #21
0
def test_xlsx_only_no_default_columns(spec_analyzed, releases, tmpdir, schema):
    flatten_options = FlattenOptions(
        **{"selection": {
            "tenders": {
                "split": True,
                "only": ["/tender/id"]
            }
        }})
    flattener = Flattener(flatten_options, spec_analyzed.tables)
    tables = prepare_tables(spec_analyzed, flatten_options)
    workdir = Path(tmpdir)
    with XlsxWriter(workdir, tables, flatten_options, schema) as writer:
        for _count, flat in flattener.flatten(releases):
            for name, rows in flat.items():
                for row in rows:
                    writer.writerow(name, row)

    path = workdir / "result.xlsx"
    xlsx_reader = openpyxl.load_workbook(path)
    column = []
    for row in xlsx_reader["tenders"].rows:
        column.append(row[0].value)
    assert column[0] == "/tender/id"
    assert xlsx_reader["tenders"].max_column == 1
コード例 #22
0
def test_flatten(spec_analyzed, releases):
    options = FlattenOptions(**{
        "selection": {
            "tenders": {
                "split": True
            },
            "parties": {
                "split": False
            }
        },
    })
    flattener = Flattener(options, spec_analyzed.tables)
    count = {"tenders": 0, "parties": 0}
    for _count, flat in flattener.flatten(releases):
        for name, rows in flat.items():
            for row in rows:
                assert "id" in row
                assert "ocid" in row
                assert "rowID" in row
                if name in ID_ITEMS:
                    key = "tender" if name == "tenders" else "parties"
                    path = f"/{key}/id"
                    assert ID_ITEMS[name][count[name]][path] == row.get(path)
                    count[name] += 1
コード例 #23
0
def test_flatten_with_only(spec_analyzed, releases):
    options = FlattenOptions(
        **{
            "selection": {
                "tenders": {
                    "split": True,
                    "only": ["/tender/id"]
                },
                "parties": {
                    "split": False
                }
            }
        })
    flattener = Flattener(options, spec_analyzed.tables)
    all_rows = defaultdict(list)
    for count, flat in flattener.flatten(releases):
        for name, rows in flat.items():
            all_rows[name].extend(rows)
    assert all_rows["tenders"]

    for row in all_rows["tenders"]:
        assert not set(row).difference(
            ["/tender/id", "rowID", "ocid", "parentID", "id"])

    options = FlattenOptions(
        **{"selection": {
            "tenders": {
                "split": False,
                "only": ["/tender/id"]
            }
        }})
    flattener = Flattener(options, spec_analyzed.tables)
    all_rows = defaultdict(list)
    for count, flat in flattener.flatten(releases):
        for name, rows in flat.items():
            all_rows[name].extend(rows)

    assert all_rows["tenders"]
    for row in all_rows["tenders"]:
        assert not set(row).difference(
            ["/tender/id", "rowID", "ocid", "parentID", "id"])
コード例 #24
0
class FileFlattener:
    """Main utility for flattening files
    :param workdir: Working directory
    :param options: Flattening configuration
    :param analyzer: Analyzed data object
    :param pkg_type: Field name to access records
    :param csv: If True generate cvs files
    :param xlsx: Generate combined xlsx table
    :param language: Language to use for the human-readable headings
    """
    def __init__(
        self,
        workdir,
        options,
        analyzer=None,
        tables=None,
        pkg_type="releases",
        csv=None,
        xlsx="result.xlsx",
        language=LOCALE,
        multiple_values=False,
        schema=None,
    ):
        self.tables = tables if tables else analyzer.spec.tables
        self.flattener = Flattener(options, self.tables, language=language)
        self.workdir = Path(workdir)
        # TODO: detect package, where?
        self.writers = []
        self.csv = csv
        self.xlsx = xlsx
        self.multiple_values = multiple_values if multiple_values else analyzer.multiple_values if analyzer else False
        self.pkg_type = pkg_type if pkg_type else analyzer.pkg_type if analyzer else "releases"
        self.schema = schema or analyzer.spec.schema

    def _flatten(self, filenames, writers):
        if not isinstance(filenames, list):
            filenames = [filenames]
        for filename in filenames:
            path = self.workdir / filename
            reader = get_reader(path)
            with reader(path, "rb") as fd:
                items = iter_file(fd,
                                  self.pkg_type,
                                  multiple_values=self.multiple_values)
                for count, data in self.flattener.flatten(items):
                    for table, rows in data.items():
                        for row in rows:
                            for wr in writers:
                                wr.writerow(table, row)
                    yield count

    def flatten_file(self, filename):
        """Flatten file
        :param filename: Input filename in working directory
        """
        workdir = self.workdir

        if isinstance(self.csv, Path):
            workdir = self.csv
        if not self.xlsx and self.csv:
            with CSVWriter(
                    workdir,
                    self.flattener.tables,
                    self.flattener.options,
                    schema=self.schema,
            ) as writer:
                for count in self._flatten(filename, [writer]):
                    yield count
        if self.xlsx and not self.csv:
            with XlsxWriter(
                    self.workdir,
                    self.flattener.tables,
                    self.flattener.options,
                    filename=self.xlsx,
                    schema=self.schema,
            ) as writer:
                for count in self._flatten(filename, [writer]):
                    yield count

        if self.xlsx and self.csv:
            with XlsxWriter(
                    self.workdir,
                    self.flattener.tables,
                    self.flattener.options,
                    filename=self.xlsx,
                    schema=self.schema,
            ) as xlsx, CSVWriter(
                    workdir,
                    self.flattener.tables,
                    self.flattener.options,
                    schema=self.schema,
            ) as csv:
                for count in self._flatten(filename, [xlsx, csv]):
                    yield count