def test_flatten_with_exclude(spec, releases): releases[0]["tender"]["items"] = releases[0]["tender"]["items"] * 6 for _ in spec.process_items(releases): pass options = FlattenOptions(**{ "selection": { "tenders": { "split": True } }, "exclude": ["tenders_items"] }) flattener = Flattener(options, spec.tables) all_rows = defaultdict(list) for count, flat in flattener.flatten(releases): for name, rows in flat.items(): all_rows[name].extend(rows) assert "tenders" in all_rows assert "tenders_items" not in all_rows options = FlattenOptions(**{"selection": {"tenders": {"split": True}}}) flattener = Flattener(options, spec.tables) all_rows = defaultdict(list) for count, flat in flattener.flatten(releases): for name, rows in flat.items(): all_rows[name].extend(rows) assert "tenders" in all_rows assert "tenders_items" in all_rows
def test_flatten_should_split_with_child(spec, releases, options): releases[0]["tender"]["items"] = releases[0]["tender"]["items"] * 6 for _ in spec.process_items(releases): pass flattener = Flattener(options, spec.tables) all_rows = defaultdict(list) for count, flat in flattener.flatten(releases): for name, rows in flat.items(): all_rows[name].extend(rows) assert "tenders_items" in all_rows assert "tenders_items_addit" not in all_rows tenders = all_rows["tenders"] for tender, release in zip(tenders, releases): if release.get("tender", {}).get("items"): assert "/tender/items/0/id" not in tender assert "/tender/items/0/description" not in tender assert "/tender/items/1/id" not in tender assert "/tender/items/1/description" not in tender items = all_rows["tenders_items"] for item in items: assert "/tender/items/id" in item assert "/tender/items/description" in item
def test_csv_writer(spec_analyzed, releases, flatten_options, tmpdir, schema): flattener = Flattener(flatten_options, spec_analyzed.tables) flatten_options.selection["parties"].split = True tables = prepare_tables(spec_analyzed, flatten_options) workdir = Path(tmpdir) with CSVWriter(workdir, tables, flatten_options, schema) as writer: # Writing CSV files for _count, flat in flattener.flatten(releases): for name, rows in flat.items(): for row in rows: writer.writerow(name, row) # Reading CSV files counter = {} for _count, flat in flattener.flatten(releases): for name, rows in flat.items(): if name not in counter: counter[name] = 0 for row in rows: str_row = {k: str(v) for (k, v) in row.items()} file = name + ".csv" path = workdir / file with open(path, newline="", encoding="utf-8") as csv_file: csv_reader = csv.DictReader(csv_file) for num, line in enumerate(csv_reader): if num == counter[name]: clean_line = { k: v for (k, v) in line.items() if v != "" } assert dict(clean_line) == str_row counter[name] += 1
def test_xlsx_writer(spec_analyzed, releases, flatten_options, tmpdir, schema): flattener = Flattener(flatten_options, spec_analyzed.tables) tables = prepare_tables(spec_analyzed, flatten_options) workdir = Path(tmpdir) with XlsxWriter(workdir, tables, flatten_options, schema) as writer: # Writing XLSX file for _count, flat in flattener.flatten(releases): for name, rows in flat.items(): for row in rows: writer.writerow(name, row) # Reading XLSX files counter = {} path = workdir / "result.xlsx" xlsx_reader = openpyxl.load_workbook(path) for _count, flat in flattener.flatten(releases): for name, rows in flat.items(): if name not in counter: counter[name] = 2 sheet = xlsx_reader[name] headers = {cell.column_letter: cell.value for cell in sheet[1]} for row in rows: line = { headers[cell.column_letter]: cell.value for cell in sheet[counter[name]] } row = row.as_dict() assert not set(row.keys()).difference(set(line.keys())) for k, v in row.items(): assert str(v) == str(line[k]) counter[name] += 1
def test_flatten_with_counters(spec, releases): releases[0]["tender"]["items"] = releases[0]["tender"]["items"] * 6 releases[0]["tender"]["items"][0]["additionalClassifications"] = ( releases[0]["tender"]["items"][0]["additionalClassifications"] * 6) for _ in spec.process_items(releases): pass options = FlattenOptions(**{ "selection": { "tenders": { "split": True } }, "count": True }) flattener = Flattener(options, spec.tables) for count, flat in flattener.flatten(releases): for name, rows in flat.items(): if name == "tenders": for row in rows: items = search(f"[{count}].tender.items", releases) if items: assert "/tender/itemsCount" in row assert len(items) == row["/tender/itemsCount"] elif name == "tenders_items": for index, row in enumerate(rows): additional = search( f"[{count}].tender.items[{index}].additionalClassifications", releases, ) if additional: assert "/tender/items/additionalClassificationsCount" in row assert len(additional) == row[ "/tender/items/additionalClassificationsCount"]
def test_flatten_string_arrays(spec_analyzed, releases): options = FlattenOptions( **{ "selection": { "tenders": { "split": True }, "parties": { "split": True } }, "exclude": ["tenders_items", "parties_ids", "tenders_tenderers"], }) flattener = Flattener(options, spec_analyzed.tables) fields = ["submissionMethod", "roles"] for count, flat in flattener.flatten(releases): for name, rows in flat.items(): counters = defaultdict(int) for row in reversed(rows): for key, value in row.items(): if "/" in key: key = key.replace("parties", f"parties/{counters['parties']}") actual = resolve_pointer(releases[count], key) if any(key.endswith(field) for field in fields): actual = JOINABLE_SEPARATOR.join(actual) assert actual == value counters[name] += 1
def test_flatten_fields_compare(spec_analyzed, releases): options = FlattenOptions(**{ "selection": { "tenders": { "split": True }, "parties": { "split": False } }, }) flattener = Flattener(options, spec_analyzed.tables) fields = ["submissionMethod", "roles"] for count, flat in flattener.flatten(releases): for name, rows in flat.items(): counters = defaultdict(int) for row in reversed(rows): for key, value in row.items(): if "/" in key: if "parties" in key: key = key.replace( "parties", f"parties/{counters['parties']}") expected = resolve_pointer(releases[count], key) if any(key.endswith(field) for field in fields): expected = JOINABLE_SEPARATOR.join(expected) assert expected == value counters[name] += 1
def __init__(self, workdir, options, tables, root_key="releases", csv=None, xlsx="result.xlsx", language=LOCALE): self.flattener = Flattener(options, tables, language=language) self.workdir = Path(workdir) # TODO: detect package, where? self.root_key = root_key self.writers = [] self.csv = csv self.xlsx = xlsx
def test_flatten_with_only(spec_analyzed, releases): options = FlattenOptions( **{ "selection": { "tenders": { "split": True, "only": ["/tender/id"] }, "parties": { "split": False } } }) flattener = Flattener(options, spec_analyzed.tables) all_rows = defaultdict(list) for count, flat in flattener.flatten(releases): for name, rows in flat.items(): all_rows[name].extend(rows) assert all_rows["tenders"] for row in all_rows["tenders"]: assert not set(row).difference( ["/tender/id", "rowID", "ocid", "parentID", "id"]) options = FlattenOptions( **{"selection": { "tenders": { "split": False, "only": ["/tender/id"] } }}) flattener = Flattener(options, spec_analyzed.tables) all_rows = defaultdict(list) for count, flat in flattener.flatten(releases): for name, rows in flat.items(): all_rows[name].extend(rows) assert all_rows["tenders"] for row in all_rows["tenders"]: assert not set(row).difference( ["/tender/id", "rowID", "ocid", "parentID", "id"])
def test_flattener_generate_count_columns(spec, releases): releases[0]["tender"]["items"] = releases[0]["tender"]["items"] * 6 for _ in spec.process_items(releases): pass options = FlattenOptions(**{ "selection": { "tenders": { "split": False } }, "count": True }) flattener = Flattener(options, spec.tables) tenders = flattener.tables["tenders"] assert "/tender/itemsCount" not in tenders for index in range( tenders.arrays["/tender/items/additionalClassifications"]): assert f"/tender/items/{index}/additionalClassificationsCount" not in tenders options = FlattenOptions( **{ "selection": { "tenders": { "split": True }, "tenders_items": { "split": False } }, "count": True }) flattener = Flattener(options, spec.tables) tenders = flattener.tables["tenders"] tenders_items = flattener.tables["tenders_items"] assert "/tender/itemsCount" in tenders for index in range( tenders.arrays["/tender/items/additionalClassifications"]): assert f"/tender/items/{index}/additionalClassificationsCount" not in tenders assert "/tender/items/additionalClassificationsCount" in tenders_items
def test_flatten_only_no_default_columns(spec_analyzed, releases): options = FlattenOptions( **{"selection": { "tenders": { "split": False, "only": ["/tender/id"] } }}) flattener = Flattener(options, spec_analyzed.tables) for _count, flat in flattener.flatten(releases): for name, rows in flat.items(): for row in rows: assert not set(row.keys()).difference(set(["/tender/id"]))
def test_less_five_arrays_csv(spec_analyzed, releases, flatten_options, tmpdir): test_arrays = ["tenders_items", "tenders_items_addit", "tenders_tende"] flattener = Flattener(flatten_options, spec_analyzed.tables) tables = prepare_tables(spec_analyzed, flatten_options) workdir = Path(tmpdir) with CSVWriter(workdir, tables, flatten_options) as writer: for _count, flat in flattener.flatten(releases): for name, rows in flat.items(): for row in rows: writer.writerow(name, row) for name in test_arrays: path = workdir / f"{name}.csv" assert not path.is_file()
def test_less_five_arrays_xlsx(spec_analyzed, releases, flatten_options, tmpdir): test_arrays = ["tenders_items", "tenders_items_addit", "tenders_tende"] flattener = Flattener(flatten_options, spec_analyzed.tables) tables = prepare_tables(spec_analyzed, flatten_options) workdir = Path(tmpdir) with XlsxWriter(workdir, tables, flatten_options) as writer: for _count, flat in flattener.flatten(releases): for name, rows in flat.items(): for row in rows: writer.writerow(name, row) path = workdir / "result.xlsx" xlsx_reader = openpyxl.load_workbook(path) for name in test_arrays: assert name not in xlsx_reader
def test_flatten_buyer(spec_analyzed, releases): options = FlattenOptions(**{ "selection": { "parties": { "split": True } }, "exclude": ["parties_ids"] }) flattener = Flattener(options, spec_analyzed.tables) for count, flat in flattener.flatten(releases): buyer = search(f"[{count}].buyer", releases) for name, rows in flat.items(): for row in rows: if buyer: assert "/buyer/id" in row assert "/buyer/name" in row
def test_flatten_should_not_split(spec_analyzed, releases): options = FlattenOptions(**{"selection": {"tenders": {"split": False}}}) flattener = Flattener(options, spec_analyzed.tables) all_rows = defaultdict(list) for count, flat in flattener.flatten(releases): for name, rows in flat.items(): all_rows[name].extend(rows) assert "tender_items" not in all_rows assert "tenders_items_addit" not in all_rows tenders = all_rows["tenders"] for tender, release in zip(tenders, releases): items = release.get("tender", {}).get("items") if release.get("tender", {}).get("items"): assert "/tender/items/0/id" in tender assert "/tender/items/0/description" in tender if len(items) > 1: assert "/tender/items/1/id" in tender assert "/tender/items/1/description" in tender
def test_flatten_row_id_parent_id_relation(spec, releases): releases[0]["tender"]["items"] = releases[0]["tender"]["items"] * 6 releases[0]["tender"]["items"] = releases[0]["tender"]["items"] * 6 releases[0]["tender"]["items"][0]["additionalClassifications"] = ( releases[0]["tender"]["items"][0]["additionalClassifications"] * 6) for _ in spec.process_items(releases): pass options = FlattenOptions(**{"selection": {"tenders": {"split": True}}}) flattener = Flattener(options, spec.tables) all_rows = defaultdict(list) for count, flat in flattener.flatten(releases): for name, rows in flat.items(): all_rows[name].extend(rows) for row in all_rows["tenders_items_class"]: parent_id = row["parentID"] items = [ i for i in all_rows["tenders_items"] if i["rowID"] == parent_id ] assert items
def test_flatten_with_repeat(spec_analyzed, releases): options = FlattenOptions(**{ "selection": { "tenders": { "split": True, "repeat": ["/tender/id"] } }, }) flattener = Flattener(options, spec_analyzed.tables) for count, flat in flattener.flatten(releases): for name, rows in flat.items(): if name == "tenders": continue for row in rows: assert "id" in row assert "ocid" in row assert "rowID" in row assert "/tender/id" in row assert row["/tender/id"] == search(f"[{count}].tender.id", releases)
def test_flatten_with_unnest(spec_analyzed, releases): field = "/tender/items/0/id" options = FlattenOptions(**{ "selection": { "tenders": { "split": True, "unnest": [field] } }, }) flattener = Flattener(options, spec_analyzed.tables) for count, flat in flattener.flatten(releases): for name, rows in flat.items(): for row in rows: if name != "tenders": assert field not in row continue item_id = search(f"[{count}].tender.items[0].id", releases) if item_id: assert field in row assert search(f"[{count}].tender.items[0].id", releases) == row[field]
def __init__( self, workdir, options, analyzer=None, tables=None, pkg_type="releases", csv=None, xlsx="result.xlsx", language=LOCALE, multiple_values=False, schema=None, ): self.tables = tables if tables else analyzer.spec.tables self.flattener = Flattener(options, self.tables, language=language) self.workdir = Path(workdir) # TODO: detect package, where? self.writers = [] self.csv = csv self.xlsx = xlsx self.multiple_values = multiple_values if multiple_values else analyzer.multiple_values if analyzer else False self.pkg_type = pkg_type if pkg_type else analyzer.pkg_type if analyzer else "releases" self.schema = schema or analyzer.spec.schema
def test_xlsx_writer(spec_analyzed, releases, flatten_options, tmpdir): flattener = Flattener(flatten_options, spec_analyzed.tables) tables = prepare_tables(spec_analyzed, flatten_options) workdir = Path(tmpdir) with XlsxWriter(workdir, tables, flatten_options) as writer: # Writing XLSX file for _count, flat in flattener.flatten(releases): for name, rows in flat.items(): for row in rows: writer.writerow(name, row) # Reading XLSX files counter = {} path = workdir / "result.xlsx" for _count, flat in flattener.flatten(releases): for name, rows in flat.items(): if name not in counter: counter[name] = 2 xlsx_reader = openpyxl.load_workbook(path) sheet = xlsx_reader[name] header_values = [cell.value for cell in sheet[1]] header_columns = [cell.column_letter for cell in sheet[1]] headers = dict(zip(header_columns, header_values)) for row in rows: line_values = [cell.value for cell in sheet[counter[name]]] line_columns = [headers[cell.column_letter] for cell in sheet[counter[name]]] line = dict(zip(line_columns, line_values)) # Cleaning empty cells line = {k: v for (k, v) in line.items() if v} if "/tender/hasEnquiries" in row: str_row = {k: v for (k, v) in row.items()} str_row["/tender/hasEnquiries"] = str(row["/tender/hasEnquiries"]) assert line == str_row else: assert line == row counter[name] += 1
def test_xlsx_only_no_default_columns(spec_analyzed, releases, tmpdir, schema): flatten_options = FlattenOptions( **{"selection": { "tenders": { "split": True, "only": ["/tender/id"] } }}) flattener = Flattener(flatten_options, spec_analyzed.tables) tables = prepare_tables(spec_analyzed, flatten_options) workdir = Path(tmpdir) with XlsxWriter(workdir, tables, flatten_options, schema) as writer: for _count, flat in flattener.flatten(releases): for name, rows in flat.items(): for row in rows: writer.writerow(name, row) path = workdir / "result.xlsx" xlsx_reader = openpyxl.load_workbook(path) column = [] for row in xlsx_reader["tenders"].rows: column.append(row[0].value) assert column[0] == "/tender/id" assert xlsx_reader["tenders"].max_column == 1
def test_flatten(spec_analyzed, releases): options = FlattenOptions(**{ "selection": { "tenders": { "split": True }, "parties": { "split": False } }, }) flattener = Flattener(options, spec_analyzed.tables) count = {"tenders": 0, "parties": 0} for _count, flat in flattener.flatten(releases): for name, rows in flat.items(): for row in rows: assert "id" in row assert "ocid" in row assert "rowID" in row if name in ID_ITEMS: key = "tender" if name == "tenders" else "parties" path = f"/{key}/id" assert ID_ITEMS[name][count[name]][path] == row.get(path) count[name] += 1