def test_flatten_with_exclude(spec, releases): releases[0]["tender"]["items"] = releases[0]["tender"]["items"] * 6 for _ in spec.process_items(releases): pass options = FlattenOptions(**{ "selection": { "tenders": { "split": True } }, "exclude": ["tenders_items"] }) flattener = Flattener(options, spec.tables) all_rows = defaultdict(list) for count, flat in flattener.flatten(releases): for name, rows in flat.items(): all_rows[name].extend(rows) assert "tenders" in all_rows assert "tenders_items" not in all_rows options = FlattenOptions(**{"selection": {"tenders": {"split": True}}}) flattener = Flattener(options, spec.tables) all_rows = defaultdict(list) for count, flat in flattener.flatten(releases): for name, rows in flat.items(): all_rows[name].extend(rows) assert "tenders" in all_rows assert "tenders_items" in all_rows
def test_flatten_with_counters(spec, releases): releases[0]["tender"]["items"] = releases[0]["tender"]["items"] * 6 releases[0]["tender"]["items"][0]["additionalClassifications"] = ( releases[0]["tender"]["items"][0]["additionalClassifications"] * 6) for _ in spec.process_items(releases): pass options = FlattenOptions(**{ "selection": { "tenders": { "split": True } }, "count": True }) flattener = Flattener(options, spec.tables) for count, flat in flattener.flatten(releases): for name, rows in flat.items(): if name == "tenders": for row in rows: items = search(f"[{count}].tender.items", releases) if items: assert "/tender/itemsCount" in row assert len(items) == row["/tender/itemsCount"] elif name == "tenders_items": for index, row in enumerate(rows): additional = search( f"[{count}].tender.items[{index}].additionalClassifications", releases, ) if additional: assert "/tender/items/additionalClassificationsCount" in row assert len(additional) == row[ "/tender/items/additionalClassificationsCount"]
def test_flatten_multiple_files(spec, tmpdir, releases, schema): for _ in spec.process_items(releases): pass options = FlattenOptions(**{"selection": {"tenders": {"split": True}}}) workdir = Path(tmpdir) analyzer = FileAnalyzer(workdir) flattener = FileFlattener(workdir=workdir, options=options, tables=spec.tables, csv=True, analyzer=analyzer, schema=schema) xlsx = workdir / "result.xlsx" sheet = "tenders" for _ in flattener.flatten_file(releases_path): pass wb = openpyxl.load_workbook(xlsx) ws = wb[sheet] line_number = ws.max_row - 1 assert ws.max_row - 1 == 4 flattener = FileFlattener(workdir=workdir, options=options, tables=spec.tables, csv=True, analyzer=analyzer, schema=schema) for _ in flattener.flatten_file([releases_path, releases_path]): pass wb = openpyxl.load_workbook(xlsx) ws = wb[sheet] assert ws.max_row - 1 == line_number * 2
def test_flatten_fields_compare(spec_analyzed, releases): options = FlattenOptions(**{ "selection": { "tenders": { "split": True }, "parties": { "split": False } }, }) flattener = Flattener(options, spec_analyzed.tables) fields = ["submissionMethod", "roles"] for count, flat in flattener.flatten(releases): for name, rows in flat.items(): counters = defaultdict(int) for row in reversed(rows): for key, value in row.items(): if "/" in key: if "parties" in key: key = key.replace( "parties", f"parties/{counters['parties']}") expected = resolve_pointer(releases[count], key) if any(key.endswith(field) for field in fields): expected = JOINABLE_SEPARATOR.join(expected) assert expected == value counters[name] += 1
def test_name_duplicate(spec, tmpdir, schema): duplicate_name = "test" options = FlattenOptions( **{ "selection": { "parties": { "split": False, "pretty_headers": True, "name": duplicate_name }, "tenders": { "split": True, "pretty_headers": True, "name": duplicate_name }, "tenders_items": { "split": False, "pretty_headers": True, "name": duplicate_name }, } }) tables = prepare_tables(spec, options) for name, table in tables.items(): for col in table: table.inc_column(col, col) workdir = Path(tmpdir) get_writers(workdir, tables, options, schema) xlsx = workdir / "result.xlsx" for name in ("test", "test1", "test2"): path = workdir / f"{name}.csv" assert path.is_file() assert read_xlsx_headers(xlsx, name) assert read_csv_headers(path)
def test_flatten_string_arrays(spec_analyzed, releases): options = FlattenOptions( **{ "selection": { "tenders": { "split": True }, "parties": { "split": True } }, "exclude": ["tenders_items", "parties_ids", "tenders_tenderers"], }) flattener = Flattener(options, spec_analyzed.tables) fields = ["submissionMethod", "roles"] for count, flat in flattener.flatten(releases): for name, rows in flat.items(): counters = defaultdict(int) for row in reversed(rows): for key, value in row.items(): if "/" in key: key = key.replace("parties", f"parties/{counters['parties']}") actual = resolve_pointer(releases[count], key) if any(key.endswith(field) for field in fields): actual = JOINABLE_SEPARATOR.join(actual) assert actual == value counters[name] += 1
def test_abbreviations(spec, tmpdir): options = FlattenOptions( **{ "selection": { "tenders_items_class": { "split": False }, "parties_ids": { "split": False }, "transactions": { "split": False }, } }) new_names = ["tenders_items_class", "parties_ids", "transactions"] tables = prepare_tables(spec, options) for name, table in tables.items(): for col in table: table.inc_column(col, col) workdir = Path(tmpdir) get_writers(workdir, tables, options) xlsx = workdir / "result.xlsx" for name in new_names: path = workdir / f"{name}.csv" assert path.is_file() assert read_xlsx_headers(xlsx, name) assert read_csv_headers(path)
def test_writers_flatten_count(spec, tmpdir, releases): releases[0]["tender"]["items"] = releases[0]["tender"]["items"] * 6 for _ in spec.process_items(releases): pass options = FlattenOptions( **{ "selection": { "tenders": {"split": True, "pretty_headers": True}, "parties": {"split": True, "pretty_headers": True}, "tenders_items": { "split": False, "pretty_headers": True, }, }, "count": True, } ) workdir = Path(tmpdir) flattener = FileFlattener(workdir, options, spec.tables, root_key="releases", csv=True) xlsx = workdir / "result.xlsx" for _ in flattener.flatten_file(releases_path): pass sheet = "tenders" path = workdir / f"{sheet}.csv" for headers in read_xlsx_headers(xlsx, sheet), read_csv_headers(path): assert "Items Count" in headers assert "Tenderers Count" in headers sheet = "parties" path = workdir / f"{sheet}.csv" for headers in read_xlsx_headers(xlsx, sheet), read_csv_headers(path): assert "Additional Identifiers Count" in headers
def test_writers_invalid_row(log, spec, tmpdir, schema): options = FlattenOptions( **{ "selection": { "parties": { "split": False, "pretty_headers": True, "name": "testname" }, } }) tables = prepare_tables(spec, options) for name, table in tables.items(): for col in table: table.inc_column(col, col) workdir = Path(tmpdir) writers = get_writers(workdir, tables, options, schema) for writer in writers: writer.writerow("parties", {"/test/test": "test"}) log.assert_has_calls([ call( "Operation produced invalid path. This a software bug, please send issue to developers" ), call( "Failed to write row None with error dict contains fields not in fieldnames: '/test/test'" ), call( "Operation produced invalid path. This a software bug, please send issue to developers" ), call("Failed to write column /test/test to xlsx sheet parties"), ])
def test_flatten_with_only(spec_analyzed, releases): options = FlattenOptions( **{ "selection": { "tenders": { "split": True, "only": ["/tender/id"] }, "parties": { "split": False } } }) flattener = Flattener(options, spec_analyzed.tables) all_rows = defaultdict(list) for count, flat in flattener.flatten(releases): for name, rows in flat.items(): all_rows[name].extend(rows) assert all_rows["tenders"] for row in all_rows["tenders"]: assert not set(row).difference( ["/tender/id", "rowID", "ocid", "parentID", "id"]) options = FlattenOptions( **{"selection": { "tenders": { "split": False, "only": ["/tender/id"] } }}) flattener = Flattener(options, spec_analyzed.tables) all_rows = defaultdict(list) for count, flat in flattener.flatten(releases): for name, rows in flat.items(): all_rows[name].extend(rows) assert all_rows["tenders"] for row in all_rows["tenders"]: assert not set(row).difference( ["/tender/id", "rowID", "ocid", "parentID", "id"])
def test_writers_open_fail(open_, log, spec, tmpdir): options = FlattenOptions( **{ "selection": { "parties": {"split": False, "pretty_headers": True, "name": "testname"}, } } ) workdir = Path(tmpdir) tables = prepare_tables(spec, options) get_writers(workdir, tables, options) log.assert_has_calls([call("Failed to open file {} with error {}".format(str(tmpdir / "testname.csv"), "test"))])
def test_flattener_generate_count_columns(spec, releases): releases[0]["tender"]["items"] = releases[0]["tender"]["items"] * 6 for _ in spec.process_items(releases): pass options = FlattenOptions(**{ "selection": { "tenders": { "split": False } }, "count": True }) flattener = Flattener(options, spec.tables) tenders = flattener.tables["tenders"] assert "/tender/itemsCount" not in tenders for index in range( tenders.arrays["/tender/items/additionalClassifications"]): assert f"/tender/items/{index}/additionalClassificationsCount" not in tenders options = FlattenOptions( **{ "selection": { "tenders": { "split": True }, "tenders_items": { "split": False } }, "count": True }) flattener = Flattener(options, spec.tables) tenders = flattener.tables["tenders"] tenders_items = flattener.tables["tenders_items"] assert "/tender/itemsCount" in tenders for index in range( tenders.arrays["/tender/items/additionalClassifications"]): assert f"/tender/items/{index}/additionalClassificationsCount" not in tenders assert "/tender/items/additionalClassificationsCount" in tenders_items
def test_flatten_only_no_default_columns(spec_analyzed, releases): options = FlattenOptions( **{"selection": { "tenders": { "split": False, "only": ["/tender/id"] } }}) flattener = Flattener(options, spec_analyzed.tables) for _count, flat in flattener.flatten(releases): for name, rows in flat.items(): for row in rows: assert not set(row.keys()).difference(set(["/tender/id"]))
def test_flatten_buyer(spec_analyzed, releases): options = FlattenOptions(**{ "selection": { "parties": { "split": True } }, "exclude": ["parties_ids"] }) flattener = Flattener(options, spec_analyzed.tables) for count, flat in flattener.flatten(releases): buyer = search(f"[{count}].buyer", releases) for name, rows in flat.items(): for row in rows: if buyer: assert "/buyer/id" in row assert "/buyer/name" in row
def test_writers_invalid_table(log, spec, tmpdir): options = FlattenOptions( **{ "selection": { "parties": {"split": False, "pretty_headers": True, "name": "testname"}, } } ) tables = prepare_tables(spec, options) for name, table in tables.items(): for col in table: table.inc_column(col, col) workdir = Path(tmpdir) writers = get_writers(workdir, tables, options) for writer in writers: writer.writerow("test", {}) log.assert_has_calls([call("Invalid table test"), call("Invalid table test")])
def test_flatten_should_not_split(spec_analyzed, releases): options = FlattenOptions(**{"selection": {"tenders": {"split": False}}}) flattener = Flattener(options, spec_analyzed.tables) all_rows = defaultdict(list) for count, flat in flattener.flatten(releases): for name, rows in flat.items(): all_rows[name].extend(rows) assert "tender_items" not in all_rows assert "tenders_items_addit" not in all_rows tenders = all_rows["tenders"] for tender, release in zip(tenders, releases): items = release.get("tender", {}).get("items") if release.get("tender", {}).get("items"): assert "/tender/items/0/id" in tender assert "/tender/items/0/description" in tender if len(items) > 1: assert "/tender/items/1/id" in tender assert "/tender/items/1/description" in tender
def test_flatten_row_id_parent_id_relation(spec, releases): releases[0]["tender"]["items"] = releases[0]["tender"]["items"] * 6 releases[0]["tender"]["items"] = releases[0]["tender"]["items"] * 6 releases[0]["tender"]["items"][0]["additionalClassifications"] = ( releases[0]["tender"]["items"][0]["additionalClassifications"] * 6) for _ in spec.process_items(releases): pass options = FlattenOptions(**{"selection": {"tenders": {"split": True}}}) flattener = Flattener(options, spec.tables) all_rows = defaultdict(list) for count, flat in flattener.flatten(releases): for name, rows in flat.items(): all_rows[name].extend(rows) for row in all_rows["tenders_items_class"]: parent_id = row["parentID"] items = [ i for i in all_rows["tenders_items"] if i["rowID"] == parent_id ] assert items
def test_flatten_with_repeat(spec_analyzed, releases): options = FlattenOptions(**{ "selection": { "tenders": { "split": True, "repeat": ["/tender/id"] } }, }) flattener = Flattener(options, spec_analyzed.tables) for count, flat in flattener.flatten(releases): for name, rows in flat.items(): if name == "tenders": continue for row in rows: assert "id" in row assert "ocid" in row assert "rowID" in row assert "/tender/id" in row assert row["/tender/id"] == search(f"[{count}].tender.id", releases)
def test_flatten_with_unnest(spec_analyzed, releases): field = "/tender/items/0/id" options = FlattenOptions(**{ "selection": { "tenders": { "split": True, "unnest": [field] } }, }) flattener = Flattener(options, spec_analyzed.tables) for count, flat in flattener.flatten(releases): for name, rows in flat.items(): for row in rows: if name != "tenders": assert field not in row continue item_id = search(f"[{count}].tender.items[0].id", releases) if item_id: assert field in row assert search(f"[{count}].tender.items[0].id", releases) == row[field]
def test_extension_export(spec, tmpdir, releases_extension, schema): for _ in spec.process_items(releases_extension): pass options = FlattenOptions(**{ "selection": { "tenders": { "split": False }, "documents": { "split": False } } }) workdir = Path(tmpdir) analyzer = FileAnalyzer(workdir) flattener = FileFlattener(workdir=workdir, options=options, tables=spec.tables, analyzer=analyzer, schema=schema) xlsx = workdir / "result.xlsx" sheet = "documents" extension_header = "/documents/test_extension" for _ in flattener.flatten_file(releases_extension_path): pass wb = openpyxl.load_workbook(xlsx) ws = wb[sheet] for column_cell in ws.iter_cols(1, ws.max_column): if column_cell[0].value == extension_header: extension_column = ws[column_cell[0].coordinate[:-1]] for cell in extension_column: if cell.value == extension_header: continue assert cell.value == "test" for column_cell in wb["tenders"].iter_cols(1, ws.max_column): assert column_cell[0].value != extension_header
def test_xlsx_only_no_default_columns(spec_analyzed, releases, tmpdir, schema): flatten_options = FlattenOptions( **{"selection": { "tenders": { "split": True, "only": ["/tender/id"] } }}) flattener = Flattener(flatten_options, spec_analyzed.tables) tables = prepare_tables(spec_analyzed, flatten_options) workdir = Path(tmpdir) with XlsxWriter(workdir, tables, flatten_options, schema) as writer: for _count, flat in flattener.flatten(releases): for name, rows in flat.items(): for row in rows: writer.writerow(name, row) path = workdir / "result.xlsx" xlsx_reader = openpyxl.load_workbook(path) column = [] for row in xlsx_reader["tenders"].rows: column.append(row[0].value) assert column[0] == "/tender/id" assert xlsx_reader["tenders"].max_column == 1
def test_flatten(spec_analyzed, releases): options = FlattenOptions(**{ "selection": { "tenders": { "split": True }, "parties": { "split": False } }, }) flattener = Flattener(options, spec_analyzed.tables) count = {"tenders": 0, "parties": 0} for _count, flat in flattener.flatten(releases): for name, rows in flat.items(): for row in rows: assert "id" in row assert "ocid" in row assert "rowID" in row if name in ID_ITEMS: key = "tender" if name == "tenders" else "parties" path = f"/{key}/id" assert ID_ITEMS[name][count[name]][path] == row.get(path) count[name] += 1
def flatten_data(flatten_id, model=None, lang_code="en_US"): with internationalization(lang_code=lang_code): logger_context = { "FLATTEN_ID": flatten_id, "TASK": "flatten_data", "MODEL": model } channel_layer = get_channel_layer() if model not in getters: extra = { "MESSAGE_ID": "model_not_registered", "MODEL": model, "TASK": "flatten_data", "FLATTEN_ID": flatten_id, } logger.info("Model %s not registered in getters" % model, extra=extra) return try: serializer = FlattenSerializer() flatten = Flatten.objects.get(id=flatten_id) selection = flatten.dataselection_set.all()[0] datasource = getattr(selection, f"{model.lower()}_set").all()[0] flatten.status = "processing" flatten.save(update_fields=["status"]) async_to_sync(channel_layer.group_send)( f"datasource_{datasource.id}", { "type": "task.flatten", "flatten": serializer.to_representation(instance=flatten) }, ) spec = DataPreprocessor.restore(datasource.analyzed_file.path) total_rows = spec.total_items opt = get_flatten_options(selection) logger.debug( "Generate options for export", extra={ "MESSAGE_ID": "generate_flatten_options", "DATASOURCE_ID": str(datasource.id), "MODEL": model, "SELECTION_ID": str(selection.id), "FLATTEN_ID": str(flatten.id), "OPTIONS": opt, }, ) options = FlattenOptions(**opt) workdir = pathlib.Path(datasource.file.path).parent formats = {"csv": None, "xlsx": None} if flatten.export_format == flatten.CSV: workdir = workdir / "export" if not workdir.exists(): os.makedirs(workdir) formats[flatten.export_format] = workdir else: formats[flatten.export_format] = "result.xlsx" flattener = FileFlattener(workdir, options, spec.tables, root_key=datasource.root_key, **formats) timestamp = time.time() for count in flattener.flatten_file(datasource.file.path): if (time.time() - timestamp) <= 1: continue async_to_sync(channel_layer.group_send)( f"datasource_{datasource.id}", { "type": "task.flatten", "flatten": { "id": str(flatten.id) }, "progress": { "total_rows": total_rows, "processed": count, "percentage": (count / total_rows) * 100 if total_rows else total_rows, }, }, ) timestamp = time.time() if flatten.export_format == flatten.CSV: target_file = f"{workdir}/{datasource.id}.zip" zip_files(workdir, target_file, extension="csv") with open(target_file, "rb") as fd: file_ = File(fd) file_.name = f"{datasource.id}.zip" flatten.file = file_ flatten.status = "completed" flatten.save(update_fields=["file", "status"]) os.remove(fd.name) else: target_file = f"{workdir}/result.xlsx" with open(target_file, "rb") as fd: file_ = File(fd) file_.name = "result.xlsx" flatten.file = file_ flatten.status = "completed" flatten.save(update_fields=["file", "status"]) os.remove(fd.name) async_to_sync(channel_layer.group_send)( f"datasource_{datasource.id}", { "type": "task.flatten", "flatten": serializer.to_representation(instance=flatten) }, ) except ObjectDoesNotExist: extra = deepcopy(logger_context) extra["MESSAGE_ID"] = "flatten_not_found" logger.info("Flatten %s for %s model not found" % (flatten_id, model), extra=extra) except OSError as e: extra = deepcopy(logger_context) extra.update({ "MESSAGE_ID": "flatten_no_left_space", "DATASOURCE_ID": str(datasource.id), "ERROR_MSG": str(e) }) logger.info("Flatten %s for %s model failed: %s" % (flatten_id, model, e), extra=extra) flatten.status = "failed" flatten.error = _( "Currently, the space limit was reached. Please try again later." ) flatten.save(update_fields=["error", "status"]) async_to_sync(channel_layer.group_send)( f"datasource_{datasource.id}", { "type": "task.flatten", "flatten": serializer.to_representation(instance=flatten) }, ) except (TypeError, Exception) as e: error_message = str(e) extra = deepcopy(logger_context) extra["MESSAGE_ID"] = "flatten_failed" extra["ERROR_MESSAGE"] = error_message logger.error( "Flatten %s for %s datasource %s failed" % (flatten_id, model, datasource.id), extra=extra, exc_info=True, ) flatten.status = "failed" flatten.error = error_message flatten.save(update_fields=["error", "status"]) async_to_sync(channel_layer.group_send)( f"datasource_{datasource.id}", { "type": "task.flatten", "flatten": serializer.to_representation(instance=flatten) }, )
def test_writers_pretty_headers(spec, tmpdir, releases, schema): # increase items count for force split releases[0]["tender"]["items"] = releases[0]["tender"]["items"] * 6 for _ in spec.process_items(releases): pass options = FlattenOptions( **{ "selection": { "tenders": { "split": True, "pretty_headers": True }, "parties": { "split": False, "pretty_headers": True }, "tenders_items": { "split": True, "headers": { "/tender/items/id": "item id" }, "pretty_headers": True, }, } }) tables = { "tenders": spec.tables["tenders"], "parties": spec.tables["parties"], "tenders_items": spec.tables["tenders_items"], } tables_headers = { "tenders": [ "Ocid", "Id", "Row Id", "Parent Id", "Tender: Award Criteria", "Tender: Award Criteria Details", "Tender: Tender Description", "Tender: Has Enquiries?", "Tender: Tender Id", "Tender: Main Procurement Category", "Tender: Number Of Tenderers", "Tender: Procurement Method", "Tender: Procurement Method Details", "Tender: Procurement Method Rationale", "Tender: Tender Status", "Tender: Submission Method", "Tender: Submission Method Details", "Tender: Tender Title", "Tender: Value: Amount", "Tender: Value: Currency", "Tender: Tender Period: Duration ( Days)", "Tender: Tender Period: End Date", "Tender: Tender Period: Start Date", "Tender: Tenderers: Organization Id", "Tender: Tenderers: Organization Name", "Tender: Value: Amount", "Tender: Value: Currency", "Tender: Tender Period: Duration ( Days)", "Tender: Tender Period: End Date", "Tender: Tender Period: Start Date", "Tender: Tender Period: Duration ( Days)", "Tender: Tender Period: End Date", "Tender: Tender Period: Start Date", "Tender: Tender Period: End Date", "Tender: Tender period: Start date", ], "parties": [ "Ocid", "Id", "Row Id", "Parent Id", "Parties: Organization: Entity Id", "Parties: Organization: Common Name", "Parties: Organization: Party Roles", "Parties: Organization: Primary Identifier: Id", "Parties: Organization: Primary Identifier: Legal Name", "Parties: Organization: Primary Identifier: Scheme", "Parties: Organization: Contact Point: Email", "Parties: Organization: Contact Point: Fax Number", "Parties: Organization: Contact Point: Name", "Parties: Organization: Contact Point: Telephone", "Parties: Organization: Contact Point: Url", "Parties: Organization: Address: Country Name", "Parties: Organization: Address: Locality", "Parties: Organization: Address: Postal Code", "Parties: Organization: Address: Region", "Parties: Organization: Address: Street Address", "Buyer: Organization Id", "Buyer: Organization Name", "Parties: Test", ], "tenders_items": [ "Ocid", "Id", "Row Id", "Parent Id", "Parent Table", "Tender: Items To Be Procured: Item: Description", "item id", "Tender: Items To Be Procured: Item: Quantity", "Tender: Items To Be Procured: Item: Unit: Id", "Tender: Items To Be Procured: Item: Unit: Name", "Tender: Items To Be Procured: Item: Unit: Scheme", "Tender: Value: Amount", "Tender: Value: Currency", ], } workdir = Path(tmpdir) get_writers(workdir, tables, options, schema) xlsx = workdir / "result.xlsx" for name, opts in options.selection.items(): path = workdir / f"{name}.csv" xlsx_headers = read_xlsx_headers(xlsx, name) csv_headers = read_csv_headers(path) headers = tables_headers[name] assert not set(headers).difference(set(xlsx_headers)) assert not set(headers).difference(set(csv_headers)) options = FlattenOptions( **{ "selection": { "tenders": { "split": True, "headers": { "/tender/id": "TENDER" }, "pretty_headers": True, }, "tenders_items": { "split": True, "headers": { "/tender/items/id": "item id" }, "pretty_headers": True, }, "parties": { "split": False, "headers": { "/parties/id": "PARTY" }, "pretty_headers": True, }, } }) workdir = Path(tmpdir) get_writers(workdir, tables, options, schema) xlsx = workdir / "result.xlsx" sheet = "tenders" path = workdir / f"{sheet}.csv" xlsx_headers = read_xlsx_headers(xlsx, sheet) csv_headers = read_csv_headers(path) assert "TENDER" in xlsx_headers assert "TENDER" in csv_headers sheet = "tenders_items" path = workdir / f"{sheet}.csv" xlsx_headers = read_xlsx_headers(xlsx, sheet) csv_headers = read_csv_headers(path) assert "item id" in xlsx_headers assert "item id" in csv_headers xlsx = workdir / "result.xlsx" sheet = "parties" path = workdir / f"{sheet}.csv" xlsx_headers = read_xlsx_headers(xlsx, sheet) csv_headers = read_csv_headers(path) assert "PARTY" in xlsx_headers assert "PARTY" in csv_headers
def test_writers_pretty_headers(spec, tmpdir, releases): # increase items count for force split releases[0]["tender"]["items"] = releases[0]["tender"]["items"] * 6 for _ in spec.process_items(releases): pass options = FlattenOptions( **{ "selection": { "tenders": {"split": True, "pretty_headers": True}, "parties": {"split": False, "pretty_headers": True}, "tenders_items": { "split": True, "headers": {"/tender/items/id": "item id"}, "pretty_headers": True, }, } } ) tables = { "tenders": spec.tables["tenders"], "parties": spec.tables["parties"], "tenders_items": spec.tables["tenders_items"], } workdir = Path(tmpdir) get_writers(workdir, tables, options) xlsx = workdir / "result.xlsx" for name, opts in options.selection.items(): path = workdir / f"{name}.csv" xlsx_headers = read_xlsx_headers(xlsx, name) csv_headers = read_csv_headers(path) table = tables[name] for col in tables[name].available_rows(opts.split): title = table.titles.get(col) if col == "/tender/items/id": title = "item id" assert title in xlsx_headers assert title in csv_headers options = FlattenOptions( **{ "selection": { "tenders": { "split": True, "headers": {"/tender/id": "TENDER"}, "pretty_headers": True, }, "tenders_items": { "split": True, "headers": {"/tender/items/id": "item id"}, "pretty_headers": True, }, "parties": { "split": False, "headers": {"/parties/id": "PARTY"}, "pretty_headers": True, }, } } ) workdir = Path(tmpdir) get_writers(workdir, tables, options) xlsx = workdir / "result.xlsx" sheet = "tenders" path = workdir / f"{sheet}.csv" xlsx_headers = read_xlsx_headers(xlsx, sheet) csv_headers = read_csv_headers(path) assert "TENDER" in xlsx_headers assert "TENDER" in csv_headers sheet = "tenders_items" path = workdir / f"{sheet}.csv" xlsx_headers = read_xlsx_headers(xlsx, sheet) csv_headers = read_csv_headers(path) assert "item id" in xlsx_headers assert "item id" in csv_headers xlsx = workdir / "result.xlsx" sheet = "parties" path = workdir / f"{sheet}.csv" xlsx_headers = read_xlsx_headers(xlsx, sheet) csv_headers = read_csv_headers(path) assert "PARTY" in xlsx_headers assert "PARTY" in csv_headers
def cli( filename, schema, selection, threshold, state_file, xlsx, csv, combine, exclude, unnest, unnest_file, only, only_file, repeat, repeat_file, count, human, language, ): """Spoonbill cli entry point""" if csv: csv = pathlib.Path(csv).resolve() if not csv.exists(): raise click.BadParameter( _("Desired location {} does not exists").format(csv)) if xlsx: xlsx = pathlib.Path(xlsx).resolve() if not xlsx.parent.exists(): raise click.BadParameter( _("Desired location {} does not exists").format(xlsx.parent)) path = pathlib.Path(filename) workdir = path.parent filename = path.name selection = selection or ROOT_TABLES.keys() combine = combine or COMBINED_TABLES.keys() root_tables = get_selected_tables(ROOT_TABLES, selection) combined_tables = get_selected_tables(COMBINED_TABLES, combine) if state_file: click.secho(_("Restoring from provided state file"), bold=True) analyzer = FileAnalyzer(workdir, state_file=state_file) else: click.secho( _("State file not supplied, going to analyze input file first"), bold=True) analyzer = FileAnalyzer( workdir, schema=schema, root_tables=root_tables, combined_tables=combined_tables, language=language, table_threshold=threshold, ) click.echo(_("Analyze options:")) for name, option in ("threshold", str(threshold)), ("language", language): click.echo( _(" - {:30} => {}").format(name, click.style(option, fg="cyan"))) click.echo( _("Processing file: {}").format(click.style(str(path), fg="cyan"))) total = path.stat().st_size progress = 0 # Progress bar not showing with small files # https://github.com/pallets/click/pull/1296/files with click.progressbar(width=0, show_percent=True, show_pos=True, length=total) as bar: for read, number in analyzer.analyze_file(filename, with_preview=False): bar.label = ANALYZED_LABEL.format( click.style(str(number), fg="cyan")) bar.update(read - progress) progress = read click.secho(_("Done processing. Analyzed objects: {}").format( click.style(str(number + 1), fg="red")), fg="green") if isinstance(filename, list): state_file = pathlib.Path(f"{filename[0]}.state") else: state_file = pathlib.Path(f"{filename}.state") state_file_path = workdir / state_file click.echo( _("Dumping analyzed data to '{}'").format( click.style(str(state_file_path.absolute()), fg="cyan"))) analyzer.dump_to_file(state_file) click.echo( _("Flattening file: {}").format(click.style(str(path), fg="cyan"))) if unnest and unnest_file: raise click.UsageError( _("Conflicting options: unnest and unnest-file")) if repeat and repeat_file: raise click.UsageError( _("Conflicting options: repeat and repeat-file")) if only and only_file: raise click.UsageError(_("Conflicting options: only and only-file")) if exclude: click.echo( _("Ignoring tables (excluded by user): {}").format( click.style(",".join(exclude), fg="red"))) options = {"selection": {}, "count": count, "exclude": exclude} unnest = read_option_file(unnest, unnest_file) repeat = read_option_file(repeat, repeat_file) only = read_option_file(only, only_file) for name in list(selection) + list(combine): table = analyzer.spec[name] if table.total_rows == 0: click.echo( _("Ignoring empty table {}").format(click.style(name, fg="red"))) continue options["selection"][name] = { "split": analyzer.spec[name].splitted, "pretty_headers": human, } if not analyzer.spec[name].is_combined: unnest_in_table = [ col for col in unnest if col in table.combined_columns ] if unnest_in_table: click.echo( _("Unnesting columns {} for table {}").format( click.style(",".join(unnest_in_table), fg="cyan"), click.style(name, fg="cyan"))) only_in_table = [col for col in only if col in table] if only_in_table: click.echo( _("Using only columns {} for table {}").format( click.style(",".join(only_in_table), fg="cyan"), click.style(name, fg="cyan"))) repeat_in_table = [col for col in repeat if col in table] if repeat_in_table: click.echo( _("Repeating columns {} in all child table of {}").format( click.style(",".join(repeat_in_table), fg="cyan"), click.style(name, fg="cyan"))) options["selection"][name]["only"] = only_in_table options["selection"][name]["repeat"] = repeat_in_table options["selection"][name]["unnest"] = unnest_in_table options = FlattenOptions(**options) flattener = FileFlattener( workdir, options, analyzer, csv=csv, xlsx=xlsx, language=language, ) click.echo( _("Going to export tables: {}").format( click.style(",".join(flattener.flattener.tables.keys()), fg="magenta"))) click.echo(_("Processed tables:")) for table_name, table in flattener.flattener.tables.items(): msg = _(" - {:30} => {} rows") if table.is_root else _( " ---- {:27} => {} rows") message = msg.format(table_name, click.style(str(table.total_rows), fg="cyan")) click.echo(message) click.echo(_("Flattening input file")) with click.progressbar( flattener.flatten_file(filename), length=analyzer.spec.total_items + 1, width=0, show_percent=True, show_pos=True, ) as bar: for count in bar: bar.label = FLATTENED_LABEL.format( click.style(str(count + 1), fg="cyan")) click.secho(_("Done flattening. Flattened objects: {}").format( click.style(str(count + 1), fg="red")), fg="green")
assert "/tender/items/0/description" in tender assert "/tender/items/0/additionalClassifications/0/description" if len(items) > 1: assert "/tender/items/1/id" in tender assert "/tender/items/1/description" in tender @pytest.mark.parametrize( "options", [ FlattenOptions( **{ "selection": { "tenders": { "split": True }, "tenders_items": { "split": False } } }), FlattenOptions(**{"selection": { "tenders": { "split": True } }}), ], ) def test_flatten_should_split_with_child(spec, releases, options): releases[0]["tender"]["items"] = releases[0]["tender"]["items"] * 6 for _ in spec.process_items(releases):
def cli( filename, schema, selection, split, threshold, state_file, xlsx, csv, combine, unnest, unnest_file, only, only_file, repeat, repeat_file, count, human, language, ): """Spoonbill cli entry point""" click.echo(_("Detecting input file format")) # TODO: handle line separated json # TODO: handle single release/record ( input_format, _is_concatenated, _is_array, ) = detect_format(filename) if csv: csv = pathlib.Path(csv).resolve() if not csv.exists(): raise click.BadParameter( _("Desired location {} does not exists").format(csv)) if xlsx: xlsx = pathlib.Path(xlsx).resolve() if not xlsx.parent.exists(): raise click.BadParameter( _("Desired location {} does not exists").format(xlsx.parent)) click.echo( _("Input file is {}").format(click.style(input_format, fg="green"))) is_package = "package" in input_format combine_choice = combine if combine else "" if not is_package: # TODO: fix this click.echo("Single releases are not supported by now") return if schema: schema = resolve_file_uri(schema) if "release" in input_format: root_key = "releases" if not schema: click.echo( _("No schema provided, using version {}").format( click.style(CURRENT_SCHEMA_TAG, fg="cyan"))) profile = ProfileBuilder(CURRENT_SCHEMA_TAG, {}) schema = profile.release_package_schema() else: root_key = "records" if not schema: click.echo( _("No schema provided, using version {}").format( click.style(CURRENT_SCHEMA_TAG, fg="cyan"))) profile = ProfileBuilder(CURRENT_SCHEMA_TAG, {}) schema = profile.record_package_schema() title = schema.get("title", "").lower() if not title: raise ValueError( _("Incomplete schema, please make sure your data is correct")) if "package" in title: # TODO: is is a good way to get release/record schema schema = schema["properties"][root_key]["items"] path = pathlib.Path(filename) workdir = path.parent filename = path.name selection = selection or ROOT_TABLES.keys() combine = combine or COMBINED_TABLES.keys() root_tables = get_selected_tables(ROOT_TABLES, selection) combined_tables = get_selected_tables(COMBINED_TABLES, combine) if state_file: click.secho(_("Restoring from provided state file"), bold=True) analyzer = FileAnalyzer(workdir, state_file=state_file) else: click.secho( _("State file not supplied, going to analyze input file first"), bold=True) analyzer = FileAnalyzer( workdir, schema=schema, root_key=root_key, root_tables=root_tables, combined_tables=combined_tables, language=language, table_threshold=threshold, ) click.echo(_("Analyze options:")) click.echo( _(" - table threshold => {}").format( click.style(str(threshold), fg="cyan"))) click.echo( _(" - language => {}").format( click.style(language, fg="cyan"))) click.echo( _("Processing file: {}").format(click.style(str(path), fg="cyan"))) total = path.stat().st_size progress = 0 # Progress bar not showing with small files # https://github.com/pallets/click/pull/1296/files with click.progressbar(width=0, show_percent=True, show_pos=True, length=total) as bar: for read, number in analyzer.analyze_file(filename, with_preview=True): bar.label = ANALYZED_LABEL.format( click.style(str(number), fg="cyan")) bar.update(read - progress) progress = read click.secho(_("Done processing. Analyzed objects: {}").format( click.style(str(number + 1), fg="red")), fg="green") state_file = pathlib.Path(f"{filename}.state") state_file_path = workdir / state_file click.echo( _("Dumping analyzed data to '{}'").format( click.style(str(state_file_path.absolute()), fg="cyan"))) analyzer.dump_to_file(state_file) click.echo( _("Flattening file: {}").format(click.style(str(path), fg="cyan"))) if unnest and unnest_file: raise click.UsageError( _("Conflicting options: unnest and unnest-file")) if repeat and repeat_file: raise click.UsageError( _("Conflicting options: repeat and repeat-file")) if only and only_file: raise click.UsageError(_("Conflicting options: only and only-file")) options = {"selection": {}, "count": count} unnest = read_option_file(unnest, unnest_file) repeat = read_option_file(repeat, repeat_file) only = read_option_file(only, only_file) for name in selection: table = analyzer.spec[name] if table.total_rows == 0: click.echo( _("Ignoring empty table {}").format(click.style(name, fg="red"))) continue unnest = [col for col in unnest if col in table.combined_columns] if unnest: click.echo( _("Unnesting columns {} for table {}").format( click.style(",".join(unnest), fg="cyan"), click.style(name, fg="cyan"))) only = [col for col in only if col in table] if only: click.echo( _("Using only columns {} for table {}").format( click.style(",".join(only), fg="cyan"), click.style(name, fg="cyan"))) repeat = [col for col in repeat if col in table] if repeat: click.echo( _("Repeating columns {} in all child table of {}").format( click.style(",".join(repeat), fg="cyan"), click.style(name, fg="cyan"))) options["selection"][name] = { "split": split or analyzer.spec[name].should_split, "pretty_headers": human, "unnest": unnest, "only": only, "repeat": repeat, } options = FlattenOptions(**options) flattener = FileFlattener( workdir, options, analyzer.spec.tables, root_key=root_key, csv=csv, xlsx=xlsx, language=language, ) all_tables = chain([table for table in flattener.flattener.tables.keys()], combine_choice) click.echo( _("Going to export tables: {}").format( click.style(",".join(all_tables), fg="magenta"))) click.echo(_("Processed tables:")) for table in flattener.flattener.tables.keys(): message = _("{}: {} rows").format( table, flattener.flattener.tables[table].total_rows) if not flattener.flattener.tables[table].is_root: message = "└-----" + message click.echo(message) else: click.echo(message) click.echo(_("Flattening input file")) with click.progressbar( flattener.flatten_file(filename), length=analyzer.spec.total_items + 1, width=0, show_percent=True, show_pos=True, ) as bar: for count in bar: bar.label = FLATTENED_LABEL.format( click.style(str(count + 1), fg="cyan")) click.secho(_("Done flattening. Flattened objects: {}").format( click.style(str(count + 1), fg="red")), fg="green")
def flatten_options(): return FlattenOptions( **{ "selection": {"tenders": {"split": True}, "parties": {"split": False}}, } )
def flatten_data(flatten_id, model=None, lang_code="en_US"): with internationalization(lang_code=lang_code): logger_context = { "FLATTEN_ID": flatten_id, "TASK": "flatten_data", "MODEL": model } channel_layer = get_channel_layer() if model not in getters: extra = { "MESSAGE_ID": "model_not_registered", "MODEL": model, "TASK": "flatten_data", "FLATTEN_ID": flatten_id, } logger.info("Model %s not registered in getters", model, extra=extra) return try: serializer = FlattenSerializer() flatten = Flatten.objects.get(id=flatten_id) selection = flatten.dataselection_set.all()[0] datasource = getattr(selection, f"{model.lower()}_set").all()[0] flatten.status = "processing" flatten.save(update_fields=["status"]) async_to_sync(channel_layer.group_send)( f"datasource_{datasource.id}", { "type": "task.flatten", "flatten": serializer.to_representation(instance=flatten) }, ) spec = DataPreprocessor.restore(datasource.analyzed_file.path) total_rows = spec.total_items opt = get_flatten_options(selection) # In case of exclusion of child tables, 'split' of root table should be set to 'False' for proper export # TODO: There should be better way to handle this (probably on library side) if "exclude" in opt: for _table in opt["exclude"]: _parent = spec.tables[_table].parent if _parent != "" and _parent.name in opt["selection"]: opt["selection"][_parent.name]["split"] = False logger.debug( "Generate options for export", extra={ "MESSAGE_ID": "generate_flatten_options", "DATASOURCE_ID": str(datasource.id), "MODEL": model, "SELECTION_ID": str(selection.id), "FLATTEN_ID": str(flatten.id), "OPTIONS": opt, }, ) options = FlattenOptions(**opt) files = [file.file.path for file in datasource.files.all()] workdir = pathlib.Path(files[0]).parent formats = {"csv": None, "xlsx": None} if flatten.export_format == flatten.CSV: workdir = workdir / "export" if not workdir.exists(): os.makedirs(workdir) formats[flatten.export_format] = workdir else: formats[flatten.export_format] = "result.xlsx" flattener = FileFlattener( workdir, options, tables=spec.tables, pkg_type=datasource.root_key, multiple_values=getattr(spec, "multiple_values", False), schema=spec.schema, **formats, ) timestamp = time.time() for count in flattener.flatten_file(files): if (time.time() - timestamp) <= 1: continue async_to_sync(channel_layer.group_send)( f"datasource_{datasource.id}", { "type": "task.flatten", "flatten": { "id": str(flatten.id) }, "progress": { "total_rows": total_rows, "processed": count, "percentage": (count / total_rows) * 100 if total_rows else total_rows, }, }, ) timestamp = time.time() if flatten.export_format == flatten.CSV: target_file = f"{workdir}/{datasource.id}.zip" zip_files(workdir, target_file, extension="csv") with open(target_file, "rb") as fd: file_ = File(fd) file_.name = f"{datasource.id}.zip" flatten.file = file_ flatten.status = "completed" flatten.save(update_fields=["file", "status"]) os.remove(fd.name) else: target_file = f"{workdir}/result.xlsx" with open(target_file, "rb") as fd: file_ = File(fd) file_.name = "result.xlsx" flatten.file = file_ flatten.status = "completed" flatten.save(update_fields=["file", "status"]) os.remove(fd.name) async_to_sync(channel_layer.group_send)( f"datasource_{datasource.id}", { "type": "task.flatten", "flatten": serializer.to_representation(instance=flatten) }, ) except ObjectDoesNotExist: extra = deepcopy(logger_context) extra["MESSAGE_ID"] = "flatten_not_found" logger.info("Flatten %s for %s model not found", flatten_id, model, extra=extra) except OSError as e: extra = deepcopy(logger_context) extra.update({ "MESSAGE_ID": "flatten_no_left_space", "DATASOURCE_ID": str(datasource.id), "ERROR_MSG": str(e) }) logger.info("Flatten %s for %s model failed: %s", flatten_id, model, e, extra=extra) flatten.status = "failed" flatten.error = (_( "Currently, the space limit was reached. Please try again later." ) if "[Errno 28]" in str(e) else _( "Something went wrong during processing of your file, please contact support" )) flatten.save(update_fields=["error", "status"]) async_to_sync(channel_layer.group_send)( f"datasource_{datasource.id}", { "type": "task.flatten", "flatten": serializer.to_representation(instance=flatten) }, ) except (TypeError, Exception) as e: error_message = str(e) extra = deepcopy(logger_context) extra["MESSAGE_ID"] = "flatten_failed" extra["ERROR_MESSAGE"] = error_message logger.error( "Flatten %s for %s datasource %s failed", flatten_id, model, datasource.id, extra=extra, exc_info=True, ) flatten.status = "failed" flatten.error = error_message flatten.save(update_fields=["error", "status"]) async_to_sync(channel_layer.group_send)( f"datasource_{datasource.id}", { "type": "task.flatten", "flatten": serializer.to_representation(instance=flatten) }, )