def test_transform_tnds_empty(asserts, xslt, mock_ids): # Set modification to 'delete' - should be excluded raw = et.parse(TNDS_RAW) raw.getroot().set("Modification", "Delete") output = xslt_transform(raw, xslt, region="Y", file="SVRYSBO120.xml") asserts.xml_elements_equal(output.getroot(), et.XML("<Data/>"))
def test_transform_tnds_wrong_mode(asserts, xslt, mock_ids): # Set service mode to ferry - should be excluded raw = et.parse(TNDS_RAW) ns = {"t": raw.xpath("namespace-uri()")} raw.xpath("//t:Service/t:Mode", namespaces=ns)[0].text = "ferry" output = xslt_transform(raw, xslt, region="Y", file="SVRYSBO120.xml") asserts.xml_elements_equal(output.getroot(), et.XML("<Data/>"))
def test_transform_tnds_missing_mode(asserts, xslt, mock_ids): # Remove service mode - this should default to bus (1) raw = et.parse(TNDS_RAW) ns = {"t": raw.xpath("namespace-uri()")} mode = raw.xpath("//t:Service/t:Mode", namespaces=ns)[0] mode.getparent().remove(mode) output = xslt_transform(raw, xslt, region="Y", file="SVRYSBO120.xml") assert output.xpath("/Data/Service/mode")[0].text == "1"
def test_transform_tnds(asserts, xslt, mock_ids, service_codes): output = xslt_transform(et.parse(TNDS_RAW), xslt, region="Y", file="SVRYSBO120A.xml") expected = et.parse(TNDS_OUT, parser=et.XMLParser(remove_blank_text=True)) print(et.tostring(output, pretty_print=True)) asserts.xml_elements_equal(output.getroot(), expected.getroot())
def populate_tnds_data(connection, path=None, delete=True, warn=False): """ Commits TNDS data to database. :param connection: Connection for population. :param path: Path for zip files with TNDS XML documents and named after region codes. Global expansion is supported - all unique files matching region codes will be used. The archives will be downloaded if this is None. :param delete: Truncate all data from TNDS tables before populating. :param warn: Log warning if no FTP credentials exist. If False an error will be raised instead. """ data = _get_archives(connection, path, warn) if data is None: return # Check if operators exist first operators_exist = connection.execute( db.exists(db.select([models.Operator.code])).select()).scalar() if not operators_exist: raise ValueError( "No operators were found. The TNDS dataset requires the database " "to be populated from NOC data first.") row_ids = setup_row_ids(connection, check_existing=not delete) setup_stop_exists(connection) setup_service_codes() # We don't want to delete any NOC data if they have been added excluded = models.Operator, models.LocalOperator metadata = utils.reflect_metadata(connection) with open_binary("nextbus.populate", "tnds.xslt") as file_: xslt = et.XSLT(et.parse(file_)) del_ = delete for region, archive in data.items(): for file_ in file_ops.iter_archive(archive): path = os.path.join(os.path.basename(archive), file_.name) utils.logger.info(f"Parsing file {path!r}") try: data = utils.xslt_transform(file_, xslt, region=region, file=file_.name) except RowIdError: # IDs do not match in XML file; log error and move on utils.logger.error(f"Invalid IDs in file {path!r}", exc_info=1) else: utils.populate_database(connection, utils.collect_xml_data(data), metadata=metadata, delete=del_, exclude=excluded) row_ids.clear() del_ = False
def test_transform_alt_description(asserts, xslt, mock_ids, service_codes): data = et.parse(TNDS_RAW) ns = {"txc": data.xpath("namespace-uri()")} description = data.xpath( "/txc:TransXChange/txc:Services/txc:Service/txc:Description", namespaces=ns)[0] # Clear description text, output should be same as origin/destination from # standard service will be used instead description.text = "" output = xslt_transform(et.parse(TNDS_RAW), xslt, region="Y", file="SVRYSBO120A.xml") expected = et.parse(TNDS_OUT, parser=et.XMLParser(remove_blank_text=True)) asserts.xml_elements_equal(output.getroot(), expected.getroot())
def populate_noc_data(connection, path=None): """ Convert NOC data (service operators) to database objects and commit them to the application database. :param connection: Connection for population. :param path: Path to raw data in XML form """ temp = current_app.config.get("TEMP_DIRECTORY") if not temp: raise ValueError("TEMP_DIRECTORY is not defined.") if path is None: file_path = file_ops.download(NOC_URL, directory=temp) else: file_path = path utils.logger.info(f"Opening NOC XML file {file_path!r}") try: data = et.parse(file_path) except (UnicodeDecodeError, et.XMLSyntaxError): # NOC data is encoded in Windows-1252 for some reason despite the XML # declaration specifying UTF-8 encoding utils.logger.warning( f"NOC XML file {file_path!r} cannot be parsed with UTF-8 - trying " f"again with CP1252" ) data = et.parse(file_path, et.XMLParser(encoding="CP1252")) with open_binary("nextbus.populate", "noc.xslt") as file_: xslt = et.XSLT(et.parse(file_)) utils.populate_database( connection, utils.collect_xml_data(utils.xslt_transform(data, xslt)), delete=True ) if file_path is None: utils.logger.info(f"New file {file_path!r} downloaded; can be deleted") utils.logger.info("NOC population done")
def populate_nptg_data(connection, archive=None, list_files=None): """ Convert NPTG data (regions admin areas, districts and localities) to database objects and commit them to the application database. :param connection: Connection & transaction for population :param archive: Path to zipped archive file for NPTG XML files. :param list_files: List of file paths for NPTG XML files. """ temp = current_app.config.get("TEMP_DIRECTORY") if not temp: raise ValueError("TEMP_DIRECTORY is not defined.") if archive is not None and list_files is not None: raise ValueError("Can't specify both archive file and list of files.") elif archive is not None: iter_files = file_ops.iter_archive(archive) elif list_files is not None: iter_files = iter(list_files) else: downloaded = file_ops.download(NPTG_URL, directory=temp, params={"format": "xml"}) iter_files = file_ops.iter_archive(downloaded) metadata = utils.reflect_metadata(connection) with open_binary("nextbus.populate", "nptg.xslt") as file_: xslt = et.XSLT(et.parse(file_)) deleted = False for i, file_ in enumerate(iter_files): file_name = file_.name if hasattr(file_, "name") else file_ utils.logger.info(f"Parsing file {file_name!r}") utils.populate_database(connection, utils.collect_xml_data( utils.xslt_transform(file_, xslt)), metadata=metadata, delete=not deleted) deleted = True
def test_update_tnds_data(load_db): with open_binary("nextbus.populate", "tnds.xslt") as file_: xslt = et.XSLT(et.parse(file_)) setup_service_codes() # All relevant data already exists for Dagenham Sunday market shuttle; # just overwrite route data using a newer file file_name = "66-DSM-_-y05-1" with db.engine.begin() as connection: setup_stop_exists(connection) setup_row_ids(connection, check_existing=False) transformed = xslt_transform(TNDS_DSM, xslt, region="L", file=file_name) data = collect_xml_data(transformed) populate_database(connection, data, delete=True, exclude=(models.Operator, models.LocalOperator)) assert _as_dict(models.Service.query.one()) == { "id": 1, "code": "dagenham-sunday-market-shuttle", "line": "Dagenham Sunday Market Shuttle", "description": "Barking – Dagenham Sunday Market", "short_description": "Barking – Dagenham Sunday Market", "mode": 1, "filename": file_name } patterns = (models.JourneyPattern.query.order_by( models.JourneyPattern.id).all()) assert len(patterns) == 2 assert _as_dict(patterns[0]) == dict(id=1, origin="Barking Station", destination="Dagenham Sunday Market", service_ref=1, direction=False, date_start=datetime.date(2019, 12, 8), date_end=datetime.date(2020, 5, 31), local_operator_ref="ATC", region_ref="L") journeys = (models.Journey.query.order_by(models.Journey.id).all()) assert len(journeys) == 26 assert _as_dict(journeys[0]) == dict(id=1, pattern_ref=1, start_run=None, end_run=None, departure=datetime.time(8, 30), days=0b10000000, weeks=None, include_holidays=0b0000010001010010, exclude_holidays=0b0000001000101000, note_code=None, note_text=None) special_days = (models.SpecialPeriod.query.order_by( models.SpecialPeriod.id).all()) assert len(special_days) == 26 assert _as_dict(special_days[0]) == dict( id=1, journey_ref=1, date_start=datetime.date(2020, 5, 8), date_end=datetime.date(2020, 5, 8), operational=True)
def populate_naptan_data(connection, archive=None, list_files=None, split=True): """ Convert NaPTAN data (stop points and areas) to database objects and commit them to the application database. :param connection: Connection for population :param archive: Path to zipped archive file for NaPTAN XML files. :param list_files: List of file paths for NaPTAN XML files. :param split: Splits NaPTAN XML files in archive by admin area code. Has no effect if list_files is used. """ # Get complete list of ATCO admin areas and localities from NPTG data query_area = connection.execute(db.select([models.AdminArea.code])) query_local = connection.execute(db.select([models.Locality.code])) areas = [a[0] for a in query_area] localities = [local[0] for local in query_local] if not areas or not localities: raise ValueError("NPTG tables are not populated; stop point data " "cannot be added without the required locality data. " "Populate the database with NPTG data first.") temp = current_app.config.get("TEMP_DIRECTORY") if not temp: raise ValueError("TEMP_DIRECTORY is not defined.") if archive is not None and list_files is not None: raise ValueError("Can't specify both archive file and list of files.") elif archive is not None: path = archive elif list_files is not None: path = None else: downloaded = file_ops.download(NAPTAN_URL, directory=temp, params={"dataFormat": "XML"}) utils.logger.info(f"Zipping {downloaded!r}") # The downloaded file is not zipped. Move it into an archive path = os.path.join(temp, "NaPTAN.zip") with zipfile.ZipFile(path, "w", compression=zipfile.ZIP_DEFLATED) as zf: zf.write(downloaded) os.remove(downloaded) if path is not None and split: split_path = os.path.join(temp, "NaPTAN_split.zip") _split_naptan_data(areas, path, split_path) path = split_path if path is not None: iter_files = file_ops.iter_archive(path) else: iter_files = iter(list_files) # Go through data and create objects for committing to database _setup_naptan_functions() metadata = utils.reflect_metadata(connection) with open_binary("nextbus.populate", "naptan.xslt") as file_: xslt = et.XSLT(et.parse(file_)) deleted = False for i, file_ in enumerate(iter_files): file_name = file_.name if hasattr(file_, "name") else file_ utils.logger.info(f"Parsing file {file_name!r}") utils.populate_database(connection, utils.collect_xml_data( utils.xslt_transform(file_, xslt)), metadata=metadata, delete=not deleted) deleted = True
def test_naptan_transform_all(asserts): _setup_naptan_functions() data = xslt_transform(NAPTAN_RAW, naptan_xslt()) expected = et.parse(NAPTAN_ALL, PARSER) asserts.xml_elements_equal(data.getroot(), expected.getroot())
def test_nptg_transform_all(asserts): data = xslt_transform(NPTG_RAW, nptg_xslt()) expected = et.parse(NPTG_ALL, et.XMLParser(remove_blank_text=True)) asserts.xml_elements_equal(data.getroot(), expected.getroot())