def process_main_language_code_for_gtfs_metadata(gtfs_representation):
    """Process the main language code using the`agency` file from the GTFS dataset of the representation.
    Add the main language code to the representation metadata once processed.
    :param gtfs_representation: The representation of the GTFS dataset to process.
    :return: The representation of the GTFS dataset post-execution.
    """
    validate_gtfs_representation(gtfs_representation)
    dataset = gtfs_representation.dataset
    metadata = gtfs_representation.metadata

    # Agency must be present AND not empty because we are accessing the first index
    agency_is_present = (
        dataset.agency is not None
        and AGENCY_LANG in dataset.agency.columns
        and not dataset.agency.empty
    )

    if agency_is_present:
        # Extract the main language code from the first row in the dataset agency
        main_language_code = dataset.agency[AGENCY_LANG].iloc[AGENCY_LANG_IDX]

        # Set the main language code in the GTFS representation
        # if it is a non empty string
        if isinstance(main_language_code, str) and len(main_language_code) != 0:
            metadata.main_language_code = main_language_code

    return gtfs_representation
def process_timezones_for_gtfs_metadata(gtfs_representation):
    """Process all the timezones using the `stops` and the `agency` files from the GTFS dataset of the representation.
    Add the list of all the timezones to the representation metadata once processed.
    :param gtfs_representation: The representation of the GTFS dataset to process.
    :return: The representation of the GTFS dataset post-execution.
    """
    validate_gtfs_representation(gtfs_representation)
    dataset = gtfs_representation.dataset
    metadata = gtfs_representation.metadata

    # Agency must be present AND not empty because we are accessing the first index
    agency_is_present = (dataset.agency is not None
                         and AGENCY_TIMEZONE in dataset.agency.columns
                         and not dataset.agency.empty)
    stops_are_present = (dataset.stops is not None
                         and STOP_TIMEZONE in dataset.stops.columns)

    if agency_is_present or stops_are_present:
        if agency_is_present:
            # Extract main timezone
            main_timezone = dataset.agency[AGENCY_TIMEZONE].iloc[
                AGENCY_TIMEZONE_IDX]
        else:
            main_timezone = ""

        stop_timezones = set()
        if stops_are_present:
            # Extract the timezones using the stop_timezone in the dataset stops
            for index, row in dataset.stops.iterrows():
                # Keep the stop timezone only if the value exist, is a string and is not empty
                if isinstance(row[STOP_TIMEZONE],
                              str) and len(row[STOP_TIMEZONE]) != 0:
                    stop_timezones.add(row[STOP_TIMEZONE])

        # Remove the main_timezone from the set of the stop_timezones
        # to create the other_timezones
        other_timezones = set()
        if len(stop_timezones) != 0:
            other_timezones.update(stop_timezones)
            other_timezones.discard(main_timezone)
            # Convert the set of time to a list, and sort it alphabetically
            other_timezones = sorted(list(other_timezones))

        # Set the main timezone in the GTFS representation
        # if it is a non empty string
        if isinstance(main_timezone, str) and len(main_timezone) != 0:
            metadata.main_timezone = main_timezone

        # Set the other timezones in the GTFS representation
        # if the list is not empty
        if len(other_timezones) != 0:
            metadata.other_timezones = other_timezones

    return gtfs_representation
Ejemplo n.º 3
0
def process_stops_count_by_type_for_gtfs_metadata(gtfs_representation):
    """Process and count by type all the stops in the `stops` file from the GTFS dataset of the representation.
    Add the dictionary of the stops count to the representation metadata once processed.
    :return: The representation of the GTFS dataset post-execution.
    :param gtfs_representation: The representation of the GTFS dataset to process.
    """
    validate_gtfs_representation(gtfs_representation)
    dataset = gtfs_representation.dataset
    metadata = gtfs_representation.metadata

    stops_are_present = (dataset.stops is not None
                         and LOCATION_TYPE in dataset.stops.columns)

    if stops_are_present:
        # Transform the blank location types to 0
        # According to the GTFS specification, blank location type is a Stop
        dataset.stops[LOCATION_TYPE] = dataset.stops[LOCATION_TYPE].fillna(0)
        dataset.stops = dataset.stops

        # Count stops by location type
        # Generic Node (3) and Boarding Area (4) are not considered
        # because they relate to an existing Stop or Station.
        stops_count = (dataset.stops[LOCATION_TYPE].loc[
            dataset.stops[LOCATION_TYPE] == STOP].size)
        stations_count = (dataset.stops[LOCATION_TYPE].loc[
            dataset.stops[LOCATION_TYPE] == STATION].size)
        entrances_count = (dataset.stops[LOCATION_TYPE].loc[
            dataset.stops[LOCATION_TYPE] == ENTRANCE].size)

        # Create the dictionary of stops count by type
        stops_count_by_type = {
            STOP_KEY: stops_count,
            STATION_KEY: stations_count,
            ENTRANCE_KEY: entrances_count,
        }

        # Clean the dictionary to keep only the stop type
        # where the stop count is one or more
        stops_count_by_type = {
            key: count
            for key, count in stops_count_by_type.items() if count > 0
        }

        # Set the stops count by type in the GTFS representation
        # if the dictionary is not empty
        if len(stops_count_by_type) != 0:
            metadata.stops_count_by_type = stops_count_by_type

    return gtfs_representation
def process_geographical_boundaries_for_gtfs_metadata(
    gtfs_representation, geo_boundaries_map
):
    """Process the geographical boundaries, according to the `geo_boundaries_map`,
    using the `stops` file from the GTFS dataset of the representation.
    Add the geographical boundaries to the representation metadata once processed.
    :param gtfs_representation: The representation of the GTFS dataset to process.
    :param geo_boundaries_map: Either BOUNDING_BOX_MAP or BOUNDING_OCTAGON_MAP.
    :return: The representation of the GTFS dataset post-execution.
    """
    validate_gtfs_representation(gtfs_representation)
    dataset = gtfs_representation.dataset
    metadata = gtfs_representation.metadata

    stops_required_columns = {STOP_LAT, STOP_LON}
    stops_are_present = dataset.stops is not None and stops_required_columns.issubset(
        dataset.stops.columns
    )

    if stops_are_present:
        # Extract the box corners coordinates in the dataset representation and
        # Order the corners inside a bounding box
        # The order is clockwise, from the South East to the North East corner
        # or
        # Extract the octagon corners coordinates in the dataset representation and
        # Order the corners inside a bounding octagon
        # The order is clockwise, from the right bottom to the right top corner
        # Documentation about dictionary comprehension can be found at:
        # https://docs.python.org/3/tutorial/datastructures.html
        geo_boundaries = {
            f"{index+1}": corner
            for index, corner in enumerate(
                geo_boundaries_map[GEO_BOUNDARIES_UTILS](dataset)
            )
        }

        # Set the bounding box in the GTFS representation
        # or
        # Set the bounding octagon in the GTFS representation
        # if the dictionary is of the correct size
        if len(geo_boundaries) == geo_boundaries_map[GEO_BOUNDARIES_SIZE]:
            setattr(metadata, geo_boundaries_map[GEO_BOUNDARIES_ATTR], geo_boundaries)

    return gtfs_representation
def process_country_codes_for_gtfs_metadata(gtfs_representation):
    """Process the country codes of a GTFS dataset using the latitude and longitude pairs
    from `stops` file from the GTFS dataset of the representation.
    Add the country codes to the representation metadata once processed.
    :param gtfs_representation: The representation of the GTFS dataset to process.
    :return: The representation of the GTFS dataset post-execution.
    """
    validate_gtfs_representation(gtfs_representation)
    dataset = gtfs_representation.dataset
    metadata = gtfs_representation.metadata

    stops_required_columns = {STOP_LAT, STOP_LON}
    stops_are_present = dataset.stops is not None and stops_required_columns.issubset(
        dataset.stops.columns)

    # Make sure latitude and longitude columns are present in stops.txt before execution
    if stops_are_present:
        # Initialize the country codes set
        country_codes = set()

        # Zip the latitude and longitude pairs in stops.txt
        coordinates = [(lat, lon)
                       for lat, lon in zip(dataset.stops[STOP_LAT].tolist(),
                                           dataset.stops[STOP_LON].tolist())]

        # Compute the country codes from every latitude and longitude pairs in stops.txt
        if len(coordinates) != 0:
            infos = rg.search(coordinates)
            for info in infos:
                country_code = info.get(RG_COUNTRY_CODE_KEY, None)
                # Keep the country code only if the value is not None and is not empty
                if country_code is not None and len(country_code) != 0:
                    country_codes.add(country_code)

        # Convert the country codes set to a list, and sort it alphabetically
        country_codes = sorted(list(country_codes))

        # Set the country codes in the GTFS representation
        # if the list is not empty
        if len(country_codes) != 0:
            metadata.country_codes = country_codes

    return gtfs_representation
Ejemplo n.º 6
0
def process_agencies_count_for_gtfs_metadata(gtfs_representation):
    """Process and count all the agencies in the `agency` file from the GTFS dataset of the representation.
    Add the agencies count to the representation metadata once processed.
    :param gtfs_representation: The representation of the GTFS dataset to process.
    :return: The representation of the GTFS dataset post-execution.
    """
    validate_gtfs_representation(gtfs_representation)
    dataset = gtfs_representation.dataset
    metadata = gtfs_representation.metadata

    agency_is_present = (dataset.agency is not None
                         and AGENCY_NAME in dataset.agency.columns)

    if agency_is_present:
        # Count agencies
        agencies_count = dataset.agency[AGENCY_NAME].size

        # Set the main timezone in the GTFS representation
        # if there is one agency or more
        if agencies_count > 0:
            metadata.agencies_count = agencies_count

    return gtfs_representation
Ejemplo n.º 7
0
def process_routes_count_by_type_for_gtfs_metadata(gtfs_representation):
    """Process and count by type all the routes in the `routes` file from the GTFS dataset of the representation.
    Add the dictionary of the routes count to the representation metadata once processed.
    :param gtfs_representation: The representation of the GTFS dataset to process.
    :return: The representation of the GTFS dataset post-execution.
    """
    validate_gtfs_representation(gtfs_representation)
    dataset = gtfs_representation.dataset
    metadata = gtfs_representation.metadata

    routes_are_present = (
        dataset.routes is not None and ROUTE_TYPE in dataset.routes.columns
    )

    if routes_are_present:
        # Count routes by route type
        trams_count = (
            dataset.routes[ROUTE_TYPE].loc[dataset.routes[ROUTE_TYPE] == TRAM].size
        )
        subways_count = (
            dataset.routes[ROUTE_TYPE].loc[dataset.routes[ROUTE_TYPE] == SUBWAY].size
        )
        rails_count = (
            dataset.routes[ROUTE_TYPE].loc[dataset.routes[ROUTE_TYPE] == RAIL].size
        )
        buses_count = (
            dataset.routes[ROUTE_TYPE].loc[dataset.routes[ROUTE_TYPE] == BUS].size
        )
        ferries_count = (
            dataset.routes[ROUTE_TYPE].loc[dataset.routes[ROUTE_TYPE] == FERRY].size
        )
        cable_trams_count = (
            dataset.routes[ROUTE_TYPE]
            .loc[dataset.routes[ROUTE_TYPE] == CABLE_TRAM]
            .size
        )
        aerial_lifts_count = (
            dataset.routes[ROUTE_TYPE]
            .loc[dataset.routes[ROUTE_TYPE] == AERIAL_LIFT]
            .size
        )
        funiculars_count = (
            dataset.routes[ROUTE_TYPE].loc[dataset.routes[ROUTE_TYPE] == FUNICULAR].size
        )
        trolley_buses_count = (
            dataset.routes[ROUTE_TYPE]
            .loc[dataset.routes[ROUTE_TYPE] == TROLLEY_BUS]
            .size
        )
        monorails_count = (
            dataset.routes[ROUTE_TYPE].loc[dataset.routes[ROUTE_TYPE] == MONORAIL].size
        )

        # Create the dictionary of routes count by type
        routes_count_by_type = {
            os.environ[TRAM_CODE]: trams_count,
            os.environ[SUBWAY_CODE]: subways_count,
            os.environ[RAIL_CODE]: rails_count,
            os.environ[BUS_CODE]: buses_count,
            os.environ[FERRY_CODE]: ferries_count,
            os.environ[CABLE_TRAM_CODE]: cable_trams_count,
            os.environ[AERIAL_LIFT_CODE]: aerial_lifts_count,
            os.environ[FUNICULAR_CODE]: funiculars_count,
            os.environ[TROLLEY_BUS_CODE]: trolley_buses_count,
            os.environ[MONORAIL_CODE]: monorails_count,
        }

        # Clean the dictionary to keep only the route type
        # where the route count is one or more
        routes_count_by_type = {
            key: count for key, count in routes_count_by_type.items() if count > 0
        }

        # Set the routes count by type in the GTFS representation
        # if the dictionary is not empty
        if len(routes_count_by_type) != 0:
            metadata.routes_count_by_type = routes_count_by_type

    return gtfs_representation
Ejemplo n.º 8
0
def process_timestamp_for_gtfs_metadata(gtfs_representation, timestamp_map):
    """Process the start/end timestamp using the `agency`,
    `calendar`, `calendar_dates`, `trips` and `stop_times` files from the GTFS dataset
    of the representation, depending on which timestamp_map it receives.
    Add the end timestamp to the representation metadata once processed.
    :param gtfs_representation: The representation of the GTFS dataset to process.
    :param timestamp_map: either START_TIMESTAMP_MAP or END_TIMESTAMP_MAP
    :return: The representation of the GTFS dataset post-execution.
    """
    validate_gtfs_representation(gtfs_representation)
    dataset = gtfs_representation.dataset
    metadata = gtfs_representation.metadata

    calendar_required_columns = CALENDAR_REQUIRED_COLUMNS.union(
        {timestamp_map[CALENDAR_DATE_KEY]})
    stop_times_required_columns = {TRIP_ID, timestamp_map[STOP_TIME_KEY]}

    calendar_is_present = (dataset.calendar is not None
                           and calendar_required_columns.issubset(
                               dataset.calendar.columns))
    calendar_dates_are_present = (dataset.calendar_dates is not None
                                  and CALENDAR_DATES_REQUIRED_COLUMNS.issubset(
                                      dataset.calendar_dates.columns))
    trips_are_present = (dataset.trips is not None
                         and SERVICE_ID in dataset.trips.columns)
    stop_times_are_present = (dataset.stop_times is not None
                              and stop_times_required_columns.issubset(
                                  dataset.stop_times.columns))
    agency_is_present = (dataset.agency is not None
                         and AGENCY_TIMEZONE in dataset.agency.columns)

    if ((calendar_is_present or calendar_dates_are_present)
            and trips_are_present and stop_times_are_present
            and agency_is_present):
        # Extract the start dates in the dataset representation
        # or
        # Extract the end dates in the dataset representation
        dataset_dates = get_gtfs_dates_by_type(
            dataset, date_type=timestamp_map[DATASET_DATE_TYPE])
        dates = pd.to_datetime(dataset_dates[DATE_KEY], format=PD_DATE_FORMAT)

        # Get first start service date with min()
        # or
        # Get last end service date with max()
        service_date = getattr(dates, timestamp_map[MIN_MAX_ATTR])()

        # Continue only if the dataset dates is not an empty dataframe and service date found is not null
        if not dataset_dates.empty and pd.notna(service_date):
            # Get every stop time of the dataset for the start service date
            # or
            # Get every stop time of the dataset for the end service date
            stop_times_for_date = get_gtfs_stop_times_for_date(
                dataset, dataset_dates, service_date,
                timestamp_map[STOP_TIME_KEY])

            # Get first arrival time of the first start service date with min()
            # or
            # Get last departure time of the last end service date with max()
            stop_time = getattr(
                stop_times_for_date[timestamp_map[STOP_TIME_KEY]],
                timestamp_map[MIN_MAX_ATTR],
            )()

            # Continue only if the stop time found is not null
            if pd.notna(stop_time):
                # Compute UTC offset for the GTFS dataset
                timezone_offset = get_gtfs_timezone_utc_offset(dataset)

                # Continue if the timezone offset is not empty
                if len(timezone_offset) != 0:
                    # Build and set timestamp string in ISO 8601 YYYY-MM-DDThh:mm:ss±hh:mm format
                    timestamp = f"{service_date.strftime(TIMESTAMP_FORMAT)}T{stop_time}{timezone_offset}"
                    # Set timestamp if the string is not empty
                    if len(timestamp) != 0:
                        setattr(metadata, timestamp_map[TIMESTAMP_ATTR],
                                timestamp)

    return gtfs_representation
def process_service_date_for_gtfs_metadata(gtfs_representation,
                                           service_date_map):
    """Execute the ``ProcessStartServiceDateForGtfsMetadata`` or ``ProcessEndServiceDateForGtfsMetadata`` use case
    depending on which service_date it receives.
    Process the start service date using the `feed_info`, `calendar` and `calendar_dates` files
    from the GTFS dataset of the representation.
    Add the start service date to the representation metadata once processed.
    :param gtfs_representation: The representation of the GTFS dataset to process.
    :param service_date_map: Either START_DATE_MAP or END_DATE_MAP.
    :return: The representation of the GTFS dataset post-execution.
    """
    validate_gtfs_representation(gtfs_representation)
    dataset = gtfs_representation.dataset
    metadata = gtfs_representation.metadata

    calendar_required_columns = CALENDAR_REQUIRED_COLUMNS.union(
        {service_date_map[CALENDAR_DATE_KEY]})

    feed_info_is_present = (
        dataset.feed_info is not None
        and service_date_map[FEED_DATE_KEY] in dataset.feed_info.columns
        and not dataset.feed_info[
            service_date_map[FEED_DATE_KEY]].isnull().values.all())
    calendar_is_present = (dataset.calendar is not None
                           and calendar_required_columns.issubset(
                               dataset.calendar.columns))
    calendar_dates_are_present = (dataset.calendar_dates is not None
                                  and CALENDAR_DATES_REQUIRED_COLUMNS.issubset(
                                      dataset.calendar_dates.columns))

    if feed_info_is_present or calendar_is_present or calendar_dates_are_present:
        if feed_info_is_present:
            # Extract start service date from feed info if the file is provided
            # or
            # Extract end service date from feed info if the file is provided
            feed_dates = dataset.feed_info[service_date_map[FEED_DATE_KEY]]
            filtered_feed_info = dataset.feed_info.loc[feed_dates.notnull()]
            dates = pd.to_datetime(
                filtered_feed_info[service_date_map[FEED_DATE_KEY]],
                format=PD_DATE_FORMAT,
            )
        else:
            # Extract the start dates in the dataset representation
            # or
            # Extract the end dates in the dataset representation
            dataset_dates = get_gtfs_dates_by_type(
                dataset, date_type=service_date_map[DATASET_DATE_TYPE])
            dates = pd.to_datetime(dataset_dates[DATE_KEY],
                                   format=PD_DATE_FORMAT)

        # Get first start service date with min() and converting the date into a ISO 8601 string
        # or
        # Get last end service date with max() and converting the date into a ISO 8601 string
        service_date = getattr(dates, service_date_map[MIN_MAX_ATTR])()
        if pd.notna(service_date):
            service_date = service_date.strftime(SERVICE_DATE_FORMAT)

            # Set the start service date in the GTFS representation
            # or
            # Set the end service date in the GTFS representation
            # if the string is not empty
            if len(service_date) != 0:
                setattr(metadata, service_date_map[SERVICE_DATE_ATTR],
                        service_date)

    return gtfs_representation
Ejemplo n.º 10
0
def create_dataset_entity_for_gtfs_metadata(gtfs_representation,
                                            api_url,
                                            username=None,
                                            password=None):
    """Create a dataset entity for a new dataset version on the Database.
    :param gtfs_representation: The representation of the GTFS dataset to process.
    :param api_url: API url, either PRODUCTION_API_URL or STAGING_API_URL.
    :return: The representation of the GTFS dataset post-execution.
    """
    validate_api_url(api_url)
    validate_gtfs_representation(gtfs_representation)
    metadata = gtfs_representation.metadata

    ###########################
    # 1. Process the core props
    ###########################

    # Begin with the core properties data
    # To verify if the dataset entity already exist
    core_props_data = []

    # SHA-1 hash property
    if is_valid_instance(metadata.sha1_hash, str):
        core_props_data.append(
            wbi_core.String(value=metadata.sha1_hash,
                            prop_nr=os.environ[SHA1_HASH_PROP]))

    # Archives URL, from the stable URL property
    if is_valid_instance(metadata.stable_urls, dict):
        archives_url = metadata.stable_urls.get(ARCHIVES_URL)
        try:
            core_props_data.append(
                wbi_core.Url(
                    value=archives_url,
                    prop_nr=os.environ[STABLE_URL_PROP],
                    rank=PREFERRED,
                ))
        except ValueError as ve:
            print(
                f"url {archives_url} for source {metadata.source_entity_code} caused {ve}"
            )
            raise ve

    # If the 2 core props values in were NOT added in core_props_data,
    # then it is not possible to verify if the dataset entity already exists
    if len(core_props_data) != 2:
        raise MissingCorePropsException(core_props_data)

    # An existing dataset entity is considered the same as the one processed
    # if and only if 2 core props values are matching: the SHA-1 hash and the Archives URL
    # so the core properties threshold is 100%
    core_props_threshold = 1.0

    try:
        dataset_entity = wbi_core.ItemEngine(
            data=core_props_data,
            core_props={
                os.environ[STABLE_URL_PROP],
                os.environ[SHA1_HASH_PROP],
            },
            core_prop_match_thresh=core_props_threshold,
        )
    except ManualInterventionReqException as mi:
        print(
            f"ManualInterventionReqException : a core property value exists for multiple dataset entities."
        )
        raise mi
    except CorePropIntegrityException as cp:
        print(
            f"CorePropIntegrityException: a dataset entity exists with 1 of the 2 core props values."
        )
        raise cp
    except Exception as e:
        print(f"metadata : {metadata} raised {e}")
        raise e

    # If the dataset entity retrieved as already an item_id (entity id) value,
    # then we do nothing because the dataset already exists
    if dataset_entity.item_id != "":
        raise EntityAlreadyExistsException(dataset_entity.item_id)

    #################################################
    # 2. Add the other properties to the dataset data
    #################################################
    dataset_data = []

    # Add the core_props_data to the dataset_data
    dataset_data += core_props_data

    # Delete the archives_url from the metadata.stable_urls
    # Since it was part of the core_props_data
    del metadata.stable_urls[ARCHIVES_URL]

    # Stable urls property
    if is_valid_instance(metadata.stable_urls, dict):
        for url in metadata.stable_urls.values():
            try:
                dataset_data.append(
                    wbi_core.Url(value=url,
                                 prop_nr=os.environ[STABLE_URL_PROP],
                                 rank=NORMAL))
            except ValueError as ve:
                print(
                    f"url {url} for source {metadata.source_entity_code} caused {ve}"
                )
                raise ve

    # Instance property
    dataset_data.append(
        wbi_core.ItemID(
            value=os.environ[GTFS_SCHEDULE_DATA_FORMAT],
            prop_nr=os.environ[INSTANCE_PROP],
        ))

    # Source entity property
    dataset_data.append(
        wbi_core.ItemID(value=metadata.source_entity_code,
                        prop_nr=os.environ[SOURCE_ENTITY_PROP]))

    # Main timezone property
    if is_valid_instance(metadata.main_timezone, str):
        dataset_data.append(
            wbi_core.String(
                value=metadata.main_timezone,
                prop_nr=os.environ[TIMEZONE_PROP],
                rank=PREFERRED,
            ))

    # Other timezones property
    if is_valid_instance(metadata.other_timezones, list):
        for timezone in metadata.other_timezones:
            dataset_data.append(
                wbi_core.String(value=timezone,
                                prop_nr=os.environ[TIMEZONE_PROP],
                                rank=NORMAL))

    # Country code property
    if is_valid_instance(metadata.country_codes, list):
        for country_code in metadata.country_codes:
            dataset_data.append(
                wbi_core.String(
                    value=country_code,
                    prop_nr=os.environ[COUNTRY_CODE_PROP],
                    rank=NORMAL,
                ))

    # Main language code property
    if is_valid_instance(metadata.main_language_code, str):
        dataset_data.append(
            wbi_core.String(
                value=metadata.main_language_code,
                prop_nr=os.environ[MAIN_LANGUAGE_CODE_PROP],
                rank=PREFERRED,
            ))

    # Start service date property
    if is_valid_instance(metadata.start_service_date, str):
        dataset_data.append(
            wbi_core.String(
                value=metadata.start_service_date,
                prop_nr=os.environ[START_SERVICE_DATE_PROP],
            ))

    # End service date property
    if is_valid_instance(metadata.end_service_date, str):
        dataset_data.append(
            wbi_core.String(
                value=metadata.end_service_date,
                prop_nr=os.environ[END_SERVICE_DATE_PROP],
            ))

    # Start timestamp property
    if is_valid_instance(metadata.start_timestamp, str):
        dataset_data.append(
            wbi_core.String(value=metadata.start_timestamp,
                            prop_nr=os.environ[START_TIMESTAMP_PROP]))

    # End timestamp property
    if is_valid_instance(metadata.end_timestamp, str):
        dataset_data.append(
            wbi_core.String(value=metadata.end_timestamp,
                            prop_nr=os.environ[END_TIMESTAMP_PROP]))

    # Bounding box property
    if is_valid_instance(metadata.bounding_box, dict):
        for order_key, corner_value in metadata.bounding_box.items():
            dataset_data.append(
                create_geographical_property(order_key, corner_value,
                                             os.environ[BOUNDING_BOX_PROP]))

    # Bounding octagon property
    if is_valid_instance(metadata.bounding_octagon, dict):
        for order_key, corner_value in metadata.bounding_octagon.items():
            dataset_data.append(
                create_geographical_property(
                    order_key, corner_value,
                    os.environ[BOUNDING_OCTAGON_PROP]))

    # Stop counts
    if is_valid_instance(metadata.stops_count_by_type, dict):
        # Number of stops property
        stops_count = metadata.stops_count_by_type.get(STOP_KEY, None)
        if stops_count is not None:
            dataset_data.append(
                wbi_core.Quantity(
                    quantity=stops_count,
                    prop_nr=os.environ[NUM_OF_STOPS_PROP],
                ))

        # Number of stations property
        stations_count = metadata.stops_count_by_type.get(STATION_KEY, None)
        if stations_count is not None:
            dataset_data.append(
                wbi_core.Quantity(
                    quantity=stations_count,
                    prop_nr=os.environ[NUM_OF_STATIONS_PROP],
                ))

        # Number of entrances property
        entrances_count = metadata.stops_count_by_type.get(ENTRANCE_KEY, None)
        if entrances_count is not None:
            dataset_data.append(
                wbi_core.Quantity(
                    quantity=entrances_count,
                    prop_nr=os.environ[NUM_OF_ENTRANCES_PROP],
                ))

    if is_valid_instance(metadata.agencies_count, int):
        # Number of agencies property
        dataset_data.append(
            wbi_core.Quantity(
                quantity=metadata.agencies_count,
                prop_nr=os.environ[NUM_OF_AGENCIES_PROP],
            ))

    # Number of routes property
    if is_valid_instance(metadata.routes_count_by_type, dict):
        for route_key, route_value in metadata.routes_count_by_type.items():
            route_qualifier = [
                wbi_core.ItemID(
                    value=route_key,
                    prop_nr=os.environ[ROUTE_TYPE_PROP],
                    is_qualifier=True,
                )
            ]
            dataset_data.append(
                wbi_core.Quantity(
                    quantity=route_value,
                    prop_nr=os.environ[NUM_OF_ROUTES_PROP],
                    qualifiers=route_qualifier,
                ))

    # Download date
    if is_valid_instance(metadata.download_date, str):
        dataset_data.append(
            wbi_core.String(
                value=metadata.download_date,
                prop_nr=os.environ[DOWNLOAD_DATE_PROP],
            ))

    # Dataset version entity label
    version_name_label = metadata.dataset_version_name
    if not username:
        username = os.environ[USERNAME]
    if not password:
        password = os.environ[PASSWORD]
    login_instance = wbi_login.Login(user=username, pwd=password)

    #################################################
    # 3. Create the dataset entity on the database
    #################################################

    # Create the Dataset WITHOUT using the core_props
    # For some reasons, when using the core_props with all the data
    # the WikibaseIntegrator library retrieves entities
    # that are not sharing data with the actual dataset entity,
    # which makes the process crash
    dataset_entity = wbi_core.ItemEngine(data=dataset_data, )

    # Set the label (name)
    dataset_entity.set_label(version_name_label, ENGLISH)

    # Create the dataset entity on the database
    dataset_entity_id = dataset_entity.write(login_instance)
    metadata.dataset_version_entity_code = dataset_entity_id

    # Create the source data with the dataset entity code and property
    version_prop = wbi_core.ItemID(
        value=metadata.dataset_version_entity_code,
        prop_nr=os.environ[DATASET_PROP],
        if_exists=APPEND,
    )
    source_data = [version_prop]

    # Update the source entity
    # Try maximum 20 times in cases there is edit conflicts
    try_count = 20
    has_succeeded = False

    while not has_succeeded and try_count > 0:
        source_entity = wbi_core.ItemEngine(
            item_id=metadata.source_entity_code)
        source_entity.update(source_data)
        try:
            source_entity.write(login_instance)
        except MWApiError as mwae:
            print(
                f"Failed to update: {source_entity.item_id} with data: {source_data} raised MWApiError. "
                f"{try_count} attempts left.")
            try_count -= 1
            # If the attempts have not succeeded, fail loudly
            if try_count == 0:
                print(
                    f"source_entity: {source_entity.get_json_representation()} with data: "
                    f"{source_data} raised MWApiError.")
                raise mwae
            # Wait 20 seconds before the next attempt so the database updates
            # preventing other edit conflicts
            # and not overloading the database with requests
            time.sleep(20)
        else:
            has_succeeded = True
            metadata.source_entity_code = source_entity.item_id

    return gtfs_representation