def get_docker_service_client():
    """
    Checks that the docker daemon service is running and returns the service client
    :return: the docker service client
    """
    # Check that the docker daemon service is up, and timeout after five minutes
    docker_check_alive_cmd = "docker info"
    docker_is_up = False
    timeout = time.time() + 60 * 5
    try:
        while not docker_is_up:
            if time.time() > timeout:
                raise TimeoutError
            # Check that the daemon is up and running
            docker_check_alive_process = subprocess.Popen(
                docker_check_alive_cmd, stdout=subprocess.PIPE, shell=True)
            output, error = docker_check_alive_process.communicate()
            docker_is_up = "Containers" in output.decode('utf-8')

        # Get the docker client
        client = docker.from_env()
        return client
    except BaseException as error:
        _log.error("Docker daemon service is not up")
        raise error
def delete_file_from_container(container, file_name):
    """
    Delete a filefrom Navitia worker container
    :param container: Navitia worker container
    :param file_name: the name of the file to be removed
    """
    delete_command= "/bin/sh -c \"rm " + file_name + "\""
    exit_code, output = container.exec_run(cmd=delete_command,  stdout=True, workdir="/srv/ed/output/")
    if exit_code != 0:
        _log.error("Couldn't delete %s graph", file_name)
        return False
    _log.info("Finished deleting %s from container %s", file_name, container.name)
def get_file_from_url_http(url, file_name, file_path, _log):
    """
    Downloads a file to the working directory
    :param url: HTTP utl to downloads from - not an FTP URL
    :return: file name of the downloaded content in the working directory
    """

    # Preparing file for fetching
    local_file_path_and_name = Path(os.getcwd()).parent / file_path / file_name
    _log.info("Going to download the latest osm from %s to %s", url,
              local_file_path_and_name)

    download_complete = False
    download_attempts = 1
    max_download_attemtps = 24

    while not download_complete:
        if not download_complete and 24 > download_attempts > 1:
            _log.error(
                "%s is unreachable. Sleeping for 60 minutes and trying again. This is attempt %s out of "
                "%s attempts", url, download_attempts, max_download_attemtps)
            time.sleep(60 * 60)
        if not download_complete and download_attempts > 24:
            _log.error(
                "%s is unreachable for more than 24 hours. Aborting update",
                url)
            raise Exception
        download_attempts += 1

        try:
            r = requests.get(url, stream=True)
            file = open(local_file_path_and_name, 'wb')

            # Creating a progress bar
            size = int(r.headers['Content-Length'])
            pbar = createProgressBar(size)

            # Fetching
            global size_iterator
            size_iterator = 0
            for chunk in r.iter_content(chunk_size=1024):
                if chunk:
                    file_write_update_progress_bar(chunk, file, pbar)
            file.close()
            pbar.finish()
            _log.info("Finished loading latest OSM to: %s",
                      local_file_path_and_name)
            download_complete = True
            return

        except Exception as e:
            continue
def move_current_to_past(container, source_cov_name, dest_cov_name):
    """
    Move the Navitia graph of the source coverage to the destination coverage so in next re-start changes are applied
    :param container: the worker container of Navitia
    :param source_cov_name: the name of the coverage to take the graph from (usually "default")
    :param dest_cov_name: the name of the coverage to move the graph to (e.g. "secondary-cov")
    :return: whether the move was successful, a RunTimeError is thown if not
    """
    command_list = "/bin/sh -c \"mv " + source_cov_name + ".nav.lz4 "+ dest_cov_name + ".nav.lz4\""
    exit_code, output = container.exec_run(cmd=command_list,  stdout=True, workdir="/srv/ed/output/")
    if exit_code != 0:
        _log.error("Couldn't change %s to %s", source_cov_name, dest_cov_name)
        raise RuntimeError
    _log.info("Changed the name of %s.nav.lz4 to %s.nav.lz4", source_cov_name, dest_cov_name)
    return True
def send_log_to_email(subject, message):
    """
    Send an e-mail with user-defined subject and message. the e-mail is attached with logs of this script
    :param subject:
    :param message:
    :return: Whether the e-mail was sent successfully
    """
    # Change to root before trying to send logs
    root_path = Path.home() / "TransitAnalystIsrael" / "root"
    os.chdir(root_path.as_posix())
    logs_path = root_path / "logs"
    if not os.path.isdir(logs_path ):
        _log.error("%s isn't the logs directory. Please fix log directory as in code")
    path = logs_path  / '*'
    list_of_files = glob.glob(str(path))  # * means all if need specific format then *.csv
    attached_file = max(list_of_files, key=os.path.getctime)
    return send_email.create_msg_and_send_email(subject, message, attached_file)
def start_navitia_w_default_and_custom_cov(secondary_custom_coverage_name, navitia_docker_compose_file_path,
                                           navitia_docker_compose_custom_file_path, navitia_docker_compose_file_name,
                                           extend_wait_time=False):
    """
    Start Navitia server with default and custom coverages (using custom docker-compose file)

    :param secondary_custom_coverage_name:
    :param navitia_docker_compose_file_path: path where docker-compose file exists
    :param navitia_docker_compose_file_name:  name of the custom docker-compose file
        :param extend_wait_time: whether an extended time of wait should be applied. Should be set to True when Navitia
    docker compose is started up the first time (images are being downloaded from the web)
    :return:  Whether Navitia was started successfully with default and secondary coverages
    """

    _log.error("This method isn't currently used because 2 corages require server with at least 10GB RAM available for "
               "docker.\nEach coverage requires about 3.5 RAM when running")
    raise Exception
def unzip_gtfs(gtfs_zip_file_name, gtfspath, _log):
    """
    Unzip gtfs to gtfspath
    """
    pardir = Path(os.getcwd()).parent
    gtfs_contets_folder = Path(
        os.getcwd()).parent / gtfspath / gtfs_zip_file_name
    if not os.path.isfile(gtfs_contets_folder):
        _log.error(
            "%s does not exist - please check correct GTFS date is configured",
            gtfs_zip_file_name)
        raise Exception
    _log.info("Going to unzip %s file to %s", gtfs_zip_file_name, gtfspath)
    dest_folder = pardir / gtfspath / gtfs_zip_file_name[:
                                                         -4]  # removing the .zip end
    if not os.path.exists(dest_folder):
        os.mkdir(dest_folder)
    shutil.unpack_archive(gtfs_contets_folder,
                          extract_dir=dest_folder,
                          format='zip')
    _log.info("Finished unzipping")
def validate_osm_gtfs_convertion_to_graph_is_completed(worker_con, time_to_wait, start_processing_time):
    """
    Validates that the following Navitia worker tasks were successfully completed:
    osm2ed, gtfs2ed and ed2nav
    :param worker_con: the Navitia worker container
    :param time_to_wait: time to wait for the validation to take place, in minutes. Default is 20 minutes
    :return: Whether conversion is completed or not
    """

    # Wait if needed
    _log.info("Waiting %s minutes to let OSM & GTFS conversions to lz4 graph takes place", time_to_wait)
    time.sleep(time_to_wait * 60)
    _log.info("I'm back! Verifying that the conversions took place")
    # Success status look like Task tyr.binarisation.ed2nav[feac06ca-51f7-4e39-bf1d-9541eaac0988] succeeded
    # and tyr.binarisation.gtfs2ed[feac06ca-51f7-4e39-bf1d-9541eaac0988] succeeded
    tyr_worker_outputname = "tyr_worker_output.txt"

    with open(tyr_worker_outputname, "w", encoding="UTF-8") as tyr_worker_output:
        tyr_worker_output.write(worker_con.logs().decode('utf-8'))
    tyr_worker_output.close()

    ed2nav_completed = False
    with open(tyr_worker_outputname, "r", encoding="UTF-8") as tyr_worker_output:
        lines = tyr_worker_output.readlines()
        for line in reversed(lines):
            if re.compile(r'tyr\.binarisation\.ed2nav\[\S*\] succeeded').search(line):
                time_of_line = re.findall(r'\d{1,4}-\d{1,2}-\d{1,2}\b \d{1,2}:\d{1,2}:\d{1,2}', line)
                time_of_line = dt.strptime(time_of_line[0], '%Y-%m-%d %H:%M:%S')
                if start_processing_time < time_of_line:
                    ed2nav_completed = True
                    break
    os.remove(tyr_worker_outputname)
    if ed2nav_completed:
        _log.info("OSM conversion task ed2nav, GTFS conversion task gtfs2ed  and ed2nav are successful")
        return True
    else:
        _log.error("After %s minutes - tasks aren't completed", time_to_wait)
        return False
def validate_graph_changes_applied(coverage_name):
    """
    Validate that the coverage has a different start of production date different from before
    """
    current_start_service_date = process_date.get_date_now()

    if cfg.ttm_server_on == "aws_ec2":
        time_map_server_url = cfg.time_map_server_aws_url
    else:
        time_map_server_url = cfg.time_map_server_local_url

    cov_sop_date = get_coverage_start_production_date(coverage_name)
    if cov_sop_date == "" or not check_prod_date_is_valid_using_heat_map(time_map_server_url, coverage_name,
                                                                     current_start_service_date):
        _log.error("The %s coverage seems not to be up-to-date following update attempts."
                   "\n A call for heat map data with %s date returned no results",
                   coverage_name, current_start_service_date)
        return False

    _log.info("%s coverage is now updated with new start-of-production date %s\n."
              "Can be accessed via %s%s", coverage_name, current_start_service_date, time_map_server_url,
              coverage_name)
    return True
def copy_file_into_docker(container, dest_path, file_path, file_name):
    """
    Copy a given file to a destination folder in a Docker container
    :param container: container object
    :param dest_path: destination folder path inside the container
    :param file_path: source path of the file on the host
    :param file_name: the file name to be copied
    """
    _log.info("Going to copy %s to %s at %s", file_name, container.name, dest_path)

    # Read the file
    file = open(Path(os.getcwd()).parent / file_path / file_name, 'rb')
    file = file.read()

    try:
        # Convert to tar file
        tar_stream = BytesIO()
        file_tar = tarfile.TarFile(fileobj=tar_stream, mode='w')
        tarinfo = tarfile.TarInfo(name=file_name)
        tarinfo.size = len(file)
        file_tar.addfile(tarinfo, BytesIO(file))
        file_tar.close()

        # Put in the container
        tar_stream.seek(0)
        success = container.put_archive(
            path=dest_path,
            data=tar_stream
        )
        if success:
            _log.info("Finished copying %s to %s at %s", file_name, container.name, dest_path)
        else:
            raise FileNotFoundError

    except FileNotFoundError as err:
        _log.error("Couldn't copy %s to %s at %s", file_name, container.name, dest_path)
        raise err
def validate_auto_graph_changes_applied(coverage_name, default_coverage_name, default_cov_prev_sop_date, docker_client,
                                        navitia_docker_compose_file_path, navitia_docker_compose_file_name,
                                        navitia_docker_compose_default_file_name):
    """
    Validate that the new default coverage returns results for heat map query for current_start_service_date (as in dates file
    or gtfs date) and that secondary-cov has results for the previous production date of the default.

    :param default_coverage_name: The coverage that gets a new (usually more recent) start of production date
    :param secondary_custom_coverage_name: The coverage that gets a the original default_coverage start of production date
    :param default_cov_sop_date: start of production date of original default coverage (before changes applied)
    :return: whether the graph changes were applied
    """
    current_start_service_date = dt.strptime(process_date.get_date_now(), "%Y%m%d")

    if cfg.ttm_server_on == "aws_ec2":
        time_map_server_url = cfg.time_map_server_aws_url
    else:
        time_map_server_url = cfg.time_map_server_local_url

    # Check that the current default coverage is up-to-date by comparing sop dates
    stop_all_containers(docker_client)

    start_navitia_with_single_coverage(navitia_docker_compose_file_path, navitia_docker_compose_default_file_name,
                                       default_coverage_name, False)

    if not check_prod_date_is_valid_using_heat_map(time_map_server_url, default_coverage_name,
                                                   current_start_service_date.strftime("%Y%m%d")):
        _log.error("The %s coverage seems not to be up-to-date following update attempts.", default_coverage_name)
        return False
    else:
        _log.info("%s coverage is up-to-date with production date %s", default_coverage_name,
                  current_start_service_date.strftime("%Y%m%d"))

    # Check that the coverage_name (the previous one) is up-to-date by comparing sop dates
    stop_all_containers(docker_client)

    is_up = start_navitia_with_single_coverage(navitia_docker_compose_file_path, navitia_docker_compose_file_name,
                                       coverage_name, False)
    if not is_up:
        _log.error("The %s coverage seems not to be up", coverage_name)
    cov_sop_date = get_coverage_start_production_date(coverage_name)
    if cov_sop_date == "":
        _log.info("If this is the first time you're running Transit Analyst Israel data processing, you need to "
                  "copy the generated default.nav.lz4 graph to secondary-cov.nav.lz4 - See docs.")
        return True

    if not check_prod_date_is_valid_using_heat_map(time_map_server_url, coverage_name,
                                                   current_start_service_date.strftime("%Y%m%d")):
        _log.error("The %s coverage seems not to be up-to-date following update attempts.\nA call for heat map data with"
                   " %s date returned no results", coverage_name, current_start_service_date.strftime("%Y%m%d"))
        return False
    _log.info("%s coverage is now updated with new start-of-production date %s. "
              "Can be accessed via %s%s", coverage_name, current_start_service_date.strftime("%Y%m%d"), time_map_server_url,
              coverage_name)
    return True
def process_new_data_to_current_coverage(docker_client, navitia_docker_compose_file_path,
                                         navitia_docker_compose_file_name, navitia_docker_compose_default_file_name,
                                         coverage_name, default_coverage_name,
                                         cov_eos_date, osm_file_path, osm_file_name,
                                         gtfs_file_path, gtfs_file_name, _log):

    start_processing_time = datetime.datetime.utcnow() #We take the time in UTC because docker time is in UTC
    # Re-start Navitia docker with default coverage only in order to process the OSM & GTFS
    # Later we will restart with the custom coverage as well
    utils.stop_all_containers(docker_client)
    if cfg.get_service_date == "auto":
        utils.start_navitia_with_single_coverage(navitia_docker_compose_file_path, navitia_docker_compose_default_file_name,
                                                 default_coverage_name)
    elif cfg.get_service_date == "on_demand":
        utils.start_navitia_with_single_coverage(navitia_docker_compose_file_path, navitia_docker_compose_file_name,
                                                 coverage_name)

    # Get the new worker container
    worker_con = docker_client.containers.list(filters={"name": "worker"})[0]

    # Copy OSM & GTFS to the default coverage input folder on the worker container
    if cfg.get_service_date == "auto":
        utils.copy_osm_and_gtfs_to_cov(worker_con, osm_file_path, osm_file_name, gtfs_file_path, gtfs_file_name,
                                       default_coverage_name)
    elif cfg.get_service_date == "on_demand":
        utils.copy_osm_and_gtfs_to_cov(worker_con, osm_file_path, osm_file_name, gtfs_file_path, gtfs_file_name,
                                       coverage_name)

    # Validate the conversion process takes place by ensuring tyr_beat is up
    if cfg.get_service_date == "auto":
        utils.validate_osm_gtfs_convertion_to_graph_is_running(docker_client, default_coverage_name,
                                                               navitia_docker_compose_default_file_name,
                                                               navitia_docker_compose_file_name)
    elif cfg.get_service_date == "on_demand":
        utils.validate_osm_gtfs_convertion_to_graph_is_running(docker_client, coverage_name,
                                                               navitia_docker_compose_file_path,
                                                               navitia_docker_compose_file_name)

    worker_con = docker_client.containers.list(filters={"name": "worker"})[0]
    # After 20 minutes - test that both osm and gtfs conversions are done
    success = utils.validate_osm_gtfs_convertion_to_graph_is_completed(worker_con, 40, start_processing_time)

    # If it didn't succeed, give it 30 more minutes
    if not success:
        success = utils.validate_osm_gtfs_convertion_to_graph_is_completed(worker_con, 30, start_processing_time)

    # If it didn't succeed, give it 30 more minutes
    if not success:
        success = utils.validate_osm_gtfs_convertion_to_graph_is_completed(worker_con, 30, start_processing_time)

    if not success:
        _log.error("After 90 minutes - tasks aren't completed - connect to server for manual inspection")
        raise Exception

    is_changes_applied = True
    # Validate that changes are applied
    if cfg.get_service_date == "auto":
        is_changes_applied = utils.validate_auto_graph_changes_applied(coverage_name, default_coverage_name,
                                cov_eos_date, docker_client, navitia_docker_compose_file_path,
                                        navitia_docker_compose_file_name, navitia_docker_compose_default_file_name)
        if not is_changes_applied:
            raise Exception

    elif cfg.get_service_date == "on_demand":
        is_changes_applied = utils.validate_graph_changes_applied(coverage_name)
        if not is_changes_applied:
            raise Exception

    # If it's up - delete the old gtfs and osm files - only from AWS machines
    if is_changes_applied and utils.is_aws_machine():
        utils.delete_file_from_host(Path(os.getcwd()).parent / osm_file_path / osm_file_name)
        utils.delete_file_from_host(Path(os.getcwd()).parent / gtfs_file_path / gtfs_file_name)
        utils.delete_file_from_host(Path(os.getcwd()).parent / osm_file_path / osm_file_name)
        utils.delete_file_from_host(Path(os.getcwd()).parent / gtfs_file_path / gtfs_file_name)


# config variables to be moved to config-file downstrem
default_coverage_name, coverage_name, navitia_docker_compose_file_path, navitia_docker_compose_file_name, \
navitia_docker_compose_default_file_name, gtfs_file_path, gtfs_zip_file_name = utils.get_config_params()

try:

    # Get the docker service client
    docker_client = utils.get_docker_service_client()

    containers = docker_client.containers.list(filters={"name": "worker"})
    if len(containers) == 0:
        _log.error("Navitia docker containers are down, bringing them up with default coverage for processing")
        utils.start_navitia_with_single_coverage(navitia_docker_compose_file_path,
                                                 navitia_docker_compose_default_file_name,
                                                 default_coverage_name)
        containers = docker_client.containers.list(filters={"name": "worker"})
        
    # Get the worker container
    worker_con = containers[0]

    default_cov_sop_date = ""
    # For production env. we have default coverage and secondary-cov coverage so back up is needed
    if cfg.get_service_date == "auto":
        # Get the current start of production dates of default coverage for post-processing comparison
        if utils.is_cov_exists(worker_con, default_coverage_name):
            default_cov_sop_date = utils.get_coverage_start_production_date(default_coverage_name)
Example #14
0
def main(gtfsdate, gtfsparentpath, gtfsdirbase, pathout):
    # input:
    parent_path = cwd.parent / gtfsparentpath
    gtfsdir = gtfsdirbase + gtfsdate
    txtfilein = ''

    # output:
    gtfspathout = cwd.parent / pathout / gtfsdir
    txtfileout = ''

    gtfspathin = parent_path / gtfsdir
    gtfspath = gtfspathin

    # >>> load routes file
    routes_count = 0
    txtfilein = 'routes.txt'
    routes_dict = {}
    with open(gtfspathin / txtfilein, newline='', encoding="utf8") as f:
        reader = csv.reader(f)
        header = next(
            reader
        )  # [route_id,agency_id,route_short_name,route_long_name,route_desc,route_type,route_color]
        #print(header)
        for row in reader:
            #print row
            routes_count += 1
            routes_dict[row[0]] = [row[1]]  # 'route_id' : ['agency_id']
    #print routes_dict[:4]
    print('routes_dict loaded. routes count ', len(routes_dict))

    # >>> load trips file
    trips_count = 0
    trips_header_trip_headsign_missing = False
    txtfilein = 'trips.txt'
    trips_dict = {}
    with open(gtfspathin / txtfilein, newline='', encoding="utf8") as f:
        reader = csv.reader(f)
        header = next(
            reader
        )  # [route_id,service_id,trip_id,trip_headsign,direction_id,shape_id]
        if len(header
               ) == 5 and header[3] == 'direction_id':  # trip_headsign missing
            trips_header_trip_headsign_missing = True
            print('trip_headsign missing')
            print(header)
        #print(header)
        for row in reader:
            #print(row)
            trips_count += 1
            if trips_header_trip_headsign_missing:
                trips_dict[row[2]] = [
                    row[0], row[1], row[4]
                ]  # 'trip_id' : ['route_id','service_id','shape_id']
            else:
                trips_dict[row[2]] = [
                    row[0], row[1], row[5]
                ]  # 'trip_id' : ['route_id','service_id','shape_id']
    #print trips_dict[:4]
    print('trips_dict loaded. trips count ', len(trips_dict))

    # >>> load stop_times file
    stop_times_count = 0
    txtfilein = 'stop_times.txt'
    stop_times_trips_set = set([])
    stop_times_stops_set = set([])
    with open(gtfspathin / txtfilein, newline='', encoding="utf8") as f:
        reader = csv.reader(f)
        header = next(
            reader
        )  # [trip_id,arrival_time,departure_time,stop_id,stop_sequence,pickup_type,drop_off_type,shape_dist_traveled]
        #print(header)
        for row in reader:
            #print row
            stop_times_count += 1
            stop_times_trips_set.add(row[0])  # trip_id
            stop_times_stops_set.add(row[3])  # stop_id
    print('stop_times_trips loaded. trips count ', len(stop_times_trips_set))
    print('stop_times_stops loaded. stops count ', len(stop_times_stops_set))

    # >>> load stops file
    stops_count = 0
    txtfilein = 'stops.txt'
    stops_dict = {}
    with open(gtfspathin / txtfilein, newline='', encoding="utf8") as f:
        reader = csv.reader(f)
        header = next(
            reader
        )  # ['stop_id', 'stop_code', 'stop_name', 'stop_desc', 'stop_lat', 'stop_lon', 'location_type', 'parent_station', 'zone_id']
        #print(header)
        for row in reader:
            #print row
            stops_count += 1
            stops_dict[row[0]] = [
                row[2], row[3], row[4], row[5]
            ]  # 'stop_id' : ['stop_name', 'stop_desc', 'stop_lat', 'stop_lon']
    #print stops_dict[row[0]] # last one
    print('stops_dict loaded. stop count ', len(stops_dict))

    # >>> load agency file
    agency_count = 0
    txtfilein = 'agency.txt'
    agency_dict = {}
    agency_name_problem_count = 0
    with open(gtfspathin / txtfilein, newline='', encoding="utf8") as f:
        reader = csv.reader(f)
        header = next(
            reader
        )  # agency_id,agency_name,agency_url,agency_timezone,agency_lang,agency_phone,agency_fare_url
        print(header)
        for row in reader:
            print(row)
            agency_count += 1
            agency_name = row[1]
            agency_name_clean = agency_name.replace('\"', '').replace("\'", "")
            if agency_name != agency_name_clean:
                print('agency name problem: ', agency_name, agency_name_clean)
                agency_name_problem_count += 1
                row[1] = agency_name_clean  # patch agency name for dict, later it will be written to file
            agency_dict[row[0]] = [
                row[1], row[2], row[3], row[4], row[5], row[6]
            ]  # 'agency_id': ['agency_name','agency_url','agency_timezone','agency_lang','agency_phone','agency_fare_url']
    #print agency_dict[row[0]] # last one
    print('agency_dict loaded. agency count ', len(agency_dict))
    print('agency_name_problem_count : ', agency_name_problem_count)

    # >>> load shapes file. Actually loads only one point per shape!!! used only as a set of shape_ids
    shapes_count = 0
    txtfilein = 'shapes.txt'
    shapes_dict = {}
    with open(gtfspathin / txtfilein, newline='', encoding="utf8") as f:
        reader = csv.reader(f)
        header = next(
            reader)  # shape_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence
        #print(header)
        for row in reader:
            #print row
            shapes_count += 1
            shapes_dict[row[0]] = [
                row[1], row[2]
            ]  # 'shape_id' : ['shape_pt_lat','shape_pt_lon']
    #print shapes_dict[row[0]] # last one
    print('shapes_dict loaded. shape count ', len(shapes_dict))

    # >>> load calendar file
    calendar_count = 0
    txtfilein = 'calendar.txt'
    calendar_dict = {}
    with open(gtfspathin / txtfilein, newline='', encoding="utf8") as f:
        reader = csv.reader(f)
        header = next(
            reader
        )  # service_id,sunday,monday,tuesday,wednesday,thursday,friday,saturday,start_date,end_date
        #print(header)
        for row in reader:
            #print row
            calendar_count += 1
            calendar_dict[row[0]] = [
                row[1], row[2], row[3], row[4], row[5], row[6], row[7], row[8],
                row[9]
            ]  # ['service_id' : [ 'sunday','monday','tuesday','wednesday','thursday','friday','saturday','start_date','end_date']
    #print calendar_dict[row[0]] # last one
    print('calendar_dict loaded. calendar count ', len(calendar_dict))

    # >>> process loaded files

    # check MAX limits on file line count

    if stops_count > gtfs_config.MAX_STOPS_COUNT:
        print('need to abort')
        _log.error('MAX GTFS line count exceeded')
        raise Exception
    if stop_times_count > gtfs_config.MAX_STOP_TIMES_COUNT:
        print('need to abort')
        _log.error('MAX GTFS line count exceeded')
        raise Exception
    if trips_count > gtfs_config.MAX_TRIPS_COUNT:
        print('need to abort')
        _log.error('MAX GTFS line count exceeded')
        raise Exception
    if shapes_count > gtfs_config.MAX_SHAPES_COUNT:
        print('need to abort')
        _log.error('MAX GTFS line count exceeded')
        raise Exception
    if routes_count > gtfs_config.MAX_ROUTES_COUNT:
        print('need to abort')
        _log.error('MAX GTFS line count exceeded')
        raise Exception
    if agency_count > gtfs_config.MAX_AGENCY_COUNT:
        print('need to abort')
        _log.error('MAX GTFS line count exceeded')
        raise Exception
    if calendar_count > gtfs_config.MAX_CALENDAR_COUNT:
        print('need to abort')
        _log.error('MAX GTFS line count exceeded')
        raise Exception

    # >>> process calendar - check that GTFS start date in calendar.txt is as expected - gtfsdate
    service_ok_count = 0
    service_problem_count = 0
    service_problem_list = []
    min_service_date = '21190101'
    for service_id, [
            sunday, monday, tuesday, wednesday, thursday, friday, saturday,
            start_date, end_date
    ] in calendar_dict.items():
        min_service_date = min(min_service_date, start_date, end_date)
        if start_date >= gtfsdate and end_date >= gtfsdate:
            service_ok_count += 1
        else:
            service_problem_count += 1
            print(
                'service_problem date before expected start date: start_date, end_date, gtfsdate ',
                service_id, start_date, end_date, gtfsdate)
            service_problem_list.append(
                [service_id, start_date, end_date, gtfsdate])
    print('service_ok_count : ', service_ok_count)
    print('service_problem_count : ', service_problem_count)
    if min_service_date != gtfsdate:  # problem
        print(
            'GTFS file start date in calendar.txt is not the same as expected start date : ',
            min_service_date, gtfsdate)
        print(cfg.patch_calendar)

        if cfg.patch_calendar == 'yes':  # problem and patch
            print('patch_calendar')
            # >>> open and prep output txt file
            txtfileout = 'calendar.txt'
            print('open file ', gtfspathout / txtfileout)
            fileout = open(gtfspathout / txtfileout, 'w',
                           encoding="utf8")  # save results in file
            postsline = 'service_id,sunday,monday,tuesday,wednesday,thursday,friday,saturday,start_date,end_date\n'
            print(postsline)
            fileout.write(postsline)
            outfilelinecount = 0
            for service_id, [
                    sunday, monday, tuesday, wednesday, thursday, friday,
                    saturday, start_date, end_date
            ] in calendar_dict.items():
                if end_date >= gtfsdate:  # good
                    if start_date >= gtfsdate:  # good
                        pass
                    else:  # *** start date problem *** fix entry to start on gtfsdate
                        print(
                            'service_problem date before expected start date: start_date, end_date, gtfsdate ',
                            service_id, start_date, end_date, gtfsdate)
                        start_date = gtfsdate
                    # output entry
                    postsline = ','.join([
                        service_id, sunday, monday, tuesday, wednesday,
                        thursday, friday, saturday, start_date, end_date
                    ]) + '\n'
                    fileout.write(postsline)
                    outfilelinecount += 1

                else:  # *** end date problem *** skip entry
                    print(
                        'end date before GTFS date - erase (skip) this calendar entry',
                        end_date, gtfsdate)
            fileout.close()
            print('close file ', gtfspathout / txtfileout)
            print('lines in out file count ', outfilelinecount)

        else:  # problem and no patch
            print('need to abort')
            _log.error(
                'GTFS file start date in calendar.txt is not the same as expected start date : %s %s',
                min_service_date, gtfsdate)
            raise Exception
        raise Exception

    # >>> process routes
    routes_agency_id_ok_count = 0
    routes_agency_id_problem_count = 0
    routes_agency_id_problem_list = []
    agencies_referenced_set = set([])
    for route_id, [agency_id] in routes_dict.items():
        if agency_id in agency_dict:
            routes_agency_id_ok_count += 1
            agencies_referenced_set.add(agency_id)
        else:
            routes_agency_id_problem_count += 1
            print('routes_agency_id_problem : ', route_id, agency_id)
            routes_agency_id_problem_list.append(route_id)
    print('routes_agency_id_ok_count : ', routes_agency_id_ok_count)
    print('routes_agency_id_problem_count : ', routes_agency_id_problem_count)
    print('agencies_referenced_count : ', len(agencies_referenced_set))
    print('agencies_referenced_set : ', agencies_referenced_set)

    # >>> process trips
    trips_service_id_ok_count = 0
    trips_service_id_problem_count = 0
    trips_service_id_problem_list = []
    trips_shape_id_ok_count = 0
    trips_shape_id_problem_count = 0
    trips_shape_id_problem_list = []
    trips_route_id_ok_count = 0
    trips_route_id_problem_count = 0
    trips_route_id_problem_list = []
    for trip_id, [route_id, service_id, shape_id] in trips_dict.items():
        if route_id in routes_dict:
            trips_route_id_ok_count += 1
        else:
            trips_route_id_problem_count += 1
            print('trips_route_id_problem : ', trip_id, route_id)
            trips_route_id_problem_list.append(trip_id)
        if shape_id in shapes_dict:
            trips_shape_id_ok_count += 1
        else:
            trips_shape_id_problem_count += 1
            print('trips_shape_id_problem : ', trip_id, shape_id)
            #print(shapes_dict.keys())
            trips_shape_id_problem_list.append(trip_id)
        if service_id in calendar_dict:
            trips_service_id_ok_count += 1
        else:
            trips_service_id_problem_count += 1
            print('trips_service_id_problem : ', trip_id, service_id)
            trips_service_id_problem_list.append(trip_id)
    print('trips_service_id_ok_count : ', trips_service_id_ok_count)
    print('trips_service_id_problem_count : ', trips_service_id_problem_count)
    print('trips_shape_id_ok_count : ', trips_shape_id_ok_count)
    print('trips_shape_id_problem_count : ', trips_shape_id_problem_count)
    print('trips_route_id_ok_count : ', trips_route_id_ok_count)
    print('trips_route_id_problem_count : ', trips_route_id_problem_count)

    # >>> process stop_times
    stoptimes_trip_id_ok_count = 0
    stoptimes_trip_id_problem_count = 0
    stoptimes_trip_id_problem_list = []
    stoptimes_stop_id_ok_count = 0
    stoptimes_stop_id_problem_count = 0
    stoptimes_stop_id_problem_list = []
    for trip_id in stop_times_trips_set:
        if trip_id in trips_dict:
            stoptimes_trip_id_ok_count += 1
        else:
            stoptimes_trip_id_problem_count += 1
            print('stoptimes_trip_id_problem : ', trip_id)
            stoptimes_trip_id_problem_list.append(trip_id)
    for stop_id in stop_times_stops_set:
        if stop_id in stops_dict:
            stoptimes_stop_id_ok_count += 1
        else:
            stoptimes_stop_id_problem_count += 1
            print('stoptimes_stop_id_problem : ', stop_id)
            stoptimes_stop_id_problem_list.append(stop_id)
    print('stoptimes_trip_id_ok_count : ', stoptimes_trip_id_ok_count)
    print('stoptimes_trip_id_problem_count : ',
          stoptimes_trip_id_problem_count)
    print('stoptimes_stop_id_ok_count : ', stoptimes_stop_id_ok_count)
    print('stoptimes_stop_id_problem_count : ',
          stoptimes_stop_id_problem_count)

    # >>> patch problem files
    if agency_name_problem_count > 0:  # patch agency names, in case they include " or ' in the name (happened in GTFS file of 20190901)
        # >>> open and prep output txt file
        txtfileout = 'agency.txt'
        print('open file ', gtfspathout / txtfileout)
        fileout = open(gtfspathout / txtfileout, 'w',
                       encoding="utf8")  # save results in file
        postsline = 'agency_id,agency_name,agency_url,agency_timezone,agency_lang,agency_phone,agency_fare_url\n'
        print(postsline)
        fileout.write(postsline)
        outfilelinecount = 0
        for agency_id, [
                agency_name, agency_url, agency_timezone, agency_lang,
                agency_phone, agency_fare_url
        ] in agency_dict.items():
            postsline = ','.join([
                agency_id, agency_name, agency_url, agency_timezone,
                agency_lang, agency_phone, agency_fare_url
            ]) + '\n'
            fileout.write(postsline)
            outfilelinecount += 1
        fileout.close()
        print('close file ', gtfspathout / txtfileout)
        print('lines in out file count ', outfilelinecount)

    if trips_header_trip_headsign_missing:
        print('trips_header_trip_headsign_missing')
        # add dummy '' trip_headsign
        # load full trips.txt file  then apply the patch while writing back.

        # >>> load trips file
        txtfilein = 'trips.txt'
        trips_full_list = []
        with open(gtfspathin / txtfilein, newline='', encoding="utf8") as f:
            reader = csv.reader(f)
            header = next(
                reader)  # [route_id,service_id,trip_id,direction_id,shape_id]
            #print(header)
            for row in reader:
                #print row
                trips_full_list.append([
                    row[0], row[1], row[2], row[3], row[4]
                ])  # [route_id,service_id,trip_id,direction_id,shape_id]
        print('trips_full_list loaded. trips count ', len(trips_full_list))

        # >>> open and prep output txt file
        txtfileout = 'trips.txt'
        print('open file ', gtfspathout / txtfileout)
        fileout = open(gtfspathout / txtfileout, 'w',
                       encoding="utf8")  # save results in file
        postsline = 'route_id,service_id,trip_id,trip_headsign,direction_id,shape_id\n'
        print(postsline)
        fileout.write(postsline)
        outfilelinecount = 0
        trip_headsign = ''
        for [route_id, service_id, trip_id, direction_id,
             shape_id] in trips_full_list:
            postsline = ','.join([
                route_id, service_id, trip_id, trip_headsign, direction_id,
                shape_id
            ]) + '\n'
            fileout.write(postsline)
            outfilelinecount += 1
        fileout.close()
        print('close file ', gtfspathout / txtfileout)
        print('lines in out file count ', outfilelinecount)

    if routes_agency_id_problem_count != 0:
        print('routes_agency_id_problem_count : ',
              routes_agency_id_problem_count)
        # erase routes if agency_id referenced is missing from agency.txt or add unknown agency to agency.txt with the missing id...
        # for now leaving as is

    if trips_service_id_problem_count != 0:
        print('trips_service_id_problem_count : ',
              trips_service_id_problem_count)
        # erase trips if service_id referenced is missing from calendar.txt or add empty service record to calendar.txt with the missing id...
        # for now doing the first - but checking that the erased trip will not be missed
        # load full trips.txt file  then apply the patch while writing back.

        # >>> load trips file
        txtfilein = 'trips.txt'
        trips_full_list = []
        with open(gtfspathin / txtfilein, newline='', encoding="utf8") as f:
            reader = csv.reader(f)
            header = next(
                reader
            )  # [route_id,service_id,trip_id,trip_headsign,direction_id,shape_id]
            #print(header)
            for row in reader:
                #print row
                trips_full_list.append(
                    [row[0], row[1], row[2], row[3], row[4], row[5]]
                )  # [route_id,service_id,trip_id,trip_headsign,direction_id,shape_id]
        print('trips_full_list loaded. trips count ', len(trips_full_list))

        # >>> open and prep output txt file
        txtfileout = 'trips.txt'
        print('open file ', gtfspathout / txtfileout)
        fileout = open(gtfspathout / txtfileout, 'w',
                       encoding="utf8")  # save results in file
        postsline = 'route_id,service_id,trip_id,trip_headsign,direction_id,shape_id\n'
        print(postsline)
        fileout.write(postsline)
        outfilelinecount = 0
        for [
                route_id, service_id, trip_id, trip_headsign, direction_id,
                shape_id
        ] in trips_full_list:
            if trip_id in trips_service_id_problem_list:
                print('trips_service_id_problem : ', trip_id, service_id)
                print('erasing trip_id from trips.txt')
                # check if this trip that we are erasing will be missed
                if trip_id in stop_times_trips_set:
                    print(
                        'ooops **************** erased a trip that is referenced in stoptimes.txt : ',
                        trip_id)
            else:
                postsline = ','.join([
                    route_id, service_id, trip_id, trip_headsign, direction_id,
                    shape_id
                ]) + '\n'
                fileout.write(postsline)
                outfilelinecount += 1
        fileout.close()
        print('close file ', gtfspathout / txtfileout)
        print('lines in out file count ', outfilelinecount)

    if trips_shape_id_problem_count != 0:
        print('trips_shape_id_problem_count : ', trips_shape_id_problem_count)
        # if shape_id == "" then create shape from sequence of stops and add to shapes.txt with the newly created id...
        # for now leaving as is

    if trips_route_id_problem_count != 0:
        print('trips_route_id_problem_count : ', trips_route_id_problem_count)
        # erase trips if route_id referenced is missing from routes.txt or add unknown route to route.txt with the missing id...
        # for now doing the first - but checking that the erased trip will not be missed
        # load full trips.txt file  then apply the patch while writing back.

        # >>> load trips file
        txtfilein = 'trips.txt'
        trips_full_list = []
        with open(gtfspathin / txtfilein, newline='', encoding="utf8") as f:
            reader = csv.reader(f)
            header = next(
                reader
            )  # [route_id,service_id,trip_id,trip_headsign,direction_id,shape_id]
            #print(header)
            for row in reader:
                #print row
                trips_full_list.append(
                    [row[0], row[1], row[2], row[3], row[4], row[5]]
                )  # [route_id,service_id,trip_id,trip_headsign,direction_id,shape_id]
        print('trips_full_list loaded. trips count ', len(trips_full_list))

        # >>> open and prep output txt file
        txtfileout = 'trips.txt'
        print('open file ', gtfspathout / txtfileout)
        fileout = open(gtfspathout / txtfileout, 'w',
                       encoding="utf8")  # save results in file
        postsline = 'route_id,service_id,trip_id,trip_headsign,direction_id,shape_id\n'
        print(postsline)
        fileout.write(postsline)
        outfilelinecount = 0
        for [
                route_id, service_id, trip_id, trip_headsign, direction_id,
                shape_id
        ] in trips_full_list:
            if trip_id in trips_route_id_problem_list:
                print('trips_route_id_problem : ', trip_id, route_id)
                print('erasing trip_id from trips.txt')
                # check if this trip that we are erasing will be missed
                if trip_id in stop_times_trips_set:
                    print(
                        'ooops **************** erased a trip that is referenced in stoptimes.txt : ',
                        trip_id)
            else:
                postsline = ','.join([
                    route_id, service_id, trip_id, trip_headsign, direction_id,
                    shape_id
                ]) + '\n'
                fileout.write(postsline)
                outfilelinecount += 1
        fileout.close()
        print('close file ', gtfspathout / txtfileout)
        print('lines in out file count ', outfilelinecount)

    if (stoptimes_trip_id_problem_count != 0) and (patch_stoptimes_trip_id
                                                   == 'yes'):
        print('stoptimes_trip_id_problem_count : ',
              stoptimes_trip_id_problem_count)
        # erase stoptimes if trip_id referenced is missing from trips.txt or add dummy trip...
        # for now doing the first
        # load full stop_times.txt file  then apply the patch while writing back.
        # **** takes too long - replace with pandas code

        # >>> load stop_times file
        txtfilein = 'stop_times.txt'
        stop_times_full_list = []
        with open(gtfspathin / txtfilein, newline='', encoding="utf8") as f:
            reader = csv.reader(f)
            header = next(
                reader
            )  # [trip_id,arrival_time,departure_time,stop_id,stop_sequence,pickup_type,drop_off_type,shape_dist_traveled]
            #print(header)
            for row in reader:
                #print row
                stop_times_full_list.append(row)
        print('stop_times_full_list loaded. count ', len(stop_times_full_list))

        # >>> open and prep output txt file
        txtfileout = 'stop_times.txt'
        print('open file ', gtfspathout / txtfileout)
        fileout = open(gtfspathout / txtfileout, 'w',
                       encoding="utf8")  # save results in file
        postsline = 'trip_id,arrival_time,departure_time,stop_id,stop_sequence,pickup_type,drop_off_type,shape_dist_traveled\n'
        print(postsline)
        fileout.write(postsline)
        outfilelinecount = 0
        for [
                trip_id, arrival_time, departure_time, stop_id, stop_sequence,
                pickup_type, drop_off_type, shape_dist_traveled
        ] in stop_times_full_list:
            if trip_id in stoptimes_trip_id_problem_list:
                print(
                    'stoptimes_trip_id_problem, erasing stop_time from stop_times.txt : ',
                    trip_id)
            else:
                postsline = ','.join([
                    trip_id, arrival_time, departure_time, stop_id,
                    stop_sequence, pickup_type, drop_off_type,
                    shape_dist_traveled
                ]) + '\n'
                fileout.write(postsline)
                outfilelinecount += 1
        fileout.close()
        print('close file ', gtfspathout / txtfileout)
        print('lines in out file count ', outfilelinecount)

    if stoptimes_stop_id_problem_count != 0:
        print('stoptimes_stop_id_problem_count : ',
              stoptimes_stop_id_problem_count)
        # erase stoptimes if stop_id referenced is missing from stops.txt
        # for now leaving as is
    print(
        '============================================================================='
    )
def get_gtfs_file_from_url_ftp(url, file_name_on_server, _log):
    """
    Downloads a GTFS file from an FTP server to the working directory
    :param url: the FTP server URL that points to file's containing folder
    :param file_name_on_server: The file name on the FTP server
    :return: file name of the downloaded content in the working directory
    """
    _log.info("Going to download the latest GTFS from %s ", url)
    download_complete = False
    download_attempts = 1
    max_download_attemtps = 24

    while not download_complete:
        if not download_complete and 24 > download_attempts > 1:
            _log.error(
                "%s is unreachable. Sleeping for 60 minutes and trying again. This is attempt %s out of "
                "%s attempts", url, download_attempts, max_download_attemtps)
            time.sleep(60 * 60)
        if not download_complete and download_attempts > 24:
            _log.error(
                "%s is unreachable for more than 24 hours. Aborting update",
                url)
            raise Exception
        download_attempts += 1

        try:
            # Connect to FTP
            ftp = ftplib.FTP(url)
            ftp.login()
            # Get the GTFS time stamp and generate local file name, "israel20190225"
            file_lines = []
            size = 0

            local_file_name = cfg.gtfsdirbase
            processdate = process_date.get_date_now()
            ftp.dir("", file_lines.append)
            for line in file_lines:
                tokens = line.split(maxsplit=4)
                name = tokens[3]
                if name == file_name_on_server:
                    time_str = tokens[0]
                    actual_time = parser.parse(time_str)
                    local_file_name = local_file_name + processdate + ".zip"
                    size = float(tokens[2])

            pardir = Path(os.getcwd()).parent
            local_file_path_and_name = pardir / cfg.gtfspath / local_file_name
            # Generate a progress bar and download
            local_file = open(local_file_path_and_name, 'wb')
            pbar = createProgressBar(size)

            # Download
            global size_iterator
            size_iterator = 0
            ftp.retrbinary(
                "RETR " + file_name_on_server,
                lambda data, : file_write_update_progress_bar(
                    data, local_file, pbar))

            # Finish
            local_file.close()
            ftp.quit()
            pbar.finish()
            sys.stdout.flush()
            download_complete = True
            _log.info("Finished loading latest GTFS to: %s",
                      local_file_path_and_name)
            return local_file_name

        except ftplib.all_errors as err:
            error_code = err.args[0]
            # file not found on server
            if error_code == 2:
                _log.error(file_name_on_server, "is not found on %s", url)
                raise err
            # Unvalid URL
            if error_code == 11001:
                _log.error("URL %s is not valid", url)
                continue