def get_docker_service_client(): """ Checks that the docker daemon service is running and returns the service client :return: the docker service client """ # Check that the docker daemon service is up, and timeout after five minutes docker_check_alive_cmd = "docker info" docker_is_up = False timeout = time.time() + 60 * 5 try: while not docker_is_up: if time.time() > timeout: raise TimeoutError # Check that the daemon is up and running docker_check_alive_process = subprocess.Popen( docker_check_alive_cmd, stdout=subprocess.PIPE, shell=True) output, error = docker_check_alive_process.communicate() docker_is_up = "Containers" in output.decode('utf-8') # Get the docker client client = docker.from_env() return client except BaseException as error: _log.error("Docker daemon service is not up") raise error
def delete_file_from_container(container, file_name): """ Delete a filefrom Navitia worker container :param container: Navitia worker container :param file_name: the name of the file to be removed """ delete_command= "/bin/sh -c \"rm " + file_name + "\"" exit_code, output = container.exec_run(cmd=delete_command, stdout=True, workdir="/srv/ed/output/") if exit_code != 0: _log.error("Couldn't delete %s graph", file_name) return False _log.info("Finished deleting %s from container %s", file_name, container.name)
def get_file_from_url_http(url, file_name, file_path, _log): """ Downloads a file to the working directory :param url: HTTP utl to downloads from - not an FTP URL :return: file name of the downloaded content in the working directory """ # Preparing file for fetching local_file_path_and_name = Path(os.getcwd()).parent / file_path / file_name _log.info("Going to download the latest osm from %s to %s", url, local_file_path_and_name) download_complete = False download_attempts = 1 max_download_attemtps = 24 while not download_complete: if not download_complete and 24 > download_attempts > 1: _log.error( "%s is unreachable. Sleeping for 60 minutes and trying again. This is attempt %s out of " "%s attempts", url, download_attempts, max_download_attemtps) time.sleep(60 * 60) if not download_complete and download_attempts > 24: _log.error( "%s is unreachable for more than 24 hours. Aborting update", url) raise Exception download_attempts += 1 try: r = requests.get(url, stream=True) file = open(local_file_path_and_name, 'wb') # Creating a progress bar size = int(r.headers['Content-Length']) pbar = createProgressBar(size) # Fetching global size_iterator size_iterator = 0 for chunk in r.iter_content(chunk_size=1024): if chunk: file_write_update_progress_bar(chunk, file, pbar) file.close() pbar.finish() _log.info("Finished loading latest OSM to: %s", local_file_path_and_name) download_complete = True return except Exception as e: continue
def move_current_to_past(container, source_cov_name, dest_cov_name): """ Move the Navitia graph of the source coverage to the destination coverage so in next re-start changes are applied :param container: the worker container of Navitia :param source_cov_name: the name of the coverage to take the graph from (usually "default") :param dest_cov_name: the name of the coverage to move the graph to (e.g. "secondary-cov") :return: whether the move was successful, a RunTimeError is thown if not """ command_list = "/bin/sh -c \"mv " + source_cov_name + ".nav.lz4 "+ dest_cov_name + ".nav.lz4\"" exit_code, output = container.exec_run(cmd=command_list, stdout=True, workdir="/srv/ed/output/") if exit_code != 0: _log.error("Couldn't change %s to %s", source_cov_name, dest_cov_name) raise RuntimeError _log.info("Changed the name of %s.nav.lz4 to %s.nav.lz4", source_cov_name, dest_cov_name) return True
def send_log_to_email(subject, message): """ Send an e-mail with user-defined subject and message. the e-mail is attached with logs of this script :param subject: :param message: :return: Whether the e-mail was sent successfully """ # Change to root before trying to send logs root_path = Path.home() / "TransitAnalystIsrael" / "root" os.chdir(root_path.as_posix()) logs_path = root_path / "logs" if not os.path.isdir(logs_path ): _log.error("%s isn't the logs directory. Please fix log directory as in code") path = logs_path / '*' list_of_files = glob.glob(str(path)) # * means all if need specific format then *.csv attached_file = max(list_of_files, key=os.path.getctime) return send_email.create_msg_and_send_email(subject, message, attached_file)
def start_navitia_w_default_and_custom_cov(secondary_custom_coverage_name, navitia_docker_compose_file_path, navitia_docker_compose_custom_file_path, navitia_docker_compose_file_name, extend_wait_time=False): """ Start Navitia server with default and custom coverages (using custom docker-compose file) :param secondary_custom_coverage_name: :param navitia_docker_compose_file_path: path where docker-compose file exists :param navitia_docker_compose_file_name: name of the custom docker-compose file :param extend_wait_time: whether an extended time of wait should be applied. Should be set to True when Navitia docker compose is started up the first time (images are being downloaded from the web) :return: Whether Navitia was started successfully with default and secondary coverages """ _log.error("This method isn't currently used because 2 corages require server with at least 10GB RAM available for " "docker.\nEach coverage requires about 3.5 RAM when running") raise Exception
def unzip_gtfs(gtfs_zip_file_name, gtfspath, _log): """ Unzip gtfs to gtfspath """ pardir = Path(os.getcwd()).parent gtfs_contets_folder = Path( os.getcwd()).parent / gtfspath / gtfs_zip_file_name if not os.path.isfile(gtfs_contets_folder): _log.error( "%s does not exist - please check correct GTFS date is configured", gtfs_zip_file_name) raise Exception _log.info("Going to unzip %s file to %s", gtfs_zip_file_name, gtfspath) dest_folder = pardir / gtfspath / gtfs_zip_file_name[: -4] # removing the .zip end if not os.path.exists(dest_folder): os.mkdir(dest_folder) shutil.unpack_archive(gtfs_contets_folder, extract_dir=dest_folder, format='zip') _log.info("Finished unzipping")
def validate_osm_gtfs_convertion_to_graph_is_completed(worker_con, time_to_wait, start_processing_time): """ Validates that the following Navitia worker tasks were successfully completed: osm2ed, gtfs2ed and ed2nav :param worker_con: the Navitia worker container :param time_to_wait: time to wait for the validation to take place, in minutes. Default is 20 minutes :return: Whether conversion is completed or not """ # Wait if needed _log.info("Waiting %s minutes to let OSM & GTFS conversions to lz4 graph takes place", time_to_wait) time.sleep(time_to_wait * 60) _log.info("I'm back! Verifying that the conversions took place") # Success status look like Task tyr.binarisation.ed2nav[feac06ca-51f7-4e39-bf1d-9541eaac0988] succeeded # and tyr.binarisation.gtfs2ed[feac06ca-51f7-4e39-bf1d-9541eaac0988] succeeded tyr_worker_outputname = "tyr_worker_output.txt" with open(tyr_worker_outputname, "w", encoding="UTF-8") as tyr_worker_output: tyr_worker_output.write(worker_con.logs().decode('utf-8')) tyr_worker_output.close() ed2nav_completed = False with open(tyr_worker_outputname, "r", encoding="UTF-8") as tyr_worker_output: lines = tyr_worker_output.readlines() for line in reversed(lines): if re.compile(r'tyr\.binarisation\.ed2nav\[\S*\] succeeded').search(line): time_of_line = re.findall(r'\d{1,4}-\d{1,2}-\d{1,2}\b \d{1,2}:\d{1,2}:\d{1,2}', line) time_of_line = dt.strptime(time_of_line[0], '%Y-%m-%d %H:%M:%S') if start_processing_time < time_of_line: ed2nav_completed = True break os.remove(tyr_worker_outputname) if ed2nav_completed: _log.info("OSM conversion task ed2nav, GTFS conversion task gtfs2ed and ed2nav are successful") return True else: _log.error("After %s minutes - tasks aren't completed", time_to_wait) return False
def validate_graph_changes_applied(coverage_name): """ Validate that the coverage has a different start of production date different from before """ current_start_service_date = process_date.get_date_now() if cfg.ttm_server_on == "aws_ec2": time_map_server_url = cfg.time_map_server_aws_url else: time_map_server_url = cfg.time_map_server_local_url cov_sop_date = get_coverage_start_production_date(coverage_name) if cov_sop_date == "" or not check_prod_date_is_valid_using_heat_map(time_map_server_url, coverage_name, current_start_service_date): _log.error("The %s coverage seems not to be up-to-date following update attempts." "\n A call for heat map data with %s date returned no results", coverage_name, current_start_service_date) return False _log.info("%s coverage is now updated with new start-of-production date %s\n." "Can be accessed via %s%s", coverage_name, current_start_service_date, time_map_server_url, coverage_name) return True
def copy_file_into_docker(container, dest_path, file_path, file_name): """ Copy a given file to a destination folder in a Docker container :param container: container object :param dest_path: destination folder path inside the container :param file_path: source path of the file on the host :param file_name: the file name to be copied """ _log.info("Going to copy %s to %s at %s", file_name, container.name, dest_path) # Read the file file = open(Path(os.getcwd()).parent / file_path / file_name, 'rb') file = file.read() try: # Convert to tar file tar_stream = BytesIO() file_tar = tarfile.TarFile(fileobj=tar_stream, mode='w') tarinfo = tarfile.TarInfo(name=file_name) tarinfo.size = len(file) file_tar.addfile(tarinfo, BytesIO(file)) file_tar.close() # Put in the container tar_stream.seek(0) success = container.put_archive( path=dest_path, data=tar_stream ) if success: _log.info("Finished copying %s to %s at %s", file_name, container.name, dest_path) else: raise FileNotFoundError except FileNotFoundError as err: _log.error("Couldn't copy %s to %s at %s", file_name, container.name, dest_path) raise err
def validate_auto_graph_changes_applied(coverage_name, default_coverage_name, default_cov_prev_sop_date, docker_client, navitia_docker_compose_file_path, navitia_docker_compose_file_name, navitia_docker_compose_default_file_name): """ Validate that the new default coverage returns results for heat map query for current_start_service_date (as in dates file or gtfs date) and that secondary-cov has results for the previous production date of the default. :param default_coverage_name: The coverage that gets a new (usually more recent) start of production date :param secondary_custom_coverage_name: The coverage that gets a the original default_coverage start of production date :param default_cov_sop_date: start of production date of original default coverage (before changes applied) :return: whether the graph changes were applied """ current_start_service_date = dt.strptime(process_date.get_date_now(), "%Y%m%d") if cfg.ttm_server_on == "aws_ec2": time_map_server_url = cfg.time_map_server_aws_url else: time_map_server_url = cfg.time_map_server_local_url # Check that the current default coverage is up-to-date by comparing sop dates stop_all_containers(docker_client) start_navitia_with_single_coverage(navitia_docker_compose_file_path, navitia_docker_compose_default_file_name, default_coverage_name, False) if not check_prod_date_is_valid_using_heat_map(time_map_server_url, default_coverage_name, current_start_service_date.strftime("%Y%m%d")): _log.error("The %s coverage seems not to be up-to-date following update attempts.", default_coverage_name) return False else: _log.info("%s coverage is up-to-date with production date %s", default_coverage_name, current_start_service_date.strftime("%Y%m%d")) # Check that the coverage_name (the previous one) is up-to-date by comparing sop dates stop_all_containers(docker_client) is_up = start_navitia_with_single_coverage(navitia_docker_compose_file_path, navitia_docker_compose_file_name, coverage_name, False) if not is_up: _log.error("The %s coverage seems not to be up", coverage_name) cov_sop_date = get_coverage_start_production_date(coverage_name) if cov_sop_date == "": _log.info("If this is the first time you're running Transit Analyst Israel data processing, you need to " "copy the generated default.nav.lz4 graph to secondary-cov.nav.lz4 - See docs.") return True if not check_prod_date_is_valid_using_heat_map(time_map_server_url, coverage_name, current_start_service_date.strftime("%Y%m%d")): _log.error("The %s coverage seems not to be up-to-date following update attempts.\nA call for heat map data with" " %s date returned no results", coverage_name, current_start_service_date.strftime("%Y%m%d")) return False _log.info("%s coverage is now updated with new start-of-production date %s. " "Can be accessed via %s%s", coverage_name, current_start_service_date.strftime("%Y%m%d"), time_map_server_url, coverage_name) return True
def process_new_data_to_current_coverage(docker_client, navitia_docker_compose_file_path, navitia_docker_compose_file_name, navitia_docker_compose_default_file_name, coverage_name, default_coverage_name, cov_eos_date, osm_file_path, osm_file_name, gtfs_file_path, gtfs_file_name, _log): start_processing_time = datetime.datetime.utcnow() #We take the time in UTC because docker time is in UTC # Re-start Navitia docker with default coverage only in order to process the OSM & GTFS # Later we will restart with the custom coverage as well utils.stop_all_containers(docker_client) if cfg.get_service_date == "auto": utils.start_navitia_with_single_coverage(navitia_docker_compose_file_path, navitia_docker_compose_default_file_name, default_coverage_name) elif cfg.get_service_date == "on_demand": utils.start_navitia_with_single_coverage(navitia_docker_compose_file_path, navitia_docker_compose_file_name, coverage_name) # Get the new worker container worker_con = docker_client.containers.list(filters={"name": "worker"})[0] # Copy OSM & GTFS to the default coverage input folder on the worker container if cfg.get_service_date == "auto": utils.copy_osm_and_gtfs_to_cov(worker_con, osm_file_path, osm_file_name, gtfs_file_path, gtfs_file_name, default_coverage_name) elif cfg.get_service_date == "on_demand": utils.copy_osm_and_gtfs_to_cov(worker_con, osm_file_path, osm_file_name, gtfs_file_path, gtfs_file_name, coverage_name) # Validate the conversion process takes place by ensuring tyr_beat is up if cfg.get_service_date == "auto": utils.validate_osm_gtfs_convertion_to_graph_is_running(docker_client, default_coverage_name, navitia_docker_compose_default_file_name, navitia_docker_compose_file_name) elif cfg.get_service_date == "on_demand": utils.validate_osm_gtfs_convertion_to_graph_is_running(docker_client, coverage_name, navitia_docker_compose_file_path, navitia_docker_compose_file_name) worker_con = docker_client.containers.list(filters={"name": "worker"})[0] # After 20 minutes - test that both osm and gtfs conversions are done success = utils.validate_osm_gtfs_convertion_to_graph_is_completed(worker_con, 40, start_processing_time) # If it didn't succeed, give it 30 more minutes if not success: success = utils.validate_osm_gtfs_convertion_to_graph_is_completed(worker_con, 30, start_processing_time) # If it didn't succeed, give it 30 more minutes if not success: success = utils.validate_osm_gtfs_convertion_to_graph_is_completed(worker_con, 30, start_processing_time) if not success: _log.error("After 90 minutes - tasks aren't completed - connect to server for manual inspection") raise Exception is_changes_applied = True # Validate that changes are applied if cfg.get_service_date == "auto": is_changes_applied = utils.validate_auto_graph_changes_applied(coverage_name, default_coverage_name, cov_eos_date, docker_client, navitia_docker_compose_file_path, navitia_docker_compose_file_name, navitia_docker_compose_default_file_name) if not is_changes_applied: raise Exception elif cfg.get_service_date == "on_demand": is_changes_applied = utils.validate_graph_changes_applied(coverage_name) if not is_changes_applied: raise Exception # If it's up - delete the old gtfs and osm files - only from AWS machines if is_changes_applied and utils.is_aws_machine(): utils.delete_file_from_host(Path(os.getcwd()).parent / osm_file_path / osm_file_name) utils.delete_file_from_host(Path(os.getcwd()).parent / gtfs_file_path / gtfs_file_name)
utils.delete_file_from_host(Path(os.getcwd()).parent / osm_file_path / osm_file_name) utils.delete_file_from_host(Path(os.getcwd()).parent / gtfs_file_path / gtfs_file_name) # config variables to be moved to config-file downstrem default_coverage_name, coverage_name, navitia_docker_compose_file_path, navitia_docker_compose_file_name, \ navitia_docker_compose_default_file_name, gtfs_file_path, gtfs_zip_file_name = utils.get_config_params() try: # Get the docker service client docker_client = utils.get_docker_service_client() containers = docker_client.containers.list(filters={"name": "worker"}) if len(containers) == 0: _log.error("Navitia docker containers are down, bringing them up with default coverage for processing") utils.start_navitia_with_single_coverage(navitia_docker_compose_file_path, navitia_docker_compose_default_file_name, default_coverage_name) containers = docker_client.containers.list(filters={"name": "worker"}) # Get the worker container worker_con = containers[0] default_cov_sop_date = "" # For production env. we have default coverage and secondary-cov coverage so back up is needed if cfg.get_service_date == "auto": # Get the current start of production dates of default coverage for post-processing comparison if utils.is_cov_exists(worker_con, default_coverage_name): default_cov_sop_date = utils.get_coverage_start_production_date(default_coverage_name)
def main(gtfsdate, gtfsparentpath, gtfsdirbase, pathout): # input: parent_path = cwd.parent / gtfsparentpath gtfsdir = gtfsdirbase + gtfsdate txtfilein = '' # output: gtfspathout = cwd.parent / pathout / gtfsdir txtfileout = '' gtfspathin = parent_path / gtfsdir gtfspath = gtfspathin # >>> load routes file routes_count = 0 txtfilein = 'routes.txt' routes_dict = {} with open(gtfspathin / txtfilein, newline='', encoding="utf8") as f: reader = csv.reader(f) header = next( reader ) # [route_id,agency_id,route_short_name,route_long_name,route_desc,route_type,route_color] #print(header) for row in reader: #print row routes_count += 1 routes_dict[row[0]] = [row[1]] # 'route_id' : ['agency_id'] #print routes_dict[:4] print('routes_dict loaded. routes count ', len(routes_dict)) # >>> load trips file trips_count = 0 trips_header_trip_headsign_missing = False txtfilein = 'trips.txt' trips_dict = {} with open(gtfspathin / txtfilein, newline='', encoding="utf8") as f: reader = csv.reader(f) header = next( reader ) # [route_id,service_id,trip_id,trip_headsign,direction_id,shape_id] if len(header ) == 5 and header[3] == 'direction_id': # trip_headsign missing trips_header_trip_headsign_missing = True print('trip_headsign missing') print(header) #print(header) for row in reader: #print(row) trips_count += 1 if trips_header_trip_headsign_missing: trips_dict[row[2]] = [ row[0], row[1], row[4] ] # 'trip_id' : ['route_id','service_id','shape_id'] else: trips_dict[row[2]] = [ row[0], row[1], row[5] ] # 'trip_id' : ['route_id','service_id','shape_id'] #print trips_dict[:4] print('trips_dict loaded. trips count ', len(trips_dict)) # >>> load stop_times file stop_times_count = 0 txtfilein = 'stop_times.txt' stop_times_trips_set = set([]) stop_times_stops_set = set([]) with open(gtfspathin / txtfilein, newline='', encoding="utf8") as f: reader = csv.reader(f) header = next( reader ) # [trip_id,arrival_time,departure_time,stop_id,stop_sequence,pickup_type,drop_off_type,shape_dist_traveled] #print(header) for row in reader: #print row stop_times_count += 1 stop_times_trips_set.add(row[0]) # trip_id stop_times_stops_set.add(row[3]) # stop_id print('stop_times_trips loaded. trips count ', len(stop_times_trips_set)) print('stop_times_stops loaded. stops count ', len(stop_times_stops_set)) # >>> load stops file stops_count = 0 txtfilein = 'stops.txt' stops_dict = {} with open(gtfspathin / txtfilein, newline='', encoding="utf8") as f: reader = csv.reader(f) header = next( reader ) # ['stop_id', 'stop_code', 'stop_name', 'stop_desc', 'stop_lat', 'stop_lon', 'location_type', 'parent_station', 'zone_id'] #print(header) for row in reader: #print row stops_count += 1 stops_dict[row[0]] = [ row[2], row[3], row[4], row[5] ] # 'stop_id' : ['stop_name', 'stop_desc', 'stop_lat', 'stop_lon'] #print stops_dict[row[0]] # last one print('stops_dict loaded. stop count ', len(stops_dict)) # >>> load agency file agency_count = 0 txtfilein = 'agency.txt' agency_dict = {} agency_name_problem_count = 0 with open(gtfspathin / txtfilein, newline='', encoding="utf8") as f: reader = csv.reader(f) header = next( reader ) # agency_id,agency_name,agency_url,agency_timezone,agency_lang,agency_phone,agency_fare_url print(header) for row in reader: print(row) agency_count += 1 agency_name = row[1] agency_name_clean = agency_name.replace('\"', '').replace("\'", "") if agency_name != agency_name_clean: print('agency name problem: ', agency_name, agency_name_clean) agency_name_problem_count += 1 row[1] = agency_name_clean # patch agency name for dict, later it will be written to file agency_dict[row[0]] = [ row[1], row[2], row[3], row[4], row[5], row[6] ] # 'agency_id': ['agency_name','agency_url','agency_timezone','agency_lang','agency_phone','agency_fare_url'] #print agency_dict[row[0]] # last one print('agency_dict loaded. agency count ', len(agency_dict)) print('agency_name_problem_count : ', agency_name_problem_count) # >>> load shapes file. Actually loads only one point per shape!!! used only as a set of shape_ids shapes_count = 0 txtfilein = 'shapes.txt' shapes_dict = {} with open(gtfspathin / txtfilein, newline='', encoding="utf8") as f: reader = csv.reader(f) header = next( reader) # shape_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence #print(header) for row in reader: #print row shapes_count += 1 shapes_dict[row[0]] = [ row[1], row[2] ] # 'shape_id' : ['shape_pt_lat','shape_pt_lon'] #print shapes_dict[row[0]] # last one print('shapes_dict loaded. shape count ', len(shapes_dict)) # >>> load calendar file calendar_count = 0 txtfilein = 'calendar.txt' calendar_dict = {} with open(gtfspathin / txtfilein, newline='', encoding="utf8") as f: reader = csv.reader(f) header = next( reader ) # service_id,sunday,monday,tuesday,wednesday,thursday,friday,saturday,start_date,end_date #print(header) for row in reader: #print row calendar_count += 1 calendar_dict[row[0]] = [ row[1], row[2], row[3], row[4], row[5], row[6], row[7], row[8], row[9] ] # ['service_id' : [ 'sunday','monday','tuesday','wednesday','thursday','friday','saturday','start_date','end_date'] #print calendar_dict[row[0]] # last one print('calendar_dict loaded. calendar count ', len(calendar_dict)) # >>> process loaded files # check MAX limits on file line count if stops_count > gtfs_config.MAX_STOPS_COUNT: print('need to abort') _log.error('MAX GTFS line count exceeded') raise Exception if stop_times_count > gtfs_config.MAX_STOP_TIMES_COUNT: print('need to abort') _log.error('MAX GTFS line count exceeded') raise Exception if trips_count > gtfs_config.MAX_TRIPS_COUNT: print('need to abort') _log.error('MAX GTFS line count exceeded') raise Exception if shapes_count > gtfs_config.MAX_SHAPES_COUNT: print('need to abort') _log.error('MAX GTFS line count exceeded') raise Exception if routes_count > gtfs_config.MAX_ROUTES_COUNT: print('need to abort') _log.error('MAX GTFS line count exceeded') raise Exception if agency_count > gtfs_config.MAX_AGENCY_COUNT: print('need to abort') _log.error('MAX GTFS line count exceeded') raise Exception if calendar_count > gtfs_config.MAX_CALENDAR_COUNT: print('need to abort') _log.error('MAX GTFS line count exceeded') raise Exception # >>> process calendar - check that GTFS start date in calendar.txt is as expected - gtfsdate service_ok_count = 0 service_problem_count = 0 service_problem_list = [] min_service_date = '21190101' for service_id, [ sunday, monday, tuesday, wednesday, thursday, friday, saturday, start_date, end_date ] in calendar_dict.items(): min_service_date = min(min_service_date, start_date, end_date) if start_date >= gtfsdate and end_date >= gtfsdate: service_ok_count += 1 else: service_problem_count += 1 print( 'service_problem date before expected start date: start_date, end_date, gtfsdate ', service_id, start_date, end_date, gtfsdate) service_problem_list.append( [service_id, start_date, end_date, gtfsdate]) print('service_ok_count : ', service_ok_count) print('service_problem_count : ', service_problem_count) if min_service_date != gtfsdate: # problem print( 'GTFS file start date in calendar.txt is not the same as expected start date : ', min_service_date, gtfsdate) print(cfg.patch_calendar) if cfg.patch_calendar == 'yes': # problem and patch print('patch_calendar') # >>> open and prep output txt file txtfileout = 'calendar.txt' print('open file ', gtfspathout / txtfileout) fileout = open(gtfspathout / txtfileout, 'w', encoding="utf8") # save results in file postsline = 'service_id,sunday,monday,tuesday,wednesday,thursday,friday,saturday,start_date,end_date\n' print(postsline) fileout.write(postsline) outfilelinecount = 0 for service_id, [ sunday, monday, tuesday, wednesday, thursday, friday, saturday, start_date, end_date ] in calendar_dict.items(): if end_date >= gtfsdate: # good if start_date >= gtfsdate: # good pass else: # *** start date problem *** fix entry to start on gtfsdate print( 'service_problem date before expected start date: start_date, end_date, gtfsdate ', service_id, start_date, end_date, gtfsdate) start_date = gtfsdate # output entry postsline = ','.join([ service_id, sunday, monday, tuesday, wednesday, thursday, friday, saturday, start_date, end_date ]) + '\n' fileout.write(postsline) outfilelinecount += 1 else: # *** end date problem *** skip entry print( 'end date before GTFS date - erase (skip) this calendar entry', end_date, gtfsdate) fileout.close() print('close file ', gtfspathout / txtfileout) print('lines in out file count ', outfilelinecount) else: # problem and no patch print('need to abort') _log.error( 'GTFS file start date in calendar.txt is not the same as expected start date : %s %s', min_service_date, gtfsdate) raise Exception raise Exception # >>> process routes routes_agency_id_ok_count = 0 routes_agency_id_problem_count = 0 routes_agency_id_problem_list = [] agencies_referenced_set = set([]) for route_id, [agency_id] in routes_dict.items(): if agency_id in agency_dict: routes_agency_id_ok_count += 1 agencies_referenced_set.add(agency_id) else: routes_agency_id_problem_count += 1 print('routes_agency_id_problem : ', route_id, agency_id) routes_agency_id_problem_list.append(route_id) print('routes_agency_id_ok_count : ', routes_agency_id_ok_count) print('routes_agency_id_problem_count : ', routes_agency_id_problem_count) print('agencies_referenced_count : ', len(agencies_referenced_set)) print('agencies_referenced_set : ', agencies_referenced_set) # >>> process trips trips_service_id_ok_count = 0 trips_service_id_problem_count = 0 trips_service_id_problem_list = [] trips_shape_id_ok_count = 0 trips_shape_id_problem_count = 0 trips_shape_id_problem_list = [] trips_route_id_ok_count = 0 trips_route_id_problem_count = 0 trips_route_id_problem_list = [] for trip_id, [route_id, service_id, shape_id] in trips_dict.items(): if route_id in routes_dict: trips_route_id_ok_count += 1 else: trips_route_id_problem_count += 1 print('trips_route_id_problem : ', trip_id, route_id) trips_route_id_problem_list.append(trip_id) if shape_id in shapes_dict: trips_shape_id_ok_count += 1 else: trips_shape_id_problem_count += 1 print('trips_shape_id_problem : ', trip_id, shape_id) #print(shapes_dict.keys()) trips_shape_id_problem_list.append(trip_id) if service_id in calendar_dict: trips_service_id_ok_count += 1 else: trips_service_id_problem_count += 1 print('trips_service_id_problem : ', trip_id, service_id) trips_service_id_problem_list.append(trip_id) print('trips_service_id_ok_count : ', trips_service_id_ok_count) print('trips_service_id_problem_count : ', trips_service_id_problem_count) print('trips_shape_id_ok_count : ', trips_shape_id_ok_count) print('trips_shape_id_problem_count : ', trips_shape_id_problem_count) print('trips_route_id_ok_count : ', trips_route_id_ok_count) print('trips_route_id_problem_count : ', trips_route_id_problem_count) # >>> process stop_times stoptimes_trip_id_ok_count = 0 stoptimes_trip_id_problem_count = 0 stoptimes_trip_id_problem_list = [] stoptimes_stop_id_ok_count = 0 stoptimes_stop_id_problem_count = 0 stoptimes_stop_id_problem_list = [] for trip_id in stop_times_trips_set: if trip_id in trips_dict: stoptimes_trip_id_ok_count += 1 else: stoptimes_trip_id_problem_count += 1 print('stoptimes_trip_id_problem : ', trip_id) stoptimes_trip_id_problem_list.append(trip_id) for stop_id in stop_times_stops_set: if stop_id in stops_dict: stoptimes_stop_id_ok_count += 1 else: stoptimes_stop_id_problem_count += 1 print('stoptimes_stop_id_problem : ', stop_id) stoptimes_stop_id_problem_list.append(stop_id) print('stoptimes_trip_id_ok_count : ', stoptimes_trip_id_ok_count) print('stoptimes_trip_id_problem_count : ', stoptimes_trip_id_problem_count) print('stoptimes_stop_id_ok_count : ', stoptimes_stop_id_ok_count) print('stoptimes_stop_id_problem_count : ', stoptimes_stop_id_problem_count) # >>> patch problem files if agency_name_problem_count > 0: # patch agency names, in case they include " or ' in the name (happened in GTFS file of 20190901) # >>> open and prep output txt file txtfileout = 'agency.txt' print('open file ', gtfspathout / txtfileout) fileout = open(gtfspathout / txtfileout, 'w', encoding="utf8") # save results in file postsline = 'agency_id,agency_name,agency_url,agency_timezone,agency_lang,agency_phone,agency_fare_url\n' print(postsline) fileout.write(postsline) outfilelinecount = 0 for agency_id, [ agency_name, agency_url, agency_timezone, agency_lang, agency_phone, agency_fare_url ] in agency_dict.items(): postsline = ','.join([ agency_id, agency_name, agency_url, agency_timezone, agency_lang, agency_phone, agency_fare_url ]) + '\n' fileout.write(postsline) outfilelinecount += 1 fileout.close() print('close file ', gtfspathout / txtfileout) print('lines in out file count ', outfilelinecount) if trips_header_trip_headsign_missing: print('trips_header_trip_headsign_missing') # add dummy '' trip_headsign # load full trips.txt file then apply the patch while writing back. # >>> load trips file txtfilein = 'trips.txt' trips_full_list = [] with open(gtfspathin / txtfilein, newline='', encoding="utf8") as f: reader = csv.reader(f) header = next( reader) # [route_id,service_id,trip_id,direction_id,shape_id] #print(header) for row in reader: #print row trips_full_list.append([ row[0], row[1], row[2], row[3], row[4] ]) # [route_id,service_id,trip_id,direction_id,shape_id] print('trips_full_list loaded. trips count ', len(trips_full_list)) # >>> open and prep output txt file txtfileout = 'trips.txt' print('open file ', gtfspathout / txtfileout) fileout = open(gtfspathout / txtfileout, 'w', encoding="utf8") # save results in file postsline = 'route_id,service_id,trip_id,trip_headsign,direction_id,shape_id\n' print(postsline) fileout.write(postsline) outfilelinecount = 0 trip_headsign = '' for [route_id, service_id, trip_id, direction_id, shape_id] in trips_full_list: postsline = ','.join([ route_id, service_id, trip_id, trip_headsign, direction_id, shape_id ]) + '\n' fileout.write(postsline) outfilelinecount += 1 fileout.close() print('close file ', gtfspathout / txtfileout) print('lines in out file count ', outfilelinecount) if routes_agency_id_problem_count != 0: print('routes_agency_id_problem_count : ', routes_agency_id_problem_count) # erase routes if agency_id referenced is missing from agency.txt or add unknown agency to agency.txt with the missing id... # for now leaving as is if trips_service_id_problem_count != 0: print('trips_service_id_problem_count : ', trips_service_id_problem_count) # erase trips if service_id referenced is missing from calendar.txt or add empty service record to calendar.txt with the missing id... # for now doing the first - but checking that the erased trip will not be missed # load full trips.txt file then apply the patch while writing back. # >>> load trips file txtfilein = 'trips.txt' trips_full_list = [] with open(gtfspathin / txtfilein, newline='', encoding="utf8") as f: reader = csv.reader(f) header = next( reader ) # [route_id,service_id,trip_id,trip_headsign,direction_id,shape_id] #print(header) for row in reader: #print row trips_full_list.append( [row[0], row[1], row[2], row[3], row[4], row[5]] ) # [route_id,service_id,trip_id,trip_headsign,direction_id,shape_id] print('trips_full_list loaded. trips count ', len(trips_full_list)) # >>> open and prep output txt file txtfileout = 'trips.txt' print('open file ', gtfspathout / txtfileout) fileout = open(gtfspathout / txtfileout, 'w', encoding="utf8") # save results in file postsline = 'route_id,service_id,trip_id,trip_headsign,direction_id,shape_id\n' print(postsline) fileout.write(postsline) outfilelinecount = 0 for [ route_id, service_id, trip_id, trip_headsign, direction_id, shape_id ] in trips_full_list: if trip_id in trips_service_id_problem_list: print('trips_service_id_problem : ', trip_id, service_id) print('erasing trip_id from trips.txt') # check if this trip that we are erasing will be missed if trip_id in stop_times_trips_set: print( 'ooops **************** erased a trip that is referenced in stoptimes.txt : ', trip_id) else: postsline = ','.join([ route_id, service_id, trip_id, trip_headsign, direction_id, shape_id ]) + '\n' fileout.write(postsline) outfilelinecount += 1 fileout.close() print('close file ', gtfspathout / txtfileout) print('lines in out file count ', outfilelinecount) if trips_shape_id_problem_count != 0: print('trips_shape_id_problem_count : ', trips_shape_id_problem_count) # if shape_id == "" then create shape from sequence of stops and add to shapes.txt with the newly created id... # for now leaving as is if trips_route_id_problem_count != 0: print('trips_route_id_problem_count : ', trips_route_id_problem_count) # erase trips if route_id referenced is missing from routes.txt or add unknown route to route.txt with the missing id... # for now doing the first - but checking that the erased trip will not be missed # load full trips.txt file then apply the patch while writing back. # >>> load trips file txtfilein = 'trips.txt' trips_full_list = [] with open(gtfspathin / txtfilein, newline='', encoding="utf8") as f: reader = csv.reader(f) header = next( reader ) # [route_id,service_id,trip_id,trip_headsign,direction_id,shape_id] #print(header) for row in reader: #print row trips_full_list.append( [row[0], row[1], row[2], row[3], row[4], row[5]] ) # [route_id,service_id,trip_id,trip_headsign,direction_id,shape_id] print('trips_full_list loaded. trips count ', len(trips_full_list)) # >>> open and prep output txt file txtfileout = 'trips.txt' print('open file ', gtfspathout / txtfileout) fileout = open(gtfspathout / txtfileout, 'w', encoding="utf8") # save results in file postsline = 'route_id,service_id,trip_id,trip_headsign,direction_id,shape_id\n' print(postsline) fileout.write(postsline) outfilelinecount = 0 for [ route_id, service_id, trip_id, trip_headsign, direction_id, shape_id ] in trips_full_list: if trip_id in trips_route_id_problem_list: print('trips_route_id_problem : ', trip_id, route_id) print('erasing trip_id from trips.txt') # check if this trip that we are erasing will be missed if trip_id in stop_times_trips_set: print( 'ooops **************** erased a trip that is referenced in stoptimes.txt : ', trip_id) else: postsline = ','.join([ route_id, service_id, trip_id, trip_headsign, direction_id, shape_id ]) + '\n' fileout.write(postsline) outfilelinecount += 1 fileout.close() print('close file ', gtfspathout / txtfileout) print('lines in out file count ', outfilelinecount) if (stoptimes_trip_id_problem_count != 0) and (patch_stoptimes_trip_id == 'yes'): print('stoptimes_trip_id_problem_count : ', stoptimes_trip_id_problem_count) # erase stoptimes if trip_id referenced is missing from trips.txt or add dummy trip... # for now doing the first # load full stop_times.txt file then apply the patch while writing back. # **** takes too long - replace with pandas code # >>> load stop_times file txtfilein = 'stop_times.txt' stop_times_full_list = [] with open(gtfspathin / txtfilein, newline='', encoding="utf8") as f: reader = csv.reader(f) header = next( reader ) # [trip_id,arrival_time,departure_time,stop_id,stop_sequence,pickup_type,drop_off_type,shape_dist_traveled] #print(header) for row in reader: #print row stop_times_full_list.append(row) print('stop_times_full_list loaded. count ', len(stop_times_full_list)) # >>> open and prep output txt file txtfileout = 'stop_times.txt' print('open file ', gtfspathout / txtfileout) fileout = open(gtfspathout / txtfileout, 'w', encoding="utf8") # save results in file postsline = 'trip_id,arrival_time,departure_time,stop_id,stop_sequence,pickup_type,drop_off_type,shape_dist_traveled\n' print(postsline) fileout.write(postsline) outfilelinecount = 0 for [ trip_id, arrival_time, departure_time, stop_id, stop_sequence, pickup_type, drop_off_type, shape_dist_traveled ] in stop_times_full_list: if trip_id in stoptimes_trip_id_problem_list: print( 'stoptimes_trip_id_problem, erasing stop_time from stop_times.txt : ', trip_id) else: postsline = ','.join([ trip_id, arrival_time, departure_time, stop_id, stop_sequence, pickup_type, drop_off_type, shape_dist_traveled ]) + '\n' fileout.write(postsline) outfilelinecount += 1 fileout.close() print('close file ', gtfspathout / txtfileout) print('lines in out file count ', outfilelinecount) if stoptimes_stop_id_problem_count != 0: print('stoptimes_stop_id_problem_count : ', stoptimes_stop_id_problem_count) # erase stoptimes if stop_id referenced is missing from stops.txt # for now leaving as is print( '=============================================================================' )
def get_gtfs_file_from_url_ftp(url, file_name_on_server, _log): """ Downloads a GTFS file from an FTP server to the working directory :param url: the FTP server URL that points to file's containing folder :param file_name_on_server: The file name on the FTP server :return: file name of the downloaded content in the working directory """ _log.info("Going to download the latest GTFS from %s ", url) download_complete = False download_attempts = 1 max_download_attemtps = 24 while not download_complete: if not download_complete and 24 > download_attempts > 1: _log.error( "%s is unreachable. Sleeping for 60 minutes and trying again. This is attempt %s out of " "%s attempts", url, download_attempts, max_download_attemtps) time.sleep(60 * 60) if not download_complete and download_attempts > 24: _log.error( "%s is unreachable for more than 24 hours. Aborting update", url) raise Exception download_attempts += 1 try: # Connect to FTP ftp = ftplib.FTP(url) ftp.login() # Get the GTFS time stamp and generate local file name, "israel20190225" file_lines = [] size = 0 local_file_name = cfg.gtfsdirbase processdate = process_date.get_date_now() ftp.dir("", file_lines.append) for line in file_lines: tokens = line.split(maxsplit=4) name = tokens[3] if name == file_name_on_server: time_str = tokens[0] actual_time = parser.parse(time_str) local_file_name = local_file_name + processdate + ".zip" size = float(tokens[2]) pardir = Path(os.getcwd()).parent local_file_path_and_name = pardir / cfg.gtfspath / local_file_name # Generate a progress bar and download local_file = open(local_file_path_and_name, 'wb') pbar = createProgressBar(size) # Download global size_iterator size_iterator = 0 ftp.retrbinary( "RETR " + file_name_on_server, lambda data, : file_write_update_progress_bar( data, local_file, pbar)) # Finish local_file.close() ftp.quit() pbar.finish() sys.stdout.flush() download_complete = True _log.info("Finished loading latest GTFS to: %s", local_file_path_and_name) return local_file_name except ftplib.all_errors as err: error_code = err.args[0] # file not found on server if error_code == 2: _log.error(file_name_on_server, "is not found on %s", url) raise err # Unvalid URL if error_code == 11001: _log.error("URL %s is not valid", url) continue