def match_station_with_google_maps(): key = getConfig('googleMapsKey') gmaps = googlemaps.Client(key=key) all_stops = get_stops_without_location() for stop in all_stops: geocoding = gmaps.geocode(stop.stop_name) if geocoding is None or len(geocoding) == 0: print(f'couldnt find {stop.stop_name} on google maps') else: location = geocoding[0]['geometry']['location'] lat = location['lat'] lng = location['lng'] stop.stop_lat = lat stop.stop_lon = lng commit()
def crawl(): global stop_times_to_add, finishUp, update_stops_thread, date_arr with open('Data/bus_stops.csv') as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') row_count = sum(1 for row in csv_reader) try: begin = int(getConfig('csv.begin')) - 1 except KeyError: begin = 1 try: end = int(getConfig('csv.end')) - 1 except KeyError: end = row_count - 1 csv_file.seek(0) stop_set = set() for row in skip_stop(csv_reader, begin, end): stop_set.add(row[0]) try: max_stops_to_crawl = getConfig('batchSize') except: max_stops_to_crawl = 3 try: date_arr = getConfig('dates') except: date_arr = [date_w] stop_list = list(stop_set) stop_list_deleted = False commit() get_std_date() load_allg_feiertage() update_stops_thread = Thread(target=location_data_thread) update_stops_thread.daemon = True update_stops_thread.start() try: if getConfig('resetDBstatus'): for stop in get_from_table(Stop): stop.crawled = False except: pass commit() new_session() print("started crawling", flush=True) count12 = 0 while True: if stop_list_deleted or len(stop_list) == 0: if not stop_list_deleted: logging.debug(f'deleting stop_list starting with database crawl') del stop_list stop_list_deleted = True if not continuesCrawling: break stop_set = set() for stop in (uncrawled := load_all_uncrawled_stops(max_stops_to_crawl)): stop_set.add(stop.stop_name) stop.crawled = True commit() if uncrawled is None or len(uncrawled) == 0: break to_crawl = list(stop_set) else: to_crawl = stop_list[:max_stops_to_crawl] stop_list = stop_list[max_stops_to_crawl:] routes = load_all_stops_to_crawl(to_crawl) stop_times_to_add = [] t = Thread(target=load_data_async, args=(routes,)) t.daemon = True t.start() commit() while t.is_alive() or (not t.is_alive() and len(q) > 0): if len(q) == 0: time.sleep(0.01) continue page = q.pop() try: process_page(page.url, page.data) except TripAlreadyPresentError as e: pass except Exception as e: logging.error(f'load_route {page.url} {repr(e)}') stop_times_executor = ThreadPoolExecutor() for tree, page, current_stops_dict, trip in stop_times_to_add: stop_times_executor.submit(add_stop_times_from_web_page, tree, page, current_stops_dict, trip) stop_times_executor.shutdown(wait=True) commit() new_session() count12 = count12 + 1 logging.debug(f'finished batch {count12 * max_stops_to_crawl}')
def export_all_tables(): tables = [Agency, Calendar, CalendarDate, Frequency, Route, Trip, StopTime, Shape, Stop, Transfer] file_names = [] os.chdir('./db') try: os.remove('./Archiv.zip') except FileNotFoundError: pass try: excluded_routes = getConfig('exportOptions.excludeRouteTypes') except: excluded_routes = None for i in tables: try: os.remove(f'./{i.__table__.name}.txt') except FileNotFoundError: pass for i in tables: file_names.append(f'./{i.__table__.name}.txt') new_session() if i is StopTime: q = query_element(i) with open(f'./{i.__table__.name}.txt', 'a') as outfile: outcsv = csv.writer(outfile, delimiter=',') outcsv.writerow(i.firstline()) for dataset in windowed_query(q, StopTime.stop_times_id, 1000): outcsv.writerow(dataset.tocsv()) else: with open(f'./{i.__table__.name}.txt', 'a') as outfile: outcsv = csv.writer(outfile, delimiter=',') outcsv.writerow(i.firstline()) records = get_from_table(i) for row in records: outcsv.writerow(row.tocsv()) end_session() print(f'finished {i.__table__.name}', flush=True) if excluded_routes is not None: print("removing routes") all_routes = [] all_trips = [] all_stop_times = [] deleted_route_ids = [] deleted_trip_ids = [] with open(f'./{Route.__table__.name}.txt', 'r') as routes: first_line_routes = routes.readline() route_type_index = Route.firstline().index('route_type') route_id_index = Route.firstline().index('route_id') csv_reader = csv.reader(routes, delimiter=',') for route in csv_reader: if route[route_type_index] != '' and int(route[route_type_index]) in excluded_routes: deleted_route_ids.append(int(route[route_id_index])) continue all_routes.append(route) with open(f'./{Trip.__table__.name}.txt', 'r') as trips: first_line_trips = trips.readline() route_id_of_trip_index = Trip.firstline().index('route_id') trip_id_index = Trip.firstline().index('trip_id') csv_reader = csv.reader(trips, delimiter=',') for trip in csv_reader: if int(trip[route_id_of_trip_index]) in deleted_route_ids: deleted_trip_ids.append(int(trip[trip_id_index])) continue all_trips.append(trip) with open(f'./{StopTime.__table__.name}.txt', 'r') as stop_times: first_line_stop_times = stop_times.readline() trip_id_of_stop_time_index = StopTime.firstline().index("trip_id") csv_reader = csv.reader(stop_times, delimiter=',') for stop_time in csv_reader: if int(stop_time[trip_id_of_stop_time_index]) in deleted_trip_ids: continue all_stop_times.append(stop_time) os.remove(f'./{Route.__table__.name}.txt') with open(f'./{Route.__table__.name}.txt', 'a') as routes: routes.writelines([first_line_routes]) outcsv = csv.writer(routes, delimiter=',') for row in all_routes: outcsv.writerow(row) os.remove(f'./{Trip.__table__.name}.txt') with open(f'./{Trip.__table__.name}.txt', 'a') as trips: trips.writelines([first_line_trips]) outcsv = csv.writer(trips, delimiter=',') for row in all_trips: outcsv.writerow(row) os.remove(f'./{StopTime.__table__.name}.txt') with open(f'./{StopTime.__table__.name}.txt', 'a') as stop_times: stop_times.writelines([first_line_stop_times]) outcsv = csv.writer(stop_times, delimiter=',') for row in all_stop_times: outcsv.writerow(row) print(f"done removing routes with type {excluded_routes}") with ZipFile('./Archiv.zip', 'w') as zip: for file in file_names: zip.write(file)
for stop in all_stops: geocoding = gmaps.geocode(stop.stop_name) if geocoding is None or len(geocoding) == 0: print(f'couldnt find {stop.stop_name} on google maps') else: location = geocoding[0]['geometry']['location'] lat = location['lat'] lng = location['lng'] stop.stop_lat = lat stop.stop_lon = lng commit() if __name__ == "__main__": try: continuesCrawling = getConfig('continues') except KeyError as e: continuesCrawling = False try: fiona_geometry = False crawlStopOptions = 'crawlStopOptions' in getConfig() try: shapefile = getConfig('crawlStopOptions.shapefile') fiona_shape = fiona.open(shapefile) fiona_iteration = iter(fiona_shape) fiona_geometry = [] for r in fiona_iteration: fiona_geometry.append(shape(r['geometry'])) del fiona_shape del fiona_iteration except KeyError:
from Models.stop import Stop from Models.stop_times import StopTime from Models.trip import Trip from Models.calendar import Calendar from Models.calendar_date import CalendarDate from Models.transport_type_image import TransportTypeImage from Models.stop_time_text import StopTimeText import sqlalchemy import pyhash logger = logging.getLogger(__name__) hasher = pyhash.fnv1a_64() lock = threading.Lock() try: DATABASE_URI = 'postgres+psycopg2://' + str(getConfig('postgres')) except KeyError: DATABASE_URI = 'postgres+psycopg2://postgres:password@localhost:5432/postgres' from sqlalchemy import create_engine, and_, or_, func, literal_column, Text from sqlalchemy.orm import sessionmaker engine = create_engine(DATABASE_URI, executemany_mode='values') Session = sessionmaker(bind=engine, autoflush=False, autocommit=False, expire_on_commit=True) s = Session()