def get_data(record_type, **kwargs): """ Get provider data as in-memory objects. """ # shortcut reading from file source(s) if kwargs.get("source"): source = kwargs.get("source") print(f"Reading {record_type} from {source}") payloads = mds.DataFile(record_type, source).load_payloads() return payloads # required for API calls client = kwargs.pop("client") # dependent on version and record_type start_time = kwargs.get("start_time") end_time = kwargs.get("end_time") paging = not kwargs.get("no_paging") rate_limit = kwargs.get("rate_limit") version = kwargs.get("version", DEFAULT_VERSION) # package up for API requests api_kwargs = dict(paging=paging, rate_limit=rate_limit) print(f"Requesting {record_type} from {client.provider.provider_name}") if start_time and end_time: print(f"For time range: {start_time.isoformat()} to {end_time.isoformat()}") elif end_time: print(f"For time: {end_time.isoformat()}") if version < VERSION_040: if record_type == mds.STATUS_CHANGES: api_kwargs["start_time"] = start_time api_kwargs["end_time"] = end_time elif record_type == mds.TRIPS: api_kwargs["min_end_time"] = start_time api_kwargs["max_end_time"] = end_time api_kwargs["device_id"] = kwargs.get("device_id") api_kwargs["vehicle_id"] = kwargs.get("vehicle_id") else: if record_type == mds.EVENTS: api_kwargs["start_time"] = start_time api_kwargs["end_time"] = end_time elif record_type == mds.STATUS_CHANGES: api_kwargs["event_time"] = end_time elif record_type == mds.TRIPS: api_kwargs["end_time"] = end_time elif record_type == mds.VEHICLES: # currently no special query params for vehicles pass return client.get(record_type, **api_kwargs)
def ingest(record_type, **kwargs): """ Run the ingestion flow: 1. acquire data from files or API 2. optionally validate data, filtering invalid records 3. optionally write data to output files 4. optionally load valid records into the database """ version = mds.Version(kwargs.pop("version", common.DEFAULT_VERSION)) version.raise_if_unsupported() datasource = common.get_data(record_type, **kwargs, version=version) data_key = mds.Schema(record_type).data_key # validation and filtering if not kwargs.pop("no_validate", False): print(f"Validating {record_type} @ {version}") valid, errors, removed = validation.validate(record_type, datasource, version=version) seen = sum([len(d["data"][data_key]) for d in datasource]) passed = sum([len(v["data"][data_key]) for v in valid]) failed = sum([len(r["data"][data_key]) for r in removed]) print(f"{seen} records, {passed} passed, {failed} failed") else: print("Skipping data validation") valid = datasource removed = None # output to files if needed output = kwargs.pop("output", None) if output: f = mds.DataFile(record_type, output) f.dump_payloads(valid) if removed: f.dump_payloads(removed) # load to database loading = not kwargs.pop("no_load", False) if loading and len(valid) > 0: database.load(valid, record_type, **kwargs, version=version) else: print("Skipping data load") print(f"{record_type} complete")
def get_data(record_type, **kwargs): """ Get provider data as in-memory objects. """ if kwargs.get("source"): source = kwargs.get("source") print(f"Reading {record_type} from {source}") payloads = mds.DataFile(record_type, source).load_payloads() return payloads # required for API calls client = kwargs.pop("client") start_time = kwargs.pop("start_time") end_time = kwargs.pop("end_time") paging = not kwargs.get("no_paging") rate_limit = kwargs.get("rate_limit") version = kwargs.get("version") # package up for API requests api_kwargs = dict(paging=paging, rate_limit=rate_limit) print(f"Requesting {record_type} from {client.provider.provider_name}") print(f"Time range: {start_time.isoformat()} to {end_time.isoformat()}") if record_type == mds.STATUS_CHANGES: api_kwargs["start_time"] = start_time api_kwargs["end_time"] = end_time elif record_type == mds.TRIPS: api_kwargs["device_id"] = kwargs.get("device_id") api_kwargs["vehicle_id"] = kwargs.get("vehicle_id") if version < mds.Version("0.3.0"): api_kwargs["start_time"] = start_time api_kwargs["end_time"] = end_time else: api_kwargs["min_end_time"] = start_time api_kwargs["max_end_time"] = end_time return client.get(record_type, **api_kwargs)
day_status_changes, day_trips = gen.service_day( devices, date, hour_open, hour_closed, inactivity) status_changes.extend(day_status_changes) trips.extend(day_trips) date = date + datetime.timedelta(days=1) print(f"Finished day: {formatted_date} ({time.time() - t2} s)") print(f"Finished generating data ({time.time() - t1} s)") if len(status_changes) > 0 or len(trips) > 0: print("Generating data files") t1 = time.time() trips_file = mds.DataFile(mds.TRIPS, outputdir) print("Writing trips") t2 = time.time() payload = gen.make_payload(trips=trips) trips_file.dump_payloads(payload) print(f"Finished ({time.time() - t2} s)") sc_file = mds.DataFile(mds.STATUS_CHANGES, outputdir) print("Writing status_changes") t2 = time.time() payload = gen.make_payload(status_changes=status_changes)
if len(errors) > 0: print(f" Errors ({len(errors)} total)") for error in errors: print() try: for line in error.describe(): print(f" {line}") except: print(error) if args.output: print() print(f"Writing {record_type} to {args.output}") f = mds.DataFile(record_type, args.output) f.dump_payloads( original, file_name=f"{source}_{record_type}_original.json") f.dump_payloads(valid, file_name=f"{source}_{record_type}_valid.json") if len(invalid) > 0: f.dump_payloads( invalid, file_name=f"{source}_{record_type}_invalid.json") print() print(f"Finished validation ({common.count_seconds(now)}s)")