def plannerd_thread(sm=None, pm=None): config_realtime_process(5, Priority.CTRL_LOW) cloudlog.info("plannerd is waiting for CarParams") params = Params() CP = car.CarParams.from_bytes(params.get("CarParams", block=True)) cloudlog.info("plannerd got CarParams: %s", CP.carName) use_lanelines = False wide_camera = params.get_bool('WideCameraOnly') cloudlog.event("e2e mode", on=use_lanelines) longitudinal_planner = Planner(CP) lateral_planner = LateralPlanner(use_lanelines=use_lanelines, wide_camera=wide_camera) if sm is None: sm = messaging.SubMaster( ['carState', 'controlsState', 'radarState', 'modelV2'], poll=['radarState', 'modelV2'], ignore_avg_freq=['radarState']) if pm is None: pm = messaging.PubMaster(['longitudinalPlan', 'lateralPlan']) while True: sm.update() if sm.updated['modelV2']: lateral_planner.update(sm) lateral_planner.publish(sm, pm) longitudinal_planner.update(sm) longitudinal_planner.publish(sm, pm)
def finalize_update(wait_helper: WaitTimeHelper) -> None: """Take the current OverlayFS merged view and finalize a copy outside of OverlayFS, ready to be swapped-in at BASEDIR. Copy using shutil.copytree""" # Remove the update ready flag and any old updates cloudlog.info("creating finalized version of the overlay") set_consistent_flag(False) # Copy the merged overlay view and set the update ready flag if os.path.exists(FINALIZED): shutil.rmtree(FINALIZED) shutil.copytree(OVERLAY_MERGED, FINALIZED, symlinks=True) run(["git", "reset", "--hard"], FINALIZED) run(["git", "submodule", "foreach", "--recursive", "git", "reset"], FINALIZED) cloudlog.info("Starting git gc") t = time.monotonic() try: run(["git", "gc"], FINALIZED) cloudlog.event("Done git gc", duration=time.monotonic() - t) except subprocess.CalledProcessError: cloudlog.exception(f"Failed git gc, took {time.monotonic() - t:.3f} s") if wait_helper.shutdown: cloudlog.info("got interrupted finalizing overlay") else: set_consistent_flag(True) cloudlog.info("done finalizing overlay")
def _do_upload(upload_item, callback=None): path = upload_item.path compress = False # If file does not exist, but does exist without the .bz2 extension we will compress on the fly if not os.path.exists(path) and os.path.exists(strip_bz2_extension(path)): path = strip_bz2_extension(path) compress = True with open(path, "rb") as f: if compress: cloudlog.event("athena.upload_handler.compress", fn=path, fn_orig=upload_item.path) data = bz2.compress(f.read()) size = len(data) data = io.BytesIO(data) else: size = os.fstat(f.fileno()).st_size data = f if callback: data = CallbackReader(data, callback, size) return requests.put(upload_item.url, data=data, headers={ **upload_item.headers, 'Content-Length': str(size) }, timeout=30)
def upload(self, name, key, fn, network_type, metered): try: sz = os.path.getsize(fn) except OSError: cloudlog.exception("upload: getsize failed") return False cloudlog.event("upload_start", key=key, fn=fn, sz=sz, network_type=network_type, metered=metered) if sz == 0: # tag files of 0 size as uploaded success = True elif name in self.immediate_priority and sz > UPLOAD_QLOG_QCAM_MAX_SIZE: cloudlog.event("uploader_too_large", key=key, fn=fn, sz=sz) success = True else: start_time = time.monotonic() stat = self.normal_upload(key, fn) if stat is not None and stat.status_code in (200, 201, 401, 403, 412): self.last_filename = fn self.last_time = time.monotonic() - start_time self.last_speed = (sz / 1e6) / self.last_time success = True cloudlog.event("upload_success" if stat.status_code != 412 else "upload_ignored", key=key, fn=fn, sz=sz, network_type=network_type, metered=metered) else: success = False cloudlog.event("upload_failed", stat=stat, exc=self.last_exc, key=key, fn=fn, sz=sz, network_type=network_type, metered=metered) if success: # tag file as uploaded try: setxattr(fn, UPLOAD_ATTR_NAME, UPLOAD_ATTR_VALUE) except OSError: cloudlog.event("uploader_setxattr_failed", exc=self.last_exc, key=key, fn=fn, sz=sz) return success
def main(): try: set_core_affinity([0, 1, 2, 3]) except Exception: cloudlog.exception("failed to set core affinity") params = Params() dongle_id = params.get("DongleId", encoding='utf-8') UploadQueueCache.initialize(upload_queue) ws_uri = ATHENA_HOST + "/ws/v2/" + dongle_id api = Api(dongle_id) conn_retries = 0 while 1: try: cloudlog.event("athenad.main.connecting_ws", ws_uri=ws_uri) ws = create_connection(ws_uri, cookie="jwt=" + api.get_token(), enable_multithread=True, timeout=30.0) cloudlog.event("athenad.main.connected_ws", ws_uri=ws_uri) conn_retries = 0 cur_upload_items.clear() handle_long_poll(ws) except (KeyboardInterrupt, SystemExit): break except (ConnectionError, TimeoutError, WebSocketException): conn_retries += 1 params.delete("LastAthenaPingTime") except socket.timeout: params.delete("LastAthenaPingTime") except Exception: cloudlog.exception("athenad.main.exception") conn_retries += 1 params.delete("LastAthenaPingTime") time.sleep(backoff(conn_retries))
def list_upload_files(self): if not os.path.isdir(self.root): return self.immediate_size = 0 self.immediate_count = 0 for logname in listdir_by_creation(self.root): path = os.path.join(self.root, logname) try: names = os.listdir(path) except OSError: continue if any(name.endswith(".lock") for name in names): continue for name in sorted(names, key=self.get_upload_sort): key = os.path.join(logname, name) fn = os.path.join(path, name) # skip files already uploaded try: is_uploaded = getxattr(fn, UPLOAD_ATTR_NAME) except OSError: cloudlog.event("uploader_getxattr_failed", exc=self.last_exc, key=key, fn=fn) is_uploaded = True # deleter could have deleted if is_uploaded: continue try: if name in self.immediate_priority: self.immediate_count += 1 self.immediate_size += os.path.getsize(fn) except OSError: pass yield (name, key, fn)
def main() -> NoReturn: dongle_id = Params().get("DongleId", encoding='utf-8') def get_influxdb_line(measurement: str, value: Union[float, Dict[str, float]], timestamp: datetime, tags: dict) -> str: res = f"{measurement}" for k, v in tags.items(): res += f",{k}={str(v)}" res += " " if isinstance(value, float): value = {'value': value} for k, v in value.items(): res += f"{k}={v}," res += f"dongle_id=\"{dongle_id}\" {int(timestamp.timestamp() * 1e9)}\n" return res # open statistics socket ctx = zmq.Context().instance() sock = ctx.socket(zmq.PULL) sock.bind(STATS_SOCKET) # initialize stats directory Path(STATS_DIR).mkdir(parents=True, exist_ok=True) # initialize tags tags = { 'started': False, 'version': get_short_version(), 'branch': get_short_branch(), 'dirty': is_dirty(), 'origin': get_normalized_origin(), 'deviceType': HARDWARE.get_device_type(), } # subscribe to deviceState for started state sm = SubMaster(['deviceState']) idx = 0 last_flush_time = time.monotonic() gauges = {} samples: Dict[str, List[float]] = defaultdict(list) while True: started_prev = sm['deviceState'].started sm.update() # Update metrics while True: try: metric = sock.recv_string(zmq.NOBLOCK) try: metric_type = metric.split('|')[1] metric_name = metric.split(':')[0] metric_value = float(metric.split('|')[0].split(':')[1]) if metric_type == METRIC_TYPE.GAUGE: gauges[metric_name] = metric_value elif metric_type == METRIC_TYPE.SAMPLE: samples[metric_name].append(metric_value) else: cloudlog.event("unknown metric type", metric_type=metric_type) except Exception: cloudlog.event("malformed metric", metric=metric) except zmq.error.Again: break # flush when started state changes or after FLUSH_TIME_S if (time.monotonic() > last_flush_time + STATS_FLUSH_TIME_S) or (sm['deviceState'].started != started_prev): result = "" current_time = datetime.utcnow().replace(tzinfo=timezone.utc) tags['started'] = sm['deviceState'].started for key, value in gauges.items(): result += get_influxdb_line(f"gauge.{key}", value, current_time, tags) for key, values in samples.items(): values.sort() sample_count = len(values) sample_sum = sum(values) stats = { 'count': sample_count, 'min': values[0], 'max': values[-1], 'mean': sample_sum / sample_count, } for percentile in [0.05, 0.5, 0.95]: value = values[int(round(percentile * (sample_count - 1)))] stats[f"p{int(percentile * 100)}"] = value result += get_influxdb_line(f"sample.{key}", stats, current_time, tags) # clear intermediate data gauges.clear() samples.clear() last_flush_time = time.monotonic() # check that we aren't filling up the drive if len(os.listdir(STATS_DIR)) < STATS_DIR_FILE_LIMIT: if len(result) > 0: stats_path = os.path.join(STATS_DIR, f"{current_time.timestamp():.0f}_{idx}") with atomic_write_in_dir(stats_path) as f: f.write(result) idx += 1 else: cloudlog.error("stats dir full")
def update_events(self, CS): """Compute carEvents from carState""" self.events.clear() # Add startup event if self.startup_event is not None: self.events.add(self.startup_event) self.startup_event = None # Don't add any more events if not initialized if not self.initialized: self.events.add(EventName.controlsInitializing) return # Disable on rising edge of accelerator or brake. Also disable on brake when speed > 0 if (CS.gasPressed and not self.CS_prev.gasPressed and self.disengage_on_accelerator) or \ (CS.brakePressed and (not self.CS_prev.brakePressed or not CS.standstill)): self.events.add(EventName.pedalPressed) if CS.gasPressed: self.events.add( EventName.pedalPressedPreEnable if self. disengage_on_accelerator else EventName.gasPressedOverride) if not self.CP.notCar: self.events.add_from_msg(self.sm['driverMonitoringState'].events) # Handle car events. Ignore when CAN is invalid if CS.canTimeout: self.events.add(EventName.canBusMissing) elif not CS.canValid: self.events.add(EventName.canError) else: self.events.add_from_msg(CS.events) # Create events for temperature, disk space, and memory if self.sm['deviceState'].thermalStatus >= ThermalStatus.red: self.events.add(EventName.overheat) if self.sm['deviceState'].freeSpacePercent < 7 and not SIMULATION: # under 7% of space free no enable allowed self.events.add(EventName.outOfSpace) # TODO: make tici threshold the same if self.sm['deviceState'].memoryUsagePercent > 90 and not SIMULATION: self.events.add(EventName.lowMemory) # TODO: enable this once loggerd CPU usage is more reasonable #cpus = list(self.sm['deviceState'].cpuUsagePercent) #if max(cpus, default=0) > 95 and not SIMULATION: # self.events.add(EventName.highCpuUsage) # Alert if fan isn't spinning for 5 seconds if self.sm['peripheralState'].pandaType == PandaType.dos: if self.sm['peripheralState'].fanSpeedRpm == 0 and self.sm[ 'deviceState'].fanSpeedPercentDesired > 50: if (self.sm.frame - self.last_functional_fan_frame) * DT_CTRL > 5.0: self.events.add(EventName.fanMalfunction) else: self.last_functional_fan_frame = self.sm.frame # Handle calibration status cal_status = self.sm['liveCalibration'].calStatus if cal_status != Calibration.CALIBRATED: if cal_status == Calibration.UNCALIBRATED: self.events.add(EventName.calibrationIncomplete) else: self.events.add(EventName.calibrationInvalid) # Handle lane change if self.sm[ 'lateralPlan'].laneChangeState == LaneChangeState.preLaneChange: direction = self.sm['lateralPlan'].laneChangeDirection if (CS.leftBlindspot and direction == LaneChangeDirection.left) or \ (CS.rightBlindspot and direction == LaneChangeDirection.right): self.events.add(EventName.laneChangeBlocked) else: if direction == LaneChangeDirection.left: self.events.add(EventName.preLaneChangeLeft) else: self.events.add(EventName.preLaneChangeRight) elif self.sm['lateralPlan'].laneChangeState in ( LaneChangeState.laneChangeStarting, LaneChangeState.laneChangeFinishing): self.events.add(EventName.laneChange) for i, pandaState in enumerate(self.sm['pandaStates']): # All pandas must match the list of safetyConfigs, and if outside this list, must be silent or noOutput if i < len(self.CP.safetyConfigs): safety_mismatch = pandaState.safetyModel != self.CP.safetyConfigs[i].safetyModel or \ pandaState.safetyParam != self.CP.safetyConfigs[i].safetyParam or \ pandaState.alternativeExperience != self.CP.alternativeExperience else: safety_mismatch = pandaState.safetyModel not in IGNORED_SAFETY_MODES if safety_mismatch or self.mismatch_counter >= 200: self.events.add(EventName.controlsMismatch) if log.PandaState.FaultType.relayMalfunction in pandaState.faults: self.events.add(EventName.relayMalfunction) # Handle HW and system malfunctions # Order is very intentional here. Be careful when modifying this. # All events here should at least have NO_ENTRY and SOFT_DISABLE. num_events = len(self.events) not_running = { p.name for p in self.sm['managerState'].processes if not p.running and p.shouldBeRunning } if self.sm.rcv_frame['managerState'] and (not_running - IGNORE_PROCESSES): self.events.add(EventName.processNotRunning) else: if not SIMULATION and not self.rk.lagging: if not self.sm.all_alive(self.camera_packets): self.events.add(EventName.cameraMalfunction) elif not self.sm.all_freq_ok(self.camera_packets): self.events.add(EventName.cameraFrameRate) if self.rk.lagging: self.events.add(EventName.controlsdLagging) if len(self.sm['radarState'].radarErrors): self.events.add(EventName.radarFault) if not self.sm.valid['pandaStates']: self.events.add(EventName.usbError) # generic catch-all. ideally, a more specific event should be added above instead no_system_errors = len(self.events) != num_events if (not self.sm.all_checks() or self.can_rcv_error ) and no_system_errors and CS.canValid and not CS.canTimeout: if not self.sm.all_alive(): self.events.add(EventName.commIssue) elif not self.sm.all_freq_ok(): self.events.add(EventName.commIssueAvgFreq) else: # invalid or can_rcv_error. self.events.add(EventName.commIssue) logs = { 'invalid': [s for s, valid in self.sm.valid.items() if not valid], 'not_alive': [s for s, alive in self.sm.alive.items() if not alive], 'not_freq_ok': [s for s, freq_ok in self.sm.freq_ok.items() if not freq_ok], 'can_error': self.can_rcv_error, } if logs != self.logged_comm_issue: cloudlog.event("commIssue", error=True, **logs) self.logged_comm_issue = logs else: self.logged_comm_issue = None if not self.sm['liveParameters'].valid: self.events.add(EventName.vehicleModelInvalid) if not self.sm['lateralPlan'].mpcSolutionValid: self.events.add(EventName.plannerError) if not self.sm['liveLocationKalman'].sensorsOK and not NOSENSOR: if self.sm.frame > 5 / DT_CTRL: # Give locationd some time to receive all the inputs self.events.add(EventName.sensorDataInvalid) if not self.sm['liveLocationKalman'].posenetOK: self.events.add(EventName.posenetInvalid) if not self.sm['liveLocationKalman'].deviceStable: self.events.add(EventName.deviceFalling) if not REPLAY: # Check for mismatch between openpilot and car's PCM cruise_mismatch = CS.cruiseState.enabled and ( not self.enabled or not self.CP.pcmCruise) self.cruise_mismatch_counter = self.cruise_mismatch_counter + 1 if cruise_mismatch else 0 if self.cruise_mismatch_counter > int(6. / DT_CTRL): self.events.add(EventName.cruiseMismatch) # Check for FCW stock_long_is_braking = self.enabled and not self.CP.openpilotLongitudinalControl and CS.aEgo < -1.25 model_fcw = self.sm[ 'modelV2'].meta.hardBrakePredicted and not CS.brakePressed and not stock_long_is_braking planner_fcw = self.sm['longitudinalPlan'].fcw and self.enabled if planner_fcw or model_fcw: self.events.add(EventName.fcw) for m in messaging.drain_sock(self.log_sock, wait_for_one=False): try: msg = m.androidLog.message if any(err in msg for err in ("ERROR_CRC", "ERROR_ECC", "ERROR_STREAM_UNDERFLOW", "APPLY FAILED")): csid = msg.split("CSID:")[-1].split(" ")[0] evt = CSID_MAP.get(csid, None) if evt is not None: self.events.add(evt) except UnicodeDecodeError: pass # TODO: fix simulator if not SIMULATION: if not NOSENSOR: if not self.sm['liveLocationKalman'].gpsOK and ( self.distance_traveled > 1000): # Not show in first 1 km to allow for driving out of garage. This event shows after 5 minutes self.events.add(EventName.noGps) if self.sm['modelV2'].frameDropPerc > 20: self.events.add(EventName.modeldLagging) if self.sm['liveLocationKalman'].excessiveResets: self.events.add(EventName.localizerMalfunction) # Only allow engagement with brake pressed when stopped behind another stopped car speeds = self.sm['longitudinalPlan'].speeds if len(speeds) > 1: v_future = speeds[-1] else: v_future = 100.0 if CS.brakePressed and v_future >= self.CP.vEgoStarting \ and self.CP.openpilotLongitudinalControl and CS.vEgo < 0.3: self.events.add(EventName.noTarget)
def main() -> None: params = Params() if params.get_bool("DisableUpdates"): cloudlog.warning("updates are disabled by the DisableUpdates param") exit(0) ov_lock_fd = open(LOCK_FILE, 'w') try: fcntl.flock(ov_lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB) except OSError as e: raise RuntimeError( "couldn't get overlay lock; is another instance running?") from e # Set low io priority proc = psutil.Process() if psutil.LINUX: proc.ionice(psutil.IOPRIO_CLASS_BE, value=7) # Check if we just performed an update if Path(os.path.join(STAGING_ROOT, "old_openpilot")).is_dir(): cloudlog.event("update installed") if not params.get("InstallDate"): t = datetime.datetime.utcnow().isoformat() params.put("InstallDate", t.encode('utf8')) overlay_init = Path(os.path.join(BASEDIR, ".overlay_init")) overlay_init.unlink(missing_ok=True) update_failed_count = 0 # TODO: Load from param? wait_helper = WaitTimeHelper(proc) # Run the update loop while not wait_helper.shutdown: wait_helper.ready_event.clear() # Attempt an update exception = None new_version = False update_failed_count += 1 try: init_overlay() # TODO: still needed? skip this and just fetch? # Lightweight internt check internet_ok, update_available = check_for_update() if internet_ok and not update_available: update_failed_count = 0 # Fetch update if internet_ok: new_version = fetch_update(wait_helper) update_failed_count = 0 except subprocess.CalledProcessError as e: cloudlog.event("update process failed", cmd=e.cmd, output=e.output, returncode=e.returncode) exception = f"command failed: {e.cmd}\n{e.output}" overlay_init.unlink(missing_ok=True) except Exception as e: cloudlog.exception("uncaught updated exception, shouldn't happen") exception = str(e) overlay_init.unlink(missing_ok=True) if not wait_helper.shutdown: try: set_params(new_version, update_failed_count, exception) except Exception: cloudlog.exception( "uncaught updated exception while setting params, shouldn't happen" ) # infrequent attempts if we successfully updated recently wait_helper.sleep(5 * 60 if update_failed_count > 0 else 90 * 60) dismount_overlay()
def fingerprint(logcan, sendcan): fixed_fingerprint = os.environ.get('FINGERPRINT', "") skip_fw_query = os.environ.get('SKIP_FW_QUERY', False) ecu_responses = set() if not fixed_fingerprint and not skip_fw_query: # Vin query only reliably works thorugh OBDII bus = 1 cached_params = Params().get("CarParamsCache") if cached_params is not None: cached_params = car.CarParams.from_bytes(cached_params) if cached_params.carName == "mock": cached_params = None if cached_params is not None and len(cached_params.carFw) > 0 and cached_params.carVin is not VIN_UNKNOWN: cloudlog.warning("Using cached CarParams") vin = cached_params.carVin car_fw = list(cached_params.carFw) else: cloudlog.warning("Getting VIN & FW versions") _, vin = get_vin(logcan, sendcan, bus) ecu_responses = get_present_ecus(logcan, sendcan) car_fw = get_fw_versions(logcan, sendcan) exact_fw_match, fw_candidates = match_fw_to_car(car_fw) else: vin = VIN_UNKNOWN exact_fw_match, fw_candidates, car_fw = True, set(), [] if len(vin) != 17: cloudlog.event("Malformed VIN", vin=vin, error=True) vin = VIN_UNKNOWN cloudlog.warning("VIN %s", vin) Params().put("CarVin", vin) finger = gen_empty_fingerprint() candidate_cars = {i: all_legacy_fingerprint_cars() for i in [0, 1]} # attempt fingerprint on both bus 0 and 1 frame = 0 frame_fingerprint = 25 # 0.25s car_fingerprint = None done = False # drain CAN socket so we always get the latest messages messaging.drain_sock_raw(logcan) while not done: a = get_one_can(logcan) for can in a.can: # The fingerprint dict is generated for all buses, this way the car interface # can use it to detect a (valid) multipanda setup and initialize accordingly if can.src < 128: if can.src not in finger: finger[can.src] = {} finger[can.src][can.address] = len(can.dat) for b in candidate_cars: # Ignore extended messages and VIN query response. if can.src == b and can.address < 0x800 and can.address not in (0x7df, 0x7e0, 0x7e8): candidate_cars[b] = eliminate_incompatible_cars(can, candidate_cars[b]) # if we only have one car choice and the time since we got our first # message has elapsed, exit for b in candidate_cars: if len(candidate_cars[b]) == 1 and frame > frame_fingerprint: # fingerprint done car_fingerprint = candidate_cars[b][0] # bail if no cars left or we've been waiting for more than 2s failed = (all(len(cc) == 0 for cc in candidate_cars.values()) and frame > frame_fingerprint) or frame > 200 succeeded = car_fingerprint is not None done = failed or succeeded frame += 1 exact_match = True source = car.CarParams.FingerprintSource.can # If FW query returns exactly 1 candidate, use it if len(fw_candidates) == 1: car_fingerprint = list(fw_candidates)[0] source = car.CarParams.FingerprintSource.fw exact_match = exact_fw_match if fixed_fingerprint: car_fingerprint = fixed_fingerprint source = car.CarParams.FingerprintSource.fixed cloudlog.event("fingerprinted", car_fingerprint=car_fingerprint, source=source, fuzzy=not exact_match, fw_count=len(car_fw), ecu_responses=ecu_responses, error=True) return car_fingerprint, finger, vin, car_fw, source, exact_match
def upload_handler(end_event: threading.Event) -> None: sm = messaging.SubMaster(['deviceState']) tid = threading.get_ident() while not end_event.is_set(): cur_upload_items[tid] = None try: cur_upload_items[tid] = upload_queue.get(timeout=1)._replace( current=True) if cur_upload_items[tid].id in cancelled_uploads: cancelled_uploads.remove(cur_upload_items[tid].id) continue # Remove item if too old age = datetime.now() - datetime.fromtimestamp( cur_upload_items[tid].created_at / 1000) if age.total_seconds() > MAX_AGE: cloudlog.event("athena.upload_handler.expired", item=cur_upload_items[tid], error=True) continue # Check if uploading over metered connection is allowed sm.update(0) metered = sm['deviceState'].networkMetered network_type = sm['deviceState'].networkType.raw if metered and (not cur_upload_items[tid].allow_cellular): retry_upload(tid, end_event, False) continue try: def cb(sz, cur): # Abort transfer if connection changed to metered after starting upload sm.update(0) metered = sm['deviceState'].networkMetered if metered and (not cur_upload_items[tid].allow_cellular): raise AbortTransferException cur_upload_items[tid] = cur_upload_items[tid]._replace( progress=cur / sz if sz else 1) fn = cur_upload_items[tid].path try: sz = os.path.getsize(fn) except OSError: sz = -1 cloudlog.event("athena.upload_handler.upload_start", fn=fn, sz=sz, network_type=network_type, metered=metered, retry_count=cur_upload_items[tid].retry_count) response = _do_upload(cur_upload_items[tid], cb) if response.status_code not in (200, 201, 401, 403, 412): cloudlog.event("athena.upload_handler.retry", status_code=response.status_code, fn=fn, sz=sz, network_type=network_type, metered=metered) retry_upload(tid, end_event) else: cloudlog.event("athena.upload_handler.success", fn=fn, sz=sz, network_type=network_type, metered=metered) UploadQueueCache.cache(upload_queue) except (requests.exceptions.Timeout, requests.exceptions.ConnectionError, requests.exceptions.SSLError): cloudlog.event("athena.upload_handler.timeout", fn=fn, sz=sz, network_type=network_type, metered=metered) retry_upload(tid, end_event) except AbortTransferException: cloudlog.event("athena.upload_handler.abort", fn=fn, sz=sz, network_type=network_type, metered=metered) retry_upload(tid, end_event, False) except queue.Empty: pass except Exception: cloudlog.exception("athena.upload_handler.exception")
def main() -> NoReturn: first_run = True params = Params() while True: try: params.delete("PandaSignatures") # Flash all Pandas in DFU mode for p in PandaDFU.list(): cloudlog.info( f"Panda in DFU mode found, flashing recovery {p}") PandaDFU(p).recover() time.sleep(1) panda_serials = Panda.list() if len(panda_serials) == 0: if first_run: cloudlog.info("Resetting internal panda") HARDWARE.reset_internal_panda() time.sleep(2) # wait to come back up continue cloudlog.info( f"{len(panda_serials)} panda(s) found, connecting - {panda_serials}" ) # Flash pandas pandas: List[Panda] = [] for serial in panda_serials: pandas.append(flash_panda(serial)) # check health for lost heartbeat for panda in pandas: health = panda.health() if health["heartbeat_lost"]: params.put_bool("PandaHeartbeatLost", True) cloudlog.event("heartbeat lost", deviceState=health, serial=panda.get_usb_serial()) if first_run: cloudlog.info(f"Resetting panda {panda.get_usb_serial()}") panda.reset() # sort pandas to have deterministic order pandas.sort(key=cmp_to_key(panda_sort_cmp)) panda_serials = list(map(lambda p: p.get_usb_serial(), pandas)) # type: ignore # log panda fw versions params.put("PandaSignatures", b','.join(p.get_signature() for p in pandas)) # close all pandas for p in pandas: p.close() except (usb1.USBErrorNoDevice, usb1.USBErrorPipe): # a panda was disconnected while setting everything up. let's try again cloudlog.exception("Panda USB exception while setting up") continue first_run = False # run boardd with all connected serials as arguments os.environ['MANAGER_DAEMON'] = 'boardd' os.chdir(os.path.join(BASEDIR, "selfdrive/boardd")) subprocess.run(["./boardd", *panda_serials], check=True)
def thermald_thread(end_event, hw_queue): pm = messaging.PubMaster(['deviceState']) sm = messaging.SubMaster([ "peripheralState", "gpsLocationExternal", "controlsState", "pandaStates" ], poll=["pandaStates"]) count = 0 onroad_conditions: Dict[str, bool] = { "ignition": False, } startup_conditions: Dict[str, bool] = {} startup_conditions_prev: Dict[str, bool] = {} off_ts = None started_ts = None started_seen = False thermal_status = ThermalStatus.green last_hw_state = HardwareState( network_type=NetworkType.none, network_metered=False, network_strength=NetworkStrength.unknown, network_info=None, nvme_temps=[], modem_temps=[], ) current_filter = FirstOrderFilter(0., CURRENT_TAU, DT_TRML) temp_filter = FirstOrderFilter(0., TEMP_TAU, DT_TRML) should_start_prev = False in_car = False engaged_prev = False params = Params() power_monitor = PowerMonitoring() HARDWARE.initialize_hardware() thermal_config = HARDWARE.get_thermal_config() fan_controller = None while not end_event.is_set(): sm.update(PANDA_STATES_TIMEOUT) pandaStates = sm['pandaStates'] peripheralState = sm['peripheralState'] msg = read_thermal(thermal_config) if sm.updated['pandaStates'] and len(pandaStates) > 0: # Set ignition based on any panda connected onroad_conditions["ignition"] = any( ps.ignitionLine or ps.ignitionCan for ps in pandaStates if ps.pandaType != log.PandaState.PandaType.unknown) pandaState = pandaStates[0] in_car = pandaState.harnessStatus != log.PandaState.HarnessStatus.notConnected # Setup fan handler on first connect to panda if fan_controller is None and peripheralState.pandaType != log.PandaState.PandaType.unknown: if TICI: fan_controller = TiciFanController() elif (sec_since_boot() - sm.rcv_time['pandaStates']) > DISCONNECT_TIMEOUT: if onroad_conditions["ignition"]: onroad_conditions["ignition"] = False cloudlog.error("panda timed out onroad") try: last_hw_state = hw_queue.get_nowait() except queue.Empty: pass msg.deviceState.freeSpacePercent = get_available_percent(default=100.0) msg.deviceState.memoryUsagePercent = int( round(psutil.virtual_memory().percent)) msg.deviceState.cpuUsagePercent = [ int(round(n)) for n in psutil.cpu_percent(percpu=True) ] msg.deviceState.gpuUsagePercent = int( round(HARDWARE.get_gpu_usage_percent())) msg.deviceState.networkType = last_hw_state.network_type msg.deviceState.networkMetered = last_hw_state.network_metered msg.deviceState.networkStrength = last_hw_state.network_strength if last_hw_state.network_info is not None: msg.deviceState.networkInfo = last_hw_state.network_info msg.deviceState.nvmeTempC = last_hw_state.nvme_temps msg.deviceState.modemTempC = last_hw_state.modem_temps msg.deviceState.screenBrightnessPercent = HARDWARE.get_screen_brightness( ) msg.deviceState.usbOnline = HARDWARE.get_usb_present() current_filter.update(msg.deviceState.batteryCurrent / 1e6) max_comp_temp = temp_filter.update( max(max(msg.deviceState.cpuTempC), msg.deviceState.memoryTempC, max(msg.deviceState.gpuTempC))) if fan_controller is not None: msg.deviceState.fanSpeedPercentDesired = fan_controller.update( max_comp_temp, onroad_conditions["ignition"]) is_offroad_for_5_min = (started_ts is None) and ( (not started_seen) or (off_ts is None) or (sec_since_boot() - off_ts > 60 * 5)) if is_offroad_for_5_min and max_comp_temp > OFFROAD_DANGER_TEMP: # If device is offroad we want to cool down before going onroad # since going onroad increases load and can make temps go over 107 thermal_status = ThermalStatus.danger else: current_band = THERMAL_BANDS[thermal_status] band_idx = list(THERMAL_BANDS.keys()).index(thermal_status) if current_band.min_temp is not None and max_comp_temp < current_band.min_temp: thermal_status = list(THERMAL_BANDS.keys())[band_idx - 1] elif current_band.max_temp is not None and max_comp_temp > current_band.max_temp: thermal_status = list(THERMAL_BANDS.keys())[band_idx + 1] # **** starting logic **** # Ensure date/time are valid now = datetime.datetime.utcnow() startup_conditions["time_valid"] = (now.year > 2020) or ( now.year == 2020 and now.month >= 10) set_offroad_alert_if_changed("Offroad_InvalidTime", (not startup_conditions["time_valid"])) startup_conditions["up_to_date"] = params.get( "Offroad_ConnectivityNeeded") is None or params.get_bool( "DisableUpdates") or params.get_bool("SnoozeUpdate") startup_conditions["not_uninstalling"] = not params.get_bool( "DoUninstall") startup_conditions["accepted_terms"] = params.get( "HasAcceptedTerms") == terms_version # with 2% left, we killall, otherwise the phone will take a long time to boot startup_conditions["free_space"] = msg.deviceState.freeSpacePercent > 2 startup_conditions["completed_training"] = params.get("CompletedTrainingVersion") == training_version or \ params.get_bool("Passive") startup_conditions["not_driver_view"] = not params.get_bool( "IsDriverViewEnabled") startup_conditions["not_taking_snapshot"] = not params.get_bool( "IsTakingSnapshot") # if any CPU gets above 107 or the battery gets above 63, kill all processes # controls will warn with CPU above 95 or battery above 60 onroad_conditions[ "device_temp_good"] = thermal_status < ThermalStatus.danger set_offroad_alert_if_changed( "Offroad_TemperatureTooHigh", (not onroad_conditions["device_temp_good"])) # TODO: this should move to TICI.initialize_hardware, but we currently can't import params there if TICI: if not os.path.isfile("/persist/comma/living-in-the-moment"): if not Path("/data/media").is_mount(): set_offroad_alert_if_changed("Offroad_StorageMissing", True) else: # check for bad NVMe try: with open("/sys/block/nvme0n1/device/model") as f: model = f.read().strip() if not model.startswith( "Samsung SSD 980") and params.get( "Offroad_BadNvme") is None: set_offroad_alert_if_changed( "Offroad_BadNvme", True) cloudlog.event("Unsupported NVMe", model=model, error=True) except Exception: pass # Handle offroad/onroad transition should_start = all(onroad_conditions.values()) if started_ts is None: should_start = should_start and all(startup_conditions.values()) if should_start != should_start_prev or (count == 0): params.put_bool("IsOnroad", should_start) params.put_bool("IsOffroad", not should_start) params.put_bool("IsEngaged", False) engaged_prev = False HARDWARE.set_power_save(not should_start) if sm.updated['controlsState']: engaged = sm['controlsState'].enabled if engaged != engaged_prev: params.put_bool("IsEngaged", engaged) engaged_prev = engaged try: with open('/dev/kmsg', 'w') as kmsg: kmsg.write(f"<3>[thermald] engaged: {engaged}\n") except Exception: pass if should_start: off_ts = None if started_ts is None: started_ts = sec_since_boot() started_seen = True else: if onroad_conditions["ignition"] and (startup_conditions != startup_conditions_prev): cloudlog.event("Startup blocked", startup_conditions=startup_conditions, onroad_conditions=onroad_conditions) started_ts = None if off_ts is None: off_ts = sec_since_boot() # Offroad power monitoring power_monitor.calculate(peripheralState, onroad_conditions["ignition"]) msg.deviceState.offroadPowerUsageUwh = power_monitor.get_power_used() msg.deviceState.carBatteryCapacityUwh = max( 0, power_monitor.get_car_battery_capacity()) current_power_draw = HARDWARE.get_current_power_draw() statlog.sample("power_draw", current_power_draw) msg.deviceState.powerDrawW = current_power_draw som_power_draw = HARDWARE.get_som_power_draw() statlog.sample("som_power_draw", som_power_draw) msg.deviceState.somPowerDrawW = som_power_draw # Check if we need to disable charging (handled by boardd) msg.deviceState.chargingDisabled = power_monitor.should_disable_charging( onroad_conditions["ignition"], in_car, off_ts) # Check if we need to shut down if power_monitor.should_shutdown(peripheralState, onroad_conditions["ignition"], in_car, off_ts, started_seen): cloudlog.warning(f"shutting device down, offroad since {off_ts}") params.put_bool("DoShutdown", True) msg.deviceState.chargingError = current_filter.x > 0. and msg.deviceState.batteryPercent < 90 # if current is positive, then battery is being discharged msg.deviceState.started = started_ts is not None msg.deviceState.startedMonoTime = int(1e9 * (started_ts or 0)) last_ping = params.get("LastAthenaPingTime") if last_ping is not None: msg.deviceState.lastAthenaPingTime = int(last_ping) msg.deviceState.thermalStatus = thermal_status pm.send("deviceState", msg) should_start_prev = should_start startup_conditions_prev = startup_conditions.copy() # Log to statsd statlog.gauge("free_space_percent", msg.deviceState.freeSpacePercent) statlog.gauge("gpu_usage_percent", msg.deviceState.gpuUsagePercent) statlog.gauge("memory_usage_percent", msg.deviceState.memoryUsagePercent) for i, usage in enumerate(msg.deviceState.cpuUsagePercent): statlog.gauge(f"cpu{i}_usage_percent", usage) for i, temp in enumerate(msg.deviceState.cpuTempC): statlog.gauge(f"cpu{i}_temperature", temp) for i, temp in enumerate(msg.deviceState.gpuTempC): statlog.gauge(f"gpu{i}_temperature", temp) statlog.gauge("memory_temperature", msg.deviceState.memoryTempC) statlog.gauge("ambient_temperature", msg.deviceState.ambientTempC) for i, temp in enumerate(msg.deviceState.pmicTempC): statlog.gauge(f"pmic{i}_temperature", temp) for i, temp in enumerate(last_hw_state.nvme_temps): statlog.gauge(f"nvme_temperature{i}", temp) for i, temp in enumerate(last_hw_state.modem_temps): statlog.gauge(f"modem_temperature{i}", temp) statlog.gauge("fan_speed_percent_desired", msg.deviceState.fanSpeedPercentDesired) statlog.gauge("screen_brightness_percent", msg.deviceState.screenBrightnessPercent) # report to server once every 10 minutes if (count % int(600. / DT_TRML)) == 0: cloudlog.event( "STATUS_PACKET", count=count, pandaStates=[ strip_deprecated_keys(p.to_dict()) for p in pandaStates ], peripheralState=strip_deprecated_keys( peripheralState.to_dict()), location=(strip_deprecated_keys( sm["gpsLocationExternal"].to_dict()) if sm.alive["gpsLocationExternal"] else None), deviceState=strip_deprecated_keys(msg.to_dict())) count += 1
def hw_state_thread(end_event, hw_queue): """Handles non critical hardware state, and sends over queue""" count = 0 registered_count = 0 prev_hw_state = None modem_version = None modem_nv = None modem_configured = False while not end_event.is_set(): # these are expensive calls. update every 10s if (count % int(10. / DT_TRML)) == 0: try: network_type = HARDWARE.get_network_type() modem_temps = HARDWARE.get_modem_temperatures() if len(modem_temps) == 0 and prev_hw_state is not None: modem_temps = prev_hw_state.modem_temps # Log modem version once if AGNOS and ((modem_version is None) or (modem_nv is None)): modem_version = HARDWARE.get_modem_version() # pylint: disable=assignment-from-none modem_nv = HARDWARE.get_modem_nv() # pylint: disable=assignment-from-none if (modem_version is not None) and (modem_nv is not None): cloudlog.event("modem version", version=modem_version, nv=modem_nv) hw_state = HardwareState( network_type=network_type, network_metered=HARDWARE.get_network_metered(network_type), network_strength=HARDWARE.get_network_strength( network_type), network_info=HARDWARE.get_network_info(), nvme_temps=HARDWARE.get_nvme_temperatures(), modem_temps=modem_temps, ) try: hw_queue.put_nowait(hw_state) except queue.Full: pass if AGNOS and (hw_state.network_info is not None) and (hw_state.network_info.get( 'state', None) == "REGISTERED"): registered_count += 1 else: registered_count = 0 if registered_count > 10: cloudlog.warning( f"Modem stuck in registered state {hw_state.network_info}. nmcli conn up lte" ) os.system("nmcli conn up lte") registered_count = 0 # TODO: remove this once the config is in AGNOS if not modem_configured and len(HARDWARE.get_sim_info().get( 'sim_id', '')) > 0: cloudlog.warning("configuring modem") HARDWARE.configure_modem() modem_configured = True prev_hw_state = hw_state except Exception: cloudlog.exception("Error getting hardware state") count += 1 time.sleep(DT_TRML)