class GCERouteState(ResourceState): """State of a GCE Route""" route_name = attr_property("gce.route.name", None) description = attr_property("gce.route.description", None) network = attr_property("gce.route.network", None) priority = attr_property("gce.route.priority", None, int) nextHop = attr_property("gce.route.nextHop", None) destination = attr_property("gce.route.destination", None) tags = attr_property("gce.route.tags", None, "json") defn_properties = [ "route_name", "destination", "priority", "network", "tags", "nextHop", "description", ] nix_name = "gceRoutes" @classmethod def get_type(cls): return "gce-route" def __init__(self, depl, name, id): ResourceState.__init__(self, depl, name, id) def show_type(self): s = super(GCERouteState, self).show_type() if self.state == self.UP: s = "{0} [{1}]".format(s, self.name) return s def _destroy_route(self): try: route = self.connect().ex_get_route(self.route_name) route.destroy() except libcloud.common.google.ResourceNotFoundError: self.warn("tried to destroy {0} which didn't exist".format( self.full_name)) def create_after(self, resources, defn): return {r for r in resources if isinstance(r, backends.MachineState)} @property def full_name(self): return "GCE route '{0}'".format(self.name) def _route_is_missing(self): try: self.connect().ex_get_route(self.route_name) return False except libcloud.common.google.ResourceNotFoundError: return True def _real_state_differ(self): """ Check If any of the route's properties has a different value than that in the state""" route = self.connect().ex_get_route(self.route_name) # libcloud only expose these properties in the GCERoute class. # "description" and "nextHop" can't be checked. route_properties = { "name": "route_name", "dest_range": "destination", "tags": "tags", "priority": "priority", } # This shouldn't happen, unless you delete the # route manually and create another one with the # same name, but different properties. real_state_differ = any([ getattr(route, route_attr) != getattr(self, self_attr) for route_attr, self_attr in route_properties.items() ]) # We need to check the network in separate, since GCE API add the project and the region network_differ = route.network.split("/")[-1] != self.network return real_state_differ or network_differ def _get_machine_property(self, machine_name, property): """Get a property from the machine """ machine = self.depl.get_machine(machine_name) return getattr(machine, property) def _check(self): if self._route_is_missing(): self.state = self.MISSING return False if self._real_state_differ(): if self.depl.logger.confirm( "Route properties are different from those in the state, " "destroy route {0}?".format(self.route_name)): self._destroy_route() self.state = self.MISSING return True def create(self, defn, check, allow_reboot, allow_recreate): self.copy_credentials(defn) if check: if self._route_is_missing(): self.state = self.MISSING elif self._real_state_differ(): if allow_recreate: self._destroy_route() self.state = self.MISSING else: self.warn( "Route properties are different from those in the state," " use --allow-recreate to delete the route and deploy it again." ) if defn.destination.startswith("res-"): # if a machine resource was used for the destination, get # the public IP of the instance into the definition of the # route machine_name = defn.destination[4:] defn.destination = "{ip}/32".format( ip=self._get_machine_property(machine_name, "public_ipv4")) if self.is_deployed() and self.properties_changed(defn): if allow_recreate: self.log("deleting route {0}...".format(self.route_name)) self._destroy_route() self.state = self.MISSING else: raise Exception( "GCE routes are immutable, you need to use --allow-recreate." ) if self.state != self.UP: with self.depl._db: self.log("creating {0}...".format(self.full_name)) self.copy_properties(defn) if defn.nextHop and defn.nextHop.startswith("res-"): try: nextHop_name = self._get_machine_property( defn.nextHop[4:], "machine_name") defn.nextHop = self.connect().ex_get_node(nextHop_name) except AttributeError: raise Exception("nextHop can only be a GCE machine.") raise except libcloud.common.google.ResourceNotFoundError: raise Exception( "The machine {0} isn't deployed, it need to be before it's added as nextHop" .format(nextHop_name)) args = [getattr(defn, attr) for attr in self.defn_properties] try: self.connect().ex_create_route(*args) except libcloud.common.google.ResourceExistsError: raise Exception( "tried creating a route that already exists.") self.state = self.UP def destroy(self, wipe=False): if self.state == self.UP: if not self.depl.logger.confirm( "are you sure you want to destroy {0}?".format( self.full_name)): return False self.log("destroying {0}...".format(self.full_name)) self._destroy_route() return True
class NoneState(MachineState[NoneDefinition]): """State of a trivial machine.""" @classmethod def get_type(cls): return "none" target_host = nixops.util.attr_property("targetHost", None) public_ipv4 = nixops.util.attr_property("publicIpv4", None) _ssh_private_key: Optional[str] = attr_property("none.sshPrivateKey", None) _ssh_public_key: Optional[str] = attr_property("none.sshPublicKey", None) _ssh_public_key_deployed = attr_property("none.sshPublicKeyDeployed", False, bool) def __init__(self, depl, name, id): MachineState.__init__(self, depl, name, id) @property def resource_id(self): return self.vm_id def get_physical_spec(self): return ({ ( "config", "users", "extraUsers", "root", "openssh", "authorizedKeys", "keys", ): [self._ssh_public_key] } if self._ssh_public_key else {}) def create( self, defn: NoneDefinition, check: bool, allow_reboot: bool, allow_recreate: bool, ): assert isinstance(defn, NoneDefinition) self.set_common_state(defn) self.target_host = defn._target_host self.public_ipv4 = defn._public_ipv4 if not self.vm_id: if self.provision_ssh_key: self.log_start("generating new SSH keypair... ") key_name = "NixOps client key for {0}".format(self.name) self._ssh_private_key, self._ssh_public_key = create_key_pair( key_name=key_name) self.log_end("done") self.vm_id = "nixops-{0}-{1}".format(self.depl.uuid, self.name) def switch_to_configuration(self, method, sync, command=None): res = super(NoneState, self).switch_to_configuration(method, sync, command) if res == 0: self._ssh_public_key_deployed = True return res def get_ssh_name(self): assert self.target_host return self.target_host def get_ssh_private_key_file(self) -> Optional[str]: if self._ssh_private_key_file: return self._ssh_private_key_file elif self._ssh_private_key: return self.write_ssh_private_key(self._ssh_private_key) return None def get_ssh_flags(self, *args, **kwargs): super_state_flags = super(NoneState, self).get_ssh_flags(*args, **kwargs) if self.vm_id and self.cur_toplevel and self._ssh_public_key_deployed: key_file = self.get_ssh_private_key_file() flags = super_state_flags + [ "-o", "StrictHostKeyChecking=accept-new", ] if key_file: flags = flags + ["-i", key_file] return flags return super_state_flags def _check(self, res): if not self.vm_id: res.exists = False return res.exists = True res.is_up = nixops.util.ping_tcp_port(self.target_host, self.ssh_port) if res.is_up: super()._check(res) def destroy(self, wipe=False): # No-op; just forget about the machine. return True
class AzureNetworkSecurityGroupState(ResourceState): """State of an Azure Network Security Group""" nsg_name = attr_property("azure.name", None) resource_group = attr_property("azure.resourceGroup", None) location = attr_property("azure.location", None) tags = attr_property("azure.tags", {}, 'json') security_rules = attr_property("azure.securityRules", {}, 'json') @classmethod def get_type(cls): return "azure-network-security-group" def show_type(self): s = super(AzureNetworkSecurityGroupState, self).show_type() if self.state == self.UP: s = "{0} [{1}]".format(s, self.location) return s @property def resource_id(self): return self.nsg_name @property def full_name(self): return "Azure network security group '{0}'".format(self.resource_id) def get_resource(self): try: return self.nrpc().network_security_groups.get( self.resource_group, self.resource_id).network_security_group except azure.common.AzureMissingResourceHttpError: return None def destroy_resource(self): self.nrpc().network_security_groups.delete(self.resource_group, self.resource_id) defn_properties = ['location', 'tags', 'security_rules'] def _create_or_update(self, defn): self.nrpc().network_security_groups.create_or_update( defn.resource_group, defn.nsg_name, NetworkSecurityGroup( location=defn.location, security_rules=[ SecurityRule( name=_name, description=_r['description'], protocol=_r['protocol'], source_port_range=_r['source_port_range'], destination_port_range=_r['destination_port_range'], source_address_prefix=_r['source_address_prefix'], destination_address_prefix=_r[ 'destination_address_prefix'], access=_r['access'], priority=_r['priority'], direction=_r['direction'], ) for _name, _r in defn.security_rules.iteritems() ], tags=defn.tags)) self.state = self.UP self.copy_properties(defn) def handle_changed_security_rules(self, rules): def update_rules(k, v): x = self.security_rules if v == None: x.pop(k, None) else: x[k] = v self.security_rules = x for _rule in rules: _s_name = next((_n for _n, _x in self.security_rules.iteritems() if _n == _rule.name), None) if _s_name is None: self.warn("found unexpected security rule {0}".format( _rule.name)) update_rules(_rule.name, {"dummy": True}) for _name, _s_rule in self.security_rules.iteritems(): if _s_rule.get("dummy", False): continue rule_res_name = "security rule {0}".format(_name) rule = next((_r for _r in rules if _r.name == _name), None) if rule is None: self.warn("{0} has been deleted behind our back".format( rule_res_name)) update_rules(_name, None) continue self.handle_changed_dict(_s_rule, 'description', rule.description, resource_name=rule_res_name) self.handle_changed_dict(_s_rule, 'protocol', rule.protocol, resource_name=rule_res_name) self.handle_changed_dict(_s_rule, 'source_port_range', rule.source_port_range, resource_name=rule_res_name) self.handle_changed_dict(_s_rule, 'destination_port_range', rule.destination_port_range, resource_name=rule_res_name) self.handle_changed_dict(_s_rule, 'source_address_prefix', rule.source_address_prefix, resource_name=rule_res_name) self.handle_changed_dict(_s_rule, 'destination_address_prefix', rule.destination_address_prefix, resource_name=rule_res_name) self.handle_changed_dict(_s_rule, 'access', rule.access, resource_name=rule_res_name) self.handle_changed_dict(_s_rule, 'priority', rule.priority, resource_name=rule_res_name) self.handle_changed_dict(_s_rule, 'direction', rule.direction, resource_name=rule_res_name) update_rules(_name, _s_rule) def create(self, defn, check, allow_reboot, allow_recreate): self.no_subscription_id_change(defn) self.no_location_change(defn) self.no_property_change(defn, 'resource_group') self.copy_mgmt_credentials(defn) self.nsg_name = defn.nsg_name self.resource_group = defn.resource_group if check: nsg = self.get_settled_resource() if not nsg: self.warn_missing_resource() elif self.state == self.UP: self.warn_if_failed(nsg) self.handle_changed_property('location', normalize_location(nsg.location), can_fix=False) self.handle_changed_property('tags', nsg.tags) self.handle_changed_security_rules(nsg.security_rules) else: self.warn_not_supposed_to_exist() self.confirm_destroy() if self.state != self.UP: if self.get_settled_resource(): raise Exception( "tried creating a network security group that already exists; " "please run 'deploy --check' to fix this") self.log("creating {0} in {1}...".format(self.full_name, defn.location)) self._create_or_update(defn) if self.properties_changed(defn): self.log("updating properties of {0}...".format(self.full_name)) self.get_settled_resource_assert_exists() self._create_or_update(defn) def create_after(self, resources, defn): from nixops.resources.azure_resource_group import AzureResourceGroupState return {r for r in resources if isinstance(r, AzureResourceGroupState)}
class GCEStaticIPState(ResourceState): """State of a GCE Static IP""" region = attr_property("gce.region", None) addr_name = attr_property("gce.name", None) ip_address = attr_property("gce.ipAddress", None) @classmethod def get_type(cls): return "gce-static-ip" def __init__(self, depl, name, id): ResourceState.__init__(self, depl, name, id) def show_type(self): s = super(GCEStaticIPState, self).show_type() if self.state == self.UP: s = "{0} [{1}]".format(s, self.region) return s @property def resource_id(self): return self.addr_name nix_name = "gceStaticIPs" @property def full_name(self): return "GCE static IP address '{0}'".format(self.addr_name) def address(self): return self.connect().ex_get_address(self.addr_name, region=self.region) @property def public_ipv4(self): return self.ip_address def prefix_definition(self, attr): return {("resources", "gceStaticIPs"): attr} def get_physical_spec(self): return {"publicIPv4": self.public_ipv4} def create(self, defn, check, allow_reboot, allow_recreate): self.no_change( defn.ip_address and self.ip_address != defn.ip_address, "address" ) self.no_project_change(defn) self.no_region_change(defn) self.copy_credentials(defn) self.addr_name = defn.addr_name if check: try: address = self.address() if self.state == self.UP: self.handle_changed_property( "ip_address", address.address, property_name="" ) self.handle_changed_property( "region", address.region.name, can_fix=False ) else: self.warn_not_supposed_to_exist(valuable_resource=True) self.confirm_destroy(address, self.full_name) except libcloud.common.google.ResourceNotFoundError: self.warn_missing_resource() if self.state != self.UP: self.log("reserving {0} in {1}...".format(self.full_name, defn.region)) try: address = self.connect().ex_create_address( defn.addr_name, region=defn.region, address=defn.ip_address ) except libcloud.common.google.ResourceExistsError: raise Exception( "tried requesting a static IP that already exists; " "please run 'deploy --check' to fix this" ) self.log("reserved IP address: {0}".format(address.address)) self.state = self.UP self.region = defn.region self.ip_address = address.address def destroy(self, wipe=False): if self.state == self.UP: try: address = self.address() return self.confirm_destroy( address, "{0} ({1})".format(self.full_name, self.ip_address), abort=False, ) except libcloud.common.google.ResourceNotFoundError: self.warn( "tried to destroy {0} which didn't exist".format(self.full_name) ) return True
class GCETargetPoolState(ResourceState): """State of a GCE Target Pool""" targetpool_name = attr_property("gce.name", None) region = attr_property("gce.region", None) health_check = attr_property("gce.healthcheck", None) machines = attr_property("gce.machines", [], 'json') @classmethod def get_type(cls): return "gce-target-pool" def __init__(self, depl, name, id): ResourceState.__init__(self, depl, name, id) def show_type(self): s = super(GCETargetPoolState, self).show_type() if self.state == self.UP: s = "{0} [{1}]".format(s, self.region) return s @property def resource_id(self): return self.targetpool_name nix_name = "gceTargetPools" @property def full_name(self): return "GCE target pool '{0}'".format(self.targetpool_name) def targetpool(self): return self.connect().ex_get_targetpool(self.targetpool_name) defn_properties = [ 'region', 'health_check' ] def create(self, defn, check, allow_reboot, allow_recreate): self.no_project_change(defn) self.no_region_change(defn) self.copy_credentials(defn) self.targetpool_name = defn.targetpool_name if check: try: tp = self.targetpool() if self.state == self.UP: self.handle_changed_property('region', tp.region.name, can_fix = False) normalized_hc = (tp.healthchecks[0].name if tp.healthchecks else None) self.handle_changed_property('health_check', normalized_hc) normalized_machines = set([ n.extra['selfLink'] if hasattr(n, 'extra') else n for n in tp.nodes ]) machines_state = set(self.machines) if machines_state != normalized_machines: if normalized_machines - machines_state: self.warn("{0} contains unexpected machines: {1}". format(self.full_name, list(normalized_machines - machines_state))) if machines_state - normalized_machines: self.warn("{0} is missing machines: {1}". format(self.full_name, list(machines_state - normalized_machines))) self.machines = list(normalized_machines) else: self.warn_not_supposed_to_exist() self.confirm_destroy(tp, self.full_name) except libcloud.common.google.ResourceNotFoundError: self.warn_missing_resource() if self.state != self.UP: self.log("creating {0}...".format(self.full_name)) try: tp = self.connect().ex_create_targetpool(defn.targetpool_name, region = defn.region, healthchecks = ([ defn.health_check ] if defn.health_check else None) ) except libcloud.common.google.ResourceExistsError: raise Exception("tried creating a target pool that already exists; " "please run 'deploy --check' to fix this") self.state = self.UP self.copy_properties(defn) self.machines = [] # update the target pool resource if its definition and state are out of sync machines_state = set(self.machines) machines_defn = set(defn.machines) if self.health_check != defn.health_check or machines_state != machines_defn: try: tp = self.targetpool() except libcloud.common.google.ResourceNotFoundError: raise Exception("{0} has been deleted behind our back; " "please run 'deploy --check' to fix this" .format(self.full_name)) if self.health_check != defn.health_check: self.log("ppdating the health check of {0}...".format(self.full_name)) if self.health_check: tp.remove_healthcheck(self.health_check) self.health_check = None if defn.health_check: tp.add_healthcheck(defn.health_check) self.health_check = defn.health_check if machines_state != machines_defn: self.log("updating the machine list of {0}...".format(self.full_name)) for uri in (machines_state - machines_defn): tp.remove_node(uri) machines_state.remove(uri) for uri in (machines_defn - machines_state): tp.add_node(uri) machines_state.add(uri) self.machines = list(machines_state) def destroy(self, wipe=False): if self.state == self.UP: try: targetpool = self.targetpool() return self.confirm_destroy(targetpool, self.full_name, abort = False) except libcloud.common.google.ResourceNotFoundError: self.warn("tried to destroy {0} which didn't exist".format(self.full_name)) return True def create_after(self, resources, defn): return {r for r in resources if isinstance(r, GCEHTTPHealthCheckState)}
class HetznerCloudState(MachineState[HetznerCloudDefinition]): """ State of a Hetzner Cloud machine. """ @classmethod def get_type(cls): return "hetznercloud" state = attr_property("state", MachineState.MISSING, int) # override vm_id = attr_property("vmId", None, int) # override type api_token = attr_property("hetznerCloud.apiToken", None) public_ipv4 = attr_property("publicIpv4", None) public_ipv6 = attr_property("publicIpv6", None) private_ipv4 = attr_property("privateIpv4", None) public_client_key = attr_property("hetznerCloud.publicClientKey", None) private_client_key = attr_property("hetznerCloud.privateClientKey", None) public_host_key = attr_property("hetznerCloud.publicHostKey", None) private_host_key = attr_property("hetznerCloud.privateHostKey", None) legacy_if_scheme = attr_property("legacyIfScheme", None, bool) labels = attr_property("hetznerCloud.labels", {}, "json") location = attr_property("hetznerCloud.location", None) server_name = attr_property("hetznerCloud.serverName", None) server_type = attr_property("hetznerCloud.serverType", None) server_networks = attr_property("hetznerCloud.serverNetworks", {}, "json") volumes = attr_property("hetznerCloud.volumes", {}, "json") ip_addresses = attr_property("hetznerCloud.ipAddresses", {}, "json") def __init__(self, depl: Deployment, name: str, id): MachineState.__init__(self, depl, name, id) self._client = None def cleanup_state(self) -> None: """ Discard all state pertaining to an instance. """ with self.depl._db: self.vm_id = None self.public_ipv4 = None self.public_ipv6 = None self.private_client_key = None self.public_client_key = None self.private_host_key = None self.public_host_key = None self.legacy_if_scheme = None self.location = None self.server_name = None self.server_type = None self.server_networks = {} self.labels = {} self.volumes = {} self.ip_addresses = {} def show_type(self): s = f"{super(HetznerCloudState, self).show_type()}" if self.location: s += f" [{self.location}; {self.server_type}]" return s @property def full_name(self) -> str: return f"Hetzner Cloud Server ‘{self.name}’" def get_instance(self) -> BoundServer: try: return self.get_client().servers.get_by_id(self.vm_id) except APIException as e: if e.code == "not_found": self.logger.warn( f"{self.full_name} was deleted from outside of nixops") return None else: raise def get_client(self) -> Client: """ Generic method to get or create a Hetzner Cloud client. """ if self._client: return self._client new_api_token = self.api_token or os.environ.get("HCLOUD_API_TOKEN") if new_api_token is not None: self.api_token = new_api_token if self.api_token is None: raise Exception("please set ‘apiToken’ or $HCLOUD_API_TOKEN") self._client = Client(token=self.api_token) return self._client def get_common_labels(self) -> Dict[str, str]: labels = { "CharonNetworkUUID": self.depl.uuid, "CharonInstanceName": self.name, "CharonStateFileHost": socket.gethostname(), "CharonStateFileUser": getpass.getuser(), } pattern = "^$|(?i)((?=^[a-z0-9])[a-z0-9._-]{0,63}[a-z0-9]$)" file_name = os.path.basename(self.depl._db.db_file) if re.match(pattern, file_name): labels["CharonStateFileName"] = file_name if self.depl.name: labels["CharonNetworkName"] = self.depl.name return labels def get_ssh_name(self) -> str: if not self.public_ipv4: raise Exception( f"{self.full_name} does not have a public IP address (yet)") return self.public_ipv4 def get_ssh_private_key_file(self) -> str: return self._ssh_private_key_file or self.write_ssh_private_key( self.private_client_key) def get_ssh_flags(self, *args, **kwargs) -> List[str]: super_flags = super(HetznerCloudState, self).get_ssh_flags(*args, **kwargs) return super_flags + ["-i", self.get_ssh_private_key_file()] def get_udev_name(self, volume_id: str) -> str: return f"/dev/disk/by-id/scsi-0HC_Volume_{volume_id}" def get_physical_spec(self) -> Dict[Any, Any]: ipv4 = [{"address": self.public_ipv4, "prefixLength": 32}] ipv6 = [{"address": self.public_ipv6[:-3], "prefixLength": 64}] for addr in self.ip_addresses.values(): try: socket.inet_pton(socket.AF_INET, addr) ipv4.append({"address": addr, "prefixLength": 32}) except socket.error: # not a valid address ipv4 ipv6.append({"address": addr, "prefixLength": 64}) def get_interface_name(i: int) -> str: return f"ens{10+i}" if self.legacy_if_scheme else f"enp{7+i}s0" spec = { "imports": [RawValue("<nixpkgs/nixos/modules/profiles/qemu-guest.nix>")], ("boot", "loader", "grub", "device"): "nodev", ("fileSystems", "/"): { "device": "/dev/sda1", "fsType": "ext4" }, **{("fileSystems", v["mountPoint"]): { "fsType": v["fsType"], "device": v["device"], } for k, v in self.volumes.items() if v["mountPoint"]}, # Hetzner Cloud networking defaults ("networking", "defaultGateway"): "172.31.1.1", ("networking", "nameservers"): [ "213.133.98.98", "213.133.99.99", "213.133.100.100", ], ( "networking", "interfaces", "ens3" if self.legacy_if_scheme else "enp1s0", ): { ("ipv4", "addresses"): ipv4, ("ipv6", "addresses"): ipv6, "useDHCP": True, }, ("users", "extraUsers", "root", "openssh", "authorizedKeys", "keys"): [self.public_client_key], } for i, v in enumerate(self.server_networks.values()): private_ipv4_addresses = [{ "address": addr, "prefixLength": 32 } for addr in [v["privateIpAddress"]] + v["aliasIpAddresses"]] spec[("networking", "interfaces", get_interface_name(i))] = { ("ipv4", "addresses"): private_ipv4_addresses, "useDHCP": True, } for v in self.volumes.values(): if v["fsType"] == "xfs": spec[("boot", "kernelModules")] = ["xfs"] break return spec def _update_attr(self, attr: str, k: str, v: Optional[Dict[str, Any]]) -> None: x = getattr(self, attr) if v is None: x.pop(k, None) else: x[k] = v setattr(self, attr, x) def _handle_changed_server_networks(self, defn: HetznerCloudDefinition, allow_recreate: bool) -> None: """ Detects and corrects any virtual network state desynchronisation. """ attached: Set[str] = { x.network.id for x in self.get_instance().private_net } # Detach server from networks for name in self.server_networks.keys(): nw: Optional[BoundNetwork] = self.get_client( ).networks.get_by_name(name) # Detect destroyed networks if nw is None: if name not in defn.server_networks: # we dont need it self.logger.warn( f"forgetting about network ‘{name}’ that no longer exists" " and is no longer needed by the deployment specification" ) self._update_attr("server_networks", name, None) else: # we do need it raise Exception( f"network ‘{name}’ (used by {self.full_name}) no longer exists;" " run ‘nixops deploy --check’ to update resource state" ) # Detect network detachment elif nw.id not in attached: self.logger.warn( f"instance was manually detached from network ‘{name}’ [{nw.id}]" ) if name in defn.server_networks: self._update_attr("server_networks", name, None) # Detach from existing networks if required. elif name not in defn.server_networks: self.logger.log(f"detaching from network ‘{name}’ [{nw.id}]") self.get_client().servers.detach_from_network( server=Server(self.vm_id), network=nw).wait_until_finished() self._update_attr("server_networks", name, None) # Attach server to networks for name, x in defn.server_networks.items(): if name not in self.server_networks: nw = self.get_client().networks.get_by_name(name) if nw is None: raise Exception( f"tried to attach instance to network ‘{name}’" " but it doesn't exist...") # NixOps will update machines in parallel, so retry # network attachment to deal with resource conflict. def attach_to_network() -> bool: try: self.wait_on_action( self.get_client().servers.attach_to_network( server=Server(self.vm_id), network=nw, ip=x["privateIpAddress"], alias_ips=x["aliasIpAddresses"], )) except APIException as e: if e.code == "conflict": return False else: raise else: self._update_attr("server_networks", x["network"], x) return True self.logger.log( f"attaching instance to network ‘{name}’ [{nw.id}]...") check_wait(attach_to_network) def _handle_changed_floating_ips(self, defn: HetznerCloudDefinition, allow_recreate: bool) -> None: """ Detects and corrects any floating IP state desynchronisation. """ assigned: Set[str] = { x.name for x in self.get_instance().public_net.floating_ips } for name in self.ip_addresses.keys(): fip: Optional[BoundFloatingIP] = self.get_client( ).floating_ips.get_by_name(name) # Detect manually destroyed floating IPs if fip is None: if name not in defn.ip_addresses: # we dont need it self.logger.warn( f"forgetting about floating IP ‘{name}’ that no longer" " exists and is no longer needed by the deployment" " specification") self._update_attr("ip_addresses", name, None) else: if name.startswith("nixops-" + self.depl.uuid): raise Exception( f"floating IP ‘{name}’ (used by {self.full_name})" " no longer exists; run ‘nixops deploy --check’" " to update resource state") else: raise Exception( f"floating IP ‘{name}’ (used by {self.full_name})" " was manually destroyed") # Detect unassigned floating IPs elif name not in assigned: if name not in defn.ip_addresses: # we dont need it self.logger.warn( f"forgetting about unassigned floating IP ‘{name}’ [{fip.id}]" " that is no longer needed by the deployment specification" ) else: # we do need it self.logger.warn( f"floating IP ‘{name}’ [{fip.id}] was manually unassigned;" " will reassign it.") self._update_attr("ip_addresses", name, None) # Assign missing floating IPs. for name in defn.ip_addresses: if name not in self.ip_addresses: fip = self.get_client().floating_ips.get_by_name(name) if fip is None: raise Exception(f"tried to assign floating IP ‘{name}’" " but it doesn't exist...") self.logger.log( f"assigning floating IP ‘{name}’ [{fip.id}]...") self.wait_on_action(fip.assign(Server(self.vm_id))) self._update_attr("ip_addresses", name, fip.ip) def _handle_changed_volumes(self, defn: HetznerCloudDefinition, allow_recreate: bool) -> None: """ Detects and corrects any volume state desynchronisation. """ attached: Set[str] = {x.name for x in self.get_instance().volumes} for name in self.volumes.keys(): volume: Optional[BoundVolume] = self.get_client( ).volumes.get_by_name(name) # Detect destroyed volumes. if volume is None: if name not in defn.volumes: # we dont need it self.logger.warn( f"forgetting about volume ‘{name}’ that no longer exists" " and is no longer needed by the deployment specification" ) else: if name.startswith("nixops-" + self.depl.uuid): raise Exception( f"volume ‘{name}’ (used by {self.full_name}) no longer exists;" " run ‘nixops deploy --check’ to update resource state" ) else: raise Exception( f"volume ‘{name}’ (used by {self.full_name}) was" " manually destroyed") # Detect detached volumes. elif name not in attached: if name not in defn.volumes: # we dont need it self.logger.warn( f"forgetting about detached volume ‘{name}’ [{volume.id}]" " that is no longer needed by the deployment specification" ) else: # we do need it self.logger.warn( f"volume ‘{name}’ [{volume.id}] was manually detached;" " will reattach it") self._update_attr("volumes", name, None) # Detach existing attached volumes if required. elif name not in defn.volumes: self.logger.warn( f"detaching volume ‘{name}’ [{volume.id}] that is no longer" " needed by the deployment specification") volume.detach().wait_until_finished() self._update_attr("volumes", name, None) # Attach missing volumes. resize filesystems if required, before mounting. for name, v in defn.volumes.items(): if name not in self.volumes: # Check if it exists. resources will have been created if user ran check, # but prexisting vols which got deleted may be gone (detected in code above) volume = self.get_client().volumes.get_by_name(name) if volume is None: self.logger.warn( f"tried to attach non-NixOps managed volume ‘{name}’," " but it doesn't exist... skipping") continue elif volume.location.name != self.location: raise Exception( f"volume ‘{name}’ [{volume.id}] is in a different location" " to {self.full_name}; attempting to attach it will fail." ) elif (volume.server and volume.server.id != self.vm_id and self.depl.logger.confirm( f"volume ‘{name}’ is in use by instance ‘{volume.server.id}’," " are you sure you want to attach this volume?") ): # noqa: E124 self.logger.log( f"detaching volume ‘{name}’ from instance ‘{volume.server.id}’..." ) volume.detach().wait_until_finished() volume.server = None # Attach volume. self.logger.log(f"attaching volume ‘{name}’ [{volume.id}]... ") volume.attach(Server(self.vm_id)).wait_until_finished() # Wait until the device is visible in the instance. v["device"] = self.get_udev_name(volume.id) def check_device() -> bool: return 0 == self.run_command(f"test -e {v['device']}", check=False) if not check_wait( check_device, initial=1, max_tries=10, exception=False): # If stopping times out, then do an unclean shutdown. self.logger.log_end("(timed out)") self.logger.log(f"can't find device ‘{v['device']}’...") self.logger.log("available devices:") self.run_command("lsblk") raise Exception("operation timed out") else: self._update_attr("volumes", name, v) self.logger.log_end("") # Grow filesystems on resource based volumes. # We want to grow the fs when its volume gets resized, but if the # volume isn't attached to any server at the time, thats not possible. # Blindly trying to grow all volumes when mounting them just in case # they got resized while they were orphaned is bad. Workaround: # the needsFSResize attribute of VolumeState is set when the volume # gets resized by NixOps. When attaching a volume NixOps will use this # flag to decide whether to grow the filesystem. if name.startswith("nixops-" + self.depl.uuid): res = self.depl.get_typed_resource(name[44:], "hetznercloud-volume", VolumeState) # get correct option definitions for volume resources v["size"] = res._state["size"] v["fsType"] = res._state["fsType"] v["device"] = self.get_udev_name(res._state["resourceId"]) question = ( f"volume {name} was resized, do you wish to grow its" " filesystem to fill the space?") op = (f"umount {v['device']} ;" f"e2fsck -fy {v['device']} &&" f"resize2fs {v['device']}") if (v["fsType"] == "ext4" and res.needsFSResize and self.depl.logger.confirm(question) and self.run_command(op, check=False) == 0): with res.depl._db: res.needsFSResize = False self._update_attr("volumes", name, v) if v["mountPoint"]: volume = self.get_client().volumes.get_by_name(name) v["device"] = self.get_udev_name(volume.id) self._update_attr("volumes", name, v) def after_activation(self, defn: HetznerCloudDefinition) -> None: # Unlike ext4, xfs filesystems must be resized while the underlying drive is mounted. # Thus this operation is delayed until after activation. for name, v in self.volumes.items(): if (name.startswith("nixops-" + self.depl.uuid) and v["mountPoint"] and v["fsType"] == "xfs"): res = self.depl.get_typed_resource(name[44:], "hetznercloud-volume", VolumeState) question = ( f"volume {name} was resized, do you wish to grow its" " filesystem to fill the space?") if (res.needsFSResize and self.depl.logger.confirm(question) and 0 == self.run_command( f"xfs_growfs {v['mountPoint']}", check=False)): with res.depl._db: res.needsFSResize = False def create_after( self, resources, defn: HetznerCloudDefinition) -> Set[HetznerCloudResourceState]: return { r for r in resources if isinstance(r, FloatingIPState) or isinstance(r, NetworkState) or isinstance(r, VolumeState) } def _create_ssh_key(self, public_key: str) -> BoundSSHKey: """Create or get a hetzner cloud ssh key.""" public_key = public_key.strip() hetzner_ssh_keys: List[BoundSSHKey] = self.get_client( ).ssh_keys.get_all() name: str = f"nixops-{self.depl.uuid}-{self.name}" for key in hetzner_ssh_keys: if key.public_key.strip() == public_key: return key elif key.name == name: self.get_client().ssh_keys.delete(key) ssh_key: BoundSSHKey = self.get_client().ssh_keys.create( name=name, public_key=public_key, ) return ssh_key def _create_instance(self, defn) -> None: if not self.public_client_key: (private, public) = create_key_pair(type="ed25519") self.public_client_key = public self.private_client_key = private if not self.public_host_key: (private, public) = create_key_pair(type="ed25519") self.public_host_key = public self.private_host_key = private location: BoundLocation = self.get_client().locations.get_by_name( defn.location) ssh_keys: List[BoundSSHKey] = [ self._create_ssh_key(self.public_client_key) ] # Ensure host keys get injected into the base OS user_data = ("#cloud-config\n" "ssh_keys:\n" " ed25519_public: {0}\n" " ed25519_private: |\n" " {1}").format( self.public_host_key, self.private_host_key.replace("\n", "\n ")) self.logger.log_start( f"creating {defn.server_type} server at {location.description}...") response = self.get_client().servers.create( name=defn.server_name, labels={ **self.get_common_labels(), **dict(defn.labels) }, location=location, server_type=ServerType(defn.server_type), ssh_keys=ssh_keys, user_data=user_data, image=Image(name="ubuntu-20.04"), # for lustration start_after_create=True, ) self.state = self.STARTING self.wait_on_action(response.action) with self.depl._db: self.vm_id = response.server.id self.public_ipv4 = response.server.public_net.ipv4.ip self.public_ipv6 = response.server.public_net.ipv6.ip self.server_name = defn.server_name self.server_type = defn.server_type self.legacy_if_scheme = defn.server_type.startswith("cx") self.location = defn.location self.labels = dict(defn.labels) self.private_host_key = None known_hosts.add(self.public_ipv4, self.public_host_key) self.logger.log_end(f"{self.public_ipv4}") def create( # noqa: C901 self, defn: HetznerCloudDefinition, check: bool, allow_reboot: bool, allow_recreate: bool, ) -> None: self.api_token = defn.api_token if self.state != self.UP: check = True self.set_common_state(defn) if self.api_token and self.api_token != defn.api_token: raise Exception("cannot change api token of an existing instance") # Destroy the instance (if allowed) to handle attribute changes which # require recreating i.e. location if (self.vm_id and allow_recreate and self.location != defn.location and self.depl.logger.confirm( "changing server location requires recreate, are you sure?" )): self._destroy() # Stop the instance (if allowed) to handle attribute changes which # require rebooting i.e. server_type if self.vm_id and allow_reboot and self.server_type != defn.server_type: self.stop() check = True # Check whether the instance hasn't been killed behind our backs. # Handle changed server type. # Restart stopped instances. if self.vm_id and check: instance = self.get_instance() if instance is None or instance.status in {"deleting"}: if not allow_recreate: raise Exception( f"{self.full_name} went away;" " use ‘--allow-recreate’ to create a new one") status = instance.status if instance else "gone" self.logger.log( f"{self.full_name} went away (state ‘{status}’), will recreate" ) self.cleanup_state() # Modify the server type, if desired. TODO store disk size # in state to enable option to later downsize server type. if instance.status == "off" and self.server_type != defn.server_type: self.logger.log_start( f"changing server type from ‘{self.server_type}’ to" f" ‘{defn.server_type}’; may take a few minutes...") instance.change_type(ServerType(defn.server_type), upgrade_disk=True).wait_until_finished() self.logger.log_end("done!") with self.depl._db: self.server_type = defn.server_type self.logger.log("instance was stopped, restarting...") self.start() # Provision the instance. if not self.vm_id: self._create_instance(defn) self.wait_for_ssh() self.state = self.RESCUE self.logger.log_start("running nixos-infect") self.run_command("bash </dev/stdin 2>&1", stdin=open(INFECT_PATH)) self.logger.log("rebooting into NixOS 😎") self.reboot_sync() self.state = self.UP if self.location != defn.location: raise Exception("cannot change location of an existing instance" f" (from ‘{self.location}‘ to ‘{defn.location}‘);" " use ‘--allow-recreate’") if self.server_type != defn.server_type: raise Exception( "cannot change server type of a running instance" f" (from ‘{self.server_type}‘ to ‘{defn.server_type}‘);" " use ‘--allow-reboot’") # Update name or labels if they have changed. if self.server_name != defn.server_name or self.labels != defn.labels: self.logger.log("updating trivial modified attributes") self.get_instance().update(defn.server_name, { **self.get_common_labels(), **dict(defn.labels) }) self._handle_changed_floating_ips(defn, allow_recreate) self._handle_changed_volumes(defn, allow_recreate) self._handle_changed_server_networks(defn, allow_recreate) def _destroy(self) -> None: if self.state != self.UP: return self.logger.log(f"destroying {self.full_name}") # Detach volumes for name, v in self.volumes.items(): self.logger.log(f"detaching volume {name}...") self.get_client().volumes.get_by_name( name).detach().wait_until_finished() if (instance := self.get_instance()) is not None: instance.delete() # Remove host ssh key. self.get_client().ssh_keys.get_by_name( f"nixops-{self.depl.uuid}-{self.name}").delete() known_hosts.remove(self.public_ipv4, self.public_host_key) self.cleanup_state()
class GCEHTTPHealthCheckState(ResourceState): """State of a GCE HTTP Health Check""" healthcheck_name = attr_property("gce.name", None) host = attr_property("gce.host", None) path = attr_property("gce.path", None) port = attr_property("gce.port", None, int) description = attr_property("gce.description", None) check_interval = attr_property("gce.checkInterval", None, int) timeout = attr_property("gce.timeout", None, int) unhealthy_threshold = attr_property("gce.unhealthyThreshold", None, int) healthy_threshold = attr_property("gce.healthyThreshold", None, int) @classmethod def get_type(cls): return "gce-http-health-check" def __init__(self, depl, name, id): ResourceState.__init__(self, depl, name, id) def show_type(self): s = super(GCEHTTPHealthCheckState, self).show_type() if self.state == self.UP: s = "{0} [:{1}{2}]".format(s, self.port, self.path) return s @property def resource_id(self): return self.healthcheck_name nix_name = "gceHTTPHealthChecks" @property def full_name(self): return "GCE HTTP health check '{0}'".format(self.healthcheck_name) def healthcheck(self): return self.connect().ex_get_healthcheck(self.healthcheck_name) defn_properties = [ "host", "path", "port", "description", "check_interval", "timeout", "unhealthy_threshold", "healthy_threshold", ] def create(self, defn, check, allow_reboot, allow_recreate): self.no_project_change(defn) self.copy_credentials(defn) self.healthcheck_name = defn.healthcheck_name if check: try: hc = self.healthcheck() if self.state == self.UP: self.handle_changed_property("host", hc.extra["host"]) self.handle_changed_property("path", hc.path) self.handle_changed_property("port", hc.port) self.handle_changed_property("timeout", hc.timeout) self.handle_changed_property("description", hc.extra["description"]) self.handle_changed_property("check_interval", hc.interval) self.handle_changed_property( "healthy_threshold", hc.healthy_threshold ) self.handle_changed_property( "unhealthy_threshold", hc.unhealthy_threshold ) else: self.warn_not_supposed_to_exist() self.confirm_destroy(hc, self.full_name) except libcloud.common.google.ResourceNotFoundError: self.warn_missing_resource() if self.state != self.UP: self.log("creating {0}...".format(self.full_name)) try: healthcheck = self.connect().ex_create_healthcheck( defn.healthcheck_name, host=defn.host, path=defn.path, port=defn.port, interval=defn.check_interval, timeout=defn.timeout, unhealthy_threshold=defn.unhealthy_threshold, healthy_threshold=defn.healthy_threshold, description=defn.description, ) except libcloud.common.google.ResourceExistsError: raise Exception( "tried creating a health check that already exists; " "please run 'deploy --check' to fix this" ) self.state = self.UP self.copy_properties(defn) # update the health check resource if its definition and state are out of sync if self.properties_changed(defn): self.log("updating properties of {0}...".format(self.full_name)) try: hc = self.healthcheck() hc.path = defn.path hc.port = defn.port hc.interval = defn.check_interval hc.timeout = defn.timeout hc.unhealthy_threshold = defn.unhealthy_threshold hc.healthy_threshold = defn.healthy_threshold hc.extra["host"] = defn.host hc.extra["description"] = defn.description hc.update() self.copy_properties(defn) except libcloud.common.google.ResourceNotFoundError: raise Exception( "{0} has been deleted behind our back; " "please run 'deploy --check' to fix this".format(self.full_name) ) def destroy(self, wipe=False): if self.state == self.UP: try: healthcheck = self.healthcheck() return self.confirm_destroy(healthcheck, self.full_name, abort=False) except libcloud.common.google.ResourceNotFoundError: self.warn( "tried to destroy {0} which didn't exist".format(self.full_name) ) return True
class StorageResourceState(ResourceState): access_key = attr_property("azure.accessKey", None) def __init__(self, depl, name, id): ResourceState.__init__(self, depl, name, id) self._bs = None self._qs = None self._ts = None self._fs = None def get_resource(self): try: return self.get_resource_allow_exceptions() except requests.exceptions.ConnectionError: self.warn( "connection error: either storage doesn't exist and thus this resource " "doesn't exist as well, the storage domain name is in negative DNS cache " "or your network connection is down; you must either re-deploy the storage, " "drop DNS cache or delete this resource manually; aborting to avoid data loss" ) raise except azure.common.AzureMissingResourceHttpError: return None def bs(self): if not self._bs: self._bs = BlobService(self.get_storage_name(), self.get_key()) return self._bs def qs(self): if not self._qs: self._qs = QueueService(self.get_storage_name(), self.get_key()) return self._qs def ts(self): if not self._ts: self._ts = TableService(self.get_storage_name(), self.get_key()) return self._ts def fs(self): if not self._fs: self._fs = FileService(self.get_storage_name(), self.get_key()) return self._fs # Signed Identifiers handling helpers def _signed_identifiers_to_dict(self, signed_identifiers): return { s_id.id: { 'start': s_id.access_policy.start, 'expiry': s_id.access_policy.expiry, 'permissions': s_id.access_policy.permission, } for s_id in signed_identifiers.signed_identifiers } def _dict_to_signed_identifiers(self, signed_identifiers): result = SignedIdentifiers() for _id, policy in signed_identifiers.iteritems(): identifier = SignedIdentifier() identifier.id = _id identifier.access_policy = AccessPolicy( start=policy['start'], expiry=policy['expiry'], permission=policy['permissions']) result.signed_identifiers.append(identifier) return result def handle_changed_signed_identifiers(self, signed_identifiers): self.handle_changed_property( 'signed_identifiers', self._signed_identifiers_to_dict(signed_identifiers)) def handle_changed_metadata(self, resource_with_metadata): metadata = { k[10:]: v for k, v in resource_with_metadata.items() if k.startswith('x-ms-meta-') } self.handle_changed_property('metadata', metadata)
class GCENetworkState(ResourceState): """State of a GCE Network""" address_range = attr_property("gce.addressRange", None) network_name = attr_property("gce.network_name", None) firewall = attr_property("gce.firewall", {}, "json") @classmethod def get_type(cls): return "gce-network" def __init__(self, depl, name, id): ResourceState.__init__(self, depl, name, id) def show_type(self): s = super(GCENetworkState, self).show_type() if self.state == self.UP: s = "{0} [{1}]".format(s, self.address_range) return s @property def resource_id(self): return self.network_name nix_name = "gceNetworks" @property def full_name(self): return "GCE network '{0}'".format(self.network_name) def network(self): return self.connect().ex_get_network(self.network_name) def update_firewall(self, k, v): x = self.firewall if v == None: x.pop(k, None) else: x[k] = v self.firewall = x def firewall_name(self, name): return "{0}-{1}".format(self.network_name, name) def full_firewall_name(self, name): return "GCE firewall '{0}'".format(self.firewall_name(name)) def warn_if_firewall_changed( self, fw_name, expected_state, actual_state, name, can_fix=True ): return self.warn_if_changed( expected_state, actual_state, name, resource_name=self.full_firewall_name(fw_name), can_fix=can_fix, ) def destroy_firewall(self, fwname): self.log("destroying {0}...".format(self.full_firewall_name(fwname))) try: fw_n = self.firewall_name(fwname) self.connect().ex_get_firewall(fw_n).destroy() except libcloud.common.google.ResourceNotFoundError: self.warn( "tried to destroy {0} which didn't exist".format( self.full_firewall_name(fwname) ) ) self.update_firewall(fwname, None) def create(self, defn, check, allow_reboot, allow_recreate): self.no_property_change(defn, "address_range") self.no_project_change(defn) self.copy_credentials(defn) self.network_name = defn.network_name if check: try: network = self.network() if self.state == self.UP: self.handle_changed_property( "address_range", network.cidr, can_fix=False ) else: self.warn_not_supposed_to_exist() self.confirm_destroy(network, self.full_name) except libcloud.common.google.ResourceNotFoundError: self.warn_missing_resource() if self.state != self.UP: self.log("creating {0}...".format(self.full_name)) try: network = self.connect().ex_create_network( defn.network_name, defn.address_range ) except libcloud.common.google.ResourceExistsError: raise Exception( "tried creating a network that already exists; " "please run 'deploy --check' to fix this" ) self.state = self.UP self.address_range = defn.address_range # handle firewall rules def trans_allowed(attrs): return [ dict( [("IPProtocol", proto)] + ([("ports", ports)] if ports is not None else []) ) for proto, ports in attrs.items() ] if check: firewalls = [ f for f in self.connect().ex_list_firewalls() if f.network.name == defn.network_name ] # delete stray rules and mark changed ones for update for fw in firewalls: fw_name = next( ( k for (k, v) in self.firewall.items() if fw.name == self.firewall_name(k) ), None, ) if fw_name: rule = self.firewall[fw_name] rule["sourceRanges"] = self.warn_if_firewall_changed( fw_name, rule["sourceRanges"], normalize_list(fw.source_ranges), "source ranges", ) rule["sourceTags"] = self.warn_if_firewall_changed( fw_name, rule["sourceTags"], normalize_list(fw.source_tags), "source tags", ) rule["targetTags"] = self.warn_if_firewall_changed( fw_name, rule["targetTags"], normalize_list(fw.target_tags), "target tags", ) if fw.allowed != trans_allowed(rule["allowed"]): self.warn( "{0} allowed ports and protocols have changed unexpectedly".format( self.full_firewall_name(fw_name) ) ) rule["allowed"] = {} # mark for update self.update_firewall(fw_name, rule) else: self.warn( "deleting {0} which isn't supposed to exist...".format( self.firewall_name(fw_name) ) ) fw.destroy() # find missing firewall rules for k, v in self.firewall.items(): if not any(fw.name == self.firewall_name(k) for fw in firewalls): self.warn("firewall rule '{0}' has disappeared...".format(k)) self.update_firewall(k, None) # add new and update changed for k, v in defn.firewall.items(): if k in self.firewall: if v == self.firewall[k]: continue self.log("updating {0}...".format(self.firewall_name(k))) try: firewall = self.connect().ex_get_firewall(self.firewall_name(k)) firewall.allowed = trans_allowed(v["allowed"]) firewall.source_ranges = v["sourceRanges"] firewall.source_tags = v["sourceTags"] firewall.target_tags = v["targetTags"] firewall.update() except libcloud.common.google.ResourceNotFoundError: raise Exception( "tried updating a firewall rule that doesn't exist; " "please run 'deploy --check' to fix this" ) else: self.log("creating {0}...".format(self.full_firewall_name(k))) try: self.connect().ex_create_firewall( self.firewall_name(k), trans_allowed(v["allowed"]), network=self.network_name, source_ranges=v["sourceRanges"], source_tags=v["sourceTags"], target_tags=v["targetTags"], ) except libcloud.common.google.ResourceExistsError: raise Exception( "tried creating a firewall rule that already exists; " "please run 'deploy --check' to fix this" ) self.update_firewall(k, v) # delete unneeded for k in set(self.firewall.keys()) - set(defn.firewall.keys()): self.destroy_firewall(k) def destroy(self, wipe=False): if self.state == self.UP: try: network = self.network() if not self.depl.logger.confirm( "are you sure you want to destroy {0}?".format(self.full_name) ): return False for k in self.firewall.keys(): self.destroy_firewall(k) self.log("destroying {0}...".format(self.full_name)) network.destroy() except libcloud.common.google.ResourceNotFoundError: self.warn( "tried to destroy {0} which didn't exist".format(self.full_name) ) return True
class GSEBucketState(ResourceState): """State of a GSE Bucket""" bucket_name = attr_property("gce.name", None) cors = attr_property("gce.cors", [], 'json') lifecycle = attr_property("gce.lifecycle", [], 'json') log_bucket = attr_property("gce.logBucket", None) log_object_prefix = attr_property("gce.logObjectPrefix", None) region = attr_property("gce.region", None) storage_class = attr_property("gce.storageClass", None) versioning_enabled = attr_property("gce.versioningEnabled", None, bool) website_main_page_suffix = attr_property("gce.websiteMainPageSuffix", None) website_not_found_page = attr_property("gce.websiteNotFoundPage", None) @classmethod def get_type(cls): return "gse-bucket" def __init__(self, depl, name, id): ResourceState.__init__(self, depl, name, id) def show_type(self): s = super(GSEBucketState, self).show_type() if self.state == self.UP: s = "{0}".format(s) return s @property def resource_id(self): return self.bucket_name nix_name = "gseBuckets" @property def full_name(self): return "GSE bucket '{0}'".format(self.bucket_name) def connect(self): if not self._conn: self._conn = GSEConnection(self.service_account, self.access_key_path, True) return self._conn defn_properties = [ 'cors', 'lifecycle', 'log_bucket', 'log_object_prefix', 'region', 'storage_class', 'versioning_enabled', 'website_main_page_suffix', 'website_not_found_page' ] def bucket_resource(self, defn): return { 'name': defn.bucket_name, 'cors': [{ 'origin': c['origins'], 'method': c['methods'], 'responseHeader': c['response_headers'], 'maxAgeSeconds': c['max_age_seconds'] } for c in defn.cors], 'lifecycle': { 'rule': [{ 'action': { 'type': r['action'] }, 'condition': { 'age': r['age'], 'isLive': r['is_live'], 'createdBefore': r['created_before'], 'numNewerVersions': r['number_of_newer_versions'] } } for r in defn.lifecycle] }, 'location': defn.region, 'logging': { 'logBucket': defn.log_bucket, 'logObjectPrefix': defn.log_object_prefix } if defn.log_bucket is not None else {}, 'storageClass': defn.storage_class, 'versioning': { 'enabled': defn.versioning_enabled }, 'website': { 'mainPageSuffix': defn.website_main_page_suffix, 'notFoundPage': defn.website_not_found_page } } def bucket(self): return self.connect().request("/{0}?projection=full".format( self.bucket_name), method="GET").object def delete_bucket(self): return self.connect().request("/{0}".format(self.bucket_name), method='DELETE') def create_bucket(self, defn): return self.connect().request("?project={0}".format(self.project), method='POST', data=self.bucket_resource(defn)) def update_bucket(self, defn): return self.connect().request("/{0}".format(self.bucket_name), method='PATCH', data=self.bucket_resource(defn)) def create(self, defn, check, allow_reboot, allow_recreate): self.no_property_change(defn, 'storage_class') self.no_project_change(defn) self.no_region_change(defn) self.copy_credentials(defn) self.bucket_name = defn.bucket_name if check: try: b = self.bucket() if self.state == self.UP: self.handle_changed_property('region', b['location'], can_fix=False) self.handle_changed_property('storage_class', b['storageClass'], can_fix=False) self.handle_changed_property( 'log_bucket', b.get('logging', {}).get('logBucket', None)) self.handle_changed_property( 'log_object_prefix', b.get('logging', {}).get('logObjectPrefix', None)) self.handle_changed_property('versioning_enabled', b['versioning']['enabled']) self.handle_changed_property( 'website_main_page_suffix', b.get('website', {}).get('mainPageSuffix', None)) self.handle_changed_property( 'website_not_found_page', b.get('website', {}).get('notFoundPage', None)) actual_cors = sorted([{ 'origins': sorted(c.get('origin', [])), 'methods': sorted(c.get('method', [])), 'response_headers': sorted(c.get('responseHeader', [])), 'max_age_seconds': int(c.get('maxAgeSeconds')) } for c in b.get('cors', {})]) self.handle_changed_property('cors', actual_cors, property_name='CORS config') actual_lifecycle = sorted([{ 'action': r.get('action', {}).get('type', None), 'age': r.get('condition', {}).get('age', None), 'is_live': r.get('condition', {}).get('isLive', None), 'created_before': r.get('condition', {}).get('createdBefore', None), 'number_of_newer_versions': r.get('condition', {}).get('numNewerVersions', None), } for r in b.get('lifecycle', {}).get('rule', [])]) self.handle_changed_property( 'lifecycle', actual_lifecycle, property_name='lifecycle config') else: self.warn_not_supposed_to_exist(valuable_resource=True, valuable_data=True) if self.depl.logger.confirm( "are you sure you want to destroy the existing {0}?" .format(self.full_name)): self.log("destroying...") self.delete_bucket() else: raise Exception("can't proceed further") except libcloud.common.google.ResourceNotFoundError: self.warn_missing_resource() if self.state != self.UP: self.log("Creating {0}...".format(self.full_name)) try: bucket = self.create_bucket(defn) except libcloud.common.google.GoogleBaseError as e: if e.value.get( 'message', None ) == 'You already own this bucket. Please select another name.': raise Exception( "tried creating a GSE bucket that already exists; " "please run 'deploy --check' to fix this") else: raise self.state = self.UP self.copy_properties(defn) if self.properties_changed(defn): self.log("updating {0}...".format(self.full_name)) self.update_bucket(defn) self.copy_properties(defn) def destroy(self, wipe=False): if self.state == self.UP: try: bucket = self.bucket() if not self.depl.logger.confirm( "are you sure you want to destroy {0}?".format( self.full_name)): return False self.log("destroying {0}...".format(self.full_name)) self.delete_bucket() except libcloud.common.google.ResourceNotFoundError: self.warn("tried to destroy {0} which didn't exist".format( self.full_name)) return True
class ResourceState(nixops.resources.ResourceState): subscription_id = attr_property("azure.subscriptionId", None) authority_url = attr_property("azure.authorityUrl", None) identifier_uri = attr_property("azure.identifierUri", None) app_id = attr_property("azure.appId", None) app_key = attr_property("azure.appKey", None) tokens_lock = threading.Lock() tokens = {} # type: Dict[str, Dict] def __init__(self, depl, name, id): nixops.resources.ResourceState.__init__(self, depl, name, id) self._rmc = None self._cmc = None self._nrpc = None self._smc = None def get_mgmt_credentials(self): with self.tokens_lock: token_id = "{0}|||{1}|||{2}".format(self.authority_url, self.app_id, self.app_key) if token_id in self.tokens: token = self.tokens[token_id] else: try: context = adal.AuthenticationContext(self.authority_url) token = context.acquire_token_with_client_credentials( str(self.identifier_uri), str(self.app_id), str(self.app_key)) except Exception as e: e.args = ("Auth failure: {0}".format( e.args[0]), ) + e.args[1:] raise self.tokens[token_id] = token return SubscriptionCloudCredentials(self.subscription_id, token['accessToken']) def rmc(self): if not self._rmc: self._rmc = ResourceManagementClient(self.get_mgmt_credentials()) return self._rmc def cmc(self): if not self._cmc: self.rmc().providers.register('Microsoft.Compute') self._cmc = ComputeManagementClient(self.get_mgmt_credentials()) self._cmc.long_running_operation_initial_timeout = 3 self._cmc.long_running_operation_retry_timeout = 5 return self._cmc def nrpc(self): if not self._nrpc: self.rmc().providers.register('Microsoft.Network') self._nrpc = NetworkResourceProviderClient( self.get_mgmt_credentials()) return self._nrpc def smc(self): if not self._smc: self.rmc().providers.register('Microsoft.Storage') self._smc = StorageManagementClient(self.get_mgmt_credentials()) return self._smc def copy_mgmt_credentials(self, defn): self.subscription_id = defn.get_subscription_id() self.authority_url = defn.get_authority_url() self.identifier_uri = defn.get_identifier_uri() self.app_id = defn.get_app_id() self.app_key = defn.get_app_key() def is_deployed(self): return (self.state == self.UP) def is_failed(self, resource): return resource.provisioning_state == 'Failed' def warn_if_failed(self, resource): if self.is_failed(resource): self.warn("resource exists, but is in a failed state") def no_change(self, condition, property_name): if self.is_deployed() and condition: raise Exception("cannot change the {0} of a deployed {1}".format( property_name, self.full_name)) def no_property_change(self, defn, name): self.no_change( getattr(self, name) != getattr(defn, name), name.replace('_', ' ')) def no_subscription_id_change(self, defn): self.no_change(self.subscription_id != defn.get_subscription_id(), 'subscription ID') def no_location_change(self, defn): self.no_change( normalize_location(self.location) != normalize_location( defn.location), 'location') def warn_missing_resource(self): if self.state == self.UP: self.warn( "{0} is supposed to exist, but is missing; recreating...". format(self.full_name)) self.state = self.MISSING def warn_if_changed(self, expected_state, actual_state, name, resource_name=None, can_fix=True): if expected_state != actual_state: self.warn( "{0} {1} has changed to '{2}'; expected it to be '{3}'{4}". format(resource_name or self.full_name, name, actual_state, expected_state, "" if can_fix else "; cannot fix this automatically")) return actual_state # use warn_if_changed for a very typical use case of dealing # with changed properties which are stored in attributes # with user-friendly names def handle_changed_property(self, name, actual_state, property_name=None, can_fix=True): self.warn_if_changed(getattr(self, name), actual_state, property_name or name.replace('_', ' '), can_fix=can_fix) if can_fix: setattr(self, name, actual_state) # use warn_if_changed for a very typical use case of dealing # with changed properties which are stored in dictionaries # with user-friendly names def handle_changed_dict(self, resource, name, actual_state, property_name=None, resource_name=None, can_fix=True): self.warn_if_changed(resource[name], actual_state, property_name or name.replace('_', ' '), resource_name=resource_name, can_fix=can_fix) if can_fix: resource[name] = actual_state def warn_not_supposed_to_exist(self, resource_name=None, valuable_data=False, valuable_resource=False): valuables = " or ".join( filter( None, [valuable_data and "data", valuable_resource and "resource"])) valuable_msg = ( "; however, this also could be a resource name collision, " "and valuable {0} could be lost; before proceeding, " "please ensure that this isn't so".format(valuables) if valuables else "") self.warn( "{0} exists, but isn't supposed to; probably, this is the result " "of a botched creation attempt and can be fixed by deletion{1}". format(resource_name or self.full_name, valuable_msg)) def confirm_destroy(self, res_name=None, abort=True): if self.depl.logger.confirm( "are you sure you want to destroy {0}?".format( res_name or self.full_name)): self.log("destroying...") self.destroy_resource() return True else: if abort: raise Exception("can't proceed further") else: return False def destroy(self, wipe=False): if self.state == self.UP: try: resource = self.get_settled_resource() if resource is None: self.warn("tried to destroy {0} which didn't exist".format( self.full_name)) else: return self.confirm_destroy(abort=False) except azure.common.AzureMissingResourceHttpError: self.warn("tried to destroy {0} which didn't exist".format( self.full_name)) except azure.common.AzureHttpError as e: if e.status_code == 204: self.warn("tried to destroy {0} which didn't exist".format( self.full_name)) else: raise return True # API to handle copying properties from definition to state # after resource is created or updated and checking that # the state is out of sync with the definition def copy_properties(self, defn): for attr in self.defn_properties: setattr(self, attr, getattr(defn, attr)) def properties_changed(self, defn): return any( getattr(self, attr) != getattr(defn, attr) for attr in self.defn_properties) # Certain resources are provisioned and destroyed asynchronously. # While resource is being created or destroyed, attempts at # creating, updating or destroying a resource with the same name may fail. # Thus we need to wait for certain resource states to settle. def is_settled(self, resource): return resource is None or (resource.provisioning_state in ['Succeeded', 'Failed']) def ensure_settled(self): def check_settled(): resource = self.get_resource() return self.is_settled(resource) check_wait(check_settled, initial=1, max_tries=100, exception=True) def get_settled_resource(self, initial=1, factor=1, max_tries=60): def _get_resource(): try: return self.get_resource() except Exception as e: self.log("Failed getting access to {0}".format(self.full_name)) raise wait = initial tries = 0 resource = _get_resource() while tries < max_tries and not self.is_settled(resource): wait = wait * factor tries = tries + 1 if tries == max_tries: raise Exception("resource failed to settle") time.sleep(wait) resource = _get_resource() return resource def get_resource_state(self, cls, name): if cls is None: return None if not name: return None return next(( r for r in self.depl.resources.values() if isinstance(r, cls) and getattr(r, 'resource_id', None) == name), None) # retrieve the resource and complain to the user if it doesn't exist def get_settled_resource_assert_exists(self): res = self.get_settled_resource() if res is None: raise Exception("{0} has been deleted behind our back; " "please run 'deploy --check' to fix this".format( self.full_name)) return res
class HetznerState(MachineState): """ State of a Hetzner machine. """ @classmethod def get_type(cls): return "hetzner" state = attr_property("state", MachineState.UNKNOWN, int) main_ipv4 = attr_property("hetzner.mainIPv4", None) robot_admin_user = attr_property("hetzner.robotUser", None) robot_admin_pass = attr_property("hetzner.robotPass", None) partitions = attr_property("hetzner.partitions", None) just_installed = attr_property("hetzner.justInstalled", False, bool) rescue_passwd = attr_property("hetzner.rescuePasswd", None) fs_info = attr_property("hetzner.fsInfo", None) net_info = attr_property("hetzner.networkInfo", None, "json") hw_info = attr_property("hetzner.hardwareInfo", None) main_ssh_private_key = attr_property("hetzner.sshPrivateKey", None) main_ssh_public_key = attr_property("hetzner.sshPublicKey", None) def __init__(self, depl, name, id): MachineState.__init__(self, depl, name, id) self._robot = None @property def resource_id(self) -> Optional[str]: return self.vm_id @property def public_ipv4(self) -> Optional[str]: return self.main_ipv4 def connect(self) -> Robot: """ Connect to the Hetzner robot by using the admin credetials in 'self.robot_admin_user' and 'self.robot_admin_pass'. """ if self._robot is not None: return self._robot self._robot = Robot(self.robot_admin_user, self.robot_admin_pass) return self._robot def _get_robot_user_and_pass(self, defn=None, default_user=None, default_pass=None): """ Fetch the server instance using the main robot user and passwords from the MachineDefinition passed by 'defn'. If the definition does not contain these credentials or is None, it is tried to fetch it from environment variables. """ if defn is not None and len(defn.robot_user) > 0: robot_user = defn.robot_user else: robot_user = os.environ.get("HETZNER_ROBOT_USER", default_user) if defn is not None and len(defn.robot_pass) > 0: robot_pass = defn.robot_pass else: robot_pass = os.environ.get("HETZNER_ROBOT_PASS", default_pass) if robot_user is None: raise Exception( "please either set ‘deployment.hetzner.robotUser’" " or $HETZNER_ROBOT_USER for machine" " ‘{0}’".format(self.name) ) elif robot_pass is None: raise Exception( "please either set ‘deployment.hetzner.robotPass’" " or $HETZNER_ROBOT_PASS for machine" " ‘{0}’".format(self.name) ) return (robot_user, robot_pass) def _get_server_from_main_robot(self, ip, defn=None): (robot_user, robot_pass) = self._get_robot_user_and_pass(defn=defn) if TEST_MODE: return TestModeServer() robot = Robot(robot_user, robot_pass) return robot.servers.get(ip) def _get_server_by_ip(self, ip): """ Queries the robot for the given ip address and returns the Server instance if it was found. """ if TEST_MODE: return TestModeServer() robot = self.connect() return robot.servers.get(ip) def get_ssh_private_key_file(self): if self._ssh_private_key_file: return self._ssh_private_key_file else: return self.write_ssh_private_key(self.main_ssh_private_key) def get_ssh_flags(self, *args, **kwargs): return super(HetznerState, self).get_ssh_flags(*args, **kwargs) + ( [ "-o", "LogLevel=quiet", "-o", "UserKnownHostsFile=/dev/null", "-o", "GlobalKnownHostsFile=/dev/null", "-o", "StrictHostKeyChecking=accept-new", ] if self.state == self.RESCUE else # XXX: Disabling strict host key checking will only impact the # behaviour on *new* keys, so it should be "reasonably" safe to do # this until we have a better way of managing host keys in # ssh_util. So far this at least avoids to accept every damn host # key on a large deployment. [ "-o", "StrictHostKeyChecking=accept-new", "-i", self.get_ssh_private_key_file(), ] ) def _wait_for_rescue(self, ip): if not TEST_MODE: # In test mode, the target machine really doesn't go down at all, # so only wait for the reboot to finish when deploying real # systems. self.log_start("waiting for rescue system...") dotlog = lambda: self.log_continue(".") # NOQA wait_for_tcp_port(ip, 22, open=False, callback=dotlog) self.log_continue("[down]") wait_for_tcp_port(ip, 22, callback=dotlog) self.log_end("[up]") self.state = self.RESCUE def _bootstrap_rescue_for_existing_system(self): """ Make sure that an existing system is easy to work on and set everything up properly to enter a chrooted shell on the target system. """ self.log_start("mounting /mnt/run... ") self.run_command("mkdir -m 0755 -p /mnt/run") self.run_command("mount -t tmpfs -o mode=0755 none /mnt/run") self.log_end("done.") self.log_start("symlinking /mnt/run/current-system... ") self.run_command( "ln -s /nix/var/nix/profiles/system " "/mnt/run/current-system" ) self.log_end("done.") self.log_start("adding note on ‘nixos-enter’ to motd... ") cmd = "nixos-enter" msg = "Use {} to enter a shell on the target system" msglen = len(msg.format(cmd)) csimsg = msg.format("\033[1;32m{}\033[37m".format(cmd)) hborder = "-" * (msglen + 2) fullmsg = "\033[1;30m{}\033[m\n\n".format( "\n".join( [ "+{}+".format(hborder), "| \033[37;1m{}\033[30m |".format(csimsg), "+{}+".format(hborder), ] ) ) self.run_command("cat >> /etc/motd", stdin_string=fullmsg) self.log_end("done.") def _bootstrap_rescue(self, install, partitions): """ Bootstrap everything needed in order to get Nix and the partitioner usable in the rescue system. The keyword arguments are only for partitioning, see reboot_rescue() for description, if not given we will only mount based on information provided in self.partitions. """ self.log_start("building Nix bootstrap installer... ") expr = os.path.realpath( os.path.dirname(__file__) + "/../nix/hetzner-bootstrap.nix" ) bootstrap_out = subprocess.check_output( ["nix-build", expr, "--no-out-link"] ).rstrip() bootstrap = os.path.join(bootstrap_out.decode("utf-8"), "bin/hetzner-bootstrap") self.log_end("done. ({0})".format(bootstrap)) self.log_start("creating nixbld group in rescue system... ") self.run_command( "getent group nixbld > /dev/null || " "groupadd -g 30000 nixbld" ) self.log_end("done.") self.log_start("checking if tmpfs in rescue system is large enough... ") dfstat = self.run_command("stat -f -c '%a:%S' /", capture_stdout=True) df, bs = dfstat.split(":") free_mb = (int(df) * int(bs)) // 1024 // 1024 if free_mb > 300: self.log_end("yes: {0} MB".format(free_mb)) tarcmd = "tar x -C /" else: self.log_end("no: {0} MB".format(free_mb)) tarexcludes = [ "*/include", "*/man", "*/info", "*/locale", "*/locales", "*/share/doc", "*/share/aclocal", "*/example", "*/terminfo", "*/pkgconfig", "*/nix-support", "*/etc", "*/bash-completion", "*.a", "*.la", "*.pc", "*.lisp", "*.pod", "*.html", "*.pyc", "*.pyo", "*-kbd-*/share", "*-gcc-*/bin", "*-gcc-*/libexec", "*-systemd-*/bin", "*-boehm-gc-*/share", ] tarcmd = "tar x -C / " + " ".join( ["--exclude='{0}'".format(glob) for glob in tarexcludes] ) # The command to retrieve our split TAR archive on the other side. recv = 'read -d: tarsize; head -c "$tarsize" | {0}; {0}'.format(tarcmd) self.log_start("copying bootstrap files to rescue system... ") tarstream = subprocess.Popen([bootstrap], stdout=subprocess.PIPE) if not self.has_fast_connection: stream = subprocess.Popen( ["gzip", "-c"], stdin=tarstream.stdout, stdout=subprocess.PIPE ) self.run_command("gzip -d | ({0})".format(recv), stdin=stream.stdout) stream.wait() else: self.run_command(recv, stdin=tarstream.stdout) tarstream.wait() self.log_end("done.") if install: self.log_start("partitioning disks... ") try: out = self.run_command( "nixpart -p -", capture_stdout=True, stdin_string=partitions ) except SSHCommandFailed as failed_command: # Exit code 100 is when the partitioner requires a reboot. if failed_command.exitcode == 100: self.log(failed_command.message) self.reboot_rescue(install, partitions) return else: raise # This is the *only* place to set self.partitions unless we have # implemented a way to repartition the system! self.partitions = partitions self.fs_info = out else: self.log_start("mounting filesystems... ") self.run_command("nixpart -m -", stdin_string=self.partitions) self.log_end("done.") if not install: self.log_start("checking if system in /mnt is NixOS... ") res = self.run_command("test -e /mnt/etc/NIXOS", check=False) if res == 0: self.log_end("yes.") self._bootstrap_rescue_for_existing_system() else: self.log_end("NO! Not mounting special filesystems.") return self.log_start("bind-mounting special filesystems... ") for mountpoint in ("/proc", "/dev", "/dev/shm", "/sys"): self.log_continue("{0}...".format(mountpoint)) cmd = "mkdir -m 0755 -p /mnt{0} && ".format(mountpoint) cmd += "mount --bind {0} /mnt{0}".format(mountpoint) self.run_command(cmd) self.log_end("done.") def reboot(self, hard=False): if hard: self.log_start("sending hard reset to robot... ") server = self._get_server_by_ip(self.main_ipv4) server.reboot("hard") self.log_end("done.") self.state = self.STARTING self.ssh.reset() else: MachineState.reboot(self, hard=hard) def reboot_rescue(self, install=False, partitions=None, bootstrap=True, hard=False): """ Use the Robot to activate the rescue system and reboot the system. By default, only mount partitions and do not partition or wipe anything. On installation, both 'installed' has to be set to True and partitions should contain a Kickstart configuration, otherwise it's read from self.partitions if available (which it shouldn't if you're not doing something nasty). """ if self.RESCUE: self.log("machine already in rescue mode..") else: self.log( "rebooting machine ‘{0}’ ({1}) into rescue system".format( self.name, self.main_ipv4 ) ) server = self._get_server_by_ip(self.main_ipv4) server.rescue.activate() rescue_passwd = server.rescue.password if hard or (install and self.state not in (self.UP, self.RESCUE)): self.log_start("sending hard reset to robot... ") server.reboot("hard") else: self.log_start("sending reboot command... ") if self.state == self.RESCUE: self.run_command("(sleep 2; reboot) &", check=False) else: self.run_command("systemctl reboot", check=False) self.log_end("done.") self._wait_for_rescue(self.main_ipv4) self.rescue_passwd = rescue_passwd self.state = self.RESCUE self.ssh.reset() if bootstrap: self._bootstrap_rescue(install, partitions) def _install_base_system(self): self.log_start("creating missing directories... ") cmds = ["mkdir -m 1777 -p /mnt/tmp /mnt/nix/store"] mntdirs = [ "var", "etc", "bin", "nix/var/nix/gcroots", "nix/var/nix/temproots", "nix/var/nix/manifests", "nix/var/nix/userpool", "nix/var/nix/profiles", "nix/var/nix/db", "nix/var/log/nix/drvs", ] to_create = " ".join(map(lambda d: os.path.join("/mnt", d), mntdirs)) cmds.append("mkdir -m 0755 -p {0}".format(to_create)) self.run_command(" && ".join(cmds)) self.log_end("done.") self.log_start("bind-mounting files in /etc... ") for etcfile in ("resolv.conf", "passwd", "group"): self.log_continue("{0}...".format(etcfile)) cmd = ( "if ! test -e /mnt/etc/{0}; then" " touch /mnt/etc/{0} && mount --bind /etc/{0} /mnt/etc/{0};" " fi" ).format(etcfile) self.run_command(cmd) self.log_end("done.") self.run_command("touch /mnt/etc/NIXOS") self.run_command("activate-remote") self.main_ssh_private_key, self.main_ssh_public_key = create_key_pair( key_name="NixOps client key of {0}".format(self.name) ) self._gen_network_spec() def _detect_hardware(self): self.log_start("detecting hardware... ") cmd = "nixos-generate-config --no-filesystems --show-hardware-config" hardware = self.run_command(cmd, capture_stdout=True) self.hw_info = "\n".join( [ line for line in hardware.splitlines() if not line.lstrip().startswith("#") ] ) self.log_end("done.") def switch_to_configuration(self, method, sync, command=None): if self.state == self.RESCUE: # We cannot use the mountpoint command here, because it's unable to # detect bind mounts on files, so we just go ahead and try to # unmount. umount = 'if umount "{0}" 2> /dev/null; then rm -f "{0}"; fi' cmd = "; ".join( [ umount.format(os.path.join("/mnt/etc", mnt)) for mnt in ("resolv.conf", "passwd", "group") ] ) self.run_command(cmd) command = "chroot /mnt /nix/var/nix/profiles/system/bin/" command += "switch-to-configuration" res = MachineState.switch_to_configuration(self, method, sync, command) if res not in (0, 100): return res if self.state == self.RESCUE and self.just_installed: self.reboot_sync() self.just_installed = False return res def _get_ethernet_interfaces(self): """ Return a list of all the ethernet interfaces active on the machine. """ # We don't use \(\) here to ensure this works even without GNU sed. cmd = "ip addr show | sed -n -e 's/^[0-9]*: *//p' | cut -d: -f1" return self.run_command(cmd, capture_stdout=True).splitlines() def _get_udev_rule_for(self, interface): """ Get lines suitable for services.udev.extraRules for 'interface', and thus essentially map the device name to a hardware address. """ cmd = "ip addr show \"{0}\" | sed -n -e 's|^.*link/ether *||p'" cmd += " | cut -d' ' -f1" mac_addr = self.run_command(cmd.format(interface), capture_stdout=True).strip() rule = 'ACTION=="add", SUBSYSTEM=="net", ATTR{{address}}=="{0}", ' rule += 'NAME="{1}"' return rule.format(mac_addr, interface) def _get_ipv4_addr_and_prefix_for(self, interface): """ Return a tuple of (ipv4_address, prefix_length) for the specified interface. """ cmd = "ip addr show \"{0}\" | sed -n -e 's/^.*inet *//p'" cmd += " | cut -d' ' -f1" ipv4_addr_prefix = self.run_command( cmd.format(interface), capture_stdout=True ).strip() if "/" not in ipv4_addr_prefix: # No IP address set for this interface. return None else: return ipv4_addr_prefix.split("/", 1) def _get_default_gw(self): """ Return the default gateway of the currently running machine. """ default_gw_cmd = "ip route list | sed -n -e 's/^default *via *//p'" default_gw_output = self.run_command( default_gw_cmd, capture_stdout=True ).strip() default_gw_output_split = default_gw_output.split(" ") gw_ip = default_gw_output_split[0] gw_dev = default_gw_output_split[2] return (gw_ip, gw_dev) def _get_nameservers(self): """ Return a list of all nameservers defined on the currently running machine. """ cmd = "cat /etc/resolv.conf | sed -n -e 's/^nameserver *//p'" return self.run_command(cmd, capture_stdout=True).splitlines() def _indent(self, lines, level=1): """ Indent list of lines by the specified level (one level = two spaces). """ return map(lambda line: " " + line, lines) def _calculate_ipv4_subnet(self, ipv4, prefix_len): """ Returns the address of the subnet for the given 'ipv4' and 'prefix_len'. """ bits = struct.unpack("!L", socket.inet_aton(ipv4))[0] mask = 0xFFFFFFFF >> (32 - prefix_len) << (32 - prefix_len) return socket.inet_ntoa(struct.pack("!L", bits & mask)) def _gen_network_spec(self): """ Generate Nix expressions related to networking configuration based on the currently running machine (most likely in RESCUE state) and set the resulting string to self.net_info. """ udev_rules = [] iface_attrs = {} server = self._get_server_by_ip(self.main_ipv4) # Global networking options defgw_ip, defgw_dev = self._get_default_gw() v6defgw = None # Interface-specific networking options for iface in self._get_ethernet_interfaces(): if iface == "lo": continue result = self._get_ipv4_addr_and_prefix_for(iface) if result is None: continue udev_rules.append(self._get_udev_rule_for(iface)) ipv4addr, prefix = result iface_attrs[iface] = { "ipv4": { "addresses": [ {"address": ipv4addr, "prefixLength": int(prefix)}, ], }, } # We can't handle Hetzner-specific networking info in test mode. if TEST_MODE: continue # Extra route for accessing own subnet for this interface # (see https://wiki.hetzner.de/index.php/Netzkonfiguration_Debian/en#IPv4), # but only if it's not the interface for the default gateway, # because that one will already get such a route generated # by NixOS's `network-setup.service`. See also: # https://github.com/NixOS/nixops/pull/1032#issuecomment-433741624 if iface != defgw_dev: net = self._calculate_ipv4_subnet(ipv4addr, int(prefix)) iface_attrs[iface]["ipv4"] = { "routes": [ { "address": net, "prefixLength": int(prefix), "via": defgw_ip, } ], } # IPv6 subnets only for eth0 v6subnets = [] for subnet in server.subnets: if "." in subnet.net_ip: # skip IPv4 addresses continue v6subnets.append( {"address": subnet.net_ip, "prefixLength": int(subnet.mask)} ) assert v6defgw is None or v6defgw.get("address") == subnet.gateway v6defgw = { "address": subnet.gateway, "interface": defgw_dev, } iface_attrs[iface]["ipv6"] = {"addresses": v6subnets} self.net_info = { "services": { "udev": {"extraRules": "\n".join(udev_rules) + "\n"}, }, "networking": { "interfaces": iface_attrs, "defaultGateway": { "address": defgw_ip, "interface": defgw_dev, }, "defaultGateway6": v6defgw, "nameservers": self._get_nameservers(), }, } def get_physical_spec(self): if all([self.net_info, self.fs_info, self.hw_info, self.main_ssh_public_key]): return { "config": { **self.net_info, **{ ( "users", "extraUsers", "root", "openssh", "authorizedKeys", "keys", ): [self.main_ssh_public_key] }, }, "imports": [nix2py(self.fs_info), nix2py(self.hw_info)], } else: return {} def create(self, defn, check, allow_reboot, allow_recreate): assert isinstance(defn, HetznerDefinition) if self.state not in (self.RESCUE, self.UP) or check: self.check() self.set_common_state(defn) self.main_ipv4 = defn.main_ipv4 if defn.create_sub_account: if not self.robot_admin_user or not self.robot_admin_pass: self.log_start( "creating an exclusive robot admin sub-account " "for ‘{0}’... ".format(self.name) ) server = self._get_server_from_main_robot(self.main_ipv4, defn) with self.depl._db: ( self.robot_admin_user, self.robot_admin_pass, ) = server.admin.create() self.log_end("done. ({0})".format(self.robot_admin_user)) else: # If available, assign user and password even if they are already # in the DB, so that changes to them are immediately reflected. # If not available, we use the ones from the DB. (robot_user, robot_pass) = self._get_robot_user_and_pass( defn=defn, default_user=self.robot_admin_user, default_pass=self.robot_admin_pass, ) if ( robot_user != self.robot_admin_user or robot_pass != self.robot_admin_pass ): with self.depl._db: (self.robot_admin_user, self.robot_admin_pass) = ( robot_user, robot_pass, ) if not self.vm_id: self.log("installing machine...") self.reboot_rescue(install=True, partitions=defn.partitions) self._install_base_system() self._detect_hardware() server = self._get_server_by_ip(self.main_ipv4) vm_id = "nixops-{0}-{1}".format(self.depl.uuid, self.name) server.set_name(vm_id[:100]) self.vm_id = vm_id known_hosts.remove(self.main_ipv4, None) self.just_installed = True self.state_version = defn.config.nixosRelease def start(self): """ Start the server into the normal system (a reboot is done if the rescue system is active). """ if self.state == self.UP: return elif self.state == self.RESCUE: self.reboot() elif self.state in (self.STOPPED, self.UNREACHABLE): self.log_start("server was shut down, sending hard reset... ") server = self._get_server_by_ip(self.main_ipv4) server.reboot("hard") self.log_end("done.") self.state = self.STARTING self.wait_for_ssh(check=True) self.send_keys() def _wait_stop(self): """ Wait for the system to shutdown and set state STOPPED afterwards. """ self.log_start("waiting for system to shutdown... ") dotlog = lambda: self.log_continue(".") # NOQA wait_for_tcp_port(self.main_ipv4, 22, open=False, callback=dotlog) self.log_continue("[down]") self.state = self.STOPPED def stop(self): """ Stops the server by shutting it down without powering it off. """ if self.state not in (self.RESCUE, self.UP): return self.log_start("shutting down system... ") self.run_command("systemctl halt", check=False) self.log_end("done.") self.state = self.STOPPING self._wait_stop() def get_ssh_name(self): assert self.main_ipv4 return self.main_ipv4 def get_ssh_password(self): if self.state == self.RESCUE: return self.rescue_passwd else: return None def _check(self, res): if not self.vm_id: res.exists = False return if self.state in (self.STOPPED, self.STOPPING): res.is_up = ping_tcp_port(self.main_ipv4, 22) if not res.is_up: self.state = self.STOPPED res.is_reachable = False return res.exists = True avg = self.get_load_avg() if avg is None: if self.state in (self.UP, self.RESCUE): self.state = self.UNREACHABLE res.is_reachable = False res.is_up = False elif self.run_command("test -f /etc/NIXOS", check=False) != 0: self.state = self.RESCUE self.ssh_pinged = True self._ssh_pinged_this_time = True res.is_reachable = True res.is_up = False else: res.is_up = True MachineState._check(self, res) def _destroy(self, server, wipe): if self.state != self.RESCUE: self.reboot_rescue(bootstrap=False, hard=True) if wipe: self.log_start("erasing all data on disk... ") # Let it run in the background because it will take a long time. cmd = "nohup shred /dev/[sh]d? &> /dev/null < /dev/null &" self.run_command(cmd) self.log_end("done. (backgrounded)") self.log_start("unsetting server name... ") server.set_name("") self.log_end("done.") self.log_start("removing admin account... ") server.admin.delete() self.log_start("done.") self.log("machine left in rescue, password: "******"{0}".format(self.rescue_passwd)) return True def destroy(self, wipe=False): if not self.vm_id: return True # Create the instance as early as possible so if we don't have the # needed credentials, we can avoid to ask for confirmation. server = self._get_server_from_main_robot(self.main_ipv4) if wipe: question = "are you sure you want to completely erase {0}?" else: question = "are you sure you want to destroy {0}?" question_target = "Hetzner machine ‘{0}’".format(self.name) if not self.depl.logger.confirm(question.format(question_target)): return False return self._destroy(server, wipe)
class AzureBLOBContainerState(StorageResourceState): """State of an Azure BLOB Container""" container_name = attr_property("azure.name", None) blob_public_access = attr_property("azure.blobPublicAccess", None) storage = attr_property("azure.storage", None) signed_identifiers = attr_property("azure.signedIdentifiers", {}, 'json') metadata = attr_property("azure.metadata", {}, 'json') @classmethod def get_type(cls): return "azure-blob-container" def show_type(self): s = super(AzureBLOBContainerState, self).show_type() if self.state == self.UP: s = "{0}".format(s) return s @property def resource_id(self): return self.container_name @property def full_name(self): return "Azure BLOB container '{0}'".format(self.resource_id) def get_storage_name(self): return self.storage def get_key(self): storage = self.get_resource_state(AzureStorageState, self.storage) access_key = self.access_key or (storage and storage.access_key) if not access_key: raise Exception( "Can't obtain the access key needed to manage {0}".format( self.full_name)) return access_key def is_settled(self, resource): return True def get_resource_allow_exceptions(self): return self.bs().get_container_properties(self.resource_id) def destroy_resource(self): self.bs().delete_container(self.resource_id, fail_not_exist=True) defn_properties = ['blob_public_access', 'metadata'] def create(self, defn, check, allow_reboot, allow_recreate): self.no_property_change(defn, 'storage') self.container_name = defn.container_name self.access_key = defn.access_key self.storage = defn.storage if check: container = self.get_settled_resource() if not container: self.warn_missing_resource() elif self.state == self.UP: # FIXME: currently there's no way to get acl.blobPublicAccess value self.handle_changed_metadata(container) self.handle_changed_signed_identifiers( self.bs().get_container_acl(self.container_name)) else: self.warn_not_supposed_to_exist() self.confirm_destroy() if self.state != self.UP: if self.get_settled_resource(): raise Exception( "tried creating a container that already exists; " "please run 'deploy --check' to fix this") self.log("creating {0} in {1}...".format(self.full_name, defn.storage)) self.bs().create_container( defn.container_name, x_ms_meta_name_values=defn.metadata, x_ms_blob_public_access=defn.blob_public_access, fail_on_exist=True) self.state = self.UP self.copy_properties(defn) if self.metadata != defn.metadata: self.log("updating the metadata of {0}...".format(self.full_name)) self.get_settled_resource_assert_exists() self.bs().set_container_metadata( self.container_name, x_ms_meta_name_values=defn.metadata) self.metadata = defn.metadata if (self.signed_identifiers != defn.signed_identifiers or self.blob_public_access != defn.blob_public_access): self.log("updating the ACL of {0}...".format(self.full_name)) self.get_settled_resource_assert_exists() signed_identifiers = self._dict_to_signed_identifiers( defn.signed_identifiers) self.bs().set_container_acl( self.container_name, x_ms_blob_public_access=defn.blob_public_access, signed_identifiers=signed_identifiers) self.blob_public_access = defn.blob_public_access self.signed_identifiers = defn.signed_identifiers def create_after(self, resources, defn): return { r for r in resources if isinstance(r, AzureResourceGroupState) or isinstance(r, AzureStorageState) }
class AzureTableState(StorageResourceState): """State of an Azure Table""" table_name = attr_property("azure.name", None) storage = attr_property("azure.storage", None) signed_identifiers = attr_property("azure.signedIdentifiers", {}, 'json') @classmethod def get_type(cls): return "azure-table" def show_type(self): s = super(AzureTableState, self).show_type() if self.state == self.UP: s = "{0}".format(s) return s @property def resource_id(self): return self.table_name @property def full_name(self): return "Azure table '{0}'".format(self.resource_id) def get_storage_name(self): return self.storage def get_key(self): storage = self.get_resource_state(AzureStorageState, self.storage) access_key = self.access_key or (storage and storage.access_key) if not access_key: raise Exception( "Can't obtain the access key needed to manage {0}".format( self.full_name)) return access_key def is_settled(self, resource): return True def get_resource_allow_exceptions(self): return self.ts().get_table_acl(self.resource_id) def destroy_resource(self): self.ts().delete_table(self.resource_id, fail_not_exist=True) def create(self, defn, check, allow_reboot, allow_recreate): self.no_property_change(defn, 'storage') self.table_name = defn.table_name self.access_key = defn.access_key self.storage = defn.storage if check: table = self.get_settled_resource() if table is None: self.warn_missing_resource() elif self.state == self.UP: self.handle_changed_signed_identifiers(self.ts().get_table_acl( self.table_name)) else: self.warn_not_supposed_to_exist() self.confirm_destroy() if self.state != self.UP: if self.get_settled_resource() is not None: raise Exception("tried creating a table that already exists; " "please run 'deploy --check' to fix this") self.log("creating {0} in {1}...".format(self.full_name, defn.storage)) self.ts().create_table(defn.table_name, fail_on_exist=True) self.state = self.UP if self.signed_identifiers != defn.signed_identifiers: self.log("updating the ACL of {0}...".format(self.full_name)) self.get_settled_resource_assert_exists() signed_identifiers = self._dict_to_signed_identifiers( defn.signed_identifiers) self.ts().set_table_acl(self.table_name, signed_identifiers=signed_identifiers) self.signed_identifiers = defn.signed_identifiers def create_after(self, resources, defn): return { r for r in resources if isinstance(r, AzureResourceGroupState) or isinstance(r, AzureStorageState) }
class GCEForwardingRuleState(ResourceState): """State of a GCE Forwarding Rule""" forwarding_rule_name = attr_property("gce.name", None) target_pool = attr_property("gce.targetPool", None) region = attr_property("gce.region", None) protocol = attr_property("gce.protocol", None) port_range = attr_property("gce.portRange", None) ip_address = attr_property("gce.ipAddress", None) description = attr_property("gce.description", None) public_ipv4 = attr_property("gce.public_ipv4", None) @classmethod def get_type(cls): return "gce-forwarding-rule" def __init__(self, depl, name, id): ResourceState.__init__(self, depl, name, id) def show_type(self): s = super(GCEForwardingRuleState, self).show_type() if self.state == self.UP: s = "{0} [{1}]".format(s, self.region) return s def prefix_definition(self, attr): return {('resources', 'gceForwardingRules'): attr} def get_physical_spec(self): return {'publicIPv4': self.public_ipv4} @property def resource_id(self): return self.forwarding_rule_name nix_name = "gceForwardingRules" @property def full_name(self): return "GCE forwarding rule '{0}'".format(self.forwarding_rule_name) def forwarding_rule(self): return self.connect().ex_get_forwarding_rule(self.forwarding_rule_name) defn_properties = [ 'target_pool', 'region', 'protocol', 'port_range', 'ip_address', 'description' ] def create(self, defn, check, allow_reboot, allow_recreate): self.no_property_change(defn, 'target_pool') self.no_property_change(defn, 'protocol') self.no_property_change(defn, 'port_range') self.no_property_change(defn, 'ip_address') self.no_property_change(defn, 'description') self.no_project_change(defn) self.no_region_change(defn) self.copy_credentials(defn) self.forwarding_rule_name = defn.forwarding_rule_name if check: try: fwr = self.forwarding_rule() if self.state == self.UP: self.handle_changed_property('public_ipv4', fwr.address, property_name = 'IP address') self.handle_changed_property('region', fwr.region.name, can_fix = False) self.handle_changed_property('target_pool', fwr.targetpool.name, can_fix = False) self.handle_changed_property('protocol', fwr.protocol, can_fix = False) self.handle_changed_property('description', fwr.extra['description'], can_fix = False) self.warn_if_changed(self.port_range or '1-65535', fwr.extra['portRange'], 'port range', can_fix = False) if self.ip_address: try: address = self.connect().ex_get_address(self.ip_address) if self.public_ipv4 and self.public_ipv4 != address.address: self.warn("static IP Address {0} assigned to this machine has unexpectely " "changed from {1} to {2} most likely due to being redeployed" "; cannot fix this automatically" .format(self.ip_address, self.public_ipv4, address.address) ) except libcloud.common.google.ResourceNotFoundError: self.warn("static IP Address resource {0} used by this forwarding rule has been destroyed; " "it is likely that the forwarding rule is still holding the address itself ({1}) " "and this is your last chance to reclaim it before it gets lost" .format(self.ip_address, self.public_ipv4) ) else: self.warn_not_supposed_to_exist() self.confirm_destroy(fwr, self.full_name) except libcloud.common.google.ResourceNotFoundError: self.warn_missing_resource() if self.state != self.UP: self.log("creating {0}...".format(self.full_name)) try: fwr = self.connect().ex_create_forwarding_rule(defn.forwarding_rule_name, defn.target_pool, region = defn.region, protocol = defn.protocol, port_range = defn.port_range, address = defn.ip_address, description = defn.description) except libcloud.common.google.ResourceExistsError: raise Exception("tried creating a forwarding rule that already exists; " "please run 'deploy --check' to fix this") self.state = self.UP self.copy_properties(defn) self.public_ipv4 = fwr.address self.log("got IP: {0}".format(self.public_ipv4)) # only changing of target pool is supported by GCE, but not libcloud # FIXME: implement def destroy(self, wipe=False): if self.state == self.UP: try: fwr = self.forwarding_rule() return self.confirm_destroy(fwr, self.full_name, abort = False) except libcloud.common.google.ResourceNotFoundError: self.warn("tried to destroy {0} which didn't exist".format(self.full_name)) return True def create_after(self, resources, defn): return {r for r in resources if isinstance(r, GCETargetPoolState) or isinstance(r, GCEStaticIPState)}
class GSEBucketState(ResourceState): """State of a GSE Bucket""" bucket_name = attr_property("gce.name", None) cors = attr_property("gce.cors", [], "json") lifecycle = attr_property("gce.lifecycle", [], "json") log_bucket = attr_property("gce.logBucket", None) log_object_prefix = attr_property("gce.logObjectPrefix", None) region = attr_property("gce.region", None) storage_class = attr_property("gce.storageClass", None) versioning_enabled = attr_property("gce.versioningEnabled", None, bool) website_main_page_suffix = attr_property("gce.websiteMainPageSuffix", None) website_not_found_page = attr_property("gce.websiteNotFoundPage", None) @classmethod def get_type(cls): return "gse-bucket" def __init__(self, depl, name, id): ResourceState.__init__(self, depl, name, id) def show_type(self): s = super(GSEBucketState, self).show_type() if self.state == self.UP: s = "{0}".format(s) return s @property def resource_id(self): return self.bucket_name nix_name = "gseBuckets" @property def full_name(self): return "GSE bucket '{0}'".format(self.bucket_name) def connect(self): if not self._conn: self._conn = GSEConnection(self.service_account, self.access_key_path, True) return self._conn defn_properties = [ "cors", "lifecycle", "log_bucket", "log_object_prefix", "region", "storage_class", "versioning_enabled", "website_main_page_suffix", "website_not_found_page", ] def bucket_resource(self, defn): return { "name": defn.bucket_name, "cors": [{ "origin": c["origins"], "method": c["methods"], "responseHeader": c["response_headers"], "maxAgeSeconds": c["max_age_seconds"], } for c in defn.cors], "lifecycle": { "rule": [{ "action": { "type": r["action"] }, "condition": { "age": r["age"], "isLive": r["is_live"], "createdBefore": r["created_before"], "numNewerVersions": r["number_of_newer_versions"], }, } for r in defn.lifecycle] }, "location": defn.region, "logging": { "logBucket": defn.log_bucket, "logObjectPrefix": defn.log_object_prefix, } if defn.log_bucket is not None else {}, "storageClass": defn.storage_class, "versioning": { "enabled": defn.versioning_enabled }, "website": { "mainPageSuffix": defn.website_main_page_suffix, "notFoundPage": defn.website_not_found_page, }, } def bucket(self): return (self.connect().request("/{0}?projection=full".format( self.bucket_name), method="GET").object) def delete_bucket(self): return self.connect().request("/{0}".format(self.bucket_name), method="DELETE") def create_bucket(self, defn): return self.connect().request( "?project={0}".format(self.project), method="POST", data=self.bucket_resource(defn), ) def update_bucket(self, defn): return self.connect().request( "/{0}".format(self.bucket_name), method="PATCH", data=self.bucket_resource(defn), ) def create(self, defn, check, allow_reboot, allow_recreate): self.no_property_change(defn, "storage_class") self.no_project_change(defn) self.no_region_change(defn) self.copy_credentials(defn) self.bucket_name = defn.bucket_name if check: try: b = self.bucket() if self.state == self.UP: self.handle_changed_property("region", b["location"], can_fix=False) self.handle_changed_property("storage_class", b["storageClass"], can_fix=False) self.handle_changed_property( "log_bucket", b.get("logging", {}).get("logBucket", None)) self.handle_changed_property( "log_object_prefix", b.get("logging", {}).get("logObjectPrefix", None), ) self.handle_changed_property("versioning_enabled", b["versioning"]["enabled"]) self.handle_changed_property( "website_main_page_suffix", b.get("website", {}).get("mainPageSuffix", None), ) self.handle_changed_property( "website_not_found_page", b.get("website", {}).get("notFoundPage", None), ) actual_cors = sorted([{ "origins": sorted(c.get("origin", [])), "methods": sorted(c.get("method", [])), "response_headers": sorted(c.get("responseHeader", [])), "max_age_seconds": int(c.get("maxAgeSeconds")), } for c in b.get("cors", {})]) self.handle_changed_property("cors", actual_cors, property_name="CORS config") actual_lifecycle = sorted([{ "action": r.get("action", {}).get("type", None), "age": r.get("condition", {}).get("age", None), "is_live": r.get("condition", {}).get("isLive", None), "created_before": r.get("condition", {}).get("createdBefore", None), "number_of_newer_versions": r.get("condition", {}).get("numNewerVersions", None), } for r in b.get("lifecycle", {}).get("rule", [])]) self.handle_changed_property( "lifecycle", actual_lifecycle, property_name="lifecycle config") else: self.warn_not_supposed_to_exist(valuable_resource=True, valuable_data=True) if self.depl.logger.confirm( "are you sure you want to destroy the existing {0}?" .format(self.full_name)): self.log("destroying...") self.delete_bucket() else: raise Exception("can't proceed further") except libcloud.common.google.ResourceNotFoundError: self.warn_missing_resource() if self.state != self.UP: self.log("creating {0}...".format(self.full_name)) try: bucket = self.create_bucket(defn) except libcloud.common.google.GoogleBaseError as e: if (e.value.get( "message", None ) == "You already own this bucket. Please select another name." ): raise Exception( "tried creating a GSE bucket that already exists; " "please run 'deploy --check' to fix this") else: raise self.state = self.UP self.copy_properties(defn) if self.properties_changed(defn): self.log("updating {0}...".format(self.full_name)) self.update_bucket(defn) self.copy_properties(defn) def destroy(self, wipe=False): if self.state == self.UP: try: bucket = self.bucket() if not self.depl.logger.confirm( "are you sure you want to destroy {0}?".format( self.full_name)): return False self.log("destroying {0}...".format(self.full_name)) self.delete_bucket() except libcloud.common.google.ResourceNotFoundError: self.warn("tried to destroy {0} which didn't exist".format( self.full_name)) return True
class ResourceState(nixops.resources.ResourceState): project = attr_property("gce.project", None) service_account = attr_property("gce.serviceAccount", None) access_key_path = attr_property("gce.accessKey", None) def __init__(self, depl, name, id): nixops.resources.ResourceState.__init__(self, depl, name, id) self._conn = None def connect(self): if not self._conn: self._conn = get_driver(Provider.GCE)(self.service_account, self.access_key_path, project=self.project) return self._conn @property def credentials_prefix(self): return "resources.{0}.$NAME".format(self.nix_name) def defn_project(self, defn): project = defn.project or os.environ.get("GCE_PROJECT") if not project: raise Exception("please set '{0}.project' or $GCE_PROJECT".format( self.credentials_prefix)) return project def defn_service_account(self, defn): service_account = defn.service_account or os.environ.get( "GCE_SERVICE_ACCOUNT") if not service_account: raise Exception( "please set '{0}.serviceAccount' or $GCE_SERVICE_ACCOUNT". format(self.credentials_prefix)) return service_account def defn_access_key_path(self, defn): access_key_path = defn.access_key_path or os.environ.get( "ACCESS_KEY_PATH") if not access_key_path: raise Exception( "please set '{0}.accessKey' or $ACCESS_KEY_PATH".format( self.credentials_prefix)) return access_key_path def copy_credentials(self, defn): self.project = self.defn_project(defn) self.service_account = self.defn_service_account(defn) self.access_key_path = self.defn_access_key_path(defn) def is_deployed(self): return self.state == self.UP def no_change(self, condition, property_name): if self.is_deployed() and condition: raise Exception("cannot change the {0} of a deployed {1}".format( property_name, self.full_name)) def no_property_change(self, defn, name): self.no_change( getattr(self, name) != getattr(defn, name), name.replace("_", " ")) def no_project_change(self, defn): self.no_change(self.project != self.defn_project(defn), "project") def no_region_change(self, defn): self.no_change(self.region != defn.region, "region") def warn_missing_resource(self): if self.state == self.UP: self.warn( "{0} is supposed to exist, but is missing; recreating...". format(self.full_name)) self.state = self.MISSING def confirm_destroy(self, resource, res_name, abort=True): if self.depl.logger.confirm( "are you sure you want to destroy {0}?".format(res_name)): self.log("destroying...") resource.destroy() return True else: if abort: raise Exception("can't proceed further") else: return False def warn_if_changed(self, expected_state, actual_state, name, resource_name=None, can_fix=True): if expected_state != actual_state: self.warn( "{0} {1} has changed to '{2}'; expected it to be '{3}'{4}". format( resource_name or self.full_name, name, actual_state, expected_state, "" if can_fix else "; cannot fix this automatically", )) return actual_state # use warn_if_changed for a very typical use case of dealing # with changed properties which are stored in attributes # with user-friendly names def handle_changed_property(self, name, actual_state, property_name=None, can_fix=True): self.warn_if_changed( getattr(self, name), actual_state, property_name or name.replace("_", " "), can_fix=can_fix, ) if can_fix: setattr(self, name, actual_state) def warn_not_supposed_to_exist(self, resource_name=None, valuable_data=False, valuable_resource=False): valuables = " or ".join([ d for d in [valuable_data and "data", valuable_resource and "resource"] if d ]) valuable_msg = ( "; however, this also could be a resource name collision, " "and valuable {0} could be lost; before proceeding, " "please ensure that this isn't so".format(valuables) if valuables else "") self.warn( "{0} exists, but isn't supposed to; probably, this is the result " "of a botched creation attempt and can be fixed by deletion{1}". format(resource_name or self.full_name, valuable_msg)) # API to handle copying properties from definition to state # after resource is created or updated and checking that # the state is out of sync with the definition def copy_properties(self, defn): for attr in self.defn_properties: setattr(self, attr, getattr(defn, attr)) def properties_changed(self, defn): return any( getattr(self, attr) != getattr(defn, attr) for attr in self.defn_properties)
class HcloudVolumeState( ResourceState[HcloudVolumeDefinition], EntityResource[HcloudVolumeDefinition, BoundVolume], ): definition_type = HcloudVolumeDefinition state = attr_property("state", ResourceState.MISSING, int) token = attr_property("hcloud.token", None, str) hcloud_id = attr_property("hcloud.id", None, int) hcloud_name = attr_property("hcloud.name", None, str) size = attr_property("hcloud.size", None, int) location = attr_property("hcloud.location", None, str) _cached_client: Optional[hcloud.Client] = None @classmethod def get_type(cls) -> str: return "hcloud-volume" def prefix_definition(self, attr): return {("resources", "hcloudVolumes"): attr} @property def resource_id(self) -> str: return self.hcloud_id def create( self, defn: HcloudVolumeDefinition, check: bool, allow_reboot: bool, allow_recreate: bool, ): return entity_create(self, defn, check) def destroy(self, wipe=False) -> bool: return entity_destroy(self) def _check(self) -> bool: return entity_check(self) def entity_client(self) -> VolumesClient: if self._cached_client is None: self._cached_client = hcloud.Client(self.token) return self._cached_client.volumes def do_create_new(self, defn: HcloudVolumeDefinition) -> BoundVolume: self.size = defn.config.size self.location = defn.config.location resp = self.entity_client().create( name=self.hcloud_name, size=self.size, location=Location(name=self.location)) resp.action.wait_until_finished() return resp.volume def update(self, defn: HcloudVolumeDefinition, model: BoundVolume) -> None: if defn.config.location != model.location.name: self.logger.error( "Cannot update the location of a Hetzner Cloud volume") if defn.config.size < model.size: self.logger.error("Cannot shrink volume") elif defn.config.size > model.size: if not self.depl.logger.confirm(f"Resize volume {self.name!r}?"): return model.resize(defn.config.size).wait_until_finished() self.size = defn.config.size def should_update(self, defn: HcloudVolumeDefinition) -> bool: return self.location != defn.config.location or self.size != defn.config.size def update_unchecked(self, defn: HcloudVolumeDefinition) -> None: if defn.config.location != self.location: self.logger.error( "Cannot update the location of a Hetzner Cloud volume") if defn.config.size < self.size: self.logger.error("Cannot shrink volume") elif defn.config.size > self.size: if not self.depl.logger.confirm(f"Resize volume {self.name!r}?"): return model = get_by_name(self) if model is None: self.logger.error("Volume missing") return model.resize(defn.config.size).wait_until_finished() self.size = defn.config.size def check_model(self, model: BoundVolume) -> None: self.location = model.location.name self.size = model.size
class AzureDNSRecordSetState(ResourceState): """State of an Azure DNS Record Set""" dns_record_set_name = attr_property("azure.name", None) dns_zone = attr_property("azure.dnsZone", None) record_type = attr_property("azure.recordType", None) tags = attr_property("azure.tags", {}, 'json') properties = attr_property("azure.properties", {}, 'json') @classmethod def get_type(cls): return "azure-dns-record-set" @property def resource_id(self): return self.dns_record_set_name @property def full_name(self): return "Azure DNS record set '{0}'".format(self.resource_id) def is_settled(self, resource): return True def get_resource_url(self): return ("https://management.azure.com" "{0}/{1}/{2}?api-version=2015-05-04-preview" .format(quote(self.dns_zone), quote(self.record_type), quote(self.dns_record_set_name))) def mk_request(self, method): http_request = Request() http_request.url = self.get_resource_url() http_request.method = method http_request.headers['Content-Type'] = 'application/json' return http_request def get_resource(self): response = self.nrpc().send_request(self.mk_request('GET')) if response.status_code == 200: return json.loads(response.content.decode()) else: return None def destroy_resource(self): response = self.nrpc().send_request(self.mk_request('DELETE')) if response.status_code != 200: raise AzureHttpError(response.content, response.status_code) defn_properties = [ 'tags', 'properties' ] def _create_or_update(self, defn): info = { "location": "global", "tags": defn.tags, "properties": defn.properties } http_request = self.mk_request('PUT') http_request.data = json.dumps(info) http_request.headers['Content-Length'] = len(http_request.data) response = self.nrpc().send_request(http_request) if response.status_code not in [200, 201]: raise AzureHttpError(response.content, response.status_code) self.state = self.UP self.copy_properties(defn) def create(self, defn, check, allow_reboot, allow_recreate): self.no_subscription_id_change(defn) self.no_property_change(defn, 'dns_zone') self.no_property_change(defn, 'record_type') self.copy_mgmt_credentials(defn) self.dns_record_set_name = defn.dns_record_set_name self.dns_zone = defn.dns_zone self.record_type = defn.record_type if check: rset = self.get_settled_resource() if not rset: self.warn_missing_resource() elif self.state == self.UP: self.handle_changed_property('tags', rset['tags']) self.handle_changed_property('properties', rset['properties']) else: self.warn_not_supposed_to_exist() self.confirm_destroy() if self.state != self.UP: if self.get_settled_resource(): raise Exception("tried creating a DNS record set that already exists; " "please run 'deploy --check' to fix this") self.log("creating {0}...".format(self.full_name)) self._create_or_update(defn) if self.properties_changed(defn): self.log("updating properties of {0}...".format(self.full_name)) self.get_settled_resource_assert_exists() self._create_or_update(defn) def create_after(self, resources, defn): from nixops.resources.azure_resource_group import AzureResourceGroupState from nixops.resources.azure_dns_zone import AzureDNSZoneState return {r for r in resources if isinstance(r, AzureResourceGroupState) or isinstance(r, AzureDNSZoneState) }
class AzureGatewayConnectionState(ResourceState): """State of an Azure Virtual Network Gateway Connection""" connection_name = attr_property("azure.name", None) resource_group = attr_property("azure.resourceGroup", None) location = attr_property("azure.location", None) tags = attr_property("azure.tags", {}, 'json') virtual_network_gateway1 = attr_property("azure.virtualNetworkGateway1", None) virtual_network_gateway2 = attr_property("azure.virtualNetworkGateway2", None) local_network_gateway2 = attr_property("azure.localNetworkGateway2", None) connection_type = attr_property("azure.connectionType", None) routing_weight = attr_property("azure.routingWeight", None, int) shared_key = attr_property("azure.sharedKey", None) @classmethod def get_type(cls): return "azure-gateway-connection" def show_type(self): s = super(AzureGatewayConnectionState, self).show_type() if self.state == self.UP: s = "{0} [{1}]".format(s, self.location) return s @property def resource_id(self): return self.connection_name @property def full_name(self): return "Azure virtual network gateway connection '{0}'".format(self.resource_id) def get_resource(self): try: return self.nrpc().virtual_network_gateway_connections.get( self.resource_group, self.resource_id).virtual_network_gateway_connection except azure.common.AzureMissingResourceHttpError: return None def destroy_resource(self): self.nrpc().virtual_network_gateway_connections.delete(self.resource_group, self.resource_id) defn_properties = [ 'location', 'tags', 'virtual_network_gateway1', 'virtual_network_gateway2', 'local_network_gateway2', 'connection_type', 'routing_weight', 'shared_key' ] def get_resource_url(self): return ("https://management.azure.com/subscriptions/{0}" "/resourceGroups/{1}/providers/Microsoft.Network" "/connections/{2}?api-version=2015-05-01-preview" .format(quote(self.subscription_id), quote(self.resource_group), quote(self.connection_name))) def mk_request(self, method): http_request = Request() http_request.url = self.get_resource_url() http_request.method = method http_request.headers['Content-Type'] = 'application/json' return http_request def _create_or_update(self, defn): info = { 'location': defn.location, 'tags': defn.tags, 'properties': { 'connectionType': defn.connection_type, 'routingWeight': defn.routing_weight, 'sharedKey': defn.shared_key, } } if defn.virtual_network_gateway1: info['properties']['virtualNetworkGateway1'] = { 'id': defn.virtual_network_gateway1 } if defn.virtual_network_gateway2: info['properties']['virtualNetworkGateway2'] = { 'id': defn.virtual_network_gateway2 } if defn.local_network_gateway2: info['properties']['localNetworkGateway2'] = { 'id': defn.local_network_gateway2 } http_request = self.mk_request('PUT') http_request.data = json.dumps(info) http_request.headers['Content-Length'] = len(http_request.data) response = self.nrpc().send_request(http_request) if response.status_code not in [200, 201]: raise AzureHttpError(response.content, response.status_code) self.state = self.UP self.copy_properties(defn) self.get_settled_resource() def create(self, defn, check, allow_reboot, allow_recreate): self.no_subscription_id_change(defn) self.no_location_change(defn) self.no_property_change(defn, 'resource_group') self.no_property_change(defn, 'connection_type') self.no_property_change(defn, 'virtual_network_gateway1') self.no_property_change(defn, 'virtual_network_gateway2') self.no_property_change(defn, 'local_network_gateway2') self.copy_mgmt_credentials(defn) self.connection_name = defn.connection_name self.resource_group = defn.resource_group if check: connection = self.get_settled_resource() if not connection: self.warn_missing_resource() elif self.state == self.UP: self.warn_if_failed(connection) self.handle_changed_property('location', normalize_location(connection.location), can_fix = False) self.handle_changed_property('tags', connection.tags) self.handle_changed_property('connection_type', connection.connection_type, can_fix = False) self.handle_changed_property('routing_weight', connection.routing_weight) # check key only if the user wants to manage it if defn.shared_key: self.handle_changed_property('shared_key', connection.shared_key) self.handle_changed_property('virtual_network_gateway1', connection.virtual_network_gateway1 and connection.virtual_network_gateway1.id, can_fix = False) self.handle_changed_property('virtual_network_gateway2', connection.virtual_network_gateway2 and connection.virtual_network_gateway2.id, can_fix = False) self.handle_changed_property('local_network_gateway2', connection.local_network_gateway2 and connection.local_network_gateway2.id, can_fix = False) else: self.warn_not_supposed_to_exist() self.confirm_destroy() if self.state != self.UP: if self.get_settled_resource(): raise Exception("tried creating a virtual network gateway connection that already exists; " "please run 'deploy --check' to fix this") self.log("creating {0}...".format(self.full_name)) self._create_or_update(defn) if self.properties_changed(defn): self.log("updating properties of {0}...".format(self.full_name)) self.get_settled_resource_assert_exists() self._create_or_update(defn) def create_after(self, resources, defn): from nixops.resources.azure_resource_group import AzureResourceGroupState from nixops.resources.azure_local_network_gateway import AzureLocalNetworkGatewayState from nixops.resources.azure_virtual_network_gateway import AzureVirtualNetworkGatewayState return {r for r in resources if isinstance(r, AzureResourceGroupState) or isinstance(r, AzureLocalNetworkGatewayState) or isinstance(r, AzureVirtualNetworkGatewayState) }
class GCEImageState(ResourceState): """State of a GCE Image""" image_name = attr_property("gce.name", None) source_uri = attr_property("gce.sourceUri", None) description = attr_property("gce.description", None) @classmethod def get_type(cls): return "gce-image" def __init__(self, depl, name, id): ResourceState.__init__(self, depl, name, id) def show_type(self): return super(GCEImageState, self).show_type() @property def resource_id(self): return self.image_name nix_name = "gceImages" @property def full_name(self): return "GCE image '{0}'".format(self.image_name) def image(self): img = self.connect().ex_get_image(self.image_name) if img: img.destroy = img.delete return img defn_properties = [ 'description', 'source_uri' ] def create(self, defn, check, allow_reboot, allow_recreate): if defn.name != "bootstrap": self.no_property_change(defn, 'source_uri') self.no_property_change(defn, 'description') self.no_project_change(defn) self.copy_credentials(defn) self.image_name = defn.image_name if check: image = self.image() if image: if self.state == self.UP: self.handle_changed_property('description', image.extra['description'], can_fix = False) else: self.warn_not_supposed_to_exist(valuable_data = True) self.confirm_destroy(image, self.full_name) else: self.warn_missing_resource() if self.state != self.UP: self.log("creating {0}...".format(self.full_name)) try: image = self.connect().ex_copy_image(defn.image_name, defn.source_uri, description = defn.description) except libcloud.common.google.ResourceExistsError: raise Exception("tried creating an image that already exists; " "please run 'deploy --check' to fix this") self.state = self.UP self.copy_properties(defn) def destroy(self, wipe=False): if self.state == self.UP: image = self.image() if image: return self.confirm_destroy(image, self.full_name, abort = False) else: self.warn("tried to destroy {0} which didn't exist".format(self.full_name)) return True
class HcloudSshKeyState( ResourceState[HcloudSshKeyDefinition], EntityResource[HcloudSshKeyDefinition, BoundSSHKey], ): definition_type = HcloudSshKeyDefinition state = attr_property("state", ResourceState.MISSING, int) token = attr_property("hcloud.token", None, str) hcloud_id = attr_property("hcloud.id", None, int) hcloud_name = attr_property("hcloud.name", None, str) public_key = attr_property("hcloud.publicKey", None, str) _cached_client: Optional[hcloud.Client] = None @classmethod def get_type(cls) -> str: return "hcloud-sshkey" def prefix_definition(self, attr): return {("resources", "hcloudSshKeys"): attr} @property def resource_id(self) -> str: return self.hcloud_id def create( self, defn: HcloudSshKeyDefinition, check: bool, allow_reboot: bool, allow_recreate: bool, ): return entity_create(self, defn, check) def destroy(self, wipe=False) -> bool: return entity_destroy(self) def _check(self) -> bool: return entity_check(self) def entity_client(self) -> SSHKeysClient: if self._cached_client is None: self._cached_client = hcloud.Client(self.token) return self._cached_client.ssh_keys def do_create_new(self, defn: HcloudSshKeyDefinition) -> BoundSSHKey: self.public_key = defn.config.publicKey resp = self.entity_client().create(name=self.hcloud_name, public_key=self.public_key) return resp def update(self, defn: HcloudSshKeyDefinition, model: BoundSSHKey) -> None: self.check_model(model) def should_update(self, defn: HcloudSshKeyDefinition) -> bool: return self.public_key != defn.config.publicKey def update_unchecked(self, defn: HcloudSshKeyDefinition) -> None: if self.public_key != defn.config.publicKey: self.logger.error( "Cannot update the public key of a Hetzner Cloud SSH key") def check_model(self, model: BoundSSHKey) -> None: self.public_key = model.public_key
class AzureExpressRouteCircuitState(ResourceState): """State of an Azure ExpressRoute Circuit""" circuit_name = attr_property("azure.name", None) resource_group = attr_property("azure.resourceGroup", None) location = attr_property("azure.location", None) tags = attr_property("azure.tags", {}, 'json') tier = attr_property("azure.tier", None) family = attr_property("azure.family", None) service_provider_name = attr_property("azure.serviceProviderName", None) peering_location = attr_property("azure.peeringLocation", None) bandwidth = attr_property("azure.bandwidth", None, int) peerings = attr_property("azure.peerings", {}, 'json') @classmethod def get_type(cls): return "azure-express-route-circuit" @property def resource_id(self): return self.circuit_name @property def full_name(self): return "Azure ExpressRoute circuit '{0}'".format(self.resource_id) def is_settled(self, resource): return resource is None or (resource.get('properties', {}) .get('provisioningState', None) in ['Succeeded', 'Failed']) def is_failed(self, resource): return resource.get('properties', {}).get('provisioningState', None) == 'Failed' def get_resource_url(self): return ("https://management.azure.com/subscriptions/{0}" "/resourceGroups/{1}/providers/Microsoft.Network" "/expressRouteCircuits/{2}?api-version=2015-06-15" .format(quote(self.subscription_id), quote(self.resource_group), quote(self.circuit_name))) def mk_request(self, method): http_request = Request() http_request.url = self.get_resource_url() http_request.method = method http_request.headers['Content-Type'] = 'application/json' return http_request def get_resource(self): response = self.nrpc().send_request(self.mk_request('GET')) if response.status_code == 200: return json.loads(response.content.decode()) else: return None def destroy_resource(self): response = self.nrpc().send_request(self.mk_request('DELETE')) if response.status_code not in [200, 202, 204]: raise AzureHttpError(response.content, response.status_code) self.get_settled_resource() # wait for the delete operation to finish defn_properties = [ 'tags', 'location', 'tier', 'family', 'service_provider_name', 'peering_location', 'bandwidth', 'peerings' ] def _create_or_update(self, defn): info = { 'location': defn.location, 'tags': defn.tags, 'sku': { 'name': "{0}_{1}".format(defn.tier, defn.family), 'tier': defn.tier, 'family': defn.family, }, 'properties': { 'serviceProviderProperties': { 'serviceProviderName': defn.service_provider_name, 'peeringLocation': defn.peering_location, 'bandwidthInMbps': defn.bandwidth, }, 'peerings': [ { 'name': _n, 'properties': _p, } for _n, _p in defn.peerings.iteritems() ], }, } http_request = self.mk_request('PUT') http_request.data = json.dumps(info) http_request.headers['Content-Length'] = len(http_request.data) response = self.nrpc().send_request(http_request) if response.status_code not in [200, 201, 202]: raise AzureHttpError(response.content, response.status_code) self.get_settled_resource() self.state = self.UP self.copy_properties(defn) def handle_changed_peerings(self, peerings): def update_peerings(k, v): x = self.peerings if v == None: x.pop(k, None) else: x[k] = v self.peerings = x for _p in peerings: _p_name = next((_n for _n, _x in self.peerings.iteritems() if _n == _p.get('name', None)), None) if _p_name is None: self.warn("found unexpected peering {0}".format(_p.get('name', None))) update_peerings(_p.get('name', None), {"dummy": True}) for _name, _s_p in self.peerings.iteritems(): if _s_p.get("dummy", False): continue p_res_name = "peering {0}".format(_name) p = next((_r for _r in peerings if _r.get('name', None) == _name), None) if p is None: self.warn("{0} has been deleted behind our back".format(p_res_name)) update_peerings(_name, None) continue properties = p.get('properties', {}) # only check the properties that the user has specified explicitly for prop_name in _s_p.keys(): self.handle_changed_dict(_s_p, prop_name, properties.get(prop_name, None), resource_name = p_res_name) update_peerings(_name, _s_p) def create(self, defn, check, allow_reboot, allow_recreate): self.no_subscription_id_change(defn) self.no_property_change(defn, 'resource_group') self.no_location_change(defn) self.copy_mgmt_credentials(defn) self.circuit_name = defn.circuit_name self.resource_group = defn.resource_group if check: circuit = self.get_settled_resource() if not circuit: self.warn_missing_resource() elif self.state == self.UP: self.warn_if_failed(circuit) self.handle_changed_property('tags', circuit.get('tags', {})) self.handle_changed_property('location', normalize_location(circuit.get('location', None)), can_fix = False) sku = circuit.get('sku', {}) self.handle_changed_property('tier', sku.get('tier', None)) self.handle_changed_property('family', sku.get('family', None)) properties = circuit.get('properties', {}) provider = properties.get('serviceProviderProperties', {}) self.handle_changed_property('service_provider_name', provider.get('serviceProviderName', None)) self.handle_changed_property('peering_location', provider.get('peeringLocation', None)) self.handle_changed_property('bandwidth', provider.get('bandwidthInMbps', None)) self.handle_changed_peerings(properties.get('peerings', [])) else: self.warn_not_supposed_to_exist() self.confirm_destroy() if self.state != self.UP: if self.get_settled_resource(): raise Exception("tried creating an express route circuit that already exists; " "please run 'deploy --check' to fix this") self.log("creating {0}...".format(self.full_name)) self._create_or_update(defn) if self.properties_changed(defn): self.log("updating properties of {0}...".format(self.full_name)) self.get_settled_resource_assert_exists() self._create_or_update(defn) def create_after(self, resources, defn): from nixops.resources.azure_resource_group import AzureResourceGroupState return {r for r in resources if isinstance(r, AzureResourceGroupState) }
class FloatingIPState(HetznerCloudResourceState): """ State of a Hetzner Cloud Floating IP. """ definition_type = FloatingIPDefinition _resource_type = "floating_ips" _reserved_keys = HetznerCloudResourceState.COMMON_HCLOUD_RESERVED + [ "address", ] address = attr_property("address", None) @classmethod def get_type(cls): return "hetznercloud-floating-ip" def __init__(self, depl, name, id): super(HetznerCloudResourceState, self).__init__(depl, name, id) self.handle_create_floating_ip = Handler( ["location", "type"], handle=self.realise_create_floating_ip, ) self.handle_modify_description = Handler( ["description"], after=[self.handle_create_floating_ip], handle=self.realise_modify_description, ) self.handle_modify_labels = Handler( ["labels"], after=[self.handle_modify_description], handle=super().realise_modify_labels, ) def show_type(self): s = f"{super(FloatingIPState, self).show_type()}" if self.state == self.UP: s += f" [{self._state.get('location', None)}]" return s @property def full_name(self) -> str: address = self._state.get("address", None) return f"Hetzner Cloud Floating IP {self.resource_id} [{address}]" def prefix_definition(self, attr: Any) -> Dict[Sequence[str], Any]: return {("resources", "hetznerCloudFloatingIPs"): attr} def get_definition_prefix(self) -> str: return "resources.hetznerCloudFloatingIPs." def get_physical_spec(self) -> Dict[str, Any]: s = super(FloatingIPState, self).get_physical_spec() s["address"] = self._state.get("address", None) return s def cleanup_state(self) -> None: with self.depl._db: self.state = self.MISSING self.resource_id = None self._state["address"] = None self._state["description"] = None self._state["labels"] = None self._state["location"] = None self._state["ipType"] = None def realise_create_floating_ip(self, allow_recreate: bool) -> None: defn: FloatingIPOptions = self.get_defn().config if self.state == self.UP: if self._state["location"] != defn.location: raise Exception( "changing a floating IP's location isn't supported.") if not allow_recreate: raise Exception( f"{self.full_name} definition changed and it needs to be " "recreated use --allow-recreate if you want to create a new one" ) self.warn("floating IP definition changed, recreating...") self._destroy() self._client = None location: BoundLocation = self.get_client().locations.get_by_name( defn.location) self.logger.log(f"creating floating IP at {location.description}...") response: CreateFloatingIPResponse = self.get_client( ).floating_ips.create( name=self.get_default_name(), type=defn.ipType, home_location=location, ) response.action and response.action.wait_until_finished() self.resource_id = response.floating_ip.id self.address = response.floating_ip.ip self.logger.log(f"IP address is {self.address}") with self.depl._db: self.state = self.STARTING self._state["location"] = defn.location self._state["ipType"] = defn.ipType self.wait_for_resource_available(self.resource_id) def realise_modify_description(self, allow_recreate: bool) -> None: defn: FloatingIPOptions = self.get_defn().config self.logger.log("updating floating IP description") self.get_client().floating_ips.update( floating_ip=FloatingIP(self.resource_id), description=defn.description, ) with self.depl._db: self._state["description"] = defn.description
class GCEState(MachineState, ResourceState): """ State of a Google Compute Engine machine. """ @classmethod def get_type(cls): return "gce" machine_name = attr_property("gce.name", None) public_ipv4 = attr_property("publicIpv4", None) private_ipv4 = attr_property("privateIpv4", None) region = attr_property("gce.region", None) instance_type = attr_property("gce.instanceType", None) public_client_key = attr_property("gce.publicClientKey", None) private_client_key = attr_property("gce.privateClientKey", None) public_host_key = attr_property("gce.publicHostKey", None) private_host_key = attr_property("gce.privateHostKey", None) tags = attr_property("gce.tags", None, 'json') metadata = attr_property("gce.metadata", {}, 'json') email = attr_property("gce.serviceAccountEmail", 'default') scopes = attr_property("gce.serviceAccountScopes", [], 'json') automatic_restart = attr_property("gce.scheduling.automaticRestart", None, bool) on_host_maintenance = attr_property("gce.scheduling.onHostMaintenance", None) ipAddress = attr_property("gce.ipAddress", None) network = attr_property("gce.network", None) block_device_mapping = attr_property("gce.blockDeviceMapping", {}, 'json') backups = nixops.util.attr_property("gce.backups", {}, 'json') def __init__(self, depl, name, id): MachineState.__init__(self, depl, name, id) self._conn = None @property def resource_id(self): return self.machine_name def show_type(self): s = super(GCEState, self).show_type() if self.region: s = "{0} [{1}; {2}]".format(s, self.region, self.instance_type) return s credentials_prefix = "deployment.gce" @property def full_name(self): return "GCE machine '{0}'".format(self.machine_name) def node(self): return self.connect().ex_get_node(self.machine_name, self.region) def address_to(self, resource): """Return the IP address to be used to access "resource" from this machine.""" if isinstance(resource, GCEState) and resource.network == self.network: return resource.private_ipv4 else: return MachineState.address_to(self, resource) def full_metadata(self, metadata): result = metadata.copy() result.update({ 'sshKeys': "root:{0}".format(self.public_client_key), 'ssh_host_{0}_key'.format(self.host_key_type): self.private_host_key, 'ssh_host_{0}_key_pub'.format(self.host_key_type): self.public_host_key }) return result def gen_metadata(self, metadata): return { 'kind': 'compute#metadata', 'items': [{ 'key': k, 'value': v } for k, v in metadata.iteritems()] } def update_block_device_mapping(self, k, v): x = self.block_device_mapping if v == None: x.pop(k, None) else: x[k] = v self.block_device_mapping = x def _delete_volume(self, volume_id, region, allow_keep=False): if not self.depl.logger.confirm( "are you sure you want to destroy GCE disk '{0}'?".format( volume_id)): if allow_keep: return else: raise Exception( "not destroying GCE disk '{0}'".format(volume_id)) self.log("destroying GCE disk '{0}'...".format(volume_id)) try: disk = self.connect().ex_get_volume(volume_id, region) disk.destroy() except libcloud.common.google.ResourceNotFoundError: self.warn("seems to have been destroyed already") def _node_deleted(self): self.vm_id = None self.state = self.STOPPED for k, v in self.block_device_mapping.iteritems(): v['needsAttach'] = True self.update_block_device_mapping(k, v) defn_properties = [ 'tags', 'region', 'instance_type', 'email', 'scopes', 'metadata', 'ipAddress', 'network' ] def is_deployed(self): return (self.vm_id or self.block_device_mapping) def create(self, defn, check, allow_reboot, allow_recreate): assert isinstance(defn, GCEDefinition) self.no_project_change(defn) self.no_region_change(defn) self.no_change(self.machine_name != defn.machine_name, "instance name") self.set_common_state(defn) self.copy_credentials(defn) self.machine_name = defn.machine_name self.region = defn.region if not self.public_client_key: (private, public) = create_key_pair() self.public_client_key = public self.private_client_key = private self.host_key_type = "ed25519" if self.state_version != "14.12" and nixops.util.parse_nixos_version( defn.config["nixosRelease"]) >= ["15", "09"] else "ecdsa" if not self.public_host_key: (private, public) = create_key_pair(type=self.host_key_type) self.public_host_key = public self.private_host_key = private recreate = False if check: try: node = self.node() if self.vm_id: if node.state == NodeState.TERMINATED: recreate = True self.warn( "the instance is terminated and needs a reboot") self.state = self.STOPPED self.handle_changed_property('region', node.extra['zone'].name, can_fix=False) # a bit hacky but should work network_name = node.extra['networkInterfaces'][0][ 'network'].split('/')[-1] if network_name == 'default': network_name = None self.handle_changed_property('network', network_name) self.handle_changed_property('instance_type', node.size) self.handle_changed_property( 'public_ipv4', node.public_ips[0] if node.public_ips else None, property_name='public IP address') if self.public_ipv4: known_hosts.add(self.public_ipv4, self.public_host_key) self.handle_changed_property( 'private_ipv4', node.private_ips[0] if node.private_ips else None, property_name='private IP address') if self.ipAddress: try: address = self.connect().ex_get_address( self.ipAddress) if self.public_ipv4 and self.public_ipv4 != address.address: self.warn( "static IP Address {0} assigned to this machine has unexpectely " "changed from {1} to {2} most likely due to being redeployed" .format(self.ipAddress, self.public_ipv4, address.address)) self.ipAddress = None except libcloud.common.google.ResourceNotFoundError: self.warn( "static IP Address resource {0} used by this machine has been destroyed; " "it is likely that the machine is still holding the address itself ({1}) " "and this is your last chance to reclaim it before it gets " "lost in a reboot".format( self.ipAddress, self.public_ipv4)) self.handle_changed_property('tags', sorted(node.extra['tags'])) actual_metadata = { i['key']: i['value'] for i in node.extra['metadata'].get('items', []) if i['key'] not in [ 'ssh_host_{0}_key'.format(self.host_key_type), 'sshKeys', 'ssh_host_{0}_key_pub'.format( self.host_key_type) ] } self.handle_changed_property('metadata', actual_metadata) self.handle_changed_property( 'automatic_restart', node.extra['scheduling']["automaticRestart"]) self.handle_changed_property( 'on_host_maintenance', node.extra['scheduling']["onHostMaintenance"]) attached_disk_names = [ d.get("deviceName", None) for d in node.extra['disks'] ] # check that all disks are attached for k, v in self.block_device_mapping.iteritems(): disk_name = v['disk_name'] or v['disk'] is_attached = disk_name in attached_disk_names if not is_attached and not v.get('needsAttach', False): self.warn( "disk {0} seems to have been detached behind our back; will reattach..." .format(disk_name)) v['needsAttach'] = True self.update_block_device_mapping(k, v) if is_attached and v.get('needsAttach', False): self.warn( "disk {0} seems to have been attached for us; thank you, mr. Elusive Bug!" .format(disk_name)) del v['needsAttach'] self.update_block_device_mapping(k, v) # check that no extra disks are attached defn_disk_names = [ v['disk_name'] or v['disk'] for k, v in defn.block_device_mapping.iteritems() ] state_disk_names = [ v['disk_name'] or v['disk'] for k, v in self.block_device_mapping.iteritems() ] unexpected_disks = list( set(attached_disk_names) - set(defn_disk_names) - set(state_disk_names)) if unexpected_disks: self.warn( "unexpected disk(s) {0} are attached to this instance; " "not fixing this just in case".format( unexpected_disks)) else: self.warn_not_supposed_to_exist(valuable_data=True) self.confirm_destroy(node, self.full_name) except libcloud.common.google.ResourceNotFoundError: if self.vm_id: self.warn( "the instance seems to have been destroyed behind our back" ) if not allow_recreate: raise Exception("use --allow-recreate to fix") self._node_deleted() # check that the disks that should exist do exist # and that the disks we expected to create don't exist yet for k, v in defn.block_device_mapping.iteritems(): disk_name = v['disk_name'] or v['disk'] try: disk = self.connect().ex_get_volume( disk_name, v.get('region', None)) if k not in self.block_device_mapping and v['disk_name']: self.warn_not_supposed_to_exist( resource_name=disk_name, valuable_data=True) self.confirm_destroy(disk, disk_name) except libcloud.common.google.ResourceNotFoundError: if v['disk']: raise Exception( "external disk '{0}' is required but doesn't exist" .format(disk_name)) if k in self.block_device_mapping and v['disk_name']: self.warn( "disk '{0}' is supposed to exist, but is missing; will recreate..." .format(disk_name)) self.update_block_device_mapping(k, None) # create missing disks for k, v in defn.block_device_mapping.iteritems(): if k in self.block_device_mapping: continue if v['disk'] is None: extra_msg = (" from snapshot '{0}'".format(v['snapshot']) if v['snapshot'] else " from image '{0}'".format( v['image']) if v['image'] else "") self.log("creating GCE disk of {0} GiB{1}...".format( v['size'] if v['size'] else "auto", extra_msg)) v['region'] = defn.region try: self.connect().create_volume(v['size'], v['disk_name'], v['region'], snapshot=v['snapshot'], image=v['image'], ex_disk_type="pd-" + v.get('type', 'standard'), use_existing=False) except libcloud.common.google.ResourceExistsError: raise Exception( "tried creating a disk that already exists; " "please run 'deploy --check' to fix this") v['needsAttach'] = True self.update_block_device_mapping(k, v) if self.vm_id: if self.instance_type != defn.instance_type: recreate = True self.warn("change of the instance type requires a reboot") if self.network != defn.network: recreate = True self.warn("change of the network requires a reboot") if self.email != defn.email or self.scopes != defn.scopes: recreate = True self.warn('change of service account requires a reboot') for k, v in self.block_device_mapping.iteritems(): defn_v = defn.block_device_mapping.get(k, None) if defn_v and not v.get('needsAttach', False): if v['bootDisk'] != defn_v['bootDisk']: recreate = True self.warn("change of the boot disk requires a reboot") if v['readOnly'] != defn_v['readOnly']: recreate = True self.warn("remounting disk as ro/rw requires a reboot") if recreate: if not allow_reboot: raise Exception( "reboot is required for the requested changes; please run with --allow-reboot" ) self.stop() self.create_node(defn) if self.node().state == NodeState.STOPPED: self.start() def create_node(self, defn): if not self.vm_id: self.log("creating {0}...".format(self.full_name)) boot_disk = next( (v for k, v in defn.block_device_mapping.iteritems() if v.get('bootDisk', False)), None) if not boot_disk: raise Exception("no boot disk found for {0}".format( self.full_name)) try: service_accounts = [] account = {'email': defn.email} if defn.scopes != []: account['scopes'] = defn.scopes service_accounts.append(account) # keeping a gcloud like behavior, if nothing was specified # i.e service account is default get the default scopes as well if defn.email == 'default' and defn.scopes == []: service_accounts = None node = self.connect().create_node( self.machine_name, defn.instance_type, "", location=self.connect().ex_get_zone(defn.region), ex_boot_disk=self.connect().ex_get_volume( boot_disk['disk_name'] or boot_disk['disk'], boot_disk.get('region', None)), ex_metadata=self.full_metadata(defn.metadata), ex_tags=defn.tags, ex_service_accounts=service_accounts, external_ip=(self.connect().ex_get_address(defn.ipAddress) if defn.ipAddress else 'ephemeral'), ex_network=(defn.network if defn.network else 'default')) except libcloud.common.google.ResourceExistsError: raise Exception( "tried creating an instance that already exists; " "please run 'deploy --check' to fix this") self.vm_id = self.machine_name self.state = self.STARTING self.ssh_pinged = False self.copy_properties(defn) self.public_ipv4 = node.public_ips[0] self.log("got public IP: {0}".format(self.public_ipv4)) known_hosts.add(self.public_ipv4, self.public_host_key) self.private_ipv4 = node.private_ips[0] for k, v in self.block_device_mapping.iteritems(): v['needsAttach'] = True self.update_block_device_mapping(k, v) # set scheduling config here instead of triggering an update using None values # because we might be called with defn = self, thus modifying self would ruin defn self.connect().ex_set_node_scheduling( node, automatic_restart=defn.automatic_restart, on_host_maintenance=defn.on_host_maintenance) self.automatic_restart = defn.automatic_restart self.on_host_maintenance = defn.on_host_maintenance # Update service account if self.email != defn.email or self.scopes != defn.scopes: self.log('updating the service account') node = self.node() request = '/zones/%s/instances/%s/setServiceAccount' % ( node.extra['zone'].name, node.name) service_account = {} service_account["email"] = defn.email if defn.scopes != []: service_account["scopes"] = defn.scopes self.connect().connection.async_request(request, method='POST', data=service_account) self.email = defn.email self.scopes = defn.scopes # Attach missing volumes for k, v in self.block_device_mapping.items(): defn_v = defn.block_device_mapping.get(k, None) if v.get('needsAttach', False) and defn_v: disk_name = v['disk_name'] or v['disk'] disk_region = v.get('region', None) v['readOnly'] = defn_v['readOnly'] v['bootDisk'] = defn_v['bootDisk'] v['deleteOnTermination'] = defn_v['deleteOnTermination'] v['passphrase'] = defn_v['passphrase'] self.log("attaching GCE disk '{0}'...".format(disk_name)) if not v.get('bootDisk', False): self.connect().attach_volume( self.node(), self.connect().ex_get_volume(disk_name, disk_region), device=disk_name, ex_mode=('READ_ONLY' if v['readOnly'] else 'READ_WRITE')) del v['needsAttach'] self.update_block_device_mapping(k, v) # generate LUKS key if the model didn't specify one if v.get('encrypt', False) and v.get('passphrase', "") == "" and v.get( 'generatedKey', "") == "": v['generatedKey'] = generate_random_string(length=256) self.update_block_device_mapping(k, v) if self.metadata != defn.metadata: self.log('setting new metadata values') node = self.node() meta = self.gen_metadata(self.full_metadata(defn.metadata)) request = '/zones/%s/instances/%s/setMetadata' % ( node.extra['zone'].name, node.name) metadata_data = {} metadata_data['items'] = meta['items'] metadata_data['kind'] = meta['kind'] metadata_data['fingerprint'] = node.extra['metadata'][ 'fingerprint'] self.connect().connection.async_request(request, method='POST', data=metadata_data) self.metadata = defn.metadata if self.tags != defn.tags: self.log('updating tags') self.connect().ex_set_node_tags(self.node(), defn.tags) self.tags = defn.tags if self.public_ipv4 and self.ipAddress != defn.ipAddress: self.log("detaching old public IP address {0}".format( self.public_ipv4)) self.connect().connection.async_request( "/zones/{0}/instances/{1}/deleteAccessConfig?accessConfig=External+NAT&networkInterface=nic0" .format(self.region, self.machine_name), method='POST') self.public_ipv4 = None self.ipAddress = None if self.public_ipv4 is None: self.log("attaching public IP address {0}".format( defn.ipAddress or "[Ephemeral]")) self.connect().connection.async_request( "/zones/{0}/instances/{1}/addAccessConfig?networkInterface=nic0" .format(self.region, self.machine_name), method='POST', data={ 'kind': 'compute#accessConfig', 'type': 'ONE_TO_ONE_NAT', 'name': 'External NAT', 'natIP': self.connect().ex_get_address(defn.ipAddress).address if defn.ipAddress else None }) self.ipAddress = defn.ipAddress self.public_ipv4 = self.node().public_ips[0] self.log("got public IP: {0}".format(self.public_ipv4)) known_hosts.add(self.public_ipv4, self.public_host_key) self.ssh.reset() self.ssh_pinged = False if self.automatic_restart != defn.automatic_restart or self.on_host_maintenance != defn.on_host_maintenance: self.log("setting scheduling configuration") self.connect().ex_set_node_scheduling( self.node(), automatic_restart=defn.automatic_restart, on_host_maintenance=defn.on_host_maintenance) self.automatic_restart = defn.automatic_restart self.on_host_maintenance = defn.on_host_maintenance def reboot(self, hard=False): if hard: self.log("sending hard reset to GCE machine...") self.node().reboot() self.state = self.STARTING else: MachineState.reboot(self, hard=hard) def start(self): if self.vm_id: try: node = self.node() except libcloud.common.google.ResourceNotFoundError: self.warn("seems to have been destroyed already") self._node_deleted() node = None if node and (node.state == NodeState.TERMINATED): self.stop() if node and (node.state == NodeState.STOPPED): self.log("starting GCE machine") self.connect().ex_start_node(node) self.public_ipv4 = self.node().public_ips[0] self.private_ipv4 = self.node().private_ips[0] known_hosts.add(self.public_ipv4, self.public_host_key) self.wait_for_ssh(check=True) self.send_keys() if not self.vm_id and self.block_device_mapping: prev_public_ipv4 = self.public_ipv4 prev_private_ipv4 = self.private_ipv4 self.create_node(self) if prev_public_ipv4 != self.public_ipv4: self.warn("Public IP address has changed from {0} to {1}, " "you may need to run 'nixops deploy'".format( prev_public_ipv4, self.public_ipv4)) if prev_private_ipv4 != self.private_ipv4: self.warn("Private IP address has changed from {0} to {1}, " "you may need to run 'nixops deploy'".format( prev_private_ipv4, self.private_ipv4)) self.wait_for_ssh(check=True) self.send_keys() def stop(self): if not self.vm_id: return try: node = self.node() except libcloud.common.google.ResourceNotFoundError: self.warn("seems to have been destroyed already") self._node_deleted() return if node.state != NodeState.TERMINATED: self.log_start("stopping GCE machine... ") self.connect().ex_stop_node(node) self.state = self.STOPPING def check_stopped(): return self.node().state == NodeState.STOPPED if nixops.util.check_wait(check_stopped, initial=3, max_tries=100, exception=False): # = 5 min self.log_end("stopped") else: self.log_end("(timed out)") self.state = self.STOPPED self.ssh.reset() def destroy(self, wipe=False): if wipe: log.warn("wipe is not supported") try: node = self.node() question = "are you sure you want to destroy {0}?" if not self.depl.logger.confirm(question.format(self.full_name)): return False known_hosts.remove(self.public_ipv4, self.public_host_key) self.log("destroying the GCE machine...") node.destroy() except libcloud.common.google.ResourceNotFoundError: self.warn("seems to have been destroyed already") self._node_deleted() # Destroy volumes created for this instance. for k, v in self.block_device_mapping.items(): if v.get('deleteOnTermination', False): self._delete_volume(v['disk_name'], v['region']) self.update_block_device_mapping(k, None) return True def after_activation(self, defn): # Detach volumes that are no longer in the deployment spec. for k, v in self.block_device_mapping.items(): if k not in defn.block_device_mapping: disk_name = v['disk'] or v['disk_name'] self.log("unmounting device '{0}'...".format(disk_name)) if v.get('encrypt', False): dm = "/dev/mapper/{0}".format(disk_name) self.run_command("umount -l {0}".format(dm), check=False) self.run_command("cryptsetup luksClose {0}".format(dm), check=False) else: self.run_command("umount -l {0}".format(k), check=False) node = self.node() try: if not v.get('needsAttach', False): self.log( "detaching GCE disk '{0}'...".format(disk_name)) volume = self.connect().ex_get_volume( disk_name, v.get('region', None)) self.connect().detach_volume(volume, node) v['needsAttach'] = True self.update_block_device_mapping(k, v) if v.get('deleteOnTermination', False): self._delete_volume(disk_name, v['region']) except libcloud.common.google.ResourceNotFoundError: self.warn( "GCE disk '{0}' seems to have been already destroyed". format(disk_name)) self.update_block_device_mapping(k, None) def get_console_output(self): node = self.node() if node.state == NodeState.TERMINATED: raise Exception( "cannot get console output of a state=TERMINATED machine '{0}'" .format(self.name)) request = '/zones/%s/instances/%s/serialPort' % ( node.extra['zone'].name, node.name) return self.connect().connection.request( request, method='GET').object['contents'] def _check(self, res): try: node = self.node() res.exists = True res.is_up = node.state == NodeState.RUNNING or node.state == NodeState.REBOOTING if node.state == NodeState.REBOOTING or node.state == NodeState.PENDING: self.state = self.STARTING if node.state == NodeState.STOPPED or node.state == NodeState.TERMINATED: self.state = self.STOPPED if node.state == NodeState.UNKNOWN: self.state = self.UNKNOWN if node.state == NodeState.RUNNING: # check that all disks are attached res.disks_ok = True for k, v in self.block_device_mapping.iteritems(): disk_name = v['disk_name'] or v['disk'] if all( d.get("deviceName", None) != disk_name for d in node.extra['disks']): res.disks_ok = False res.messages.append( "disk {0} is detached".format(disk_name)) try: disk = self.connect().ex_get_volume( disk_name, v.get('region', None)) except libcloud.common.google.ResourceNotFoundError: res.messages.append( "disk {0} is destroyed".format(disk_name)) self.handle_changed_property( 'public_ipv4', node.public_ips[0] if node.public_ips else None, property_name='public IP address') if self.public_ipv4: known_hosts.add(self.public_ipv4, self.public_host_key) self.handle_changed_property( 'private_ipv4', node.private_ips[0] if node.private_ips else None, property_name='private IP address') MachineState._check(self, res) except libcloud.common.google.ResourceNotFoundError: res.exists = False res.is_up = False self.state = self.MISSING def create_after(self, resources, defn): # Just a check for all GCE resource classes return { r for r in resources if isinstance(r, nixops.resources.gce_static_ip.GCEStaticIPState) or isinstance(r, nixops.resources.gce_disk.GCEDiskState) or isinstance(r, nixops.resources.gce_image.GCEImageState) or isinstance(r, nixops.resources.gce_network.GCENetworkState) } def backup(self, defn, backup_id): self.log("backing up {0} using ID '{1}'".format( self.full_name, backup_id)) if sorted(defn.block_device_mapping.keys()) != sorted( self.block_device_mapping.keys()): self.warn( "the list of disks currently deployed doesn't match the current deployment" " specification; consider running 'deploy' first; the backup may be incomplete" ) backup = {} _backups = self.backups for k, v in self.block_device_mapping.iteritems(): disk_name = v['disk_name'] or v['disk'] volume = self.connect().ex_get_volume(disk_name, v.get('region', None)) snapshot_name = "backup-{0}-{1}".format(backup_id, disk_name[-32:]) self.log("initiating snapshotting of disk '{0}': '{1}'".format( disk_name, snapshot_name)) self.connect().connection.request( '/zones/%s/disks/%s/createSnapshot' % (volume.extra['zone'].name, volume.name), method='POST', data={ 'name': snapshot_name, 'description': "backup of disk {0} attached to {1}".format( volume.name, self.machine_name) }) backup[disk_name] = snapshot_name _backups[backup_id] = backup self.backups = _backups def restore(self, defn, backup_id, devices=[]): self.log("restoring {0} to backup '{1}'".format( self.full_name, backup_id)) self.stop() for k, v in self.block_device_mapping.items(): disk_name = v['disk_name'] or v['disk'] s_id = self.backups[backup_id].get(disk_name, None) if s_id and (devices == [] or k in devices or disk_name in devices): try: snapshot = self.connect().ex_get_snapshot(s_id) except libcloud.common.google.ResourceNotFoundError: self.warn("snapsnot {0} for disk {1} is missing; skipping". format(s_id, disk_name)) continue try: self.log("destroying disk {0}".format(disk_name)) self.connect().ex_get_volume(disk_name, v.get('region', None)).destroy() except libcloud.common.google.ResourceNotFoundError: self.warn( "disk {0} seems to have been destroyed already".format( disk_name)) self.log("creating disk {0} from snapshot '{1}'".format( disk_name, s_id)) self.connect().create_volume(None, disk_name, v.get('region', None), ex_disk_type="pd-" + v.get('type', 'standard'), snapshot=snapshot, use_existing=False) def remove_backup(self, backup_id, keep_physical=False): self.log('removing backup {0}'.format(backup_id)) _backups = self.backups if not backup_id in _backups.keys(): self.warn('backup {0} not found; skipping'.format(backup_id)) else: for d_name, snapshot_id in _backups[backup_id].iteritems(): try: self.log('removing snapshot {0}'.format(snapshot_id)) self.connect().ex_get_snapshot(snapshot_id).destroy() except libcloud.common.google.ResourceNotFoundError: self.warn( 'snapshot {0} not found; skipping'.format(snapshot_id)) _backups.pop(backup_id) self.backups = _backups def get_backups(self): self.connect() backups = {} for b_id, snapshots in self.backups.iteritems(): backups[b_id] = {} backup_status = "complete" info = [] for k, v in self.block_device_mapping.items(): disk_name = v['disk_name'] or v['disk'] if not disk_name in snapshots.keys(): backup_status = "incomplete" info.append("{0} - {1} - not available in backup".format( self.name, disk_name)) else: snapshot_id = snapshots[disk_name] try: snapshot = self.connect().ex_get_snapshot(snapshot_id) if snapshot.status != 'READY': backup_status = "running" except libcloud.common.google.ResourceNotFoundError: info.append( "{0} - {1} - {2} - snapshot has disappeared". format(self.name, disk_name, snapshot_id)) backup_status = "unavailable" for d_name, s_id in snapshots.iteritems(): if not any(d_name == v['disk_name'] or d_name == v['disk'] for k, v in self.block_device_mapping.iteritems()): info.append( "{0} - {1} - {2} - a snapshot of a disk that is not or no longer deployed" .format(self.name, d_name, s_id)) backups[b_id]['status'] = backup_status backups[b_id]['info'] = info return backups def get_physical_spec(self): block_device_mapping = {} for k, v in self.block_device_mapping.items(): if (v.get('encrypt', False) and v.get('passphrase', "") == "" and v.get('generatedKey', "") != ""): block_device_mapping[k] = { 'passphrase': Call(RawValue("pkgs.lib.mkOverride 10"), v['generatedKey']), } return { 'imports': [ RawValue( "<nixpkgs/nixos/modules/virtualisation/google-compute-config.nix>" ) ], ('deployment', 'gce', 'blockDeviceMapping'): block_device_mapping, } def get_keys(self): keys = MachineState.get_keys(self) # Ugly: we have to add the generated keys because they're not # there in the first evaluation (though they are present in # the final nix-build). for k, v in self.block_device_mapping.items(): if v.get('encrypt', False) and v.get( 'passphrase', "") == "" and v.get('generatedKey', "") != "": keys["luks-" + (v['disk_name'] or v['disk'])] = { 'text': v['generatedKey'], 'group': 'root', 'permissions': '0600', 'user': '******' } return keys def get_ssh_name(self): if not self.public_ipv4: raise Exception( "{0} does not have a public IPv4 address (yet)".format( self.full_name)) return self.public_ipv4 def get_ssh_private_key_file(self): return self._ssh_private_key_file or self.write_ssh_private_key( self.private_client_key) def get_ssh_flags(self, *args, **kwargs): super_flags = super(GCEState, self).get_ssh_flags(*args, **kwargs) return super_flags + ["-i", self.get_ssh_private_key_file()]
class AzureBLOBState(StorageResourceState): """State of an Azure BLOB""" blob_name = attr_property("azure.name", None) blob_type = attr_property("azure.blobType", None) md5 = attr_property("azure.md5", None) container = attr_property("azure.container", None) storage = attr_property("azure.storage", None) content_encoding = attr_property("azure.contentEncoding", None) content_language = attr_property("azure.contentLanguage", None) content_type = attr_property("azure.contentType", None) content_length = attr_property("azure.contentLength", None) cache_control = attr_property("azure.cacheControl", None) content_disposition = attr_property("azure.contentDisposition", None) metadata = attr_property("azure.metadata", {}, 'json') last_modified = attr_property("azure.lastModified", None) copied_from = attr_property("azure.copiedFrom", None) @classmethod def get_type(cls): return "azure-blob" def show_type(self): s = super(AzureBLOBState, self).show_type() if self.state == self.UP: s = "{0}".format(s) return s @property def resource_id(self): return self.blob_name @property def full_name(self): return "Azure BLOB '{0}'".format(self.resource_id) def get_storage_name(self, defn=None): container_resource = self.get_resource_state(AzureBLOBContainerState, (defn or self).container) return (defn or self).storage or (container_resource and container_resource.storage) def get_key(self): storage = self.get_resource_state(AzureStorageState, self.storage) container = self.get_resource_state(AzureBLOBContainerState, self.container) access_key = self.access_key or (storage and storage.access_key) or ( container and container.get_key()) if not access_key: raise Exception( "Can't obtain the access key needed to manage {0}".format( self.full_name)) return access_key def is_settled(self, resource): return resource is None or (resource.get('x-ms-copy-status', 'success') == 'success') def get_resource_allow_exceptions(self): return self.bs().get_blob_properties(self.container, self.resource_id) def destroy_resource(self): self.bs().delete_blob(self.container, self.resource_id, x_ms_delete_snapshots='include') self.copied_from = None self.last_modified = None self.state = self.MISSING defn_properties = [ 'content_encoding', 'content_language', 'cache_control', 'content_type', 'content_disposition' ] def upload_file(self, defn): md5 = md5sum(defn.file_path) if self.state != self.UP or md5 != self.md5 or self.blob_type != defn.blob_type: blob = self.get_settled_resource() if self.state == self.UP: self.log("updating the contents of {0} in {1}...".format( self.full_name, defn.container)) if blob is not None and self.blob_type != defn.blob_type: self.log( "blob type change requested; deleting the destination BLOB first..." ) self.destroy_resource() else: self.log("creating {0} in {1}...".format( self.full_name, defn.container)) if defn.blob_type == 'BlockBlob': self.bs().put_block_blob_from_path( defn.container, defn.blob_name, defn.file_path, content_encoding=defn.content_encoding, content_language=defn.content_language, content_md5=md5, cache_control=defn.cache_control, x_ms_blob_content_type=defn.content_type, x_ms_blob_content_encoding=defn.content_encoding, x_ms_blob_content_language=defn.content_language, x_ms_blob_content_md5=md5, x_ms_blob_cache_control=defn.cache_control, x_ms_meta_name_values=defn.metadata, max_connections=8) else: self.bs().put_page_blob_from_path( defn.container, defn.blob_name, defn.file_path, content_encoding=defn.content_encoding, content_language=defn.content_language, content_md5=md5, cache_control=defn.cache_control, x_ms_blob_content_type=defn.content_type, x_ms_blob_content_encoding=defn.content_encoding, x_ms_blob_content_language=defn.content_language, x_ms_blob_content_md5=md5, x_ms_blob_cache_control=defn.cache_control, x_ms_meta_name_values=defn.metadata, max_connections=8) self.state = self.UP self.copy_properties(defn) self.metadata = defn.metadata self.blob_type = defn.blob_type self.md5 = md5 self.last_modified = None self.content_disposition = None self.copied_from = defn.file_path self.content_length = defn.content_length or os.stat( defn.file_path).st_size def copy_blob(self, defn): if self.state == self.UP: self.log("updating the contents of {0} in {1}...".format( self.full_name, defn.container)) if self.copied_from != defn.copy_from_blob: self.log( "source BLOB location has changed; deleting {0} first..". format(self.full_name)) self.destroy_resource() elif self.blob_type != defn.blob_type: self.warn( "when copying, cannot change the BLOB type from {0} to {1}" .format(self.blob_type, defn.blob_type)) else: self.log("creating {0} in {1}...".format(self.full_name, defn.container)) self.last_modified = None try: self.bs().copy_blob( defn.container, defn.blob_name, defn.copy_from_blob, x_ms_meta_name_values=defn.metadata, x_ms_source_if_modified_since=self.last_modified) res = self.get_settled_resource(max_tries=600) self.copy_properties(defn) self.last_modified = res.get('last-modified', None) self.copied_from = defn.copy_from_blob self.md5 = res.get('content-md5', None) self.content_encoding = res.get('content-encoding', None) self.content_language = res.get('content-language', None) self.content_length = res.get('content-length', None) self.content_type = res.get('content-type', None) self.cache_control = res.get('cache-control', None) self.blob_type = res.get('x-ms-blob-type', None) self.content_disposition = res.get('content-disposition', None) # workaround for API bug self.metadata = None if defn.metadata == {} else defn.metadata self.state = self.UP if self.blob_type != defn.blob_type: self.warn("cannot change blob type when copying; " "BLOB of type {0} has been created instead " "of the requested {1}".format( self.blob_type, defn.blob_type)) except azure.common.AzureHttpError as e: if e.status_code == 304 or e.status_code == 412: self.log( "update is not necessary, the source BLOB has not been modified" ) else: raise def create(self, defn, check, allow_reboot, allow_recreate): self.no_change( self.get_storage_name(defn=self) != self.get_storage_name(defn=defn), 'storage') self.no_property_change(defn, 'container') self.blob_name = defn.blob_name self.access_key = defn.access_key self.storage = defn.storage self.container = defn.container if check: blob = self.get_settled_resource() if not blob: self.warn_missing_resource() elif self.state == self.UP: self.handle_changed_property('blob_type', blob.get('x-ms-blob-type', None)) self.handle_changed_property('md5', blob.get('content-md5', None)) self.handle_changed_property( 'content_encoding', blob.get('content-encoding', None)) self.handle_changed_property( 'content_language', blob.get('content-language', None)) self.handle_changed_property('content_length', blob.get('content-length', None), can_fix=False) self.handle_changed_property('content_type', blob.get('content-type', None)) self.handle_changed_property('cache_control', blob.get('cache-control', None)) self.handle_changed_property( 'content_disposition', blob.get('content-disposition', None)) self.handle_changed_metadata(blob) else: self.warn_not_supposed_to_exist() self.confirm_destroy() if defn.file_path: self.upload_file(defn) if defn.copy_from_blob: self.copy_blob(defn) if self.properties_changed(defn) or self.metadata != defn.metadata: self.log("updating properties of {0}...".format(self.full_name)) self.get_settled_resource_assert_exists() self.bs().set_blob_properties( self.container, self.blob_name, x_ms_blob_cache_control=defn.cache_control, x_ms_blob_content_type=defn.content_type, x_ms_blob_content_md5=self.md5, x_ms_blob_content_encoding=defn.content_encoding, x_ms_blob_content_language=defn.content_language, x_ms_blob_content_disposition=defn.content_disposition) self.copy_properties(defn) self.bs().set_blob_metadata(self.container, self.blob_name, x_ms_meta_name_values=defn.metadata) self.metadata = defn.metadata def create_after(self, resources, defn): return { r for r in resources if isinstance(r, AzureBLOBContainerState) or isinstance( r, AzureStorageState) or isinstance(r, AzureResourceGroupState) }
class AzureDirectoryState(StorageResourceState): """State of an Azure Directory""" directory_name = attr_property("azure.name", None) parent_directory_path = attr_property("azure.parentDirectoryPath", None) parent_directory = attr_property("azure.parentDirectory", None) share = attr_property("azure.share", None) storage = attr_property("azure.storage", None) @classmethod def get_type(cls): return "azure-directory" def show_type(self): s = super(AzureDirectoryState, self).show_type() if self.state == self.UP: s = "{0}".format(s) return s @property def resource_id(self): return self.directory_name @property def full_name(self): return "Azure directory '{0}'".format(self.resource_id) def get_share_name(self, defn=None): parent_resource = self.get_resource_state(AzureDirectoryState, (defn or self).parent_directory) return ((defn or self).share or (parent_resource and parent_resource.get_share_name())) def get_storage_name(self, defn=None): parent_resource = self.get_resource_state(AzureDirectoryState, (defn or self).parent_directory) share_resource = self.get_resource_state(AzureShareState, (defn or self).share) return ((defn or self).storage or (share_resource and share_resource.get_storage_name()) or (parent_resource and parent_resource.get_storage_name())) def get_key(self): parent = self.get_resource_state(AzureDirectoryState, self.parent_directory) storage = self.get_resource_state(AzureStorageState, self.get_storage_name()) share = self.get_resource_state(AzureShareState, self.share) access_key = (self.access_key or (storage and storage.access_key) or (share and share.get_key()) or (parent and parent.get_key())) if not access_key: raise Exception( "Can't obtain the access key needed to manage {0}".format( self.full_name)) return access_key def is_settled(self, resource): return True def get_parent_directory_path(self, defn=None): parent = self.get_resource_state(AzureDirectoryState, (defn or self).parent_directory) return (defn or self).parent_directory_path or ( parent and parent.get_directory_path()) def get_directory_path(self): parent = self.get_parent_directory_path() return "{0}/{1}".format( parent, self.directory_name) if parent else self.directory_name def get_resource_allow_exceptions(self): return self.fs().get_directory_properties(self.get_share_name(), self.get_directory_path()) def destroy_resource(self): self.fs().delete_directory(self.get_share_name(), self.get_directory_path(), fail_not_exist=True) def create(self, defn, check, allow_reboot, allow_recreate): self.no_change( self.get_storage_name(defn=self) != self.get_storage_name(defn=defn), 'storage') self.no_change( self.get_share_name(defn=self) != self.get_share_name(defn=defn), 'share') self.no_change( self.get_parent_directory_path(defn=self) != self.get_parent_directory_path(defn=defn), 'parent directory path') self.directory_name = defn.directory_name self.access_key = defn.access_key self.storage = defn.storage self.share = defn.share self.parent_directory = defn.parent_directory self.parent_directory_path = defn.parent_directory_path if check: directory = self.get_settled_resource() if not directory: self.warn_missing_resource() elif self.state == self.UP: # bindings as of 05.01.2016 don't allow getting/setting metadata self.is_settled(directory) # placeholder else: self.warn_not_supposed_to_exist() self.confirm_destroy() if self.state != self.UP: if self.get_settled_resource() is not None: raise Exception( "tried creating a directory that already exists; " "please run 'deploy --check' to fix this") self.log("creating {0} in {1}...".format(self.directory_name, self.get_storage_name())) self.fs().create_directory(self.get_share_name(), self.get_directory_path(), fail_on_exist=True) self.state = self.UP def create_after(self, resources, defn): return { r for r in resources if isinstance(r, AzureShareState) or isinstance( r, AzureStorageState) or isinstance(r, AzureResourceGroupState) or (isinstance(r, AzureDirectoryState) and defn.parent_directory and (getattr(self.depl.definitions[r.name], 'directory_name', None) == defn.parent_directory)) } def destroy_before(self, resources): return { r for r in resources if isinstance(r, AzureShareState) or isinstance(r, AzureStorageState) or isinstance(r, AzureResourceGroupState) or (isinstance(r, AzureDirectoryState) and self.parent_directory and getattr(r, 'directory_name', None) == self.parent_directory) }
class DropletState(MachineState[DropletDefinition]): @classmethod def get_type(cls) -> str: return "droplet" # generic options # state: int= attr_property("state", MachineState.MISSING, int) # override public_ipv4: Optional[str] = attr_property("publicIpv4", None) public_ipv6: dict = attr_property("publicIpv6", {}, "json") default_gateway: Optional[str] = attr_property("defaultGateway", None) netmask: Optional[str] = attr_property("netmask", None) # droplet options enable_ipv6: Optional[bool] = attr_property("droplet.enableIpv6", False, bool) default_gateway6: Optional[str] = attr_property("defaultGateway6", None) region: Optional[str] = attr_property("droplet.region", None) size: Optional[str] = attr_property("droplet.size", None) auth_token: Optional[str] = attr_property("droplet.authToken", None) droplet_id: Optional[str] = attr_property("droplet.dropletId", None) key_pair: Optional[str] = attr_property("droplet.keyPair", None) def __init__(self, depl: Deployment, name: str, id: RecordId) -> None: MachineState.__init__(self, depl, name, id) self.name: str = name def _get_droplet(self) -> digitalocean.Droplet: return digitalocean.Droplet(id=self.droplet_id, token=self.get_auth_token()) def get_ssh_name(self) -> Optional[str]: return self.public_ipv4 def get_ssh_flags(self, *args, **kwargs) -> List[str]: super_flags = super(DropletState, self).get_ssh_flags(*args, **kwargs) return super_flags + [ "-o", "UserKnownHostsFile=/dev/null", "-o", "StrictHostKeyChecking=accept-new", "-i", self.get_ssh_private_key_file(), ] def get_physical_spec(self) -> Function: def prefix_len(netmask): return bin(int(codecs.encode(socket.inet_aton(netmask), "hex"), 16)).count("1") networking = { "defaultGateway": self.default_gateway, "nameservers": ["8.8.8.8"], # default provided by DO ("interfaces", "ens3", "ipv4", "addresses"): [{ "address": self.public_ipv4, "prefixLength": prefix_len(self.netmask) }], } if self.public_ipv6: networking[("interfaces", "ens3", "ipv6", "addresses")] = [{ "address": self.public_ipv6["address"], "prefixLength": self.public_ipv6["prefixLength"], }] if self.default_gateway6: networking["defaultGateway6"] = self.default_gateway6 return Function( "{ ... }", { "imports": [RawValue("<nixpkgs/nixos/modules/profiles/qemu-guest.nix>")], "networking": networking, ( "boot", "loader", "grub", "device", ): "nodev", # keep ubuntu bootloader? ("fileSystems", "/"): { "device": "/dev/vda1", "fsType": "ext4" }, ("users", "extraUsers", "root", "openssh", "authorizedKeys", "keys"): [self.get_ssh_key_resource().public_key], }, ) def get_ssh_private_key_file(self) -> str: return self.write_ssh_private_key( self.get_ssh_key_resource().private_key) def get_ssh_key_resource(self) -> ssh_keypair.SSHKeyPairState: return cast(ssh_keypair.SSHKeyPairState, self.depl.active_resources["ssh-key"]) def create_after(self, resources, defn) -> Set: # make sure the ssh key exists before we do anything else return { r for r in resources if isinstance(r, ssh_keypair.SSHKeyPairState) } def set_common_state(self, defn: DropletDefinition) -> None: super().set_common_state(defn) self.auth_token = defn.auth_token def get_auth_token(self) -> Optional[str]: return os.environ.get("DIGITAL_OCEAN_AUTH_TOKEN", self.auth_token) def destroy(self, wipe: bool = False) -> bool: self.log("destroying droplet {}".format(self.droplet_id)) try: droplet = self._get_droplet() droplet.destroy() except digitalocean.baseapi.NotFoundError: self.log( "droplet not found - assuming it's been destroyed already") self.public_ipv4 = None self.droplet_id = None return True def create(self, defn, check, allow_reboot: bool, allow_recreate: bool) -> None: try: ssh_key = self.get_ssh_key_resource() except KeyError: raise Exception( "Please specify a ssh-key resource (resources.sshKeyPairs.ssh-key = {})." ) self.set_common_state(defn) if self.droplet_id is not None: return self.manager = digitalocean.Manager(token=self.get_auth_token()) droplet = digitalocean.Droplet( token=self.get_auth_token(), name=self.name, region=defn.region, ipv6=defn.enable_ipv6, ssh_keys=[ssh_key.public_key], image="ubuntu-16-04-x64", # only for lustration size_slug=defn.size, ) self.log_start("creating droplet ...") droplet.create() status = "in-progress" while status == "in-progress": actions = droplet.get_actions() for action in actions: action.load() if action.status != "in-progress": status = action.status time.sleep(1) self.log_continue("[{}] ".format(status)) if status != "completed": raise Exception("unexpected status: {}".format(status)) droplet.load() self.droplet_id = droplet.id self.public_ipv4 = droplet.ip_address self.log_end("{}".format(droplet.ip_address)) for n in droplet.networks["v4"]: if n["ip_address"] == self.public_ipv4: self.default_gateway = n["gateway"] self.netmask = droplet.networks["v4"][0]["netmask"] first_ipv6 = {} first_gw6 = None if "v6" in droplet.networks: public_ipv6_networks = [ n for n in droplet.networks["v6"] if n["type"] == "public" ] if len(public_ipv6_networks) > 0: # The DigitalOcean API does not expose an explicit # default interface or gateway, so assume this is it. first_ipv6["address"] = public_ipv6_networks[0]["ip_address"] first_ipv6["prefixLength"] = public_ipv6_networks[0]["netmask"] first_gw6 = public_ipv6_networks[0]["gateway"] self.public_ipv6 = first_ipv6 self.default_gateway6 = first_gw6 # run modified nixos-infect # - no reboot # - predictable network interface naming (ens3 etc) self.wait_for_ssh() self.log_start("running nixos-infect") self.run_command("bash </dev/stdin 2>&1", stdin=open(infect_path)) self.reboot_sync() def start(self) -> None: if self.state == self.UP: return self.log("starting droplet... ") droplet = self._get_droplet() self.state = self.STARTING droplet.reboot() if not nixops.util.check_wait( self.check_started, initial=3, max_tries=100, exception=False): raise Exception( "Droplet '{0}' failed to start. (state is '{1}')".format( self.droplet_id, droplet.status)) self.wait_for_ssh(check=True) def check_started(self) -> bool: return self.check_status("active") def check_stopped(self) -> bool: return self.check_status("off") def check_status(self, status: str) -> bool: droplet = self._get_droplet() droplet.load() self.log_continue("[{0}] ".format(droplet.status)) if droplet.status == status: return True return False def stop(self) -> None: self.log_start("stopping droplet...") droplet = self._get_droplet() droplet.shutdown() self.state = self.STOPPING if not nixops.util.check_wait( self.check_stopped, initial=3, max_tries=100, exception=False): self.log_end("(time out)") self.log_start("forcing power off... ") droplet.power_off() if not nixops.util.check_wait( self.check_stopped, initial=3, max_tries=100, exception=False): raise Exception( "Droplet '{0}' failed to stop (state is '{1}')".format( self.droplet_id, droplet.status)) self.log_end("") self.state = self.STOPPED def reboot(self, hard: bool = False) -> None: if hard: self.log("sending hard reset to droplet...") droplet = self._get_droplet() droplet.reboot() self.state = self.STARTING self.wait_for_ssh() else: MachineState.reboot(self, hard=hard)
class HetznerState(MachineState): """ State of a Hetzner machine. """ @classmethod def get_type(cls): return "hetzner" state = attr_property("state", MachineState.UNKNOWN, int) main_ipv4 = attr_property("hetzner.mainIPv4", None) robot_admin_user = attr_property("hetzner.robotUser", None) robot_admin_pass = attr_property("hetzner.robotPass", None) partitions = attr_property("hetzner.partitions", None) just_installed = attr_property("hetzner.justInstalled", False, bool) rescue_passwd = attr_property("hetzner.rescuePasswd", None) fs_info = attr_property("hetzner.fsInfo", None) net_info = attr_property("hetzner.networkInfo", None, 'json') hw_info = attr_property("hetzner.hardwareInfo", None) main_ssh_private_key = attr_property("hetzner.sshPrivateKey", None) main_ssh_public_key = attr_property("hetzner.sshPublicKey", None) def __init__(self, depl, name, id): MachineState.__init__(self, depl, name, id) self._robot = None @property def resource_id(self): return self.vm_id @property def public_ipv4(self): return self.main_ipv4 def connect(self): """ Connect to the Hetzner robot by using the admin credetials in 'self.robot_admin_user' and 'self.robot_admin_pass'. """ if self._robot is not None: return self._robot self._robot = Robot(self.robot_admin_user, self.robot_admin_pass) return self._robot def _get_server_from_main_robot(self, ip, defn=None): """ Fetch the server instance using the main robot user and passwords from the MachineDefinition passed by 'defn'. If the definition does not contain these credentials or is None, it is tried to fetch it from environment variables. """ if defn is not None and len(defn.robot_user) > 0: robot_user = defn.robot_user else: robot_user = os.environ.get('HETZNER_ROBOT_USER', None) if defn is not None and len(defn.robot_pass) > 0: robot_pass = defn.robot_pass else: robot_pass = os.environ.get('HETZNER_ROBOT_PASS', None) if robot_user is None: raise Exception("please either set ‘deployment.hetzner.robotUser’" " or $HETZNER_ROBOT_USER for machine" " ‘{0}’".format(self.name)) elif robot_pass is None: raise Exception("please either set ‘deployment.hetzner.robotPass’" " or $HETZNER_ROBOT_PASS for machine" " ‘{0}’".format(self.name)) if TEST_MODE: return TestModeServer() robot = Robot(robot_user, robot_pass) return robot.servers.get(ip) def _get_server_by_ip(self, ip): """ Queries the robot for the given ip address and returns the Server instance if it was found. """ if TEST_MODE: return TestModeServer() robot = self.connect() return robot.servers.get(ip) def get_ssh_private_key_file(self): if self._ssh_private_key_file: return self._ssh_private_key_file else: return self.write_ssh_private_key(self.main_ssh_private_key) def get_ssh_flags(self, scp=False): return super(HetznerState, self).get_ssh_flags(scp) + ( ["-o", "LogLevel=quiet"] if self.state == self.RESCUE else # XXX: Disabling strict host key checking will only impact the # behaviour on *new* keys, so it should be "reasonably" safe to do # this until we have a better way of managing host keys in # ssh_util. So far this at least avoids to accept every damn host # key on a large deployment. [ "-o", "StrictHostKeyChecking=no", "-i", self.get_ssh_private_key_file() ]) def _wait_for_rescue(self, ip): if not TEST_MODE: # In test mode, the target machine really doesn't go down at all, # so only wait for the reboot to finish when deploying real # systems. self.log_start("waiting for rescue system...") dotlog = lambda: self.log_continue(".") wait_for_tcp_port(ip, 22, open=False, callback=dotlog) self.log_continue("[down]") wait_for_tcp_port(ip, 22, callback=dotlog) self.log_end("[up]") self.state = self.RESCUE def _bootstrap_rescue(self, install, partitions): """ Bootstrap everything needed in order to get Nix and the partitioner usable in the rescue system. The keyword arguments are only for partitioning, see reboot_rescue() for description, if not given we will only mount based on information provided in self.partitions. """ self.log_start("building Nix bootstrap installer... ") expr = os.path.join(self.depl.expr_path, "hetzner-bootstrap.nix") bootstrap_out = subprocess.check_output( ["nix-build", expr, "--no-out-link"]).rstrip() bootstrap = os.path.join(bootstrap_out, 'bin/hetzner-bootstrap') self.log_end("done. ({0})".format(bootstrap)) self.log_start("creating nixbld group in rescue system... ") self.run_command("getent group nixbld > /dev/null || " "groupadd -g 30000 nixbld") self.log_end("done.") self.log_start( "checking if tmpfs in rescue system is large enough... ") dfstat = self.run_command("stat -f -c '%a:%S' /", capture_stdout=True) df, bs = dfstat.split(':') free_mb = (int(df) * int(bs)) // 1024 // 1024 if free_mb > 300: self.log_end("yes: {0} MB".format(free_mb)) tarcmd = 'tar x -C /' else: self.log_end("no: {0} MB".format(free_mb)) tarexcludes = [ '*/include', '*/man', '*/info', '*/locale', '*/locales', '*/share/doc', '*/share/aclocal', '*/example', '*/terminfo', '*/pkgconfig', '*/nix-support', '*/etc', '*/bash-completion', '*.a', '*.la', '*.pc', '*.lisp', '*.pod', '*.html', '*.pyc', '*.pyo', '*-kbd-*/share', '*-gcc-*/bin', '*-gcc-*/libexec', '*-systemd-*/bin', '*-boehm-gc-*/share' ] tarcmd = 'tar x -C / ' + ' '.join( ["--exclude='{0}'".format(glob) for glob in tarexcludes]) # The command to retrieve our split TAR archive on the other side. recv = 'read -d: tarsize; head -c "$tarsize" | {0}; {0}'.format(tarcmd) self.log_start("copying bootstrap files to rescue system... ") tarstream = subprocess.Popen([bootstrap], stdout=subprocess.PIPE) if not self.has_really_fast_connection(): stream = subprocess.Popen(["gzip", "-c"], stdin=tarstream.stdout, stdout=subprocess.PIPE) self.run_command("gzip -d | ({0})".format(recv), stdin=stream.stdout) stream.wait() else: self.run_command(recv, stdin=tarstream.stdout) tarstream.wait() self.log_end("done.") if install: self.log_start("partitioning disks... ") try: out = self.run_command("nixpart -p -", capture_stdout=True, stdin_string=partitions) except SSHCommandFailed as cmd: # Exit code 100 is when the partitioner requires a reboot. if cmd.exitcode == 100: self.log(cmd.message) self.reboot_rescue(install, partitions) return else: raise # This is the *only* place to set self.partitions unless we have # implemented a way to repartition the system! self.partitions = partitions self.fs_info = out else: self.log_start("mounting filesystems... ") self.run_command("nixpart -m -", stdin_string=self.partitions) self.log_end("done.") if not install: self.log_start("checking if system in /mnt is NixOS... ") res = self.run_command("test -e /mnt/etc/NIXOS", check=False) if res == 0: self.log_end("yes.") else: self.log_end("NO! Not mounting special filesystems.") return self.log_start("bind-mounting special filesystems... ") for mountpoint in ("/proc", "/dev", "/dev/shm", "/sys"): self.log_continue("{0}...".format(mountpoint)) cmd = "mkdir -m 0755 -p /mnt{0} && ".format(mountpoint) cmd += "mount --bind {0} /mnt{0}".format(mountpoint) self.run_command(cmd) self.log_end("done.") def reboot(self, hard=False): if hard: self.log_start("sending hard reset to robot... ") server = self._get_server_by_ip(self.main_ipv4) server.reboot('hard') self.log_end("done.") self.state = self.STARTING self.ssh.reset() else: MachineState.reboot(self, hard=hard) def reboot_rescue(self, install=False, partitions=None, bootstrap=True, hard=False): """ Use the Robot to activate the rescue system and reboot the system. By default, only mount partitions and do not partition or wipe anything. On installation, both 'installed' has to be set to True and partitions should contain a Kickstart configuration, otherwise it's read from self.partitions if available (which it shouldn't if you're not doing something nasty). """ self.log("rebooting machine ‘{0}’ ({1}) into rescue system".format( self.name, self.main_ipv4)) server = self._get_server_by_ip(self.main_ipv4) server.rescue.activate() rescue_passwd = server.rescue.password if hard or (install and self.state not in (self.UP, self.RESCUE)): self.log_start("sending hard reset to robot... ") server.reboot('hard') else: self.log_start("sending reboot command... ") if self.state == self.RESCUE: self.run_command("(sleep 2; reboot) &", check=False) else: self.run_command("systemctl reboot", check=False) self.log_end("done.") self._wait_for_rescue(self.main_ipv4) self.rescue_passwd = rescue_passwd self.state = self.RESCUE self.ssh.reset() if bootstrap: self._bootstrap_rescue(install, partitions) def _install_main_ssh_keys(self): """ Create a SSH private/public keypair and put the public key into the chroot. """ private, public = create_key_pair( key_name="NixOps client key of {0}".format(self.name)) self.main_ssh_private_key, self.main_ssh_public_key = private, public res = self.run_command( "umask 077 && mkdir -p /mnt/root/.ssh &&" " cat > /mnt/root/.ssh/authorized_keys", stdin_string=public) def _install_base_system(self): self.log_start("creating missing directories... ") cmds = ["mkdir -m 1777 -p /mnt/tmp /mnt/nix/store"] mntdirs = [ "var", "etc", "bin", "nix/var/nix/gcroots", "nix/var/nix/temproots", "nix/var/nix/manifests", "nix/var/nix/userpool", "nix/var/nix/profiles", "nix/var/nix/db", "nix/var/log/nix/drvs" ] to_create = ' '.join(map(lambda d: os.path.join("/mnt", d), mntdirs)) cmds.append("mkdir -m 0755 -p {0}".format(to_create)) self.run_command(' && '.join(cmds)) self.log_end("done.") self.log_start("bind-mounting files in /etc... ") for etcfile in ("resolv.conf", "passwd", "group"): self.log_continue("{0}...".format(etcfile)) cmd = ("if ! test -e /mnt/etc/{0}; then" " touch /mnt/etc/{0} && mount --bind /etc/{0} /mnt/etc/{0};" " fi").format(etcfile) self.run_command(cmd) self.log_end("done.") self.run_command("touch /mnt/etc/NIXOS") self.run_command("activate-remote") self._install_main_ssh_keys() self._gen_network_spec() def _detect_hardware(self): self.log_start("detecting hardware... ") cmd = "nixos-generate-config --no-filesystems --show-hardware-config" hardware = self.run_command(cmd, capture_stdout=True) self.hw_info = '\n'.join([ line for line in hardware.splitlines() if not line.rstrip().startswith('#') ]) self.log_end("done.") def switch_to_configuration(self, method, sync, command=None): if self.state == self.RESCUE: # We cannot use the mountpoint command here, because it's unable to # detect bind mounts on files, so we just go ahead and try to # unmount. umount = 'if umount "{0}" 2> /dev/null; then rm -f "{0}"; fi' cmd = '; '.join([ umount.format(os.path.join("/mnt/etc", mnt)) for mnt in ("resolv.conf", "passwd", "group") ]) self.run_command(cmd) command = "chroot /mnt /nix/var/nix/profiles/system/bin/" command += "switch-to-configuration" res = MachineState.switch_to_configuration(self, method, sync, command) if res not in (0, 100): return res if self.state == self.RESCUE and self.just_installed: self.reboot_sync() self.just_installed = False return res def _get_ethernet_interfaces(self): """ Return a list of all the ethernet interfaces active on the machine. """ # We don't use \(\) here to ensure this works even without GNU sed. cmd = "ip addr show | sed -n -e 's/^[0-9]*: *//p' | cut -d: -f1" return self.run_command(cmd, capture_stdout=True).splitlines() def _get_udev_rule_for(self, interface): """ Get lines suitable for services.udev.extraRules for 'interface', and thus essentially map the device name to a hardware address. """ cmd = "ip addr show \"{0}\" | sed -n -e 's|^.*link/ether *||p'" cmd += " | cut -d' ' -f1" mac_addr = self.run_command(cmd.format(interface), capture_stdout=True).strip() rule = 'ACTION=="add", SUBSYSTEM=="net", ATTR{{address}}=="{0}", ' rule += 'NAME="{1}"' return rule.format(mac_addr, interface) def _get_ipv4_addr_and_prefix_for(self, interface): """ Return a tuple of (ipv4_address, prefix_length) for the specified interface. """ cmd = "ip addr show \"{0}\" | sed -n -e 's/^.*inet *//p'" cmd += " | cut -d' ' -f1" ipv4_addr_prefix = self.run_command(cmd.format(interface), capture_stdout=True).strip() if "/" not in ipv4_addr_prefix: # No IP address set for this interface. return None else: return ipv4_addr_prefix.split('/', 1) def _get_default_gw(self): """ Return the default gateway of the currently running machine. """ cmd = "ip route list | sed -n -e 's/^default *via *//p'" cmd += " | cut -d' ' -f1" return self.run_command(cmd, capture_stdout=True).strip() def _get_nameservers(self): """ Return a list of all nameservers defined on the currently running machine. """ cmd = "cat /etc/resolv.conf | sed -n -e 's/^nameserver *//p'" return self.run_command(cmd, capture_stdout=True).splitlines() def _indent(self, lines, level=1): """ Indent list of lines by the specified level (one level = two spaces). """ return map(lambda line: " " + line, lines) def _calculate_ipv4_subnet(self, ipv4, prefix_len): """ Returns the address of the subnet for the given 'ipv4' and 'prefix_len'. """ bits = struct.unpack('!L', socket.inet_aton(ipv4))[0] mask = 0xffffffff >> (32 - prefix_len) << (32 - prefix_len) return socket.inet_ntoa(struct.pack('!L', bits & mask)) def _gen_network_spec(self): """ Generate Nix expressions related to networking configuration based on the currently running machine (most likely in RESCUE state) and set the resulting string to self.net_info. """ udev_rules = [] iface_attrs = {} extra_routes = [] ipv6_commands = [] server = self._get_server_by_ip(self.main_ipv4) # Global networking options defgw = self._get_default_gw() v6defgw = None # Interface-specific networking options for iface in self._get_ethernet_interfaces(): if iface == "lo": continue result = self._get_ipv4_addr_and_prefix_for(iface) if result is None: continue udev_rules.append(self._get_udev_rule_for(iface)) ipv4, prefix = result iface_attrs[iface] = { 'ipAddress': ipv4, 'prefixLength': int(prefix), } # We can't handle Hetzner-specific networking info in test mode. if TEST_MODE: continue # Extra route for accessing own subnet net = self._calculate_ipv4_subnet(ipv4, int(prefix)) extra_routes.append(("{0}/{1}".format(net, prefix), defgw, iface)) # IPv6 subnets only for eth0 (XXX: more flexibility here?) v6addr_command = "ip -6 addr add '{0}' dev '{1}' || true" for subnet in server.subnets: if "." in subnet.net_ip: # skip IPv4 addresses continue v6addr = "{0}/{1}".format(subnet.net_ip, subnet.mask) ipv6_commands.append(v6addr_command.format(v6addr, iface)) assert v6defgw is None or v6defgw == subnet.gateway v6defgw = subnet.gateway # Extra routes route4_cmd = "ip -4 route change '{0}' via '{1}' dev '{2}' || true" route_commands = [ route4_cmd.format(net, gw, iface) for net, gw, iface in extra_routes ] # IPv6 configuration route6_cmd = "ip -6 route add default via '{0}' dev eth0 || true" route_commands.append(route6_cmd.format(v6defgw)) local_commands = '\n'.join(ipv6_commands + route_commands) + '\n' self.net_info = { 'services': { 'udev': { 'extraRules': '\n'.join(udev_rules) + '\n' }, }, 'networking': { 'interfaces': iface_attrs, 'defaultGateway': defgw, 'nameservers': self._get_nameservers(), 'localCommands': local_commands, } } def get_physical_spec(self): if all([self.net_info, self.fs_info, self.hw_info]): return { 'config': self.net_info, 'imports': [nix2py(self.fs_info), nix2py(self.hw_info)], } else: return {} def create(self, defn, check, allow_reboot, allow_recreate): assert isinstance(defn, HetznerDefinition) if self.state not in (self.RESCUE, self.UP) or check: self.check() self.set_common_state(defn) self.main_ipv4 = defn.main_ipv4 if not self.robot_admin_user or not self.robot_admin_pass: self.log_start("creating an exclusive robot admin account for " "‘{0}’... ".format(self.name)) # Create a new Admin account exclusively for this machine. server = self._get_server_from_main_robot(self.main_ipv4, defn) with self.depl._db: (self.robot_admin_user, self.robot_admin_pass) = server.admin.create() self.log_end("done. ({0})".format(self.robot_admin_user)) if not self.vm_id: self.log("installing machine...") self.reboot_rescue(install=True, partitions=defn.partitions) self._install_base_system() self._detect_hardware() server = self._get_server_by_ip(self.main_ipv4) vm_id = "nixops-{0}-{1}".format(self.depl.uuid, self.name) server.set_name(vm_id[:100]) self.vm_id = vm_id known_hosts.remove(self.main_ipv4, None) self.just_installed = True def start(self): """ Start the server into the normal system (a reboot is done if the rescue system is active). """ if self.state == self.UP: return elif self.state == self.RESCUE: self.reboot() elif self.state in (self.STOPPED, self.UNREACHABLE): self.log_start("server was shut down, sending hard reset... ") server = self._get_server_by_ip(self.main_ipv4) server.reboot("hard") self.log_end("done.") self.state = self.STARTING self.wait_for_ssh(check=True) self.send_keys() def _wait_stop(self): """ Wait for the system to shutdown and set state STOPPED afterwards. """ self.log_start("waiting for system to shutdown... ") dotlog = lambda: self.log_continue(".") wait_for_tcp_port(self.main_ipv4, 22, open=False, callback=dotlog) self.log_continue("[down]") self.state = self.STOPPED def stop(self): """ Stops the server by shutting it down without powering it off. """ if self.state not in (self.RESCUE, self.UP): return self.log_start("shutting down system... ") self.run_command("systemctl halt", check=False) self.log_end("done.") self.state = self.STOPPING self._wait_stop() def get_ssh_name(self): assert self.main_ipv4 return self.main_ipv4 def get_ssh_password(self): if self.state == self.RESCUE: return self.rescue_passwd else: return None def _check(self, res): if not self.vm_id: res.exists = False return if self.state in (self.STOPPED, self.STOPPING): res.is_up = ping_tcp_port(self.main_ipv4, 22) if not res.is_up: self.state = self.STOPPED res.is_reachable = False return res.exists = True avg = self.get_load_avg() if avg is None: if self.state in (self.UP, self.RESCUE): self.state = self.UNREACHABLE res.is_reachable = False res.is_up = False elif self.run_command("test -f /etc/NIXOS", check=False) != 0: self.state = self.RESCUE self.ssh_pinged = True self._ssh_pinged_this_time = True res.is_reachable = True res.is_up = False else: res.is_up = True MachineState._check(self, res) def _destroy(self, server, wipe): if self.state != self.RESCUE: self.reboot_rescue(bootstrap=False, hard=True) if wipe: self.log_start("erasing all data on disk... ") # Let it run in the background because it will take a long time. cmd = "nohup shred /dev/[sh]d? &> /dev/null < /dev/null &" self.run_command(cmd) self.log_end("done. (backgrounded)") self.log_start("unsetting server name... ") server.set_name("") self.log_end("done.") self.log_start("removing admin account... ") server.admin.delete() self.log_start("done.") self.log("machine left in rescue, password: "******"{0}".format(self.rescue_passwd)) return True def destroy(self, wipe=False): if not self.vm_id: return True # Create the instance as early as possible because if we don't have the # needed credentials, we really don't have to even ask for confirmation. server = self._get_server_from_main_robot(self.main_ipv4) if wipe: question = "are you sure you want to completely erase {0}?" else: question = "are you sure you want to destroy {0}?" question_target = "Hetzner machine ‘{0}’".format(self.name) if not self.depl.logger.confirm(question.format(question_target)): return False return self._destroy(server, wipe)
class HcloudState(MachineState[HcloudDefinition]): definition_type = HcloudDefinition state = attr_property("state", MachineState.MISSING, int) # override public_ipv4 = attr_property("publicIpv4", None, str) token = attr_property("hcloud.token", None, str) image_id = attr_property("hcloud.image", None, int) location = attr_property("hcloud.location", None, str) server_type = attr_property("hcloud.serverType", None, str) upgrade_disk = attr_property("hcloud.upgradeDisk", False, bool) hw_info = attr_property("hcloud.hardwareInfo", None, str) ssh_keys = attr_property("hcloud.sshKeys", None, "json") volume_ids = attr_property("hcloud.volumeIds", None, "json") filesystems = attr_property("hcloud.filesystems", None, "json") _ssh_private_key = attr_property("hcloud.sshPrivateKey", None, str) _ssh_public_key = attr_property("hcloud.sshPublicKey", None, str) _public_host_key = attr_property("hcloud.publicHostKey", None, str) def __init__(self, *args, **kwargs) -> None: super().__init__(*args, **kwargs) self._cached_client: Optional[hcloud.Client] = None self._cached_server: Optional[BoundServer] = None @classmethod def get_type(cls) -> str: return "hcloud" @property def resource_id(self): return self.vm_id @property def _client(self) -> hcloud.Client: assert self.token if self._cached_client is None: self._cached_client = hcloud.Client(self.token) return self._cached_client @property def _server(self) -> BoundServer: if self.vm_id is None: raise Exception("Server not created yet") if self._cached_server is None or self._cached_server.id != self.vm_id: self._cached_server = self._client.servers.get_by_id(self.vm_id) return cast(BoundServer, self._cached_server) def create(self, defn: HcloudDefinition, check, allow_reboot, allow_recreate): assert isinstance(defn, HcloudDefinition) hetzner = defn.config.hcloud self.token = get_access_token(hetzner) if self.state not in (MachineState.RESCUE, MachineState.UP) or check: self.check() self.set_common_state(defn) self.upgrade_disk = hetzner.upgradeDisk # TODO maybe bootstrap can be automated with vncdotool image_id = self._fetch_image_id(hetzner.image, hetzner.image_selector) if self.image_id is None: self.image_id = image_id elif self.image_id != image_id: self.warn( f"image_id changed from {self.image_id} to {image_id} but can't update image of a VM." ) if self.location is None: self.location = hetzner.location elif self.location != hetzner.location: self.warn( f"location changed from {self.location} to {hetzner.location} but can't update location of a VM." ) if self.vm_id is not None and hetzner.serverType != self.server_type: # TODO Check if server can be upgraded before hitting the Hetzner API # https://docs.hetzner.cloud/#server-actions-change-the-type-of-a-server do_upgrade = True # Only confirm if upgrade_disk is True because then the upgrade can't be undone if self.upgrade_disk: do_upgrade = self.depl.logger.confirm( f"are you sure you want to change Hetzner server {self.name} type from " + f"{self.server_type} to {hetzner.serverType}?") if do_upgrade: self.log_start("Changing Hetzner server type...") self._server.shutdown().wait_until_finished() self.wait_for_down(callback=lambda: self.log_continue(".")) self._server.change_type( ServerType(name=hetzner.serverType), upgrade_disk=self.upgrade_disk).wait_until_finished() self._server.power_on() self.wait_for_up(callback=lambda: self.log_continue(".")) self.log_end("") self.server_type = hetzner.serverType ssh_keys = [ k.name if isinstance(k, ResourceEval) else k for k in hetzner.sshKeys ] if self.state != MachineState.MISSING and ssh_keys != self.ssh_keys: self.logger.warn( f"SSH keys cannot be changed after the server is created.") volume_ids = [] filesystems = {} for volumeopts in hetzner.volumes: volume = volumeopts.volume if isinstance(volume, str): volume_model = self._client.volumes.get_by_name(volume) volume_name = volume volume_id = volume_model.id volume_loc = volume_model.location.name else: volume_res = self.depl.get_typed_resource( volume._name, "hcloud-volume", HcloudVolumeState) volume_name = volume_res.name volume_id = volume_res.hcloud_id assert volume_id is not None volume_loc = volume_res.location if volume_loc != self.location: raise Exception( f"Volume {volume_name!r} is in a different location from server {self.name!r}" ) volume_ids.append(volume_id) if volumeopts.mountPoint is not None: fs = dict(volumeopts.fileSystem) fs["device"] = f"/dev/disk/by-id/scsi-0HC_Volume_{volume_id}" filesystems[volumeopts.mountPoint] = fs has_priv = self._ssh_private_key is not None has_pub = self._ssh_public_key is not None assert has_priv == has_pub if not has_priv: self.log("Generating SSH keypair...") (self._ssh_private_key, self._ssh_public_key) = create_key_pair() if self.vm_id: if self.volume_ids != volume_ids: current = set(self.volume_ids) new = set(volume_ids) volumes_client = self._client.volumes self.log_start("Updating volumes...") for v in current - new: volumes_client.detach(Volume(id=v)) self.log_continue(".") for v in new - current: volumes_client.attach( Volume(id=v), self._server, automount=False).wait_until_finished() self.log_continue(".") self.log_end("") self.volume_ids = volume_ids else: self.log_start( "Creating Hetzner Cloud VM (" + f"image '{image_id}', type '{hetzner.serverType}', location '{hetzner.location}'" + ")...") response = self._client.servers.create( name=self.name, ssh_keys=[SSHKey(name=k) for k in ssh_keys], volumes=[Volume(id=v) for v in volume_ids], server_type=ServerType(self.server_type), image=Image(id=self.image_id), # Set labels so we can find the instance if nixops crashes before writing vm_id labels=dict(self._server_labels()), user_data=None if self._ssh_public_key is None else yaml.dump( {"public-keys": [self._ssh_public_key]}), ) self.log_end("") self.public_ipv4 = response.server.public_net.ipv4.ip self.log_start("waiting for SSH...") self.wait_for_up(callback=lambda: self.log_continue(".")) self.log_end("") with self.depl._db: self.vm_id = response.server.id # TODO get state from creation response self.state = MachineState.STARTING self.ssh_keys = ssh_keys self.volume_ids = volume_ids self._detect_hardware() self._update_host_keys() self.filesystems = filesystems def destroy(self, wipe=False): if self.vm_id is None: return True if wipe: self.warn("Wipe is not supported") if not self.depl.logger.confirm( f"are you sure you want to destroy Hetzner server {self.name}?" ): return False self.log_start("destroying Hetzner Cloud VM...") self._client.servers.delete(Server(id=self.vm_id)) self.log_end("") self._reset() return True def get_ssh_flags(self, *args, **kwargs) -> List[str]: key_file = self.get_ssh_private_key_file() assert key_file is not None flags = super().get_ssh_flags(*args, **kwargs) + [ "-i", key_file, ] # TODO set host keys with cloud-init se we don't need to disable host key checking on first # deploy if self._public_host_key is None: flags.extend([ "-o", "UserKnownHostsFile=/dev/null", "-o", "GlobalKnownHostsFile=/dev/null", "-o", "StrictHostKeyChecking=accept-new", ]) return flags def get_ssh_private_key_file(self) -> Optional[str]: if self._ssh_private_key_file: return self._ssh_private_key_file if self._ssh_private_key: return self.write_ssh_private_key(self._ssh_private_key) return None def get_ssh_name(self): assert self.public_ipv4 return self.public_ipv4 def get_physical_spec(self): spec = super().get_physical_spec() if self.hw_info: spec.setdefault("imports", []).append(nix2py(self.hw_info)) if self.filesystems is not None: fs = spec.setdefault("config", {}).setdefault("fileSystems", {}) fs.update(self.filesystems) return spec def _check(self, res): self.log_start("Looking up server...") if self.vm_id is None: label_selector = ",".join(f"{k}={v}" for k, v in self._server_labels()) servers, _ = self._client.servers.get_list( label_selector=label_selector, ) if len(servers) > 1: self.warn(f"Multiple servers matching {self.name} by labels") if len(servers) == 0: self.log_end("not found") res.exists = False return server: BoundServer = servers[0] self.vm_id = server.id else: try: server = self._client.servers.get_by_id(self.vm_id) except hcloud.APIException as e: if e.code == "not_found": self.log_end("not found") self._reset() res.exists = False return raise self.log_end("found") res.exists = True self._cached_server = server with self.depl._db: if self._public_host_key is None: self._update_host_keys() self.state = self._hcloud_status_to_machine_status(server.status) self.image_id = server.image.id self.volume_ids = [v.id for v in server.volumes] self.location = server.datacenter.location.name self.public_ipv4 = server.public_net.ipv4.ip self.server_type = server.server_type.name res.is_up = self.state == MachineState.UP if res.is_up: super()._check(res) def create_after(self, resources, defn): return { r for r in resources if isinstance(r, (HcloudSshKeyState, HcloudVolumeState)) } def _detect_hardware(self) -> None: self.log_start("detecting hardware...") cmd = "nixos-generate-config --show-hardware-config" hardware = self.run_command(cmd, capture_stdout=True) self.hw_info = "\n".join([ line for line in hardware.splitlines() if not line.lstrip().startswith("#") ]) self.log_end("") def _update_host_keys(self) -> None: self.log_start("updating host keys...") cmd = f"cat /etc/ssh/ssh_host_{HOST_KEY_TYPE}_key.pub" self._public_host_key = str(self.run_command( cmd, capture_stdout=True)).strip() known_hosts.add(self.public_ipv4, self._public_host_key) self.log_end("") @staticmethod def _hcloud_status_to_machine_status(status: str) -> int: # TODO check for rescue and unreachable try: return { Server.STATUS_OFF: MachineState.STOPPED, Server.STATUS_STOPPING: MachineState.STOPPING, Server.STATUS_STARTING: MachineState.STARTING, Server.STATUS_INIT: MachineState.STARTING, Server.STATUS_RUNNING: MachineState.UP, Server.STATUS_UNKNOWN: MachineState.UNKNOWN, Server.STATUS_DELETING: MachineState.STOPPING, Server.STATUS_MIGRATING: MachineState.STARTING, Server.STATUS_REBUILDING: MachineState.STARTING, }[status] except KeyError as e: raise Exception(f"Invalid server status {status!r}") from e def _fetch_image_id(self, image: Optional[int], image_selector: str) -> int: if image is None: self.log(f"Finding image matching {image_selector}...") matches, _ = self._client.images.get_list( label_selector=image_selector, sort="created:desc", ) if len(matches) == 0: raise Exception(f"No images found matching {image_selector}") return matches[0].id else: return image def _server_labels(self) -> Iterable[Tuple[str, str]]: assert self.depl yield "nixops/name", self.name yield "nixops/deployment", self.depl.uuid def _reset(self) -> None: assert self.depl if all((self.public_ipv4, self._public_host_key)): known_hosts.remove(self.public_ipv4, self._public_host_key) with self.depl._db: self.state = self.MISSING self.vm_id = None self.image_id = None self.location = None self.public_ipv4 = None self.server_type = None self.hw_info = None self._ssh_public_key = None self._ssh_private_key = None self._public_host_key = None