def build_cell_stems(self, in_shapes:TensorShapesList, conf_cell:Config, cell_index:int)\ ->Tuple[TensorShapes, List[OpDesc]]: # expect two stems, both should have same channels # TODO: support multiple stems assert len( in_shapes ) >= 2, "we must have outputs from at least two previous modules" # Get channels for previous two layers. # At start we have only one layer, i.e., model stems. # Typically model stems should have same channel count but for imagenet we do # reduction at model stem so stem1 will have twice channels as stem0 p_ch_out = self.get_ch(in_shapes[-1][0]) pp_ch_out = self.get_ch(in_shapes[-2][0]) # was the previous layer reduction layer? reduction_p = p_ch_out == pp_ch_out * 2 or in_shapes[-2][0][ 2] == in_shapes[-1][0][2] * 2 # find out the node channels for this cell node_ch_out = self.node_channels[cell_index][ 0] # init with first node in cell # Cell stemps will take prev channels and out sameput channels as nodes would. # If prev cell was reduction then we need to increase channels of prev-prev # by 2X. This is done by prepr_reduce stem. s0_op = OpDesc( 'prepr_reduce' if reduction_p else 'prepr_normal', params={'conv': ConvMacroParams(pp_ch_out, node_ch_out)}, in_len=1, trainables=None) s1_op = OpDesc('prepr_normal', params={'conv': ConvMacroParams(p_ch_out, node_ch_out)}, in_len=1, trainables=None) # output two shapes with proper channels setup # for default model desc, cell stems have same shapes and channels out_shape0 = copy.deepcopy(in_shapes[-1][0]) # set channels and reset shapes to -1 to indicate unknown # for imagenet HxW would be floating point numbers due to one input reduced out_shape0[0], out_shape0[2], out_shape0[3] = node_ch_out, -1, -1 out_shape1 = copy.deepcopy(out_shape0) return [out_shape0, out_shape1], [s0_op, s1_op]
def _build_cell(self, cell_desc: CellDesc, rand_ops: RandOps) -> None: # sanity check: we have random ops for each node assert len(cell_desc.nodes()) == len(rand_ops.ops_and_ins) reduction = (cell_desc.cell_type == CellType.Reduction) # Add random op for each edge for node, (op_names, to_states) in zip(cell_desc.nodes(), rand_ops.ops_and_ins): # as we want cell to be completely random, remove previous edges node.edges.clear() # add random edges for op_name, to_state in zip(op_names, to_states): op_desc = OpDesc(op_name, params={ 'conv': cell_desc.conv_params, 'stride': 2 if reduction and to_state < 2 else 1 }, in_len=1, trainables=None, children=None) edge = EdgeDesc(op_desc, input_ids=[to_state]) node.edges.append(edge)
def build_nodes(self, stem_shapes:TensorShapes, conf_cell:Config, cell_index:int, cell_type:CellType, node_count:int, in_shape:TensorShape, out_shape:TensorShape) \ ->Tuple[TensorShapes, List[NodeDesc]]: assert in_shape[0]==out_shape[0] reduction = (cell_type==CellType.Reduction) nodes:List[NodeDesc] = [] conv_params = ConvMacroParams(in_shape[0], out_shape[0]) # add xnas op for each edge for i in range(node_count): edges=[] for j in range(i+2): op_desc = OpDesc('xnas_op', params={ 'conv': conv_params, 'stride': 2 if reduction and j < 2 else 1 }, in_len=1, trainables=None, children=None) edge = EdgeDesc(op_desc, input_ids=[j]) edges.append(edge) nodes.append(NodeDesc(edges=edges, conv_params=conv_params)) out_shapes = [copy.deepcopy(out_shape) for _ in range(node_count)] return out_shapes, nodes
def build_nodes(self, stem_shapes:TensorShapes, conf_cell:Config, cell_index:int, cell_type:CellType, node_count:int, in_shape:TensorShape, out_shape:TensorShape) \ ->Tuple[TensorShapes, List[NodeDesc]]: assert in_shape[0]==out_shape[0] nodes:List[NodeDesc] = [] conv_params = ConvMacroParams(in_shape[0], out_shape[0]) for i in range(node_count): edges = [] input_ids = [] first_proj = False # if input node is connected then it needs projection if self._cell_matrix[0, i+1]: # nadbench internal node starts at 1 input_ids.append(0) # connect to s0 first_proj = True for j in range(i): # look at all internal vertex before us if self._cell_matrix[j+1, i+1]: # if there is connection input_ids.append(j+2) # offset because of s0, s1 op_desc = OpDesc('nasbench101_op', params={ 'conv': conv_params, 'stride': 1, 'vertex_op': self._vertex_ops[i+1], # offset because of input node 'first_proj': first_proj }, in_len=len(input_ids), trainables=None, children=None) # TODO: should we pass children here? edge = EdgeDesc(op_desc, input_ids=input_ids) edges.append(edge) nodes.append(NodeDesc(edges=edges, conv_params=conv_params)) out_shapes = [copy.deepcopy(out_shape) for _ in range(node_count)] return out_shapes, nodes
def build_logits_op(self, in_shapes:TensorShapesList, conf_model_desc:Config)->OpDesc: n_classes = self.get_conf_dataset()['n_classes'] return OpDesc('linear', params={'n_ch':in_shapes[-1][0][0], 'n_classes': n_classes}, in_len=1, trainables=None)
def finalize(self) -> Tuple[OpDesc, Optional[float]]: with torch.no_grad(): # probably this is not needed l = self._flatten_ops_alphas() # select 3 largest ops by alpha sel = heapq.nlargest(3, l, key=lambda t: t[0]) # TODO: add config # multi_op needs to know each input and associated primitive final_op_desc = OpDesc( name='multi_op', params={ # copy convolution parameters 'conv': self.desc.params['conv'] }, # Number of inputs remains same although only 3 of # them will be used. in_len=self.desc.in_len, trainables=None, # primitive's finalize call also records its # weights in description. finalize call returns # (desc, rank) where rank for primitive is None children=[op.finalize()[0] for a, i, op in sel], children_ins=[i for a, i, op in sel]) # rank=None to indicate no further selection needed as in darts return final_op_desc, None
def __init__(self, op_desc: OpDesc, arch_params: Optional[ArchParams], affine: bool): super().__init__() # assume last PRIMITIVE is 'none' assert DivOp.PRIMITIVES[-1] == 'none' conf = get_conf() trainer = conf['nas']['search']['divnas']['archtrainer'] finalizer = conf['nas']['search']['finalizer'] if trainer == 'noalpha' and finalizer == 'default': raise NotImplementedError( 'noalpha trainer is not implemented for the default finalizer') if trainer != 'noalpha': self._setup_arch_params(arch_params) else: self._alphas = None self._ops = nn.ModuleList() for primitive in DivOp.PRIMITIVES: op = Op.create(OpDesc(primitive, op_desc.params, in_len=1, trainables=None), affine=affine, arch_params=None) self._ops.append(op) # various state variables for diversity self._collect_activations = False self._forward_counter = 0 self._batch_activs = None
def build_nodes(self, stem_shapes:TensorShapes, conf_cell:Config, cell_index:int, cell_type:CellType, node_count:int, in_shape:TensorShape, out_shape:TensorShape) \ ->Tuple[TensorShapes, List[NodeDesc]]: assert in_shape[0] == out_shape[0] reduction = (cell_type == CellType.Reduction) ops = self._reduction_ops if reduction else self._normal_ops assert node_count == len(ops.ops_and_ins) nodes: List[NodeDesc] = [] conv_params = ConvMacroParams(in_shape[0], out_shape[0]) for op_names, to_states in ops.ops_and_ins: edges = [] # add random edges for op_name, to_state in zip(op_names, to_states): op_desc = OpDesc(op_name, params={ 'conv': conv_params, 'stride': 2 if reduction and to_state < 2 else 1 }, in_len=1, trainables=None, children=None) edge = EdgeDesc(op_desc, input_ids=[to_state]) edges.append(edge) nodes.append(NodeDesc(edges=edges, conv_params=conv_params)) out_shapes = [copy.deepcopy(out_shape) for _ in range(node_count)] return out_shapes, nodes
def build_model_stems(self, in_shapes:TensorShapesList, conf_model_desc:Config)->List[OpDesc]: # TODO: why do we need stem_multiplier? # TODO: in original paper stems are always affine conf_model_stems = self.get_conf_model_stems() init_node_ch:int = conf_model_stems['init_node_ch'] stem_multiplier:int = conf_model_stems['stem_multiplier'] ops:List[str] = conf_model_stems['ops'] out_channels = init_node_ch*stem_multiplier conv_params = ConvMacroParams(self.get_ch(in_shapes[-1][0]), # channels of first input tensor init_node_ch*stem_multiplier) stems = [OpDesc(name=op_name, params={'conv': conv_params}, in_len=1, trainables=None) \ for op_name in ops] # get reduction factors done by each stem, typically they should be same but for # imagenet they can differ stem_reductions = ModelDescBuilder._stem_reductions(stems) # Each cell takes input from previous and 2nd previous cells. # To be consistence we create two outputs for model stems: [[s1, s0], [s0, s1] # This way when we access first element of each output we get s1, s0. # Normailly s0==s1 but for networks like imagenet, s0 will have twice the channels # of s1. for stem_reduction in stem_reductions: in_shapes.append([[out_channels, -1, -1.0/stem_reduction, -1.0/stem_reduction]]) return stems
def _build_cell(self, cell_desc: CellDesc) -> None: for i, node in enumerate(cell_desc.nodes()): input_ids = [] first_proj = False # if input node is connected then it needs projection if self._cell_matrix[0, i + 1]: # nadbench internal node starts at 1 input_ids.append(0) # connect to s0 first_proj = True for j in range(i): # look at all internal vertex before us if self._cell_matrix[j + 1, i + 1]: # if there is connection input_ids.append(j + 2) # offset because of s0, s1 op_desc = OpDesc( 'nasbench101_op', params={ 'conv': cell_desc.conv_params, 'stride': 1, 'vertex_op': self._vertex_ops[i + 1], # offset because of input node 'first_proj': first_proj }, in_len=len(input_ids), trainables=None, children=None) # TODO: should we pass children here? edge = EdgeDesc(op_desc, input_ids=input_ids) node.edges.append(edge)
def build_nodes(self, stem_shapes:TensorShapes, conf_cell:Config, cell_index:int, cell_type:CellType, node_count:int, in_shape:TensorShape, out_shape:TensorShape) \ ->Tuple[TensorShapes, List[NodeDesc]]: # For petridish we add one node with identity to s1. # This will be our seed model to start with. # Later in PetridishSearcher, we will add one more node in parent after each sampling. assert in_shape[0] == out_shape[0] reduction = (cell_type == CellType.Reduction) # channels for conv filters conv_params = ConvMacroParams(in_shape[0], out_shape[0]) # identity op to connect S1 to the node op_desc = OpDesc('skip_connect', params={ 'conv': conv_params, 'stride': 2 if reduction else 1 }, in_len=1, trainables=None, children=None) edge = EdgeDesc(op_desc, input_ids=[1]) new_node = NodeDesc(edges=[edge], conv_params=conv_params) nodes = [new_node] # each node has same out channels as in channels out_shapes = [copy.deepcopy(out_shape) for _ in nodes] return out_shapes, nodes
def _build_cell(self, cell_desc: CellDesc) -> None: reduction = (cell_desc.cell_type == CellType.Reduction) self._ensure_nonempty_nodes(cell_desc) # we operate on last node, inserting another node before it new_nodes = [n.clone() for n in cell_desc.nodes()] petridish_node = NodeDesc(edges=[]) new_nodes.insert(len(new_nodes) - 1, petridish_node) input_ids = list(range(len(new_nodes))) # 2 + len-2 assert len(input_ids) >= 2 op_desc = OpDesc('petridish_reduction_op' if reduction else 'petridish_normal_op', params={ 'conv': cell_desc.conv_params, # specify strides for each input, later we will # give this to each primitive '_strides':[2 if reduction and j < 2 else 1 \ for j in input_ids], }, in_len=len(input_ids), trainables=None, children=None) edge = EdgeDesc(op_desc, input_ids=input_ids) petridish_node.edges.append(edge) # note that post op will be recreated which means there is no # warm start for post op when number of nodes changes cell_desc.reset_nodes(new_nodes, cell_desc.node_ch_out, cell_desc.post_op.name)
def seed(self, model_desc: ModelDesc) -> None: # for petridish we add one node with identity to s1 # this will be our seed model for cell_desc in model_desc.cell_descs(): node_count = len(cell_desc.nodes()) assert node_count >= 1 first_node = cell_desc.nodes()[0] # if there are no edges for 1st node, add identity to s1 if len(first_node.edges) == 0: op_desc = OpDesc( 'skip_connect', params={ 'conv': cell_desc.conv_params, 'stride': 2 if cell_desc.cell_type == CellType.Reduction else 1 }, in_len=1, trainables=None, children=None) edge = EdgeDesc(op_desc, input_ids=[1]) first_node.edges.append(edge) # remove empty nodes new_nodes = [ n.clone() for n in cell_desc.nodes() if len(n.edges) > 0 ] if len(new_nodes) != len(cell_desc.nodes()): cell_desc.reset_nodes(new_nodes, cell_desc.node_ch_out, cell_desc.post_op.name) self._ensure_nonempty_nodes(cell_desc)
def build_nodes(self, stem_shapes:TensorShapes, conf_cell:Config, cell_index:int, cell_type:CellType, node_count:int, in_shape:TensorShape, out_shape:TensorShape) \ ->Tuple[TensorShapes, List[NodeDesc]]: assert in_shape[0] == out_shape[0] reduction = (cell_type == CellType.Reduction) nodes: List[NodeDesc] = [] conv_params = ConvMacroParams(in_shape[0], out_shape[0]) # add div op for each edge in each node # how does the stride works? For all ops connected to s0 and s1, we apply # reduction in WxH. All ops connected elsewhere automatically gets # reduced WxH (because all subsequent states are derived from s0 and s1). # Note that channel is increased via conv_params for the cell for i in range(node_count): edges = [] for j in range(i + 2): op_desc = OpDesc('div_op', params={ 'conv': conv_params, 'stride': 2 if reduction and j < 2 else 1 }, in_len=1, trainables=None, children=None) edge = EdgeDesc(op_desc, input_ids=[j]) edges.append(edge) nodes.append(NodeDesc(edges=edges, conv_params=conv_params)) out_shapes = [copy.deepcopy(out_shape) for _ in range(node_count)] return out_shapes, nodes
def build_model_pool(self, in_shapes:TensorShapesList, conf_model_desc:Config)\ ->OpDesc: model_post_op = conf_model_desc['model_post_op'] last_shape = in_shapes[-1][0] in_shapes.append([copy.deepcopy(last_shape)]) return OpDesc(model_post_op, params={'conv': ConvMacroParams(last_shape[0], last_shape[0])}, in_len=1, trainables=None)
def _build_cell(self, cell_desc:CellDesc)->None: reduction = (cell_desc.cell_type==CellType.Reduction) # add xnas op for each edge for i, node in enumerate(cell_desc.nodes()): for j in range(i+2): op_desc = OpDesc('xnas_op', params={ 'conv': cell_desc.conv_params, 'stride': 2 if reduction and j < 2 else 1 }, in_len=1, trainables=None, children=None) edge = EdgeDesc(op_desc, input_ids=[j]) node.edges.append(edge)
def __init__(self, op_desc: OpDesc, arch_params: Optional[ArchParams], affine: bool): super().__init__() vertex_op_name = op_desc.params['vertex_op'] proj_first = op_desc.params[ 'proj_first'] # first input needs projection self._vertex_op = Op.create(OpDesc(vertex_op_name, params=op_desc.params, in_len=1, trainables=None), affine=affine, arch_params=None) self._in_len = op_desc.in_len self._proj_op = Op.create(OpDesc('convbnrelu_1x1', params=op_desc.params, in_len=1, trainables=None), affine=affine, arch_params=None) \ if proj_first else None
def __init__(self, op_desc:OpDesc, arch_params:Optional[ArchParams], affine:bool): super().__init__() # assume last PRIMITIVE is 'none' assert GsOp.PRIMITIVES[-1] == 'none' self._gs_num_sample = op_desc.params['gs_num_sample'] self._ops = nn.ModuleList() for primitive in GsOp.PRIMITIVES: op = Op.create( OpDesc(primitive, op_desc.params, in_len=1, trainables=None), affine=affine, arch_params=None) self._ops.append(op) # we do this at the end so that we can capture all arch params registered by # any previous child modules self._setup_arch_params(arch_params)
def __init__(self, op_desc: OpDesc, arch_params: Optional[ArchParams], reduction: bool, affine: bool): super().__init__() # assume last PRIMITIVE is 'none' (this is used for finalize) assert PetridishOp.PRIMITIVES[-1] == 'none' # create edges for the op, each edge connects input state, # within each edge we will have all N primitives self._edges = nn.ModuleList() for i in range(op_desc.in_len): # edge contains all primitives with alphas edge = nn.ModuleList() self._edges.append(edge) # for each input stride could be different, # so we will make copy of our params and then set stride for this input params = deepcopy(op_desc.params) params['stride'] = op_desc.params['_strides'][i] # create primitives for the edge for primitive in PetridishOp.PRIMITIVES: primitive_op = Op.create(OpDesc(primitive, params=params, in_len=1, trainables=None), affine=affine, arch_params=None) # wrap primitive with sg op = nn.Sequential(StopGradient(), primitive_op) edge.append(op) # TODO: check with Dey: Do we really need StopForwardReductionOp # or StopGradientReductionOp because these two will only make sense # for cell stems. # NOTE: Consider the case where prev_prev is normal, prev is reduction # then s_0 is twice as big in each dimension as s_1 and the number of channels # won't match. So you have to use StopGradientReductionOp on s_1 to make it match. self._sf = StopForward() # we do this at the end so that we can capture all arch params registered by # any previous child modules self._setup_arch_params(arch_params, op_desc.in_len)
def build_cell_post_op(self, stem_shapes:TensorShapes, node_shapes:TensorShapes, conf_cell:Config, cell_index:int)\ -> Tuple[TensorShape, OpDesc]: post_op_name = conf_cell['cell_post_op'] op_ch_in, cell_ch_out, out_states = self._post_op_ch(post_op_name, node_shapes) post_op_desc = OpDesc(post_op_name, { 'conv': ConvMacroParams(op_ch_in, cell_ch_out), 'out_states': out_states }, in_len=1, trainables=None, children=None) out_shape = copy.deepcopy(node_shapes[-1]) out_shape[0] = cell_ch_out return out_shape, post_op_desc
def _build_cell(self, cell_desc: CellDesc) -> None: reduction = (cell_desc.cell_type == CellType.Reduction) # add mixed op for each edge in each node # how does the stride works? For all ops connected to s0 and s1, we apply # reduction in WxH. All ops connected elsewhere automatically gets # reduced WxH (because all subsequent states are derived from s0 and s1). # Note that channel is increased via conv_params for the cell for i, node in enumerate(cell_desc.nodes()): for j in range(i + 2): op_desc = OpDesc('mixed_op', params={ 'conv': cell_desc.conv_params, 'stride': 2 if reduction and j < 2 else 1 }, in_len=1, trainables=None, children=None) edge = EdgeDesc(op_desc, input_ids=[j]) node.edges.append(edge)
def finalize(self, sampled_weights) -> Tuple[OpDesc, Optional[float]]: # finalization where each edge gets to keep as many # unique operations that are **sampled at the node level** assert sampled_weights.shape[0] == len(GsOp.PRIMITIVES) # we can't handle empty op assert sampled_weights.bool().any() greater_than_0 = sampled_weights > 0 children = [] children_ins = [] selected_alphas = [] for i in range(greater_than_0.size()[0]): if greater_than_0[i]: children.append(self._ops[i].finalize()[0]) selected_alphas.append(self._alphas[0][i].item()) # all the ops will operate on the single node input children_ins.append(0) final_op_desc = OpDesc( name='multi_op', params={ # copy convolution parameters 'conv': self.desc.params['conv'] }, # number of inputs remains same and in this # case should be 1 in_len=self.desc.in_len, trainables=None, # primitive's finalize call also records its # weights in description. finalize call returns # (desc, rank) where rank for primitive is None children=children, children_ins=children_ins) max_alpha = 0.0 if selected_alphas: max_alpha = max(selected_alphas) return final_op_desc, max_alpha
def __init__(self, op_desc: OpDesc, affine: bool) -> None: """MultiOp combines multiple ops to one op. The set of ops to combine if passed through op_desc.children and each of children's inputs are passed through op_desc.children_ins. This op will receive list of inputs in forward() and each of the children works on one of these inputs and generates an output. All outputs of children are then combined using projection operation to produce final output of the overall op. """ super().__init__() # get list of inputs and associated primitives iop_descs = op_desc.children ins = op_desc.children_ins assert iop_descs is not None and ins is not None and len( iop_descs) == len(ins) # conv params typically specified by macro builder conv_params: ConvMacroParams = op_desc.params['conv'] self._ops = nn.ModuleList() self._ins: List[int] = [] for i, iop_desc in zip(ins, iop_descs): iop_desc.params['conv'] = conv_params self._ops.append(Op.create(iop_desc, affine=affine)) self._ins.append(i) # number of channels as we will concate output of ops ch_out_sum = conv_params.ch_out * len(self._ins) ch_adj_desc = OpDesc( 'proj_channels', { 'conv': ConvMacroParams(ch_out_sum, conv_params.ch_out), 'out_states': len(self._ins) }, in_len=1, trainables=None, children=None) self._ch_adj = Op.create(ch_adj_desc, affine=affine)
def _add_node(self, model_desc: ModelDesc, model_desc_builder: ModelDescBuilder) -> None: for ci, cell_desc in enumerate(model_desc.cell_descs()): reduction = (cell_desc.cell_type == CellType.Reduction) nodes = cell_desc.nodes() # petridish must seed with one node assert len(nodes) > 0 # input/output channels for all nodes are same conv_params = nodes[0].conv_params # assign input IDs to nodes, s0 and s1 have IDs 0 and 1 # however as we will be inserting new node before last one input_ids = list(range(len(nodes) + 1)) assert len(input_ids) >= 2 # 2 stem inputs op_desc = OpDesc('petridish_reduction_op' if reduction else 'petridish_normal_op', params={ 'conv': conv_params, # specify strides for each input, later we will # give this to each primitive '_strides':[2 if reduction and j < 2 else 1 \ for j in input_ids], }, in_len=len(input_ids), trainables=None, children=None) edge = EdgeDesc(op_desc, input_ids=input_ids) new_node = NodeDesc(edges=[edge], conv_params=conv_params) nodes.insert(len(nodes) - 1, new_node) # output shape of all nodes are same node_shapes = cell_desc.node_shapes new_node_shape = copy.deepcopy(node_shapes[-1]) node_shapes.insert(len(node_shapes) - 1, new_node_shape) # post op needs rebuilding because number of inputs to it has changed so input/output channels may be different post_op_shape, post_op_desc = model_desc_builder.build_cell_post_op( cell_desc.stem_shapes, node_shapes, cell_desc.conf_cell, ci) cell_desc.reset_nodes(nodes, node_shapes, post_op_desc, post_op_shape)
def finalize(self) -> Tuple[OpDesc, Optional[float]]: # finalization where each edge gets to keep as many # unique operations that are sampled sample_storage = [] for i in range(self._gs_num_sample): sampled = F.gumbel_softmax(self._alphas[0], tau=1, hard=True, eps=1e-10, dim=-1) sample_storage.append(sampled) samples_summed = torch.sum(torch.stack(sample_storage, dim=0), dim=0) greater_than_0 = samples_summed > 0 children = [] children_ins = [] for i in range(greater_than_0.size()[0]): if greater_than_0[i]: children.append(self._ops[i].finalize()[0]) # all the ops will operate on the single node input children_ins.append(0) final_op_desc = OpDesc(name='multi_op', params={ # copy convolution parameters 'conv': self.desc.params['conv'] }, # number of inputs remains same and in this # case should be 1 in_len=self.desc.in_len, trainables=None, # primitive's finalize call also records its # weights in description. finalize call returns # (desc, rank) where rank for primitive is None children = children, children_ins = children_ins ) return final_op_desc, None
def __init__(self, op_desc: OpDesc, arch_params: Optional[ArchParams], affine: bool): super().__init__() # assume last PRIMITIVE is 'none' assert XnasOp.PRIMITIVES[-1] == 'none' self._ops = nn.ModuleList() for primitive in XnasOp.PRIMITIVES: op = Op.create(OpDesc(primitive, op_desc.params, in_len=1, trainables=None), affine=affine, arch_params=None) self._ops.append(op) # for getting gradients to non-leaf node self._is_first_call = True self._avg_grad_meter = AverageMeter() # we do this at the end so that we can capture all arch params registered by # any previous child modules self._setup_arch_params(arch_params)