def run(self, x, y=None): """ Runs the model for a batch of examples. The correct labels are known during training, but not at test time. When correct labels are available, `y` is a (batch_size x 10) numpy array. Each row in the array is a one-hot vector encoding the correct class. Your model should predict a (batch_size x 10) numpy array of scores, where higher scores correspond to greater probability of the image belonging to a particular class. You should use `nn.SoftmaxLoss` as your training loss. Inputs: x: a (batch_size x 784) numpy array y: a (batch_size x 10) numpy array, or None Output: (if y is not None) A nn.Graph instance, where the last added node is the loss (if y is None) A (batch_size x 10) numpy array of scores (aka logits) """ "*** YOUR CODE HERE ***" #print("x", x.shape) #print("y", y.shape) graph = nn.Graph( [self.W1, self.W2, self.W3, self.W4, self.W5, self.W6]) input_x = nn.Input(graph, x) #first term xW1mult = nn.MatrixMultiply(graph, input_x, self.W1) #second term xW2mult = nn.MatrixMultiply(graph, input_x, self.W2) addW1W2 = nn.Add(graph, xW1mult, xW2mult) relu1 = nn.ReLU(graph, addW1W2) reluMult = nn.MatrixMultiply(graph, relu1, self.W3) xW4mult = nn.MatrixMultiply(graph, input_x, self.W4) W4W5mult = nn.MatrixMultiply(graph, xW4mult, self.W5) per2Add = nn.Add(graph, reluMult, W4W5mult) totalMult = nn.MatrixMultiply(graph, per2Add, self.W6) #another term #lastRelu = nn.ReLU(graph, totalMult) if y is not None: "*** YOUR CODE HERE ***" input_y = nn.Input(graph, y) loss_node = nn.SoftmaxLoss(graph, totalMult, input_y) return graph else: "*** YOUR CODE HERE ***" return graph.get_output(totalMult)
def run(self, x, y=None): """ Runs the model for a batch of examples. The correct labels are known during training, but not at test time. When correct labels are available, `y` is a (batch_size x 10) numpy array. Each row in the array is a one-hot vector encoding the correct class. Your model should predict a (batch_size x 10) numpy array of scores, where higher scores correspond to greater probability of the image belonging to a particular class. You should use `nn.SoftmaxLoss` as your training loss. Inputs: x: a (batch_size x 784) numpy array y: a (batch_size x 10) numpy array, or None Output: (if y is not None) A nn.Graph instance, where the last added node is the loss (if y is None) A (batch_size x 10) numpy array of scores (aka logits) """ "*** YOUR CODE HERE ***" graph = nn.Graph(self.w1_list + self.b1_list + self.w2_list + self.b2_list + self.w3_list + self.b3_list) digit_losses = [] for digit in range(0,10): xInput = nn.Input(graph,x) layer1 = nn.MatrixMultiply(graph, xInput, self.w1_list[digit]) layer2 = nn.MatrixVectorAdd(graph, layer1, self.b1_list[digit]) layer3 = nn.ReLU(graph, layer2) layer4 = nn.MatrixMultiply(graph, layer3, self.w2_list[digit]) layer5 = nn.MatrixVectorAdd(graph, layer4, self.b2_list[digit]) layer6 = nn.ReLU(graph, layer5) layer7 = nn.MatrixMultiply(graph, layer6, self.w3_list[digit]) layer8 = nn.MatrixVectorAdd(graph, layer7, self.b3_list[digit]) basis_vector = np.zeros((1, 10)) basis_vector[0][digit] = 1 basis_vector_input = nn.Input(graph, basis_vector) digit_losses.append(nn.MatrixMultiply(graph, layer8, basis_vector_input)) if digit == 1: digit_losses_matrix = nn.Add(graph, digit_losses[0], digit_losses[1]) if digit > 1: previous = digit_losses_matrix digit_losses_matrix = nn.Add(graph, digit_losses_matrix, digit_losses[digit]) if y is not None: "*** YOUR CODE HERE ***" yInput = nn.Input(graph, y) soft_max_layer = nn.SoftmaxLoss(graph, digit_losses_matrix, yInput) return graph else: "*** YOUR CODE HERE ***" return graph.get_output(digit_losses_matrix)
def run(self, xs): """ Runs the model for a batch of examples. Although words have different lengths, our data processing guarantees that within a single batch, all words will be of the same length (L). Here `xs` will be a list of length L. Each element of `xs` will be a node with shape (batch_size x self.num_chars), where every row in the array is a one-hot vector encoding of a character. For example, if we have a batch of 8 three-letter words where the last word is "cat", then xs[1] will be a node that contains a 1 at position (7, 0). Here the index 7 reflects the fact that "cat" is the last word in the batch, and the index 0 reflects the fact that the letter "a" is the inital (0th) letter of our combined alphabet for this task. Your model should use a Recurrent Neural Network to summarize the list `xs` into a single node of shape (batch_size x hidden_size), for your choice of hidden_size. It should then calculate a node of shape (batch_size x 5) containing scores, where higher scores correspond to greater probability of the word originating from a particular language. Inputs: xs: a list with L elements (one per character), where each element is a node with shape (batch_size x self.num_chars) Returns: A node with shape (batch_size x 5) containing predicted scores (also called logits) """ "*** YOUR CODE HERE ***" h=nn.AddBias(nn.Linear(xs[0],self.w),self.b) h=nn.ReLU(h) for i in range(1,len(xs)): h=nn.ReLU(nn.Add(nn.AddBias(nn.Linear(xs[i],self.w),self.b),nn.AddBias(nn.Linear(h,self.w_hidden),self.b_hidden))) return nn.AddBias(nn.Linear(h,self.w_last),self.b_last)
def f(h, x): if h is None: result = nn.Linear(x, self.w) temp = nn.Constant(numpy.ones([x.data.shape[0], result.data.shape[0]])) return nn.Linear(temp, result) else: return nn.Add(nn.Linear(x, self.w), nn.Linear(h, self.w_hidden))
def run(self, x, y=None): """ Runs the model for a batch of examples. The correct outputs `y` are known during training, but not at test time. If correct outputs `y` are provided, this method must construct and return a nn.Graph for computing the training loss. If `y` is None, this method must instead return predicted y-values. Inputs: x: a (batch_size x 1) numpy array y: a (batch_size x 1) numpy array, or None Output: (if y is not None) A nn.Graph instance, where the last added node is the loss (if y is None) A (batch_size x 1) numpy array of predicted y-values Note: DO NOT call backprop() or step() inside this method! """ #batch_size = x.shape[0] # set up the graph oddRegressionGraph = nn.Graph([self.W1, self.b1, self.W2, self.b2]) input_x = nn.Input(oddRegressionGraph, x) xW1 = nn.MatrixMultiply(oddRegressionGraph, input_x, self.W1) xW1_plus_b1 = nn.MatrixVectorAdd(oddRegressionGraph, xW1, self.b1) ReLU_1 = nn.ReLU(oddRegressionGraph, xW1_plus_b1) R1W2 = nn.MatrixMultiply(oddRegressionGraph, ReLU_1, self.W2) R1W2_plus_b2 = nn.MatrixVectorAdd(oddRegressionGraph, R1W2, self.b2) negx = nn.Input(oddRegressionGraph, x * -1) negxW1 = nn.MatrixMultiply(oddRegressionGraph, negx, self.W1) negxW1_plus_b1 = nn.MatrixVectorAdd(oddRegressionGraph, negxW1, self.b1) ReLU_2 = nn.ReLU(oddRegressionGraph, negxW1_plus_b1) R2W2 = nn.MatrixMultiply(oddRegressionGraph, ReLU_2, self.W2) R2W2_plus_b2 = nn.MatrixVectorAdd(oddRegressionGraph, R2W2, self.b2) #neg2R2W2_plus_b2 = nn.Input(oddRegressionGraph, oddRegressionGraph.get_output(R2W2_plus_b2)*-2) #negR2W2_plus_b2 = nn.Add(oddRegressionGraph, R2W2_plus_b2, neg2R2W2_plus_b2) negR2W2_plus_b2 = nn.Input( oddRegressionGraph, oddRegressionGraph.get_output(R2W2_plus_b2) * -1) sumMatrix = nn.Add(oddRegressionGraph, R1W2_plus_b2, negR2W2_plus_b2) if y is not None: # At training time, the correct output `y` is known. # Here, you should construct a loss node, and return the nn.Graph # that the node belongs to. The loss node must be the last node # added to the graph. input_y = nn.Input(oddRegressionGraph, y) sumMatrix_SL_y = nn.SquareLoss(oddRegressionGraph, sumMatrix, input_y) return oddRegressionGraph else: # At test time, the correct output is unknown. # You should instead return your model's prediction as a numpy array return oddRegressionGraph.get_output(sumMatrix)
def run(self, xs, y=None): """ Runs the model for a batch of examples. Although words have different lengths, our data processing guarantees that within a single batch, all words will be of the same length (L). Here `xs` will be a list of length L. Each element of `xs` will be a (batch_size x self.num_chars) numpy array, where every row in the array is a one-hot vector encoding of a character. For example, if we have a batch of 8 three-letter words where the last word is "cat", we will have xs[1][7,0] == 1. Here the index 0 reflects the fact that the letter "a" is the inital (0th) letter of our combined alphabet for this task. The correct labels are known during training, but not at test time. When correct labels are available, `y` is a (batch_size x 5) numpy array. Each row in the array is a one-hot vector encoding the correct class. Your model should use a Recurrent Neural Network to summarize the list `xs` into a single node that represents a (batch_size x hidden_size) array, for your choice of hidden_size. It should then calculate a (batch_size x 5) numpy array of scores, where higher scores correspond to greater probability of the word originating from a particular language. You should use `nn.SoftmaxLoss` as your training loss. Inputs: xs: a list with L elements (one per character), where each element is a (batch_size x self.num_chars) numpy array y: a (batch_size x 5) numpy array, or None Output: (if y is not None) A nn.Graph instance, where the last added node is the loss (if y is None) A (batch_size x 5) numpy array of scores (aka logits) Hint: you may use the batch_size variable in your code """ batch_size = xs[0].shape[0] "*** YOUR CODE HERE ***" h = nn.Variable(batch_size, self.dimensionality) h.data = np.zeros((batch_size, self.dimensionality)) g = nn.Graph([h, self.w1, self.w2, self.w3, self.b]) for x in xs: h1 = nn.MatrixMultiply(g, h, self.w1) x2 = nn.MatrixMultiply(g, nn.Input(g, x), self.w2) h1_add_x2 = nn.Add(g, h1, x2) add_b = nn.MatrixVectorAdd(g, h1_add_x2, self.b) relu = nn.ReLU(g, add_b) h = relu result = nn.MatrixMultiply(g, h, self.w3) if y is not None: "*** YOUR CODE HERE ***" nn.SoftmaxLoss(g, result, nn.Input(g, y)) return g else: "*** YOUR CODE HERE ***" return g.get_output(result)
def f(h, c): cw = nn.MatrixMultiply(graph, c, self.w_one) #(batch_size x hidden_size) hw = nn.MatrixMultiply(graph, h, self.w_four) add = nn.Add(graph, cw, hw) #(bs x hidden_size) relu = nn.ReLU(graph, add) #(bs x hidden_size) reluw = nn.MatrixMultiply(graph, relu, self.w_three) #(bs x 5) return reluw
def run(self, xs): layer = nn.Linear(nn.DataNode(xs[0].data), self.weight[0]) for x in xs: layer = nn.ReLU( nn.AddBias( nn.Linear(nn.Add(nn.Linear(x, self.weight[0]), layer), self.weight[1]), self.bias[1])) return nn.AddBias(nn.Linear(layer, self.weight[2]), self.bias[2])
def run(self, xs): """ Runs the model for a batch of examples. Although words have different lengths, our data processing guarantees that within a single batch, all words will be of the same length (L). Here `xs` will be a list of length L. Each element of `xs` will be a node with shape (batch_size x self.num_chars), where every row in the array is a one-hot vector encoding of a character. For example, if we have a batch of 8 three-letter words where the last word is "cat", then xs[1] will be a node that contains a 1 at position (7, 0). Here the index 7 reflects the fact that "cat" is the last word in the batch, and the index 0 reflects the fact that the letter "a" is the inital (0th) letter of our combined alphabet for this task. Your model should use a Recurrent Neural Network to summarize the list `xs` into a single node of shape (batch_size x hidden_size), for your choice of hidden_size. It should then calculate a node of shape (batch_size x 5) containing scores, where higher scores correspond to greater probability of the word originating from a particular language. Inputs: xs: a list with L elements (one per character), where each element is a node with shape (batch_size x self.num_chars) Returns: A node with shape (batch_size x 5) containing predicted scores (also called logits) """ "*** YOUR CODE HERE ***" # self.batch_size = len(xs[0].data) # self.expander = nn.Parameter(self.num_chars, self.batch_size) xw1 = nn.Linear(xs[0], self.w1) # 1x47 * 47x100 == 1x100 xw1b1 = nn.AddBias(xw1, self.b1) # 1x100 + 1x100 reluxw1b1 = nn.ReLU(xw1b1) last_node = reluxw1b1 # expanded_node = nn.Linear(self.expander, reluxw1b1) for i in range(1, len(xs)): # print(i) # self.batch_size = len(xs[i].data) # self.expander = nn.Parameter(self.num_chars, self.batch_size) # expanded_node_added = nn.Add(self.w2, expanded_node) hw = nn.Linear(last_node, self.w2) loop_xw1 = nn.Linear(xs[i], self.w1) loop_xw1b1 = nn.AddBias(loop_xw1, self.b1) loop_reluxw1b1 = nn.ReLU(loop_xw1b1) hw_plus_loop_reluxw1b1 = nn.Add(hw, loop_reluxw1b1) last_node = hw_plus_loop_reluxw1b1 # expanded_node = nn.Linear(self.expander, loop_reluxw1b1) end_xw1 = nn.Linear(last_node, self.end_w1) end_xw1b1 = nn.AddBias(end_xw1, self.end_b1) # 1x100 + 1x100 end_reluxw1b1 = nn.ReLU(end_xw1b1) end_reluxw1b1w2 = nn.Linear(end_reluxw1b1, self.end_w2) end_reluxw1b1w2b2 = nn.AddBias(end_reluxw1b1w2, self.end_b2) # 1x100 + 1x100 end_reluxw1b1w2b2last = nn.ReLU(end_reluxw1b1w2b2) shrunkyclunk = nn.Linear(end_reluxw1b1w2b2last, self.shrinker) return shrunkyclunk
def run(self, xs, y=None): """ Runs the model for a batch of examples. Although words have different lengths, our data processing guarantees that within a single batch, all words will be of the same length (L). Here `xs` will be a list of length L. Each element of `xs` will be a (batch_size x self.num_chars) numpy array, where every row in the array is a one-hot vector encoding of a character. For example, if we have a batch of 8 three-letter words where the last word is "cat", we will have xs[1][7,0] == 1. Here the index 0 reflects the fact that the letter "a" is the inital (0th) letter of our combined alphabet for this task. The correct labels are known during training, but not at test time. When correct labels are available, `y` is a (batch_size x 5) numpy array. Each row in the array is a one-hot vector encoding the correct class. Your model should use a Recurrent Neural Network to summarize the list `xs` into a single node that represents a (batch_size x hidden_size) array, for your choice of hidden_size. It should then calculate a (batch_size x 5) numpy array of scores, where higher scores correspond to greater probability of the word originating from a particular language. You should use `nn.SoftmaxLoss` as your training loss. Inputs: xs: a list with L elements (one per character), where each element is a (batch_size x self.num_chars) numpy array y: a (batch_size x 5) numpy array, or None Output: (if y is not None) A nn.Graph instance, where the last added node is the loss (if y is None) A (batch_size x 5) numpy array of scores (aka logits) Hint: you may use the batch_size variable in your code """ batch_size = xs[0].shape[0] graph = nn.Graph([self.C_training, self.H_traing, self.m, self.b]) H = np.zeros((batch_size, self.hidden_size)) inputH = nn.Input(graph, H) for X in xs: inputX = nn.Input(graph, X) CWx = nn.MatrixMultiply(graph, inputX, self.C_training) HWh = nn.MatrixMultiply(graph, inputH, self.H_traing) inputH = nn.ReLU(graph, nn.Add(graph, CWx, HWh)) xm = nn.MatrixMultiply(graph, inputH, self.m) xm_plus_b = nn.MatrixVectorAdd(graph, xm, self.b) if y is not None: input_y = nn.Input(graph, y) nn.SquareLoss(graph, xm_plus_b, input_y) return graph else: return graph.get_output(xm_plus_b)
def run(self, x, y=None): """ Runs the model for a batch of examples. The correct outputs `y` are known during training, but not at test time. If correct outputs `y` are provided, this method must construct and return a nn.Graph for computing the training loss. If `y` is None, this method must instead return predicted y-values. Inputs: x: a (batch_size x 1) numpy array y: a (batch_size x 1) numpy array, or None Output: (if y is not None) A nn.Graph instance, where the last added node is the loss (if y is None) A (batch_size x 1) numpy array of predicted y-values Note: DO NOT call backprop() or step() inside this method! """ "*** YOUR CODE HERE ***" graph = nn.Graph(self.variables) negative1 = -1*np.ones((1,1)) input_x = nn.Input(graph, x) neg_1 = nn.Input(graph, negative1) """First we do the positives""" xw1 = nn.MatrixMultiply(graph, input_x, self.variables[0]) sumxw1b1 = nn.MatrixVectorAdd(graph, xw1, self.variables[1]) relu = nn.ReLU(graph, sumxw1b1) reluW2 = nn.MatrixMultiply(graph, relu, self.variables[2]) """Now we do the negatives""" negx = nn.MatrixMultiply(graph, input_x, neg_1) nxw1 = nn.MatrixMultiply(graph, negx, self.variables[0]) sumnxw1 = nn.MatrixVectorAdd(graph, nxw1, self.variables[1]) nrelu = nn.ReLU(graph, sumnxw1) nreluW2 = nn.MatrixMultiply(graph, nrelu, self.variables[2]) """Set the negative value of negative x to negative""" nsumNRW2b2 = nn.MatrixMultiply(graph, nreluW2, neg_1) """Add the two sums together""" totalSum = nn.Add(graph, reluW2, nsumNRW2b2) if y is not None: # At training time, the correct output `y` is known. # Here, you should construct a loss node, and return the nn.Graph # that the node belongs to. The loss node must be the last node # added to the graph. "*** YOUR CODE HERE ***" input_y = nn.Input(graph, y) loss = nn.SquareLoss(graph, totalSum, input_y) return graph else: # At test time, the correct output is unknown. # You should instead return your model's prediction as a numpy array "*** YOUR CODE HERE ***" nodes = graph.get_nodes() lastnode = nodes[-1] out = graph.get_output(lastnode) return out
def run(self, x, y=None): """ TODO: Question 5 - [Application] OddRegression Runs the model for a batch of examples. The correct outputs `y` are known during training, but not at test time. If correct outputs `y` are provided, this method must construct and return a nn.Graph for computing the training loss. If `y` is None, this method must instead return predicted y-values. Inputs: x: a (batch_size x 1) numpy array y: a (batch_size x 1) numpy array, or None Output: (if y is not None) A nn.Graph instance, where the last added node is the loss (if y is None) A (batch_size x 1) numpy array of predicted y-values Note: DO NOT call backprop() or step() inside this method! """ "*** YOUR CODE HERE ***" # calculates g(x) graph = nn.Graph([self.w_one, self.b_one, self.w_two, self.b_two]) input_x = nn.Input(graph, x) xw = nn.MatrixMultiply(graph, input_x, self.w_one) xw_plus_b = nn.MatrixVectorAdd(graph, xw, self.b_one) relu = nn.ReLU(graph, xw_plus_b) reluw = nn.MatrixMultiply(graph, relu, self.w_two) reluw_plus_b = nn.MatrixVectorAdd(graph, reluw, self.b_two) # calculates g(-x) negone = nn.Input(graph, np.array([[-1.0]])) neg_x = nn.MatrixMultiply(graph, input_x, negone) negxw = nn.MatrixMultiply(graph, neg_x, self.w_one) negxw_plus_b = nn.MatrixVectorAdd(graph, negxw, self.b_one) negrelu = nn.ReLU(graph, negxw_plus_b) negreluw = nn.MatrixMultiply(graph, negrelu, self.w_two) negreluw_plus_b = nn.MatrixVectorAdd(graph, negreluw, self.b_two) #g(x)-(g(-x)) negG = nn.MatrixMultiply(graph, negreluw_plus_b, negone) oddFunc = nn.Add(graph, reluw_plus_b, negG) if y is not None: # At training time, the correct output `y` is known. # Here, you should construct a loss node, and return the nn.Graph # that the node belongs to. The loss node must be the last node # added to the graph. "*** YOUR CODE HERE ***" input_y = nn.Input(graph, y) loss = nn.SquareLoss(graph, oddFunc, input_y) return graph else: # At test time, the correct output is unknown. # You should instead return your model's prediction as a numpy array "*** YOUR CODE HERE ***" return graph.get_output(oddFunc)
def run(self, x, y=None): """ Runs the model for a batch of examples. The correct outputs `y` are known during training, but not at test time. If correct outputs `y` are provided, this method must construct and return a nn.Graph for computing the training loss. If `y` is None, this method must instead return predicted y-values. Inputs: x: a (batch_size x 1) numpy array y: a (batch_size x 1) numpy array, or None Output: (if y is not None) A nn.Graph instance, where the last added node is the loss (if y is None) A (batch_size x 1) numpy array of predicted y-values Note: DO NOT call backprop() or step() inside this method! """ "*** YOUR CODE HERE ***" graph = nn.Graph([self.W1, self.W2, self.b1, self.b2]) input_x = nn.Input(graph, x) xW1mult = nn.MatrixMultiply(graph, input_x, self.W1) b1add = nn.Add(graph, xW1mult, self.b1) relu = nn.ReLU(graph, b1add) W2reluMult = nn.MatrixMultiply(graph, relu, self.W2) lastAdd = nn.Add(graph, W2reluMult, self.b2) if y is not None: # At training time, the correct output `y` is known. # Here, you should construct a loss node, and return the nn.Graph # that the node belongs to. The loss node must be the last node # added to the graph. "*** YOUR CODE HERE ***" input_y = nn.Input(graph, y) loss_node = nn.SquareLoss(graph, lastAdd, input_y) return graph else: # At test time, the correct output is unknown. # You should instead return your model's prediction as a numpy array "*** YOUR CODE HERE ***" return graph.get_output(lastAdd)
def run(self, x, y=None): """ Runs the model for a batch of examples. The correct outputs `y` are known during training, but not at test time. If correct outputs `y` are provided, this method must construct and return a nn.Graph for computing the training loss. If `y` is None, this method must instead return predicted y-values. Inputs: x: a (batch_size x 1) numpy array y: a (batch_size x 1) numpy array, or None Output: (if y is not None) A nn.Graph instance, where the last added node is the loss (if y is None) A (batch_size x 1) numpy array of predicted y-values Note: DO NOT call backprop() or step() inside this method! """ graph = nn.Graph([self.w1, self.w2, self.b1, self.b2]) #pos a_graph = nn.Graph([self.w1, self.w2, self.b1, self.b2]) a_input_x = nn.Input(graph, x) a_mult1 = nn.MatrixMultiply(graph, a_input_x, self.w1) a_add1 = nn.MatrixVectorAdd(graph, a_mult1, self.b1) a_relu1 = nn.ReLU(graph, a_add1) a_mult2 = nn.MatrixMultiply(graph, a_relu1, self.w2) a_add2 = nn.MatrixVectorAdd(graph, a_mult2, self.b2) #neg b_input_x = nn.Input(graph, np.dot(-1, x)) b_mult1 = nn.MatrixMultiply(graph, b_input_x, self.w1) b_add1 = nn.MatrixVectorAdd(graph, b_mult1, self.b1) b_relu1 = nn.ReLU(graph, b_add1) b_mult2 = nn.MatrixMultiply(graph, b_relu1, self.w2) b_add2 = nn.MatrixVectorAdd(graph, b_mult2, self.b2) b_output = graph.get_output(b_add2) neg_b_output = -1 * b_output neg_matrix = np.zeros(np.shape(b_output)[1]) neg = np.negative(np.identity(np.shape(b_output)[1])) neg_b_add2 = nn.Input(graph, neg) b_mult3 = nn.MatrixMultiply(graph, b_add2, neg_b_add2) result = nn.Add(graph, a_add2, b_mult3) if y is not None: input_y = nn.Input(graph, y) loss = nn.SquareLoss(graph, result, input_y) return graph else: # At test time, the correct output is unknown. # You should instead return your model's prediction as a numpy array return graph.get_output(result)
def run(self, x, y=None): """ Runs the model for a batch of examples. The correct outputs `y` are known during training, but not at test time. If correct outputs `y` are provided, this method must construct and return a nn.Graph for computing the training loss. If `y` is None, this method must instead return predicted y-values. Inputs: x: a (batch_size x 1) numpy array y: a (batch_size x 1) numpy array, or None Output: (if y is not None) A nn.Graph instance, where the last added node is the loss (if y is None) A (batch_size x 1) numpy array of predicted y-values Note: DO NOT call backprop() or step() inside this method! """ "*** YOUR CODE HERE ***" graph = nn.Graph([self.m1, self.b1, self.m2, self.b2]) input_x = nn.Input(graph, x) xm1 = nn.MatrixMultiply(graph, input_x, self.m1) xm_plus_b1 = nn.MatrixVectorAdd(graph, xm1, self.b1) rel = nn.ReLU(graph, xm_plus_b1) xm2 = nn.MatrixMultiply(graph, rel, self.m2) f = nn.MatrixVectorAdd(graph, xm2, self.b2) input_x_neg = nn.Input(graph, -x) xm1_neg = nn.MatrixMultiply(graph, input_x_neg, self.m1) xm_plus_b1_neg = nn.MatrixVectorAdd(graph, xm1_neg, self.b1) rel_neg = nn.ReLU(graph, xm_plus_b1_neg) xm2_neg = nn.MatrixMultiply(graph, rel_neg, self.m2) xm_plus_b2_neg = nn.MatrixVectorAdd(graph, xm2_neg, self.b2) minus_one = nn.Input(graph, np.array([[-1.0]])) minus_f = nn.MatrixMultiply(graph, xm_plus_b2_neg, minus_one) lastone = nn.Add(graph, f, minus_f) if y is not None: # At training time, the correct output `y` is known. # Here, you should construct a loss node, and return the nn.Graph # that the node belongs to. The loss node must be the last node # added to the graph. "*** YOUR CODE HERE ***" input_y = nn.Input(graph, y) nn.SquareLoss(graph, lastone, input_y) return graph else: # At test time, the correct output is unknown. # You should instead return your model's prediction as a numpy array "*** YOUR CODE HERE ***" return graph.get_output(lastone)
def run(self, xs): """ Runs the model for a batch of examples. Although words have different lengths, our data processing guarantees that within a single batch, all words will be of the same length (L). Here `xs` will be a list of length L. Each element of `xs` will be a node with shape (batch_size x self.num_chars), where every row in the array is a one-hot vector encoding of a character. For example, if we have a batch of 8 three-letter words where the last word is "cat", then xs[1] will be a node that contains a 1 at position (7, 0). Here the index 7 reflects the fact that "cat" is the last word in the batch, and the index 0 reflects the fact that the letter "a" is the inital (0th) letter of our combined alphabet for this task. Your model should use a Recurrent Neural Network to summarize the list `xs` into a single node of shape (batch_size x hidden_size), for your choice of hidden_size. It should then calculate a node of shape (batch_size x 5) containing scores, where higher scores correspond to greater probability of the word originating from a particular language. Inputs: xs: a list with L elements (one per character), where each element is a node with shape (batch_size x self.num_chars) Returns: A node with shape (batch_size x 5) containing predicted scores (also called logits) """ "*** YOUR CODE HERE ***" # Batch_size x feature_size * feature_size x layer_size = batch_size x layer_size latest_prediction = self.getInitialPrediction(xs[0]) latest_prediction = nn.ReLU(latest_prediction) for x in xs[1:]: # (Batch_size x feature_size * feature_size x layer_size) + # + (batch_size x layer_size * self.network_weights[1]) = # = batch_size x layer_size because for add function both matrices should have same dimensions # from that we can see that (batch_size x layer_size * self.network_weights[1]) = batch_size x layer_size # and self.network_weights[1] should be layer_size x layer_size # after that there are two ways to return batch_size x 5. # first way is that first and second layer weights should be 47x5 and 5x5 # that means that we cannot have neither more nor less than 5 perceptrons in each layer which isnt optimal # second way is that we have one more layer that converts previous matrices to batch_size x 5 # in this way we can have first layer weights of size 47xN, second layer NxN and third layer Nx5 initial_prediction = self.getInitialPrediction(x) latest_prediction = nn.Add( initial_prediction, nn.Linear(latest_prediction, self.network_weights[1])) latest_prediction = nn.ReLU(latest_prediction) final_prediction = nn.Linear(latest_prediction, self.network_weights[2]) return final_prediction
def run(self, x, y=None): """ Runs the model for a batch of examples. The correct outputs `y` are known during training, but not at test time. If correct outputs `y` are provided, this method must construct and return a nn.Graph for computing the training loss. If `y` is None, this method must instead return predicted y-values. Inputs: x: a (batch_size x 1) numpy array y: a (batch_size x 1) numpy array, or None Output: (if y is not None) A nn.Graph instance, where the last added node is the loss (if y is None) A (batch_size x 1) numpy array of predicted y-values Note: DO NOT call backprop() or step() inside this method! """ "*** YOUR CODE HERE ***" # set the trainable variables here # in the first try, we will do 2 layers # f(x) = W2 * ReLU(W1 * x + b1) + b2 # size of each variable: # x: i * 1 # W1: h * i, b1: h * 1 # W2: i * h, b2: i * 1 graph = nn.Graph([self.W1, self.W2, self.b1, self.b2]) input_x = nn.Input(graph, x) mul_1 = nn.MatrixMultiply(graph, self.W1, input_x) add_1 = nn.MatrixVectorAdd(graph, mul_1, self.b1) # add_1 = nn.Add(graph, mul_1, self.b1) relu_1 = nn.ReLU(graph, add_1) mul_2 = nn.MatrixMultiply(graph, self.W2, relu_1) add_2 = nn.Add(graph, mul_2, self.b2) if y is not None: # At training time, the correct output `y` is known. # Here, you should construct a loss node, and return the nn.Graph # that the node belongs to. The loss node must be the last node # added to the graph. "*** YOUR CODE HERE ***" input_y = nn.Input(graph, y) loss = nn.SquareLoss(graph, add_2, input_y) return graph else: # At test time, the correct output is unknown. # You should instead return your model's prediction as a numpy array "*** YOUR CODE HERE ***" return graph.get_output(add_2)
def run(self, xs): current_output = nn.AddBias(nn.Linear(xs[0], self.W), self.b) current_output = nn.ReLU(current_output) for i in range(1, len(xs)): current_output = nn.AddBias( nn.Add(nn.Linear(xs[i], self.W), nn.Linear(current_output, self.W_hidden)), self.b) current_output = nn.ReLU(current_output) output = nn.AddBias(nn.Linear(current_output, self.W_last), self.b_last) return output
def run(self, x, y=None): """ TODO: Question 4 - [Application] Regression Runs the model for a batch of examples. The correct outputs `y` are known during training, but not at test time. If correct outputs `y` are provided, this method must construct and return a nn.Graph for computing the training loss. If `y` is None, this method must instead return predicted y-values. Inputs: x: a (batch_size x 1) numpy array y: a (batch_size x 1) numpy array, or None Output: (if y is not None) A nn.Graph instance, where the last added node is the loss (if y is None) A (batch_size x 1) numpy array of predicted y-values Note: DO NOT call backprop() or step() inside this method! """ "*** YOUR CODE HERE ***" if y is not None: "*** YOUR CODE HERE ***" w1 = nn.Variable(len(x), len(x)) w2 = nn.Variable(len(x), len(x)) b1 = nn.Variable(len(x), 1) b2 = nn.Variable(len(x), 1) self.nodes = nn.Graph([w1, w2, b1, b2]) input_x = nn.Input(self.nodes, x) xw1 = nn.MatrixMultiply(self.nodes, w1, input_x) xw1_plus_b1 = nn.MatrixVectorAdd(self.nodes, xw1, b1) relu_xw1b1 = nn.ReLU(self.nodes, xw1_plus_b1) input_y = nn.Input(self.nodes, y) loss1 = nn.SquareLoss(self.nodes, relu_xw1b1, input_y) xw2 = nn.MatrixMultiply(self.nodes, w2, input_x) xw2_plus_b2 = nn.MatrixVectorAdd(self.nodes, xw2, b2) relu_xw2b2 = nn.ReLU(self.nodes, xw2_plus_b2) loss2 = nn.SquareLoss(self.nodes, relu_xw2b2, input_y) nn.Add(self.nodes, loss1, loss2) return self.nodes else: "*** YOUR CODE HERE ***" pred1 = self.nodes.get_output(self.nodes.get_nodes()[-7]) return pred1
def run(self, xs, y=None): batch_size = xs[0].shape[0] graph = nn.Graph([ self.w1, self.b1, self.w2, self.b2, self.w3, self.w3_f, self.b2_f ]) h = nn.Input(graph, np.tile(self.h, (batch_size, 1))) "*** YOUR CODE HERE ***" for element in xs[:len(xs) - 1]: x_graph = nn.Input(graph, element) mm_x = nn.MatrixMultiply(graph, x_graph, self.w1) hm = nn.MatrixMultiply(graph, h, self.w2) hm_x = nn.Add(graph, mm_x, hm) mva_b1 = nn.MatrixVectorAdd(graph, hm_x, self.b1) rl_b1 = nn.ReLU(graph, mva_b1) mm_w3 = nn.MatrixMultiply(graph, rl_b1, self.w3) xm_plus_b_w2_b2 = nn.MatrixVectorAdd(graph, mm_w3, self.b2) h = nn.ReLU(graph, xm_plus_b_w2_b2) # for the last element, use w3_final and b2_final to converge to batch size X 5 x_graph = nn.Input(graph, xs[len(xs) - 1]) mm_x = nn.MatrixMultiply(graph, x_graph, self.w1) hm = nn.MatrixMultiply(graph, h, self.w2) hm_x = nn.Add(graph, mm_x, hm) mva_b1 = nn.MatrixVectorAdd(graph, hm_x, self.b1) rl_b1 = nn.ReLU(graph, mva_b1) mm_w3 = nn.MatrixMultiply(graph, rl_b1, self.w3_f) xm_plus_b_w2_b2 = nn.MatrixVectorAdd(graph, mm_w3, self.b2_f) h = xm_plus_b_w2_b2 if y is not None: input_y = nn.Input(graph, y) loss = nn.SquareLoss(graph, h, input_y) return graph else: "*** YOUR CODE HERE ***" return graph.get_output(h)
def run(self, x, y=None): """ Runs the model for a batch of examples. The correct outputs `y` are known during training, but not at test time. If correct outputs `y` are provided, this method must construct and return a nn.Graph for computing the training loss. If `y` is None, this method must instead return predicted y-values. Inputs: x: a (batch_size x 1) numpy array y: a (batch_size x 1) numpy array, or None Output: (if y is not None) A nn.Graph instance, where the last added node is the loss (if y is None) A (batch_size x 1) numpy array of predicted y-values Note: DO NOT call backprop() or step() inside this method! """ "*** YOUR CODE HERE ***" ide = nn.Variable(1) ide.data = -np.identity(1) g = nn.Graph([self.w1, self.b1, self.w2, self.b2, ide]) x1 = nn.MatrixMultiply(g, nn.Input(g, x), self.w1) x1_add_b1 = nn.MatrixVectorAdd(g, x1, self.b1) relu = nn.ReLU(g, x1_add_b1) x2 = nn.MatrixMultiply(g, relu, self.w2) x2_add_b2 = nn.MatrixVectorAdd(g, x2, self.b2) n_x1 = nn.MatrixMultiply(g, nn.Input(g, -x), self.w1) n_x1_add_b1 = nn.MatrixVectorAdd(g, n_x1, self.b1) n_relu = nn.ReLU(g, n_x1_add_b1) n_x2 = nn.MatrixMultiply(g, n_relu, self.w2) n_x2_add_b2 = nn.MatrixVectorAdd(g, n_x2, self.b2) n_x2_add_b2 = nn.MatrixMultiply(g, n_x2_add_b2, ide) f = nn.Add(g, x2_add_b2, n_x2_add_b2) if y is not None: # At training time, the correct output `y` is known. # Here, you should construct a loss node, and return the nn.Graph # that the node belongs to. The loss node must be the last node # added to the graph. "*** YOUR CODE HERE ***" nn.SquareLoss(g, f, nn.Input(g, y)) return g else: # At test time, the correct output is unknown. # You should instead return your model's prediction as a numpy array "*** YOUR CODE HERE ***" return g.get_output(f)
def run(self, xs): """ Runs the model for a batch of examples. Although words have different lengths, our data processing guarantees that within a single batch, all words will be of the same length (L). Here `xs` will be a list of length L. Each element of `xs` will be a node with shape (batch_size x self.num_chars), where every row in the array is a one-hot vector encoding of a character. For example, if we have a batch of 8 three-letter words where the last word is "cat", then xs[1] will be a node that contains a 1 at position (7, 0). Here the index 7 reflects the fact that "cat" is the last word in the batch, and the index 0 reflects the fact that the letter "a" is the inital (0th) letter of our combined alphabet for this task. Your model should use a Recurrent Neural Network to summarize the list `xs` into a single node of shape (batch_size x hidden_size), for your choice of hidden_size. It should then calculate a node of shape (batch_size x 5) containing scores, where higher scores correspond to greater probability of the word originating from a particular language. Inputs: xs: a list with L elements (one per character), where each element is a node with shape (batch_size x self.num_chars) Returns: A node with shape (batch_size x 5) containing predicted scores (also called logits) """ "*** YOUR CODE HERE ***" initialx = xs[0] rest = xs[1:] def firstNet(xi): xw = nn.Linear(xi, self.w) xwb0 = nn.AddBias(xw, self.b0) return nn.ReLU(xwb0) def secondNet(hi): hWhidden = nn.Linear(hi, self.wHidden) return nn.ReLU(nn.AddBias(hWhidden, self.b1)) hi = firstNet(initialx) for xi in rest: hi = nn.Add(firstNet(xi), secondNet( hi)) # also add Relu(nn.Add...?) + b of size (1 x h)? # hi is of size (batch_size x hidden_size) return nn.ReLU(nn.AddBias(nn.Linear(hi, self.wOutput), self.b2))
def run(self, xs): """ Runs the model for a batch of examples. Although words have different lengths, our data processing guarantees that within a single batch, all words will be of the same length (L). Here `xs` will be a list of length L. Each element of `xs` will be a node with shape (batch_size x self.num_chars), where every row in the array is a one-hot vector encoding of a character. For example, if we have a batch of 8 three-letter words where the last word is "cat", then xs[1] will be a node that contains a 1 at position (7, 0). Here the index 7 reflects the fact that "cat" is the last word in the batch, and the index 0 reflects the fact that the letter "a" is the inital (0th) letter of our combined alphabet for this task. Your model should use a Recurrent Neural Network to summarize the list `xs` into a single node of shape (batch_size x hidden_size), for your choice of hidden_size. It should then calculate a node of shape (batch_size x 5) containing scores, where higher scores correspond to greater probability of the word originating from a particular language. Inputs: xs: a list with L elements (one per character), where each element is a node with shape (batch_size x self.num_chars) Returns: A node with shape (batch_size x 5) containing predicted scores (also called logits) """ "*** YOUR CODE HERE ***" x = xs[0] x_w1 = nn.Linear( x, self.weights1 ) # (bs x numchars) * (numchars x hidden) = (bs x hidden) xw1_b1_sum = nn.AddBias(x_w1, self.bias1) # (bs x hidden) relu = nn.ReLU(xw1_b1_sum) # (bs x hs) h_n = nn.AddBias( nn.Linear(relu, self.weights2), self.bias2) # (bs x hs) * (hs x 5) = (bs x 5) + (bs x 5) for x in xs[1:]: x_w1 = nn.Add( nn.Linear(x, self.weights1), nn.Linear(h_n, self.weights3)) # bs x hs + bsx5 * 5xhs = bs x hs xw1_b1_sum = nn.AddBias(x_w1, self.bias1) # bs x hs relu = nn.ReLU(xw1_b1_sum) h_n = nn.AddBias(nn.Linear(relu, self.weights2), self.bias2) return h_n
def helper_function(self, graph, h, c): # this function helps to calculate the feature f(h, c) # f = ReLU(h * W1 + c * W2 + b1) * W3 + b2 # size: (h has size batch_size x h1, c has size batch_size x 47) # W1: h1 x h2, W2: 47 x h2, b1: 1 x h2, W3: h2 x h1, b2: 1 x h1 mul_1 = nn.MatrixMultiply(graph, h, self.W1) mul_2 = nn.MatrixMultiply(graph, c, self.W2) add_1 = nn.Add(graph, mul_1, mul_2) add_2 = nn.MatrixVectorAdd(graph, add_1, self.b1) relu_1 = nn.ReLU(graph, add_1) mul_3 = nn.MatrixMultiply(graph, relu_1, self.W3) add_3 = nn.MatrixVectorAdd(graph, mul_3, self.b2) return add_3
def f(h, c): if h == None: ones = nn.Input(graph, np.ones([batch_size, 1])) in_h = nn.MatrixMultiply( graph, ones, self.h0) #nn.Input(g, np.zeros([batch_size, self.d])) else: in_h = h input_c = nn.Input(graph, c) c_mul_w = nn.MatrixMultiply(graph, input_c, self.w) # batchsize x d h_mul_v = nn.MatrixMultiply(graph, in_h, self.v) relu1 = nn.ReLU(graph, nn.Add(graph, c_mul_w, h_mul_v)) return relu1
def run(self, xs): """ Runs the model for a batch of examples. Although words have different lengths, our data processing guarantees that within a single batch, all words will be of the same length (L). Here `xs` will be a list of length L. Each element of `xs` will be a node with shape (batch_size x self.num_chars), where every row in the array is a one-hot vector encoding of a character. For example, if we have a batch of 8 three-letter words where the last word is "cat", then xs[1] will be a node that contains a 1 at position (7, 0). Here the index 7 reflects the fact that "cat" is the last word in the batch, and the index 0 reflects the fact that the letter "a" is the inital (0th) letter of our combined alphabet for this task. Your model should use a Recurrent Neural Network to summarize the list `xs` into a single node of shape (batch_size x hidden_size), for your choice of hidden_size. It should then calculate a node of shape (batch_size x 5) containing scores, where higher scores correspond to greater probability of the word originating from a particular language. Inputs: xs: a list with L elements (one per character), where each element is a node with shape (batch_size x self.num_chars) Returns: A node with shape (batch_size x 5) containing predicted scores (also called logits) """ "*** YOUR CODE HERE ***" word_length = len(xs) hidden_output = None current_index = 0 while current_index < word_length: initial_output = self.initial_network.predict(xs[current_index]) if hidden_output is None: hidden_output = initial_output else: intermediate_output = self.hidden_network.predict(hidden_output) hidden_output = nn.Add(initial_output, intermediate_output) current_index += 1 final_prediction = self.final_network.predict(hidden_output) return final_prediction
def run(self, xs): """ Runs the model for a batch of examples. Although words have different lengths, our data processing guarantees that within a single batch, all words will be of the same length (L). Here `xs` will be a list of length L. Each element of `xs` will be a node with shape (batch_size x self.num_chars), where every row in the array is a one-hot vector encoding of a character. For example, if we have a batch of 8 three-letter words where the last word is "cat", then xs[1] will be a node that contains a 1 at position (7, 0). Here the index 7 reflects the fact that "cat" is the last word in the batch, and the index 0 reflects the fact that the letter "a" is the inital (0th) letter of our combined alphabet for this task. Your model should use a Recurrent Neural Network to summarize the list `xs` into a single node of shape (batch_size x hidden_size), for your choice of hidden_size. It should then calculate a node of shape (batch_size x 5) containing scores, where higher scores correspond to greater probability of the word originating from a particular language. Inputs: xs: a list with L elements (one per character), where each element is a node with shape (batch_size x self.num_chars) Returns: A node with shape (batch_size x 5) containing predicted scores (also called logits) """ "*** YOUR CODE HERE ***" # one layer # based on architecture, f(x) = relu(x*w1 + b1) # except z_i = x_i*w + h_i * w_hidden # so it's more like f(z) = relu(z + b1) # z0 = x0 * w z_0 = nn.Linear(xs[0], self.w) z_0PlusB1 = nn.AddBias(z_0, self.b) relu = nn.ReLU(z_0PlusB1) # compute first h h_i = relu for x in xs[1:]: xw = nn.Linear(x, self.w) hw = nn.Linear(h_i, self.w_hidden) # z_i = x_i*w + h_i * w_hidden z_i = nn.Add(xw, hw) addBias = nn.AddBias(z_i, self.b) h_i = nn.ReLU(addBias) return nn.Linear(h_i, self.w_final)
def check_graph_accumulator(tracker): # A more thorough test that now requires gradient accumulators to be working import nn v1 = nn.Variable(1, 5) v1_data = np.ones_like(v1.data) / 10 v1.data = v1_data graph = nn.Graph([v1]) adder = nn.Add(graph, v1, v1) assert graph.get_nodes() == [v1, adder], \ "Not all nodes are present after adding a node." assert graph.get_inputs(v1) == [], \ "Graph.get_inputs should return no inputs for a Variable node" assert np.allclose(graph.get_output(v1), v1_data), \ "Graph.get_output for a Variable should be its data:\n{}\n" \ "Student returned:\n{}".format(v1_data, graph.get_output(v1)) expected = [graph.get_output(v1)] * 2 student = graph.get_inputs(adder) for a, b in zip(student, expected): assert np.allclose(a, b), "Graph.get_inputs returned incorrect value for an Add node\nStudent returned:\n{}\n" \ "Expected:\n{}".format(a, b) assert np.allclose(graph.get_output(adder), 2 * graph.get_output(v1)), \ "Graph.get_output returned incorrect value for an Add node\nStudent returned:\n{}\nExpected:\n{}"\ .format(graph.get_output(adder), 2 * graph.get_output(v1)) loss = nn.SoftmaxLoss(graph, adder, adder) for node in [v1, adder]: output_shape = graph.get_output(node).shape node_grad = graph.get_gradient(node) assert node_grad is not None, \ "Graph.get_gradient returned None, instead of an all-zero value" assert np.shape(node_grad) == output_shape, \ "Graph.get_gradient returned gradient of wrong shape, {0}; expected, {1}".format(np.shape(node_grad), output_shape) assert np.allclose(node_grad, np.zeros_like(node_grad)), "Graph.get_gradient should return all-zero values" \ " before backprop is called, instead returned:\n{}"\ .format(node_grad) expected_loss = 1.60943791243 graph.backprop() v1_grad = graph.get_gradient(v1) assert np.allclose(v1_grad, np.ones_like(v1_grad) * expected_loss * 2), \ "Incorrect gradient after running Graph.backprop().\nStudent returned:\n{}\nExpected:\n{}\nMake sure you are" \ " correctly accumulating your gradients.".format(v1_grad, np.ones_like(v1_grad) * expected_loss * 2) tracker.add_points(3)
def run(self, xs): """ Runs the model for a batch of examples. Although words have different lengths, our data processing guarantees that within a single batch, all words will be of the same length (L). Here `xs` will be a list of length L. Each element of `xs` will be a node with shape (batch_size x self.num_chars), where every row in the array is a one-hot vector encoding of a character. For example, if we have a batch of 8 three-letter words where the last word is "cat", then xs[1] will be a node that contains a 1 at position (7, 0). Here the index 7 reflects the fact that "cat" is the last word in the batch, and the index 0 reflects the fact that the letter "a" is the inital (0th) letter of our combined alphabet for this task. ======= self.w_h = nn.Parameter(self.hiddenLayerSize, self.hiddenLayerSize) self.w_f = nn.Parameter(self.hiddenLayerSize, len(self.languages)) self.w = nn.Parameter(self.num_chars, self.hiddenLayerSize) <<<<<<< HEAD Inputs: xs: a list with L elements (one per character), where each element is a node with shape (batch_size x self.num_chars) Returns: A node with shape (batch_size x 5) containing predicted scores (also called logits) """ "*** YOUR CODE HERE ***" h = nn.Linear(xs[0], self.w) z = h for i, x in enumerate(xs[1:]): z = nn.Add(nn.Linear(x, self.w), nn.Linear(z, self.wh)) return nn.Linear(z, self.wf)
def hidden(self, h, x): return nn.Add(nn.Linear(x, self.w1), nn.Linear(h, self.w3))