def get_confusion(labels, preds): """ Get the binary confusion matrix values for two equal-length lists """ try: assert (len(labels) == len(preds)) except: err([labels, preds, len(labels), len(preds)]) TP = TN = FP = FN = 0 ## y=label, x=pred ## for i, y in enumerate(labels): x = preds[i] if y == 1 and x == 1: TP += 1 elif y == 0 and x == 0: TN += 1 elif y == 0 and x == 1: FP += 1 elif y == 1 and x == 0: FN += 1 return TP, TN, FP, FN
def forward(self): """ Runs the data forward through the network to generate outputs. First, iterate over the input vectors for each sample. Each sample comprises a list of N (mostly) Chinese characters that have been vectorized. These will be processed through a BiLSTM to produce an output list of N-1 floats, each representing the likelihood (between 0-1) of a split. 'N' is not specified because the batch is an inhomogeneous list, i.e. each sample is a different length. """ self.preds = None for vectors in self.vector_list: # Each 'vectors' var holds a list of (tensor) vectors that comprises the vectors from an individual sample pred = self.bilstm.forward( vectors ) # Returns the list of N-1 pairs associated with an individual sample if self.preds is None: self.preds = pred else: self.preds = pu.tensor_cat( self.preds, pred, 0 ) # Build a stack of tensors comprised of all 'pred' from all vectors in this sample # The final number of 'preds' is not determined, but it is the same as the number of 'labels' try: assert (len(self.labels) == len(self.preds)) except: err([len(self.labels), len(self.preds)]) raise return self.preds, self.labels
def length_of_binary_rep(i, verbose=False): """ For some 's' which represents a binary number, find its length """ s = str(bin(i)) if verbose: err([i, s]) s = re.sub(r'^0b0*', '', s) if verbose: err([s, len(s)]) return len(s)
def i_to_binary_tensor(i, width, verbose=False): """ Convert an ordinal int i to a binary vector for encoding. Used to encode the ordinal number of a character. Fixed length of 'width' binary digits means a max of 2**width = 64 (e.g. if 'width=6', the range is 0 to 63) """ output = [] s = str(bin(i)) if verbose: err([i, width, s]) s = re.sub(r'^0b', '0' * width, s) S = list(s)[-width:] S = map(float, S) S = list(S) if verbose: err([S, len(S)]) return S
def forward(self, X, C0, H0): """ the 'forward()' for this network. The mathematics of the gates will follow this scheme, where C0 and H0 are the values coming in from previous iterations of this cell and are used to compute H1 (the next hidden layer) and C1, C2 (cell state tensors both within this iteration -- sometimes called Ct~ and Ct). Parameters ---------- X : Tensor, the current input vector C0 : Tensor, the previous cell state H0 : Tensor, the previous hidden state """ if pu.has_improper_values(X): err([X]) raise (NaN_Exeption) # Step 1 (Forget Gate): G = sigma(X * Uf + H0 * Wf) G = self.Uf(X) + self.Wf(H0) if pu.has_improper_values(G): err([G]) raise (NaN_Exeption) G = torch.sigmoid(G) # Step 2 (for updating C): C1 = tanh(X * Uc + H0 * Wc) C1 = self.Uc(X) + self.Wc(H0) C1 = torch.tanh(C1) # Step 3 (for updating C): I = sigma(X * Ui + H0 * Wi) I = self.Ui(X) + self.Wi(H0) I = torch.sigmoid(I) # Step 4 (output gate): O = sigma(X * Uo + H * Wo) O = self.Uo(X) + self.Wo(H0) O = torch.sigmoid(O) # Step 5: C2 = G * C0 + I * C1 C2 = G * C0 + I * C1 # Step 6: H = O * tanh(C) H1 = O * torch.tanh(C2) return C2, H1
def forward(self, L, R): """ the 'forward()' for this network Parameters ---------- LC, RC : Tensors (Cell and Hidden states for Left and Right) """ if pu.has_improper_values(L) or pu.has_improper_values(R): err([L, R]) raise (NaN_Exeption) x = torch.cat((L, R)) x = self.fc1(x) x = self.activ(x) x = self.fc2(x) x = self.activ(x) x = self.fc_final(x) x = torch.sigmoid(x) return x
def forward(self, line): """ The 'forward()' for this network. This function takes one sample at a time, where each sample is a line of text. Each LSTM, as it proceeds character by character, it makes a prediction *after* having consumed the character on both sides of an interstice. Parameters ---------- line : (tensor) list of N vectors (where each vector represents one character) Returns ------- (tensor) list of N-1 pairs (each pair of floats represents one interstice-- both the probability of no split and yes split) """ inputs = [] # List of list of tensor. Will hold the cell and hidden states at each possible split location # i.e. the value associated with the first interstice will be at index 0, etc. ### Run line through the left2right LSTM ### LCA = self.Lzeros # left2right cell state LHA = self.Lhzeros # left2right hidden state LCB = self.Lzeros # same for right2left ... LHB = self.Lhzeros num = 0 loc = 0 # First interstice is number 0 for x in line: x = self.embedder(x) # Get the states for interstice number 'loc' LCA, LHA = self.left2rightA(x, LCA, LHA) # Run this character through the Left->-Right LSTM LCB, LHB = self.left2rightB(LHA, LCB, LHB) # Run this character through the second Left->-Right LSTM # If we have consumed at least two characters, we will store our first output, which applies to the interstice between the last two characters if num >= 1: inputs.append( [LCB, LHB] ) assert(inputs[loc] == [LCB, LHB]) assert(len(inputs[loc]) == 2) loc += 1 # For instance, loc=0 represents interstitial between the first and second char num += 1 ### Run line through the right2left LSTM ### RCA = self.Rzeros RHA = self.Rhzeros RCB = self.Rzeros RHB = self.Rhzeros num = 0 for x in reversed(line): # Iterate backwards through the same list of vectors x = self.embedder(x) # Get the states for interstice number 'loc' RCA, RHA = self.right2leftA(x, RCA, RHA) # Run this character through the Right->-Left LSTM RCB, RHB = self.right2leftB(RHA, RCB, RHB) # Run this character through the second Right->-Left LSTM if num >= 1: loc -= 1 # Keep this process in reverse alignment with the other try: inputs[loc].extend([RCB, RHB]) assert(len(inputs[loc]) == 4) except: raise num += 1 # Just to keep track of how many characters we've consumed ### Combine output from both LSTMs and run through the SplitLayer ### # 'preds' will have one fewer element than 'line' because each prediction applies to the interstitial between two elements of 'line' preds = None for item in inputs: LC, LH, RC, RH = item # Just the output from the second layer of each stacking pred = self.splitLayer(LH, RH) # Splitter layer takes as input the hidden state from both directions pred = torch.unsqueeze(pred, 0) # Add a dimension so that 'tensor_cat()' will work correctly if preds is None: preds = pred else: preds = pu.tensor_cat(preds, pred, 0) # Build a stack of tensors try: assert(len(line) - 1 == len(preds)) except: err([preds, len(line), len(preds)]) raise return preds
def fit(self, batch, _monitor, options={}, closure=None, verbose=False): """ Train the classifier on this batch Parameters ---------- batch : list of list _monitor : dict """ if verbose: err() if self.get('fake'): batch = self.fake_batch() self.training_mode() # Set all layers to training mode epoch = _monitor.get('epoch') start_lr = self.get('learning_rate') start_lt = self.get('learning_threshold') lr = pu.learning_rate_by_epoch(epoch, start_lr) lt = pu.loss_threshold_by_epoch( epoch, start_lt) # If loss drops below this, move on to next batch grad_max = 2.0 grad_norm = 2 last_loss = 999 self.refresh_optimizer(lr) self.optimizer.zero_grad() graph = BatchGraph( self.bilstm, self.embedder, batch ) # This object holds all of the tensors and their connections to each other # Execute a number of steps, stopping if necessary for step in range(self.get('steps')): if step > self.get('min_steps'): if last_loss < lt: break # IF loss below this threshold, we've learned enough for now, move on (RARE) # FORWARD try: preds, labels = graph.forward() except NaN_Exeption: break except: raise loss = self.loss(preds, labels) # BACKWARD loss.backward(retain_graph=True) self.optimizer.step(closure=closure) this_loss = loss.item() if this_loss >= last_loss: if options.get('adaptive_learning_rate'): lr = 0.95 * lr self.refresh_optimizer(lr) if lr < 1e-12: break if not options.get('silent'): if step == 0: print() line_no = _monitor['i'] print('[e:%d l:%d s:%d] (lr %0.1E lt %0.1E) loss: %.16f' % (epoch, line_no, step, lr, lt, this_loss)) last_loss = this_loss return last_loss
def forward(self, line): """ The 'forward()' for this network. This function takes one sample at a time, where each sample is a line of text. Each LSTM, as it proceeds character by character, it makes a prediction *after* having consumed the character on both sides of an interstice. Parameters ---------- line : (tensor) list of N vectors (where each vector represents one character) Returns ------- (tensor) list of N-1 pairs (each pair of floats represents one interstice-- both the probability of no split and yes split) """ reverse = [] inputs = [ ] # List of list of tensor. Will hold the cell and hidden states at each possible split location # i.e. the value associated with the first interstice will be at index 0, etc. LCA = self.get_zeros() # Initialize left2right cell state LHA = self.get_zeros() # Initialize left2right hidden state RCA = self.get_zeros() RHA = self.get_zeros() ### LEFT-to-RIGHT PASS ### """ Iterate over the characters in a line. After each two characters AB are read, a prediction will be made about splitting A B char : number of characters read interstice : the interstice number associated with the next prediction For instance, in a line "ABC": char=0, interstice=-1 : Process first character (A), No prediction char=1, interstice=0 : Process second character (B), Make prediction (on AB) char=2, interstice=1 : Process third character (C), Make prediction (on BC) """ if self.vectorizer: print("\n\nForward: ") for char, x in enumerate(line): if pu.has_improper_values(x): err([x]) raise (NaN_Exeption) reverse = reverse + [x] # if self.vectorizer: # actual = self.vectorizer.decode(x) interstice = char - 1 # First value of 'interstice' will be negative, and have no prediction if pu.has_improper_values(LHA): err([LHA]) raise (NaN_Exeption) LCA, LHA = self.left2rightA( x, LCA, LHA) # Run this character through the Left->-Right LSTM if pu.has_improper_values(LHA): err([LHA]) raise (NaN_Exeption) if interstice >= 0: if self.vectorizer: sys.stdout.write("(%d)" % interstice) inputs.append( [LCA, LHA] ) # For this interstice, start collecting tensors to be used for a prediction assert ( inputs[interstice] == [LCA, LHA] ) # Confirm that they're in the right place in the 'inputs' array if self.vectorizer: sys.stdout.write(actual) ### RIGHT-to-LEFT PASS ### """ To ensure that we are always looking at the right interstice, we will: - ignore looking at the interstice until we have processed two characters - keep interstice as it was from the left-to-right pass - and decrement it after each prediction For instance, in a line "ABC": backchar=0, interstice=1 (because it was already 1) : Process third character (C), No prediction backchar=1, interstice=1 (because it was ignored in first iteration) : Process second character (B), Make prediction (on BC) backchar=2, interstice=0 : Process first character (A), Make prediction (on AB) """ if self.vectorizer: print("\nBackward: ") for backchar, x in enumerate(reversed( line)): # Iterate backwards through the same list of vectors # if self.vectorizer: # actual = self.vectorizer.decode(x) RCA, RHA = self.right2leftA( x, RCA, RHA) # Run this character through the Right->-Left LSTM if pu.has_improper_values(RHA): err([RHA]) raise (NaN_Exeption) if backchar >= 1: if self.vectorizer: sys.stdout.write("(%d)" % interstice) inputs[interstice].extend([RCA, RHA]) assert (len(inputs[interstice]) == 4) interstice -= 1 if self.vectorizer: sys.stdout.write(actual) # Confirm that we have the right number of predictions: assert (len(line) == len(inputs) + 1) ### Combine output from both LSTMs and run through the SplitLayer ### # 'preds' will have one fewer element than 'line' because each prediction applies to the interstitial between two elements of 'line' preds = None for item in inputs: LC, LH, RC, RH = item # Just the output from the second layer of each stacking pred = self.splitLayer( LH, RH ) # Splitter layer takes as input the hidden state from both directions pred = torch.unsqueeze( pred, 0 ) # Add a dimension so that 'tensor_cat()' will work correctly if preds is None: preds = pred else: preds = pu.tensor_cat(preds, pred, 0) # Build a stack of tensors return preds
def pass_two(self): """ Make sense of what has been seen and devise a sparse vectorization. This first attempt at vectorization is using the one-hot method merely so that each unique character presents itself to the network as an equally-disambiguated signal from which the network may learn. Yes it would be more compact and possibly efficient to map this embedding into a lower-dimensional space, but it is not yet clear that optimizing for efficiency in this case would retain sufficiently-discernible signals for the network to easily learn. This fact comes directly from my ignorance on the distribution of chinese characters across the possbilities of being in two or three-character words vs single-character words. A one-hot (binary) encoding such as this assumes that each word is equidistant. Such is obviously not true in meaning-space, but meaning-space is not our concern at this time. Here we are only concerned with "splitting-space," as it were. The set of characters are likely not equidistant in this space either, but each layer of the network (see model.py) represents a spatial transformation which is not likely to be distance-preserving. A better embedding will be left for future work. Casting this space to a smaller number of dimensions could clearly help with rare characters. """ self.all_characters = sorted( self.all_characters) # Sort to maintain a specified order self.size = len( self.all_characters ) # This will determine the max number and width of the vector if self.get('embedding_matrix'): self.width = self.dim # self.size is the number of words in your train, val and test set # self.dim is the dimension of the word vectors self.embed_matrix = nn.Embedding(self.size, self.dim) if torch.cuda.is_available(): self.embed_matrix = self.embed_matrix.cuda() for i, char in enumerate(self.all_characters): self.vocab[char] = i else: basic_width = length_of_binary_rep( self.size) # The original length of binary representation self.width = basic_width + 1 # Add a vector element for "specialness" special = set([]) for i, char in enumerate(self.all_characters): vec = i_to_binary_tensor( i, basic_width ) # Binary representation of where character sits in the sorted list # Add an element to represent specialness if is_foreign_or_punct(char): vec.append( 1 ) # Extra element of the vector represents if the char is foreign or punctuation, which deserves its own special.add( char ) # For such characters it is worth creating a separate field because they tend to behave differently else: vec.append( 0 ) # Future Work: subdivide classes further and use as features. try: assert (len(vec) == self.width ) # Each vector must have this length except: err([vec, len(vec), self.width]) raise self.vectors[char] = vec # Store in dict for quick access self.characters[vec_to_str(vec)] = char print( "Fitted Vectorizer understands %d characters, %d of which are special characters (foreign or special)." % (self.size, len(special))) print("\tFinal vector width:", self.width)