def get_confusion(labels, preds):
    """
    Get the binary confusion matrix values for two equal-length lists
    """
    try:
        assert (len(labels) == len(preds))
    except:
        err([labels, preds, len(labels), len(preds)])

    TP = TN = FP = FN = 0

    ##  y=label, x=pred  ##
    for i, y in enumerate(labels):
        x = preds[i]

        if y == 1 and x == 1:
            TP += 1
        elif y == 0 and x == 0:
            TN += 1
        elif y == 0 and x == 1:
            FP += 1
        elif y == 1 and x == 0:
            FN += 1

    return TP, TN, FP, FN
Beispiel #2
0
    def forward(self):
        """
        Runs the data forward through the network to generate outputs.

            First, iterate over the input vectors for each sample.

            Each sample comprises a list of N (mostly) Chinese characters that have been vectorized.  These will be processed through a BiLSTM
            to produce an output list of N-1 floats, each representing the likelihood (between 0-1) of a split.

            'N' is not specified because the batch is an inhomogeneous list, i.e. each sample is a different length.
        """
        self.preds = None
        for vectors in self.vector_list:  # Each 'vectors' var holds a list of (tensor) vectors that comprises the vectors from an individual sample
            pred = self.bilstm.forward(
                vectors
            )  # Returns the list of N-1 pairs associated with an individual sample

            if self.preds is None:
                self.preds = pred
            else:
                self.preds = pu.tensor_cat(
                    self.preds, pred, 0
                )  # Build a stack of tensors comprised of all 'pred' from all vectors in this sample
                # The final number of 'preds' is not determined, but it is the same as the number of 'labels'

        try:
            assert (len(self.labels) == len(self.preds))
        except:
            err([len(self.labels), len(self.preds)])
            raise

        return self.preds, self.labels
Beispiel #3
0
def length_of_binary_rep(i, verbose=False):
    """
    For some 's' which represents a binary number, find its length
    """
    s = str(bin(i))
    if verbose: err([i, s])
    s = re.sub(r'^0b0*', '', s)
    if verbose: err([s, len(s)])
    return len(s)
Beispiel #4
0
def i_to_binary_tensor(i, width, verbose=False):
    """
    Convert an ordinal int i to a binary vector for encoding.  Used to encode the ordinal number of a character.

    Fixed length of 'width' binary digits means a max of 2**width = 64   (e.g. if 'width=6', the range is 0 to 63)
    """
    output = []
    s = str(bin(i))
    if verbose: err([i, width, s])

    s = re.sub(r'^0b', '0' * width, s)
    S = list(s)[-width:]
    S = map(float, S)
    S = list(S)

    if verbose: err([S, len(S)])
    return S
    def forward(self, X, C0, H0):
        """
        the 'forward()' for this network.  The mathematics of the gates will follow this scheme, where C0 and H0 are the values coming
        in from previous iterations of this cell and are used to compute H1 (the next hidden layer) and C1, C2 (cell state tensors both
        within this iteration -- sometimes called Ct~ and Ct).

        Parameters
        ----------
        X : Tensor, the current input vector

        C0 : Tensor, the previous cell state

        H0 : Tensor, the previous hidden state
        """
        if pu.has_improper_values(X):
            err([X])
            raise (NaN_Exeption)

        # Step 1 (Forget Gate): G = sigma(X * Uf + H0 * Wf)
        G = self.Uf(X) + self.Wf(H0)
        if pu.has_improper_values(G):
            err([G])
            raise (NaN_Exeption)
        G = torch.sigmoid(G)

        # Step 2 (for updating C): C1 = tanh(X * Uc + H0 * Wc)
        C1 = self.Uc(X) + self.Wc(H0)
        C1 = torch.tanh(C1)

        # Step 3 (for updating C): I = sigma(X * Ui + H0 * Wi)
        I = self.Ui(X) + self.Wi(H0)
        I = torch.sigmoid(I)

        # Step 4 (output gate): O = sigma(X * Uo + H * Wo)
        O = self.Uo(X) + self.Wo(H0)
        O = torch.sigmoid(O)

        # Step 5: C2 = G * C0 + I * C1
        C2 = G * C0 + I * C1

        # Step 6: H = O * tanh(C)
        H1 = O * torch.tanh(C2)

        return C2, H1
Beispiel #6
0
    def forward(self, L, R):
        """
        the 'forward()' for this network

        Parameters
        ----------
        LC, RC : Tensors (Cell and Hidden states for Left and Right)
        """
        if pu.has_improper_values(L) or pu.has_improper_values(R):
            err([L, R])
            raise (NaN_Exeption)
        x = torch.cat((L, R))
        x = self.fc1(x)
        x = self.activ(x)
        x = self.fc2(x)
        x = self.activ(x)
        x = self.fc_final(x)
        x = torch.sigmoid(x)

        return x
Beispiel #7
0
    def forward(self, line):
        """
        The 'forward()' for this network.  This function takes one sample at a time, where each sample is a line of text.

        Each LSTM, as it proceeds character by character, it makes a prediction *after* having consumed the character on both sides of an interstice.

        Parameters
        ----------
        line : (tensor) list of N vectors (where each vector represents one character)

        Returns
        -------
        (tensor) list of N-1 pairs (each pair of floats represents one interstice-- both the probability of no split and yes split)
        """
        inputs = []    # List of list of tensor.  Will hold the cell and hidden states at each possible split location
        # i.e. the value associated with the first interstice will be at index 0, etc.
        
        ###  Run line through the left2right LSTM  ###
        LCA  = self.Lzeros     # left2right cell state
        LHA  = self.Lhzeros    # left2right hidden state
        LCB  = self.Lzeros     # same for right2left ...
        LHB  = self.Lhzeros
        num = 0
        loc = 0   # First interstice is number 0
        
        for x in line:
            x = self.embedder(x)
        
            # Get the states for interstice number 'loc'
            LCA, LHA = self.left2rightA(x, LCA, LHA)      # Run this character through the Left->-Right LSTM
            LCB, LHB = self.left2rightB(LHA, LCB, LHB)    # Run this character through the second Left->-Right LSTM

            # If we have consumed at least two characters, we will store our first output, which applies to the interstice between the last two characters
            if num >= 1:
                inputs.append( [LCB, LHB] )
                assert(inputs[loc] == [LCB, LHB])
                assert(len(inputs[loc]) == 2)
                loc += 1                                  # For instance, loc=0 represents interstitial between the first and second char
            num += 1
            
        ###  Run line through the right2left LSTM  ###
        RCA  = self.Rzeros
        RHA  = self.Rhzeros
        RCB  = self.Rzeros
        RHB  = self.Rhzeros
        num = 0
        
        for x in reversed(line):                          # Iterate backwards through the same list of vectors
            x = self.embedder(x)
            
            # Get the states for interstice number 'loc'
            RCA, RHA = self.right2leftA(x, RCA, RHA)      # Run this character through the Right->-Left LSTM
            RCB, RHB = self.right2leftB(RHA, RCB, RHB)    # Run this character through the second Right->-Left LSTM
            if num >= 1:
                loc -= 1                                  # Keep this process in reverse alignment with the other
                try:
                    inputs[loc].extend([RCB, RHB])
                    assert(len(inputs[loc]) == 4)
                except:
                    raise
            num += 1   # Just to keep track of how many characters we've consumed
            
        ###  Combine output from both LSTMs and run through the SplitLayer  ###
        #   'preds' will have one fewer element than 'line' because each prediction applies to the interstitial between two elements of 'line'
        preds = None
        for item in inputs:
            LC, LH, RC, RH = item               # Just the output from the second layer of each stacking
            pred = self.splitLayer(LH, RH)      # Splitter layer takes as input the hidden state from both directions
            pred = torch.unsqueeze(pred, 0)     # Add a dimension so that 'tensor_cat()' will work correctly
            if preds is None:
                preds = pred
            else:
                preds = pu.tensor_cat(preds,  pred, 0)   # Build a stack of tensors
                
        try:
            assert(len(line) - 1 == len(preds))
        except:
            err([preds, len(line), len(preds)])
            raise

        return preds
Beispiel #8
0
    def fit(self, batch, _monitor, options={}, closure=None, verbose=False):
        """
        Train the classifier on this batch

        Parameters
        ----------
        batch : list of list

        _monitor : dict

        """
        if verbose: err()

        if self.get('fake'):
            batch = self.fake_batch()

        self.training_mode()  # Set all layers to training mode
        epoch = _monitor.get('epoch')
        start_lr = self.get('learning_rate')
        start_lt = self.get('learning_threshold')
        lr = pu.learning_rate_by_epoch(epoch, start_lr)
        lt = pu.loss_threshold_by_epoch(
            epoch, start_lt)  # If loss drops below this, move on to next batch
        grad_max = 2.0
        grad_norm = 2
        last_loss = 999

        self.refresh_optimizer(lr)
        self.optimizer.zero_grad()
        graph = BatchGraph(
            self.bilstm, self.embedder, batch
        )  # This object holds all of the tensors and their connections to each other

        # Execute a number of steps, stopping if necessary
        for step in range(self.get('steps')):
            if step > self.get('min_steps'):
                if last_loss < lt:
                    break  # IF loss below this threshold, we've learned enough for now, move on (RARE)

            # FORWARD
            try:
                preds, labels = graph.forward()
            except NaN_Exeption:
                break
            except:
                raise
            loss = self.loss(preds, labels)

            # BACKWARD
            loss.backward(retain_graph=True)
            self.optimizer.step(closure=closure)
            this_loss = loss.item()
            if this_loss >= last_loss:
                if options.get('adaptive_learning_rate'):
                    lr = 0.95 * lr
                    self.refresh_optimizer(lr)
                if lr < 1e-12:
                    break

            if not options.get('silent'):
                if step == 0: print()
                line_no = _monitor['i']
                print('[e:%d l:%d s:%d] (lr %0.1E  lt %0.1E) loss: %.16f' %
                      (epoch, line_no, step, lr, lt, this_loss))

            last_loss = this_loss

        return last_loss
    def forward(self, line):
        """
        The 'forward()' for this network.  This function takes one sample at a time, where each sample is a line of text.

        Each LSTM, as it proceeds character by character, it makes a prediction *after* having consumed the character on both sides of an interstice.

        Parameters
        ----------
        line : (tensor) list of N vectors (where each vector represents one character)

        Returns
        -------
        (tensor) list of N-1 pairs (each pair of floats represents one interstice-- both the probability of no split and yes split)
        """
        reverse = []
        inputs = [
        ]  # List of list of tensor.  Will hold the cell and hidden states at each possible split location
        #                               i.e. the value associated with the first interstice will be at index 0, etc.

        LCA = self.get_zeros()  # Initialize left2right cell state
        LHA = self.get_zeros()  # Initialize left2right hidden state
        RCA = self.get_zeros()
        RHA = self.get_zeros()

        ###  LEFT-to-RIGHT PASS  ###
        """ Iterate over the characters in a line.  After each two characters AB are read, a prediction will be made about splitting A B
            
            char : number of characters read
            interstice : the interstice number associated with the next prediction

            For instance, in a line "ABC":
              char=0, interstice=-1 : Process first character (A), No prediction
              char=1, interstice=0  : Process second character (B), Make prediction (on AB)
              char=2, interstice=1  : Process third character (C), Make prediction (on BC)
        """
        if self.vectorizer:
            print("\n\nForward: ")
        for char, x in enumerate(line):
            if pu.has_improper_values(x):
                err([x])
                raise (NaN_Exeption)

            reverse = reverse + [x]
            # if self.vectorizer:
            #    actual = self.vectorizer.decode(x)

            interstice = char - 1  # First value of 'interstice' will be negative, and have no prediction

            if pu.has_improper_values(LHA):
                err([LHA])
                raise (NaN_Exeption)
            LCA, LHA = self.left2rightA(
                x, LCA,
                LHA)  # Run this character through the Left->-Right LSTM
            if pu.has_improper_values(LHA):
                err([LHA])
                raise (NaN_Exeption)

            if interstice >= 0:
                if self.vectorizer:
                    sys.stdout.write("(%d)" % interstice)
                inputs.append(
                    [LCA, LHA]
                )  # For this interstice, start collecting tensors to be used for a prediction
                assert (
                    inputs[interstice] == [LCA, LHA]
                )  # Confirm that they're in the right place in the 'inputs' array

            if self.vectorizer:
                sys.stdout.write(actual)

        ###  RIGHT-to-LEFT PASS  ###
        """
            To ensure that we are always looking at the right interstice, we will:
              - ignore looking at the interstice until we have processed two characters
              - keep interstice as it was from the left-to-right pass
              - and decrement it after each prediction

            For instance, in a line "ABC":
              backchar=0, interstice=1 (because it was already 1) : Process third character (C), No prediction
              backchar=1, interstice=1 (because it was ignored in first iteration) : Process second character (B), Make prediction (on BC)
              backchar=2, interstice=0 : Process first character (A), Make prediction (on AB)
        """
        if self.vectorizer:
            print("\nBackward: ")
        for backchar, x in enumerate(reversed(
                line)):  # Iterate backwards through the same list of vectors
            # if self.vectorizer:
            #    actual = self.vectorizer.decode(x)

            RCA, RHA = self.right2leftA(
                x, RCA,
                RHA)  # Run this character through the Right->-Left LSTM
            if pu.has_improper_values(RHA):
                err([RHA])
                raise (NaN_Exeption)

            if backchar >= 1:
                if self.vectorizer:
                    sys.stdout.write("(%d)" % interstice)
                inputs[interstice].extend([RCA, RHA])
                assert (len(inputs[interstice]) == 4)
                interstice -= 1

            if self.vectorizer:
                sys.stdout.write(actual)

        # Confirm that we have the right number of predictions:
        assert (len(line) == len(inputs) + 1)

        ###  Combine output from both LSTMs and run through the SplitLayer  ###
        #   'preds' will have one fewer element than 'line' because each prediction applies to the interstitial between two elements of 'line'
        preds = None
        for item in inputs:
            LC, LH, RC, RH = item  # Just the output from the second layer of each stacking
            pred = self.splitLayer(
                LH, RH
            )  # Splitter layer takes as input the hidden state from both directions
            pred = torch.unsqueeze(
                pred, 0
            )  # Add a dimension so that 'tensor_cat()' will work correctly
            if preds is None:
                preds = pred
            else:
                preds = pu.tensor_cat(preds, pred,
                                      0)  # Build a stack of tensors

        return preds
Beispiel #10
0
    def pass_two(self):
        """
        Make sense of what has been seen and devise a sparse vectorization.  This first attempt at vectorization is using the one-hot method merely so that each
        unique character presents itself to the network as an equally-disambiguated signal from which the network may learn.  Yes it would be more compact and
        possibly efficient to map this embedding into a lower-dimensional space, but it is not yet clear that optimizing for efficiency in this case would retain
        sufficiently-discernible signals for the network to easily learn.  This fact comes directly from my ignorance on the distribution of chinese characters
        across the possbilities of being in two or three-character words vs single-character words.

        A one-hot (binary) encoding such as this assumes that each word is equidistant.  Such is obviously not true in meaning-space, but meaning-space is not our
        concern at this time.  Here we are only concerned with "splitting-space," as it were.  The set of characters are likely not equidistant in this space either,
        but each layer of the network (see model.py) represents a spatial transformation which is not likely to be distance-preserving.

        A better embedding will be left for future work.  Casting this space to a smaller number of dimensions could clearly help with rare characters.
        """
        self.all_characters = sorted(
            self.all_characters)  # Sort to maintain a specified order
        self.size = len(
            self.all_characters
        )  # This will determine the max number and width of the vector

        if self.get('embedding_matrix'):
            self.width = self.dim

            # self.size is the number of words in your train, val and test set
            # self.dim is the dimension of the word vectors
            self.embed_matrix = nn.Embedding(self.size, self.dim)
            if torch.cuda.is_available():
                self.embed_matrix = self.embed_matrix.cuda()

            for i, char in enumerate(self.all_characters):
                self.vocab[char] = i

        else:
            basic_width = length_of_binary_rep(
                self.size)  # The original length of binary representation
            self.width = basic_width + 1  # Add a vector element for "specialness"
            special = set([])
            for i, char in enumerate(self.all_characters):
                vec = i_to_binary_tensor(
                    i, basic_width
                )  # Binary representation of where character sits in the sorted list

                # Add an element to represent specialness
                if is_foreign_or_punct(char):
                    vec.append(
                        1
                    )  # Extra element of the vector represents if the char is foreign or punctuation, which deserves its own
                    special.add(
                        char
                    )  # For such characters it is worth creating a separate field because they tend to behave differently
                else:
                    vec.append(
                        0
                    )  # Future Work: subdivide classes further and use as features.

                try:
                    assert (len(vec) == self.width
                            )  # Each vector must have this length
                except:
                    err([vec, len(vec), self.width])
                    raise

                self.vectors[char] = vec  # Store in dict for quick access
                self.characters[vec_to_str(vec)] = char
            print(
                "Fitted Vectorizer understands %d characters, %d of which are special characters (foreign or special)."
                % (self.size, len(special)))

        print("\tFinal vector width:", self.width)