Example #1
0
class Encoder(object):
    """Abstract base class for feature encoders.

    Inputs
    ------
    n_left : int, optional
        Number of tokens of left context to include.
        (Default: 2)

    n_right : int, optional
        Number of tokens of right context to include.
        (Default: 2)

    Attributes
    ----------
    chunker : chunk.ChunkEncoder
        ChunkEncoder instance used to generate tags.
    """
    def __init__(self, n_left=2, n_right=2):
        self.chunker = BILOUChunkEncoder()
        self.n_left = n_left
        self.n_right = n_right

    def get_feats_for_token(self, token):
        """Return features for token.

        Inputs
        ------
        token : str
            Token.

        Outputs
        -------
        feats : tuple of str
            Features vector.
        """
        raise NotImplementedError

    def get_feats(self,
                  tokens,
                  token_nums,
                  token_As=None,
                  token_Bs=None,
                  token_Gs=None,
                  token_Fs=None,
                  token_Js=None,
                  A_vals=None,
                  B_vals=None,
                  G_vals=None):
        """Return features corresponding to token sequence.

        Inputs
        ------
        tokens : list of str
            Token sequence.

        Outputs
        -------
        feats : lsit of tuples
            Feature vector sequence.
        """

        #        feats = [self.get_feats_for_token(token) for token in tokens];
        feats = []

        for ii, token in enumerate(tokens):

            ######################################################################################################
            ###### Changes to inclusion of features in feature sets can be made here #############################

            token_feats = []
            """ Add prefix, suffix feats """
            #            token_feats = self.get_feats_for_token(token)
            """ Add word feats """
            #            token_feats.extend(word_type(token))
            """ Add A-B-G triple as non-binary feature """
            #            if token_As != None and token_Bs != None and token_Gs != None:
            #                token_feats.append("{}-{}-{}".format(str(token_As[ii]), str(token_Bs[ii]), str(token_Gs[ii])))
            """ Add A-B double as non-binary feature """
            #            if token_As != None and token_Bs != None:
            #                token_feats.append("{}-{}".format(str(token_As[ii]), str(token_Bs[ii])))
            """ Add A as non-binary feature """
            #            if token_As != None:
            #                token_feats.append(token_As[ii])
            """ Add B as non-binary feature """
            #            if token_Bs != None:
            #                token_feats.append(token_Bs[ii])
            """ Add G as non-binary feature """
            #            if token_Gs != None:
            #                token_feats.append(token_Gs[ii])
            """ Add random A values as features (use in order to check for performance at chance) """
            #            if A_vals != None:
            #                pseudo = random.choice(list(A_vals))
            #                for v in A_vals:
            #                    token_feats.append(pseudo == v)
            """ Add random B values as features (use in order to check for performance at chance) """
            #            if B_vals != None:
            #                pseudo = random.choice(list(B_vals))
            #                for v in B_vals:
            #                    token_feats.append(pseudo == v)
            """ Add random G values as features (use in order to check for performance at chance) """
            #            if G_vals != None:
            #                pseudo = random.choice(list(G_vals))
            #                for v in G_vals:
            #                    token_feats.append(pseudo == v)
            """ Add random F values as features (use in order to check for performance at chance) """
            #            pseudo = random.choice([-1, 0, 1])
            #            for v in [-1, 0, 1]:
            #                token_feats.append(pseudo == v)
            """ Add random J values as features (use in order to check for performance at chance) """
            #            pseudo = random.choice([-1, 0, 1])
            #            for v in [-1, 0, 1]:
            #                token_feats.append(pseudo == v)
            """ Add A as binary feature (True or False for each possible value) """
            if token_As != None and A_vals != None:
                for v in A_vals:
                    token_feats.append(token_As[ii] == v)
            """ Add B as binary feature (True or False for each possible value) """
            if token_Bs != None and B_vals != None:
                for v in B_vals:
                    token_feats.append(token_Bs[ii] == v)
            """ Add G as binary feature (True or False for each possible value) """
            if token_Gs != None and G_vals != None:
                for v in G_vals:
                    token_feats.append(token_Gs[ii] == v)
            """ Add F as binary feature (True or False for each possible value) """
            if token_Fs != None:
                for v in [-1, 0, 1]:
                    token_feats.append(token_Fs[ii] == v)
            """ Add J as binary feature (True or False for each possible value) """
            if token_Js != None:
                for v in [-1, 0, 1]:
                    token_feats.append(token_Js[ii] == v)
            """ Add whether token is first token as feature (may be useful for case where f = j = -1) """
            token_feats.append(token_nums[ii] == 0)
            """ Add whether token is second token as feature (may be useful for case where f = j = -1) """
            token_feats.append(token_nums[ii] == 1)

            ######################################################################################################

            feats.append(token_feats)
        feats = zip(*feats)
        new_feats = []
        for ii, feats_ in enumerate(feats):
            for pos in xrange(-self.n_left, self.n_right + 1):
                feat_id = 'F%d[%d]' % (ii, pos)
                k = -pos
                new_feats.append([
                    '%s=%s' % (feat_id, val) if val is not None else val
                    for val in roll(feats_, k)
                ])
        new_feats = zip(*new_feats)

        # Filter out None vals in rows where they occur.
        for ii, row in enumerate(new_feats):
            new_row = [v for v in row if not v is None]
            new_feats[ii] = new_row
        return new_feats

    def get_targets(self, tokens, mentions):
        """Return tag sequence to train against.

        Inputs
        ------
        tokens : list of str
            Token sequence.

        mentions : list of list
            List of mention tuples, each of the form (tag, start_token_index,
            enc_token_index).

        Outputs
        -------
        targets : list of str
            Target label sequence.
        """
        tags = ['O' for token in tokens]
        for tag, bi, ei in mentions:
            chunk = tokens[bi:ei + 1]
            tags[bi:ei + 1] = self.chunker.chunk_to_tags(chunk, tag)
        return tags

    def get_feats_targets(self,
                          tokens,
                          mentions,
                          token_nums,
                          token_As=None,
                          token_Bs=None,
                          token_Gs=None,
                          token_Fs=None,
                          token_Js=None,
                          A_vals=None,
                          B_vals=None,
                          G_vals=None):
        """Return features/tag sequence to train against.

        Inputs
        ------
        tokens : list of str
            Token sequence.

        mentions : list of list
            List of mention tuples, each of the form (tag, start_token_index,
            enc_token_index).

        Outputs
        -------
        feats : list of tuples
            Feature vector sequence.

        targets : list of str
            Target label sequence.
        """
        feats = self.get_feats(tokens, token_nums, token_As, token_Bs,
                               token_Gs, token_Fs, token_Js, A_vals, B_vals,
                               G_vals)
        targets = self.get_targets(tokens, mentions)
        return feats, targets
Example #2
0
class Encoder(object):
    """Abstract base class for feature encoders.

    Inputs
    ------
    n_left : int, optional
        Number of tokens of left context to include.
        (Default: 2)

    n_right : int, optional
        Number of tokens of right context to include.
        (Default: 2)

    Attributes
    ----------
    chunker : chunk.ChunkEncoder
        ChunkEncoder instance used to generate tags.
    """
    def __init__(self, n_left=2, n_right=2):
        self.chunker = BILOUChunkEncoder()
        self.n_left = n_left
        self.n_right = n_right

    def get_feats_for_token(self, token):
        """Return features for token.

        Inputs
        ------
        token : str
            Token.

        Outputs
        -------
        feats : tuple of str
            Features vector.
        """
        raise NotImplementedError

    def get_feats(self, tokens):
        """Return features corresponding to token sequence.

        Inputs
        ------
        tokens : list of str
            Token sequence.

        Outputs
        -------
        feats : lsit of tuples
            Feature vector sequence.
        """
        feats = [self.get_feats_for_token(token) for token in tokens]
        feats = zip(*feats)
        new_feats = []
        for ii, feats_ in enumerate(feats):
            for pos in xrange(-self.n_left, self.n_right + 1):
                feat_id = 'F%d[%d]' % (ii, pos)
                k = -pos
                new_feats.append([
                    '%s=%s' % (feat_id, val) if val is not None else val
                    for val in roll(feats_, k)
                ])

        new_feats = zip(*new_feats)
        #print new_feats[0]
        #print '============================================================================================'

        # for ii,row in enumerate(new_feats):
        #     new_row = [v if not v is None else 'none' for v in row]
        #     new_feats[ii] = new_row
        # Filter out None vals in rows where they occur.
        for ii, row in enumerate(new_feats):
            new_row = [v for v in row if not v is None]
            new_feats[ii] = new_row
        # print new_feats[0]
        # print '**********************************************************************************************'
        return new_feats

    def get_targets(self, tokens, mentions):
        """Return tag sequence to train against.

        Inputs
        ------
        tokens : list of str
            Token sequence.

        mentions : list of list
            List of mention tuples, each of the form (tag, start_token_index,
            enc_token_index).

        Outputs
        -------
        targets : list of str
            Target label sequence.
        """
        tags = ['O' for token in tokens]
        for tag, bi, ei in mentions:
            chunk = tokens[bi:ei + 1]
            tags[bi:ei + 1] = self.chunker.chunk_to_tags(chunk, tag)
        return tags

    def get_feats_targets(self, tokens, mentions):
        """Return features/tag sequence to train against.

        Inputs
        ------
        tokens : list of str
            Token sequence.

        mentions : list of list
            List of mention tuples, each of the form (tag, start_token_index,
            enc_token_index).

        Outputs
        -------
        feats : list of tuples
            Feature vector sequence.

        targets : list of str
            Target label sequence.
        """
        feats = self.get_feats(tokens)
        targets = self.get_targets(tokens, mentions)
        return feats, targets
Example #3
0
class Encoder(object):
    """Abstract base class for feature encoders.

    Inputs
    ------
    n_left : int, optional
        Number of tokens of left context to include.
        (Default: 2)

    n_right : int, optional
        Number of tokens of right context to include.
        (Default: 2)

    Attributes
    ----------
    chunker : chunk.ChunkEncoder
        ChunkEncoder instance used to generate tags.
    """
    def __init__(self, n_left=2, n_right=2):
        self.chunker = BILOUChunkEncoder() 
        self.n_left = n_left 
        self.n_right = n_right 

    def get_feats_for_token(self, token):
        """Return features for token.

        Inputs
        ------
        token : str
            Token.

        Outputs
        -------
        feats : tuple of str
            Features vector.
        """
        raise NotImplementedError 

    def get_feats(self, tokens):
        """Return features corresponding to token sequence.

        Inputs
        ------
        tokens : list of str
            Token sequence.

        Outputs
        -------
        feats : lsit of tuples
            Feature vector sequence.
        """
        feats = [self.get_feats_for_token(token) for token in tokens] 
        feats = zip(*feats) 
        new_feats = [] 
        for ii, feats_ in enumerate(feats):
            for pos in xrange(-self.n_left, self.n_right+1):
                feat_id = 'F%d[%d]' % (ii, pos) 
                k = -pos 
                new_feats.append(['%s=%s' % (feat_id, val) if val is not None else val for val in roll(feats_, k)])

        new_feats = zip(*new_feats)
        #print new_feats[0]
        #print '============================================================================================'

        # for ii,row in enumerate(new_feats):
        #     new_row = [v if not v is None else 'none' for v in row]
        #     new_feats[ii] = new_row
        # Filter out None vals in rows where they occur.
        for ii, row in enumerate(new_feats):
            new_row = [v for v in row if not v is None]
            new_feats[ii] = new_row
        # print new_feats[0]
        # print '**********************************************************************************************'
        return new_feats


    def get_targets(self, tokens, mentions):
        """Return tag sequence to train against.

        Inputs
        ------
        tokens : list of str
            Token sequence.

        mentions : list of list
            List of mention tuples, each of the form (tag, start_token_index,
            enc_token_index).

        Outputs
        -------
        targets : list of str
            Target label sequence.
        """
        tags = ['O' for token in tokens] 
        for tag, bi, ei in mentions:
            chunk = tokens[bi:ei+1] 
            tags[bi:ei+1] = self.chunker.chunk_to_tags(chunk, tag) 
        return tags 

    def get_feats_targets(self, tokens, mentions):
        """Return features/tag sequence to train against.

        Inputs
        ------
        tokens : list of str
            Token sequence.

        mentions : list of list
            List of mention tuples, each of the form (tag, start_token_index,
            enc_token_index).

        Outputs
        -------
        feats : list of tuples
            Feature vector sequence.

        targets : list of str
            Target label sequence.
        """
        feats = self.get_feats(tokens) 
        targets = self.get_targets(tokens, mentions) 
        return feats, targets