def __init__(self, n_left=2, n_right=2): self.chunker = BILOUChunkEncoder() self.n_left = n_left self.n_right = n_right
class Encoder(object): """Abstract base class for feature encoders. Inputs ------ n_left : int, optional Number of tokens of left context to include. (Default: 2) n_right : int, optional Number of tokens of right context to include. (Default: 2) Attributes ---------- chunker : chunk.ChunkEncoder ChunkEncoder instance used to generate tags. """ def __init__(self, n_left=2, n_right=2): self.chunker = BILOUChunkEncoder() self.n_left = n_left self.n_right = n_right def get_feats_for_token(self, token): """Return features for token. Inputs ------ token : str Token. Outputs ------- feats : tuple of str Features vector. """ raise NotImplementedError def get_feats(self, tokens): """Return features corresponding to token sequence. Inputs ------ tokens : list of str Token sequence. Outputs ------- feats : lsit of tuples Feature vector sequence. """ feats = [self.get_feats_for_token(token) for token in tokens] feats = zip(*feats) new_feats = [] for ii, feats_ in enumerate(feats): for pos in xrange(-self.n_left, self.n_right+1): feat_id = 'F%d[%d]' % (ii, pos) k = -pos new_feats.append(['%s=%s' % (feat_id, val) if val is not None else val for val in roll(feats_, k)]) new_feats = zip(*new_feats) #print new_feats[0] #print '============================================================================================' # for ii,row in enumerate(new_feats): # new_row = [v if not v is None else 'none' for v in row] # new_feats[ii] = new_row # Filter out None vals in rows where they occur. for ii, row in enumerate(new_feats): new_row = [v for v in row if not v is None] new_feats[ii] = new_row # print new_feats[0] # print '**********************************************************************************************' return new_feats def get_targets(self, tokens, mentions): """Return tag sequence to train against. Inputs ------ tokens : list of str Token sequence. mentions : list of list List of mention tuples, each of the form (tag, start_token_index, enc_token_index). Outputs ------- targets : list of str Target label sequence. """ tags = ['O' for token in tokens] for tag, bi, ei in mentions: chunk = tokens[bi:ei+1] tags[bi:ei+1] = self.chunker.chunk_to_tags(chunk, tag) return tags def get_feats_targets(self, tokens, mentions): """Return features/tag sequence to train against. Inputs ------ tokens : list of str Token sequence. mentions : list of list List of mention tuples, each of the form (tag, start_token_index, enc_token_index). Outputs ------- feats : list of tuples Feature vector sequence. targets : list of str Target label sequence. """ feats = self.get_feats(tokens) targets = self.get_targets(tokens, mentions) return feats, targets