class CharTokenizer: """ Class to create char tokens """ def __init__(self, max_word_length): vocab = [chr(ord('a') + i) for i in range(0, 26)] + [' '] self.char_vocab_index = Indexer() self.char_vocab_index.add_and_get_index(PAD_TOKEN) # PAD is 0 self.char_vocab_index.add_and_get_index( UNK_TOKEN) # Unknown token is 1 for char in vocab: self.char_vocab_index.add_and_get_index(char) self.max_word_length = max_word_length def convert_words_to_charids(self, words): word_charids = [] for w in words: charids = [] for c in w: charids.append(self.char_vocab_index.index_of(c)) charids = charids[:self.max_word_length] if len(charids) < self.max_word_length: charids.extend([0] * (self.max_word_length - len(charids))) word_charids.append(charids) return word_charids
class FeatureExtractor(): def __init__(self): self.indexer = Indexer() def get_indexer(self): return self.indexer def extract_features(self, ex): feature_vector = np.zeros(len(self.indexer)) for word in ex.text: index = self.indexer.index_of(word) feature_vector[index] += 1 return feature_vector