class CharTokenizer:
    """
    Class to create char tokens
    """
    def __init__(self, max_word_length):
        vocab = [chr(ord('a') + i) for i in range(0, 26)] + [' ']
        self.char_vocab_index = Indexer()
        self.char_vocab_index.add_and_get_index(PAD_TOKEN)  # PAD is 0
        self.char_vocab_index.add_and_get_index(
            UNK_TOKEN)  # Unknown token is 1
        for char in vocab:
            self.char_vocab_index.add_and_get_index(char)

        self.max_word_length = max_word_length

    def convert_words_to_charids(self, words):
        word_charids = []
        for w in words:
            charids = []
            for c in w:
                charids.append(self.char_vocab_index.index_of(c))
            charids = charids[:self.max_word_length]
            if len(charids) < self.max_word_length:
                charids.extend([0] * (self.max_word_length - len(charids)))
            word_charids.append(charids)

        return word_charids
Exemple #2
0
class FeatureExtractor():
    def __init__(self):
        self.indexer = Indexer()

    def get_indexer(self):
        return self.indexer

    def extract_features(self, ex):
        feature_vector = np.zeros(len(self.indexer))
        for word in ex.text:
            index = self.indexer.index_of(word)
            feature_vector[index] += 1
        return feature_vector