def __init__(self, config=dict()): super(SIFFeaturizer, self).__init__() self.num_words = config.get('num_words', MAX_NUM_WORDS) self.tokenize_fn = word_tokenize self.use_tokenizer = config.get('use_tokenizer', False) self.tokenizer = Tokenizer(num_words=self.num_words)
class SIFFeaturizer(IFeaturizer): def __init__(self, config=dict()): super(SIFFeaturizer, self).__init__() self.num_words = config.get('num_words', MAX_NUM_WORDS) self.tokenize_fn = word_tokenize self.use_tokenizer = config.get('use_tokenizer', False) self.tokenizer = Tokenizer(num_words=self.num_words) def get_output_shape(self): return (300, ) def fit(self, data): if self.tokenizer is None: self.tokenizer = Tokenizer(num_words=MAX_NUM_WORDS) tokens = [self.tokenize_fn(sent) for sent in data] self.tokenizer.fit_on_texts(tokens) def transform(self, data): raw_tokens = [self.tokenize_fn(sent) for sent in data] tokens = self.tokenizer.texts_to_sequences(raw_tokens) tfidf_matrix = self.tokenizer.sequences_to_matrix(tokens, mode='tfidf') maxlen = max([len(sent) for sent in tokens]) tfidf_weights = np.zeros((len(tokens), maxlen)) for i, seq in enumerate(raw_tokens): for j, raw_token in enumerate(seq): token = -1 if raw_token in self.tokenizer.word_index: token = self.tokenizer.word_index[raw_token] # else: # similar_to_raw_token = most_similar(raw_token) # for similar_word in similar_to_raw_token: # print(similar_to_raw_token) # if similar_word in self.tokenizer.word_index: # token = self.tokenizer.word_index[similar_word] # print('Word not found: %s but similar word found: %s' % (raw_token, similar_word)) # break if token > -1: tfidf_weights[i][j] = tfidf_matrix[i][token] else: tfidf_weights[i][j] = 1 # default weight to 1 # convert from token back to texts # this is to guarantee that tfidf matrix and X has the same length (with oov words ommited) # embs = word_to_vec(self.tokenizer.sequences_to_texts(tokens)) # print(raw_tokens) embs = word_to_vec(raw_tokens) if embs is None: return None sif_emb = SIF_embedding(embs, tfidf_weights, rmpc=0) return torch.from_numpy(sif_emb).float()
def __init__(self, config=dict()): super(FastTextFeaturizer, self).__init__() self.num_words = config.get('num_words', MAX_NUM_WORDS) self.tokenize_fn = wordpunct_tokenize self.tokenizer = Tokenizer(num_words=self.num_words) self.token_indice = None self.indice_token = None self.max_features = MAX_NUM_WORDS self.max_len = MAX_SEQUENCE_LENGTH self.ngrams = 3
def __init__(self, config={}, *args, **kwargs): super(OvrClassifierWrapper, self).__init__(model_class=OvrClassifier, config=config, *args, **kwargs) self.tokenizer = Tokenizer(num_words=MAX_NUM_WORDS) self.num_words = config.get('num_words', MAX_NUM_WORDS) self.n_classes = config.get('num_classes', 10) self.tokenize_fn = wordpunct_tokenize self.label_encoder = LabelEncoder()
def __init__(self, config={}): super(StarspaceClassifierWrapper, self).__init__(model_class=StarSpaceClassifier, config=config) self.config = config self.num_words = config.get('num_words', MAX_NUM_WORDS) self.topk = config.get('top_k', 5) self.tokenizer = Tokenizer(num_words=self.num_words) self.loss_margin = config.get('loss_margin', .8) self.tokenize_fn = wordpunct_tokenize self.label_encoder = LabelEncoder()
def transform(self, data): if self.tokenizer is None: self.tokenizer = Tokenizer(num_words=MAX_NUM_WORDS) tokens = [self.tokenize_fn(sent) for sent in data] tokens = self.tokenizer.texts_to_sequences(tokens) tokens = self.add_ngram(tokens, self.token_indice, self.ngrams) max_len = max([len(seq) for seq in tokens]) if max_len > self.max_len: warnings.warn('Max training sequence length is %s, which is higher than max length setting %s' % \ (max_len, self.max_len), UserWarning) tokens = pad_sequences(tokens, maxlen=self.max_len) return to_gpu(torch.LongTensor(tokens))
def fit(self, data): if self.tokenizer is None: self.tokenizer = Tokenizer( num_words=self.num_words, lower=self.lower, char_level=self.char_level, reserved_tokens=self.reserved_tokens ) if self.char_level: print('Using char-level tokenizer') try: _ = (it for it in data) if len(data) < 1: return # Must have at least 1 item except: pass # data is not an iterable self.tokenizer.fit_on_texts(self.tokenize(data))
def __init__(self, config=dict()): super(BasicFeaturizer, self).__init__() self.lower = config.get('lower', True) self.char_level = config.get('char_level', False) self.num_words = config.get('num_words', n_letters + LM_CHAR_RESERVED if self.char_level else LM_VOCAB_SIZE) self.append_sos_eos = config.get('append_sos_eos', False) self.featurizer_seq_len = config.get('featurizer_seq_len', MAX_SEQUENCE_LENGTH) self.reserved_tokens = config.get('featurizer_reserved_tokens', [START_TAG, STOP_TAG, UNK_TAG]) self.to_tensor = config.get('to_tensor', True) # pad sequences and return tensors self.return_mask = config.get('return_mask', False) self.tokenize_fn = config.get('tokenize_fn', wordpunct_tokenize) self.tokenizer = Tokenizer( num_words=self.num_words, lower=self.lower, char_level=self.char_level, reserved_tokens=self.reserved_tokens )
def preprocess_input(self, X): if self.tokenizer is None: self.tokenizer = Tokenizer(num_words=MAX_NUM_WORDS) tokens = [self.tokenize_fn(sent) for sent in X] tokens = self.tokenizer.texts_to_sequences(tokens) tfidf_matrix = self.tokenizer.sequences_to_matrix(tokens, mode='tfidf') maxlen = max([len(sent) for sent in tokens]) tfidf_weights = np.zeros((len(tokens), maxlen)) for i, seq in enumerate(tokens): for j, token in enumerate(seq): if token < self.tokenizer.num_words: tfidf_weights[i][j] = tfidf_matrix[i][token] # convert from token back to texts # this is to guarantee that tfidf matrix and X has the same length (with oov words ommited) embs = word_to_vec(self.tokenizer.sequences_to_texts(tokens)) sif_emb = SIF_embedding(embs, tfidf_weights, rmpc=0) return torch.from_numpy(sif_emb).float()
def fit(self, data): if self.tokenizer is None: self.tokenizer = Tokenizer(num_words=MAX_NUM_WORDS) tokens = [self.tokenize_fn(sent) for sent in data] self.tokenizer.fit_on_texts(tokens) if self.ngrams > 1: # print('Adding {}-gram features'.format(ngram_range)) # Create set of unique n-gram from the training set. ngram_set = set() for input_list in data: for i in range(2, self.ngrams + 1): set_of_ngram = self.create_ngram_set(input_list, ngram_value=i) ngram_set.update(set_of_ngram) start_index = self.num_words + 1 token_indice = {v: k + start_index for k, v in enumerate(ngram_set)} indice_token = {token_indice[k]: k for k in token_indice} self.token_indice = token_indice self.indice_token = indice_token max_features = np.max(list(indice_token.keys())) + 1 self.max_features = max_features
class OvrClassifierWrapper(IModel): def __init__(self, config={}, *args, **kwargs): super(OvrClassifierWrapper, self).__init__(model_class=OvrClassifier, config=config, *args, **kwargs) self.tokenizer = Tokenizer(num_words=MAX_NUM_WORDS) self.num_words = config.get('num_words', MAX_NUM_WORDS) self.n_classes = config.get('num_classes', 10) self.tokenize_fn = wordpunct_tokenize self.label_encoder = LabelEncoder() def get_state_dict(self): return { 'tokenizer': self.tokenizer, 'config': self.model.config, 'label_encoder': self.label_encoder, 'state_dict': self.model.get_params(), } def load_state_dict(self, state_dict): config = state_dict['config'] # re-initialize model with loaded config self.model = self.init_model() self.model.set_params(state_dict['state_dict']) # load tokenizer self.tokenizer = state_dict['tokenizer'] # load label encoder self.label_encoder = state_dict['label_encoder'] def preprocess_input(self, X): if self.tokenizer is None: self.tokenizer = Tokenizer(num_words=MAX_NUM_WORDS) tokens = [self.tokenize_fn(sent) for sent in X] tokens = self.tokenizer.texts_to_sequences(tokens) tfidf_matrix = self.tokenizer.sequences_to_matrix(tokens, mode='tfidf') maxlen = max([len(sent) for sent in tokens]) tfidf_weights = np.zeros((len(tokens), maxlen)) for i, seq in enumerate(tokens): for j, token in enumerate(seq): if token < self.tokenizer.num_words: tfidf_weights[i][j] = tfidf_matrix[i][token] # convert from token back to texts # this is to guarantee that tfidf matrix and X has the same length (with oov words ommited) embs = word_to_vec(self.tokenizer.sequences_to_texts(tokens)) sif_emb = SIF_embedding(embs, tfidf_weights, rmpc=0) return torch.from_numpy(sif_emb).float() def preprocess_output(self, y): # One-hot encode outputs # Can also use torch.eye() but leaving as numpy until torch achieves performance parity # lookup = np.eye(self.num_classes) # outputs = np.array([lookup[label] for label in y]) # return torch.from_numpy(outputs).float() return torch.from_numpy(self.label_encoder.transform(y)).long() def infer_predict(self, logits, topk=None): return infer_classification_output(self, logits, topk)
def fit(self, data): if self.tokenizer is None: self.tokenizer = Tokenizer(num_words=MAX_NUM_WORDS) tokens = [self.tokenize_fn(sent) for sent in data] self.tokenizer.fit_on_texts(tokens)
class FastTextFeaturizer(IFeaturizer): def __init__(self, config=dict()): super(FastTextFeaturizer, self).__init__() self.num_words = config.get('num_words', MAX_NUM_WORDS) self.tokenize_fn = wordpunct_tokenize self.tokenizer = Tokenizer(num_words=self.num_words) self.token_indice = None self.indice_token = None self.max_features = MAX_NUM_WORDS self.max_len = MAX_SEQUENCE_LENGTH self.ngrams = 3 def get_output_shape(self): return (300, ) # size of embedding def create_ngram_set(self, input_list, ngram_value=2): """ Extract a set of n-grams from a list of integers. >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2) {(4, 9), (4, 1), (1, 4), (9, 4)} >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3) [(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)] """ return set(zip(*[input_list[i:] for i in range(ngram_value)])) def add_ngram(self, sequences, token_indice, ngram_range=2): """ Augment the input list of list (sequences) by appending n-grams values. Example: adding bi-gram >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]] >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017} >>> add_ngram(sequences, token_indice, ngram_range=2) [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]] Example: adding tri-gram >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]] >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017, (7, 9, 2): 2018} >>> add_ngram(sequences, token_indice, ngram_range=3) [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42, 2018]] """ new_sequences = [] for input_list in sequences: new_list = input_list[:] for ngram_value in range(2, ngram_range + 1): for i in range(len(new_list) - ngram_value + 1): ngram = tuple(new_list[i:i + ngram_value]) if ngram in token_indice: new_list.append(token_indice[ngram]) new_sequences.append(new_list) return new_sequences def fit(self, data): if self.tokenizer is None: self.tokenizer = Tokenizer(num_words=MAX_NUM_WORDS) tokens = [self.tokenize_fn(sent) for sent in data] self.tokenizer.fit_on_texts(tokens) if self.ngrams > 1: # print('Adding {}-gram features'.format(ngram_range)) # Create set of unique n-gram from the training set. ngram_set = set() for input_list in data: for i in range(2, self.ngrams + 1): set_of_ngram = self.create_ngram_set(input_list, ngram_value=i) ngram_set.update(set_of_ngram) start_index = self.num_words + 1 token_indice = {v: k + start_index for k, v in enumerate(ngram_set)} indice_token = {token_indice[k]: k for k in token_indice} self.token_indice = token_indice self.indice_token = indice_token max_features = np.max(list(indice_token.keys())) + 1 self.max_features = max_features def transform(self, data): if self.tokenizer is None: self.tokenizer = Tokenizer(num_words=MAX_NUM_WORDS) tokens = [self.tokenize_fn(sent) for sent in data] tokens = self.tokenizer.texts_to_sequences(tokens) tokens = self.add_ngram(tokens, self.token_indice, self.ngrams) max_len = max([len(seq) for seq in tokens]) if max_len > self.max_len: warnings.warn('Max training sequence length is %s, which is higher than max length setting %s' % \ (max_len, self.max_len), UserWarning) tokens = pad_sequences(tokens, maxlen=self.max_len) return to_gpu(torch.LongTensor(tokens))
class BasicFeaturizer(IFeaturizer): def __init__(self, config=dict()): super(BasicFeaturizer, self).__init__() self.lower = config.get('lower', True) self.char_level = config.get('char_level', False) self.num_words = config.get('num_words', n_letters + LM_CHAR_RESERVED if self.char_level else LM_VOCAB_SIZE) self.append_sos_eos = config.get('append_sos_eos', False) self.featurizer_seq_len = config.get('featurizer_seq_len', MAX_SEQUENCE_LENGTH) self.reserved_tokens = config.get('featurizer_reserved_tokens', [START_TAG, STOP_TAG, UNK_TAG]) self.to_tensor = config.get('to_tensor', True) # pad sequences and return tensors self.return_mask = config.get('return_mask', False) self.tokenize_fn = config.get('tokenize_fn', wordpunct_tokenize) self.tokenizer = Tokenizer( num_words=self.num_words, lower=self.lower, char_level=self.char_level, reserved_tokens=self.reserved_tokens ) def get_output_shape(self): return (None,) def tokenize(self, data): if self.char_level: return data else: if self.append_sos_eos: return [[START_TAG] + self.tokenize_fn(sent) + [STOP_TAG] for sent in data] else: if isinstance(data[0], list): return data else: return [self.tokenize_fn(sent) for sent in data] def fit(self, data): if self.tokenizer is None: self.tokenizer = Tokenizer( num_words=self.num_words, lower=self.lower, char_level=self.char_level, reserved_tokens=self.reserved_tokens ) if self.char_level: print('Using char-level tokenizer') try: _ = (it for it in data) if len(data) < 1: return # Must have at least 1 item except: pass # data is not an iterable self.tokenizer.fit_on_texts(self.tokenize(data)) def transform(self, data, to_tensor=None, return_mask=None): try: _ = (it for it in data) if len(data) < 1: return # Must have at least 1 item except: return # data is not an iterable tokens = self.tokenizer.texts_to_sequences(self.tokenize(data)) _return_mask = return_mask if return_mask is not None else hasattr(self, 'return_mask') and self.return_mask if to_tensor if to_tensor is not None else self.to_tensor: lengths = [len(seq) for seq in tokens] max_seq_len = max(lengths) if self.featurizer_seq_len > 0: max_seq_len = min(max_seq_len, self.featurizer_seq_len) res = torch.zeros(len(tokens), max_seq_len).long() if _return_mask: mask = torch.zeros(len(tokens), max_seq_len).long() for idx, seq in enumerate(tokens): seq_len = min(max_seq_len, len(seq)) res[idx, :seq_len] = torch.LongTensor(seq[:seq_len]) if _return_mask: mask[idx, :seq_len] = 1 if _return_mask: return res, mask else: return res else: return tokens def inverse_transform(self, data): retval = [] batch_size = data.size(0) max_len = data.size(1) for ix in range(batch_size): retval.append( [ self.tokenizer.ix_to_word.get(int(data[ix, word_ix]), '') for word_ix in range(max_len) ] ) return retval