def tokenize_from_optimized_dict(tokenization_dict, vecs): tokenized_vecs = [] for vec in vecs: vec = flatten(vec) tokenized_vec = [] for idx in range(len(vec) - 1): tokenized_vec.append(tokenization_dict[vec[idx]] + tokenization_dict[vec[idx] + vec[idx + 1]]) tokenized_vec.append(tokenization_dict[vec[-1]]) tokenized_vecs.append(tokenized_vec) return np.array(tokenized_vecs)
def find_idx_dict(inputs): char_set = set() coupling_set = set() for vec in inputs: vec = flatten(vec) for idx in range(len(vec)): char_set.add(vec[idx]) try: coupling_set.add(vec[idx] + vec[idx + 1]) except IndexError: pass idx_dict = {} for idx, char in enumerate(list(char_set)): idx_dict[char] = idx for idx, chars in enumerate(list(coupling_set)): idx_dict[chars] = idx + len(char_set) return idx_dict
def find_costume_dict(inputs, labels, idx_dict): '''optimize parameters for costume tokenization dict''' # generating X data X = [] for vec in inputs: x = np.zeros(len(idx_dict)) vec = flatten(vec) for idx in range(len(vec)): x[idx_dict[vec[idx]]] += 1 try: x[idx_dict[vec[idx] + vec[idx + 1]]] += 1 except IndexError: pass X.append(x) reg = LinearModel(fit_intercept=False, alpha=0.05) reg.fit(X, labels) d = {} for key, c in zip(idx_dict.keys(), reg.coef_): d[key] = c return d
def predict(data, idxs, model): inputs = np.array(data_from_idxs(data, idxs, 'inputs')) if isinstance(model, KernalRidge): if len(inputs[0].shape) > 1: inputs = [flatten(x) for x in inputs] return model.predict(inputs)
def gen_word_tokenization_dict(vecs): '''Method to generate standard word tokenization dictionary''' char_set = set() for char in flatten(vecs): char_set.add(char) return dict([(char, i) for i, char in enumerate(list(char_set))])
def KRR_model(train_inputs, train_labels, model_alpha=0.01, kernel='rbf'): model = KernalRidge(model_alpha, kernel=kernel) if len(train_inputs[0].shape) > 1: train_inputs = [flatten(s) for s in train_inputs] model.train(train_inputs, train_labels) return model