Example #1
0
def tokenize_from_optimized_dict(tokenization_dict, vecs):
    tokenized_vecs = []
    for vec in vecs:
        vec = flatten(vec)
        tokenized_vec = []
        for idx in range(len(vec) - 1):
            tokenized_vec.append(tokenization_dict[vec[idx]] +
                                 tokenization_dict[vec[idx] + vec[idx + 1]])
        tokenized_vec.append(tokenization_dict[vec[-1]])
        tokenized_vecs.append(tokenized_vec)
    return np.array(tokenized_vecs)
Example #2
0
def find_idx_dict(inputs):
    char_set = set()
    coupling_set = set()
    for vec in inputs:
        vec = flatten(vec)
        for idx in range(len(vec)):
            char_set.add(vec[idx])
            try:
                coupling_set.add(vec[idx] + vec[idx + 1])
            except IndexError:
                pass
    idx_dict = {}
    for idx, char in enumerate(list(char_set)):
        idx_dict[char] = idx
    for idx, chars in enumerate(list(coupling_set)):
        idx_dict[chars] = idx + len(char_set)
    return idx_dict
Example #3
0
def find_costume_dict(inputs, labels, idx_dict):
    '''optimize parameters for costume tokenization dict'''
    # generating X data
    X = []
    for vec in inputs:
        x = np.zeros(len(idx_dict))
        vec = flatten(vec)
        for idx in range(len(vec)):
            x[idx_dict[vec[idx]]] += 1
            try:
                x[idx_dict[vec[idx] + vec[idx + 1]]] += 1
            except IndexError:
                pass
        X.append(x)
    reg = LinearModel(fit_intercept=False, alpha=0.05)
    reg.fit(X, labels)
    d = {}
    for key, c in zip(idx_dict.keys(), reg.coef_):
        d[key] = c
    return d
Example #4
0
def predict(data, idxs, model):
    inputs = np.array(data_from_idxs(data, idxs, 'inputs'))
    if isinstance(model, KernalRidge):
        if len(inputs[0].shape) > 1:
            inputs = [flatten(x) for x in inputs]
    return model.predict(inputs)
Example #5
0
def gen_word_tokenization_dict(vecs):
    '''Method to generate standard word tokenization dictionary'''
    char_set = set()
    for char in flatten(vecs):
        char_set.add(char)
    return dict([(char, i) for i, char in enumerate(list(char_set))])
Example #6
0
def KRR_model(train_inputs, train_labels, model_alpha=0.01, kernel='rbf'):
    model = KernalRidge(model_alpha, kernel=kernel)
    if len(train_inputs[0].shape) > 1:
        train_inputs = [flatten(s) for s in train_inputs]
    model.train(train_inputs, train_labels)
    return model