Ejemplo n.º 1
0
def get_similar_sentences(sen: str, no_of_variations: int, language_code: str):
    check_input_language(language_code)
    # get embedding vectors for sen
    tok = LanguageTokenizer(language_code)
    token_ids = tok.numericalize(sen)
    embedding_vectors = get_embedding_vectors(sen, language_code)
    # get learner
    defaults.device = torch.device('cpu')
    path = Path(__file__).parent
    learn = load_learner(path / 'models' / f'{language_code}')
    encoder = get_model(learn.model)[0]
    embeddings = encoder.state_dict()['encoder.weight']
    embeddings = np.array(embeddings)
    # cos similarity of vectors
    scores = []
    for word_vec in embedding_vectors:
        scores.append([cos_sim(word_vec, embdg) for embdg in embeddings])
    word_ids = [np.argpartition(-np.array(score), no_of_variations)[:no_of_variations] for score in scores]
    new_token_ids = []
    # generating more variations than required so that we can then filter out the best ones
    no_of_vars_per_token = ceil(no_of_variations/len(token_ids))*3
    for i in range(len(token_ids)):
        word_ids_list = word_ids[i].tolist()
        word_ids_list.remove(token_ids[i])
        for j in range(no_of_vars_per_token):
            new_token_ids.append(token_ids[:i] + word_ids_list[j:j+1] + token_ids[i+1:])
    new_sens = [tok.textify(tok_id) for tok_id in new_token_ids]
    while sen in new_sens:
        new_sens.remove(sen)
    sen_with_sim_score = [(new_sen, get_sentence_similarity(sen, new_sen, language_code)) for new_sen in new_sens]
    sen_with_sim_score.sort(key=lambda x: x[1], reverse=True)
    new_sens = [sen for sen, _ in sen_with_sim_score]
    return new_sens[:no_of_variations]
Ejemplo n.º 2
0
def get_sentence_encoding(input: str, language_code: str):
    check_input_language(language_code)
    tok = LanguageTokenizer(language_code)
    token_ids = tok.numericalize(input)
    # get learner
    defaults.device = torch.device('cpu')
    path = Path(__file__).parent
    learn = load_learner(path / 'models' / f'{language_code}')
    m = learn.model
    kk0 = m[0](Tensor([token_ids]).to(torch.int64))
    return np.array(kk0[0][-1][0][-1])
Ejemplo n.º 3
0
def get_similar_sentences(sen: str,
                          no_of_variations: int,
                          language_code: str,
                          degree_of_aug: float = 0.1):
    check_input_language(language_code)
    # get embedding vectors for sen
    tok = LanguageTokenizer(language_code)
    token_ids = tok.numericalize(sen)
    embedding_vectors = get_embedding_vectors(sen, language_code)
    # get learner
    defaults.device = torch.device('cpu')
    path = Path(__file__).parent
    learn = load_learner(path / 'models' / f'{language_code}')
    encoder = get_model(learn.model)[0]
    encoder.reset()
    embeddings = encoder.state_dict()['encoder.weight']
    embeddings = np.array(embeddings)
    # cos similarity of vectors
    scores = cosine_similarity(embedding_vectors, embeddings)
    word_ids = [
        np.argpartition(-np.array(score),
                        no_of_variations + 1)[:no_of_variations + 1]
        for score in scores
    ]
    word_ids = [ids.tolist() for ids in word_ids]
    for i, ids in enumerate(word_ids):
        word_ids[i] = [wid for wid in word_ids[i] if wid != token_ids[i]]
    # generating more variations than required so that we can then filter out the best ones
    buffer_multiplicity = 2
    new_sen_tokens = []
    for i in range(no_of_variations):
        for k in range(buffer_multiplicity):
            new_token_ids = []
            ids = sorted(
                random.sample(range(len(token_ids)),
                              max(1, int(degree_of_aug * len(token_ids)))))
            for j in range(len(token_ids)):
                if j in ids:
                    new_token_ids.append(word_ids[j][(i + k) %
                                                     len(word_ids[j])])
                else:
                    new_token_ids.append(token_ids[j])
            new_token_ids = list(map(lambda x: int(x), new_token_ids))
            new_sen_tokens.append(new_token_ids)
    new_sens = [tok.textify(sen_tokens) for sen_tokens in new_sen_tokens]
    while sen in new_sens:
        new_sens.remove(sen)
    sen_with_sim_score = [(new_sen,
                           get_sentence_similarity(sen, new_sen,
                                                   language_code))
                          for new_sen in new_sens]
    sen_with_sim_score.sort(key=lambda x: x[1], reverse=True)
    new_sens = [sen for sen, _ in sen_with_sim_score]
    return new_sens[:no_of_variations]
Ejemplo n.º 4
0
def get_embedding_vectors(input: str, language_code: str):
    check_input_language(language_code)
    tok = LanguageTokenizer(language_code)
    token_ids = tok.numericalize(input)
    # get learner
    defaults.device = torch.device('cpu')
    path = Path(__file__).parent
    learn = load_learner(path / 'models' / f'{language_code}')
    encoder = get_model(learn.model)[0]
    embeddings = encoder.state_dict()['encoder.weight']
    embeddings = np.array(embeddings)
    embedding_vectors = []
    for token in token_ids:
        embedding_vectors.append(embeddings[token])
    return embedding_vectors
Ejemplo n.º 5
0
def tokenize(input: str, language_code: str):
    check_input_language(language_code)
    tok = LanguageTokenizer(language_code)
    output = tok.tokenizer(input)
    return output
Ejemplo n.º 6
0
def remove_foreign_languages(input: str, host_language_code: str):
    check_input_language(host_language_code)
    tok = LanguageTokenizer(host_language_code)
    output = tok.remove_foreign_tokens(input)
    return output