def get_augmented_data(train_txt_path, augmentation, alpha, n_aug=1):

    output_pkl_path = train_txt_path.parent.joinpath(f"train_aug_{augmentation}_data_{alpha}.pkl")

    if not output_pkl_path.exists():

        print(f"creating {output_pkl_path}")

        lines = open(train_txt_path, 'r').readlines()
        sentence_to_augmented_sentences = {}

        for line in lines:
            parts = line[:-1].split('\t')
            sentence = parts[1]
            if augmentation == 'swap':
                augmented_sentences = eda.get_swap_sentences(sentence, n_aug, alpha)
            elif augmentation == 'insert':
                augmented_sentences = eda.get_insert_sentences(sentence, n_aug, alpha)
            elif augmentation == 'delete':
                augmented_sentences = eda.get_insert_sentences(sentence, n_aug, alpha)
            sentence_to_augmented_sentences[sentence] = augmented_sentences

        utils_common.save_pickle(output_pkl_path, sentence_to_augmented_sentences)
    
    return utils_common.load_pickle(output_pkl_path)
Beispiel #2
0
def get_x_y(txt_path, embedding_path):
    lines = open(txt_path).readlines()
    string_to_embedding = utils_common.load_pickle(embedding_path)

    x = np.zeros((len(lines), 768))
    y = np.zeros((len(lines), ))

    for i, line in enumerate(lines):
        parts = line[:-1].split('\t')
        label = int(parts[0])
        string = parts[1]
        assert string in string_to_embedding
        embedding = string_to_embedding[string]
        x[i, :] = embedding
        y[i] = label

    x, y = shuffle(x, y, random_state=0)
    return x, y
Beispiel #3
0
def get_split_train_embedding_dict(sentence_to_augmented_sentences,
                                   train_txt_path, augmentation, alpha):

    embeddings_dict_path = train_txt_path.parent.joinpath(
        f"train_aug_{augmentation}_embeddings_{alpha}.pkl")

    if not embeddings_dict_path.exists():

        print(f"creating {embeddings_dict_path}")

        string_to_embedding = {}

        for sentence, augmented_sentences in tqdm(
                sentence_to_augmented_sentences.items()):
            embedding = get_embedding(sentence, tokenizer, model)
            string_to_embedding[sentence] = embedding
            for augmented_sentence in augmented_sentences:
                aug_embedding = get_embedding(augmented_sentence, tokenizer,
                                              model)
                string_to_embedding[augmented_sentence] = aug_embedding

        utils_common.save_pickle(embeddings_dict_path, string_to_embedding)

    return utils_common.load_pickle(embeddings_dict_path)
        return last_hidden_states


def save_word_to_embedding_pickle(word_to_aoa, output_path):

    bert_vocab = tokenizer.get_vocab()
    print(
        f"{len(bert_vocab)} in bert vocab, {len(word_to_aoa)} in aoa or abstractness"
    )

    word_to_embedding = {}

    for word in tqdm(list(word_to_aoa.keys())):
        if word in bert_vocab:
            embedding = get_embedding(word, tokenizer, model)
            word_to_embedding[word] = embedding

    utils_common.save_pickle(word_to_embedding, output_path)
    print(f"{len(word_to_embedding)} words saved")


if __name__ == "__main__":
    # word_to_aoa = utils_common.load_pickle(config.aoa_dict_path)
    # save_word_to_embedding_pickle(word_to_aoa, config.aoa_embedding_path)
    # word_to_embedding = utils_common.load_pickle(config.aoa_embedding_path)
    word_to_abstractness = utils_common.load_pickle(
        config.abstractness_dict_path)
    save_word_to_embedding_pickle(word_to_abstractness,
                                  config.abstractness_embedding_path)
    word_to_embedding = utils_common.load_pickle(
        config.abstractness_embedding_path)
def get_split_train_x_y(train_txt_path, train_subset, seed_num, setup, alpha):

    setup_to_augmentations = {  'swap': ['swap'], 'delete': ['delete'], 'insert': ['insert'],
                                'swap-mtl': ['swap'], 'delete-mtl': ['delete'], 'insert-mtl': ['insert'], 
                                'three_aug': ['delete', 'insert', 'swap'],
                                'three_aug-mtl': ['delete', 'insert', 'swap'],
                                'vanilla': []}
    augmentations = setup_to_augmentations[setup]

    big_dict_aug_sentences = {}
    big_dict_embeddings = {}
    for augmentation in augmentations:
        sentence_to_augmented_sentences = get_augmented_data(train_txt_path, augmentation, alpha)
        string_to_embedding = utils_bert.get_split_train_embedding_dict(sentence_to_augmented_sentences, train_txt_path, augmentation, alpha)
        big_dict_aug_sentences[augmentation] = sentence_to_augmented_sentences
        big_dict_embeddings[augmentation] = string_to_embedding

    sentence_to_label = get_sentence_to_label(train_txt_path)
    sentences = list(sentence_to_label.keys())
    labels = []
    for sentence in sentences:
        label = sentence_to_label[sentence]
        labels.append(label)
    original_sentence_to_embedding = utils_common.load_pickle(train_txt_path.parent.joinpath(f"train_embeddings.pkl"))

    train_sentences, _, train_labels, _ = train_test_split(sentences, labels, train_size=train_subset, random_state = seed_num, stratify = labels)

    # get train_x_np
    train_x = []
    aug_train_x_dict = {augmentation: [] for augmentation in augmentations}

    for train_sentence in train_sentences:

        embedding = original_sentence_to_embedding[train_sentence]
        train_x.append(embedding)

        for augmentation in augmentations:
            sentence_to_augmented_sentences = big_dict_aug_sentences[augmentation]
            string_to_embedding = big_dict_embeddings[augmentation]
            train_sentence_swap = sentence_to_augmented_sentences[train_sentence][0]
            embedding_swap = string_to_embedding[train_sentence_swap]
            aug_train_x_dict[augmentation].append(embedding_swap)
    
    for augmentation in augmentations:
        train_x += aug_train_x_dict[augmentation]

    train_x_np = np.asarray(train_x)

    # get train_y_np
    train_labels_dup = list(train_labels)
    for _ in augmentations:
        train_labels_dup += train_labels
    train_y_np = np.asarray(train_labels_dup)

    #get train_y_aux
    num_classes_aux = len([train_x] + augmentations)
    train_labels_aux = []
    for y_aux in range(num_classes_aux):
        for _ in range(len(train_sentences)):
            train_labels_aux.append(y_aux)
    train_y_aux_np = np.asarray(train_labels_aux)

    return train_x_np, train_y_np, train_y_aux_np, num_classes_aux