def evaluate_vectors(vector_file, output_dir, prototype_config=True):
    if output_dir[-1] == "/": output_dir = output_dir[:-1]
    if not prototype_config:
        senteval_config = PRODUCTION_CONFIG
    else:
        senteval_config = None
    # Standard Glove vectors first
    for dim in [
            50,
            100,
            #150,
            200
    ]:
        glove(output_dir, dim, senteval_config)
    summary_file_name = f"{output_dir}/glove_300.json"
    WE = WordEmbeddings(vector_file=vector_file)
    WE.evaluate(tasks=CLASSIFICATION_TASKS,
                save_summary=True,
                summary_file_name=summary_file_name,
                overwrite_file=True,
                senteval_config=senteval_config)
    for dim in [50, 100, 150, 200]:
        WE = algo_n(WE, output_dir, dim, senteval_config)
        WE = shap_algo(WE, output_dir, dim, senteval_config)
        WE = shap_ppe(WE, output_dir, dim, senteval_config)
        WE = shap_(WE, output_dir, dim, senteval_config)
def glove(output_dir, dims, senteval_config):
    summary_file_name = f"{output_dir}/glove_{dims}.json"
    WE = WordEmbeddings(vector_file=f"embeds/glove.6B.{dims}d.txt")
    # Default Glove
    WE.evaluate(tasks=CLASSIFICATION_TASKS,
                save_summary=True,
                summary_file_name=summary_file_name,
                overwrite_file=True,
                senteval_config=senteval_config)
def get_top_shap_grams(task, k, n):
    WE = WordEmbeddings(vector_file='embeds/glove.6B.300d.txt',
                        is_word2vec=False)
    out = WE.top_ngrams_per_class(task=task, k=k, n=n)
Beispiel #4
0
from lib.ProcessEmbeddings import WordEmbeddings
from tools.Blogger import Blogger
import os

logger = Blogger()
CLASSIFICATION_TASKS = ["MR", "CR", "SUBJ", "MPQA", "SST5", "TREC", "MRPC"]
SIMILARITY_TASKS = [
    "SICKRelatedness", "STS12", "STS13", "STS14", "STS15", "STS16"
]

if __name__ == "__main__":
    WE = WordEmbeddings(vector_file="embeds/glove.6B.300d.txt")
    WE.sparsify("ckpt/glove3000/ckpt-8000")
    WE.subract_mean()
    logger.status_update("Running SentEval tasks...")
    WE.SentEval(
        tasks=CLASSIFICATION_TASKS,
        save_summary=True,
        summary_file_name="glove_wta_3000.json",
    )
    raw_X_test,
    raw_y_test,
    raw_X_val,
    raw_y_val,
    LABELS,
    create_dataset,
    LABEL_TO_IX,
    get_vocab,
    IX_TO_LABEL,
    train_dataset,
    val_dataset,
    tokenized_sentence,
)
from lib.ProcessEmbeddings import WordEmbeddings

WE = WordEmbeddings(vector_file="embeds/glove.6B/glove.6B.300d.txt")


def predict(clf, text):
    return clf.predict(
        WE.get_sentence_vector(text.lower().split(), vector_dict).reshape(1, -1)
    )[0]


raw_X = raw_X_test + raw_X_train


def to_one_hot(raw_y, LABELS):
    y = np.zeros((len(raw_y), len(LABELS)))
    for ix, datapoint in enumerate(raw_y):
        for label in datapoint:
Beispiel #6
0
from lib.ProcessEmbeddings import WordEmbeddings
from tools.Blogger import Blogger
import os

logger = Blogger()
CLASSIFICATION_TASKS = ["MR", "CR", "SUBJ", "MPQA", "SST5", "TREC", "MRPC"]
SIMILARITY_TASKS = [
    "SICKRelatedness", "STS12", "STS13", "STS14", "STS15", "STS16"
]
ALL_TASKS = CLASSIFICATION_TASKS + SIMILARITY_TASKS

if __name__ == "__main__":
    if not os.path.exists("embeds/glove_algo150.txt"):
        WE = WordEmbeddings(vector_file="embeds/glove.6B.300d.txt")
        # PPE
        WE.subract_mean()
        WE.pca_fit()
        WE.remove_top_components(k=7)

        # PCA dim reduction
        WE.subract_mean()
        WE.pca_fit_transform(output_dims=150)

        # PPE
        WE.subract_mean()
        WE.pca_fit()
        WE.remove_top_components(k=7)
        WE.save_vectors("embeds/glove_algo150.txt")
        logger.status_update("Running SentEval tasks...")

        WE.evaluate(senteval_tasks=ALL_TASKS,
Beispiel #7
0
if __name__ == "__main__":
    get_avg_ranks(summary_dir="summary/SHAP/production")
    get_score_table(summary_dir="summary/SHAP/production")




### Averaging Random Data Results
import statistics
from collections import defaultdict
import pickle
with open("summary/rand_data.pkl", "rb") as f:
    rand_dims = pickle.load(f)
mean_scores = defaultdict(lambda defaultdict(float))
for dim, task in rand_dims.items():
    for task_name, scores in task.items():
        mean_score = statistics.mean(scores)
    mean_scores[dim][task_name] = mean_score
mean_scores[50]["task"]





### Analyzing specific examples
from lib.ProcessEmbeddings import WordEmbeddings
WE = WordEmbeddings(vector_file="embeds/glove.6B.300d.txt", normalize_on_load=True)
out1 = WE.analyze_sentence("CR", "and supply those stupid white headphones .")
out2 = WE.analyze_sentence("SUBJ", "The movie is about children who learn to fly. It is honestly quite ridiculous.", k=50)
Beispiel #8
0
BATCH_SIZE = 1
DEVICE = "cpu"
vocab, id2tok, tok2id = get_vocab(train_dataset)
VOCAB_SIZE = len(vocab)
EMBED_DIM = 300
HIDDEN_SIZE = 32
ALPHA = 0.003
NUM_EPOCHS = 15
THRESHOLD = 0.4
POS_LOSS_WEIGHT = 1.5
DROPOUT = 0.6
USE_GLOVE = True
#################################################

if USE_GLOVE:
    WE = WordEmbeddings(vector_file="embeds/glove.6B/glove.6B.300d.txt")


def get_embed_weights(vocab, tok2id):
    embeddings_matrix = np.asarray(
        np.random.normal(0, 0.9, (len(vocab), 300)), dtype="float32"
    )
    vector_dict = WE.get_vector_dict()
    for tok in vocab:
        i = tok2id[tok]
        if tok in vector_dict:
            embeddings_matrix[i] = vector_dict[tok]
    return torch.from_numpy(embeddings_matrix)


class LSTMClassifier(nn.Module):