Example #1
0
"""
Adapted from SentEval/examples/bow.py
https://github.com/facebookresearch/SentEval/blob/master/examples/bow.py
"""
import sys
import io
import numpy as np
import argparse
import logging
from tools.Blogger import Blogger

logger = Blogger()

# Set PATHs
PATH_TO_SENTEVAL = "./SentEval"
PATH_TO_DATA = "./SentEval/data"

# import SentEval
sys.path.insert(0, PATH_TO_SENTEVAL)
import senteval


# Create dictionary
def create_dictionary(sentences, threshold=0):
    words = {}
    for s in sentences:
        for word in s:
            words[word] = words.get(word, 0) + 1

    if threshold > 0:
        newwords = {}
Example #2
0
from lib.ProcessEmbeddings import WordEmbeddings
from tools.Blogger import Blogger
import os

logger = Blogger()
CLASSIFICATION_TASKS = ["MR", "CR", "SUBJ", "MPQA", "SST5", "TREC", "MRPC"]
SIMILARITY_TASKS = [
    "SICKRelatedness", "STS12", "STS13", "STS14", "STS15", "STS16"
]

if __name__ == "__main__":
    WE = WordEmbeddings(vector_file="embeds/glove.6B.300d.txt")
    WE.sparsify("ckpt/glove3000/ckpt-8000")
    WE.subract_mean()
    logger.status_update("Running SentEval tasks...")
    WE.SentEval(
        tasks=CLASSIFICATION_TASKS,
        save_summary=True,
        summary_file_name="glove_wta_3000.json",
    )
"""
Run in Google Colab.
"""
from lib.ProcessEmbeddings import WordEmbeddings
from tools.Blogger import Blogger
from collections import defaultdict
logger = Blogger()
BINARY_CLASSIFICATION_TASKS = ["MR", "CR", "SUBJ", "MPQA", "SST2"]
MULTICLASS_CLASSIFICATION_TASKS = ["SST5", "TREC"]
CLASSIFICATION_TASKS = ["MR", "CR", "SUBJ", "MPQA", "SST2", "SST5", "TREC"]
SIMILARITY_TASKS = ['STS12', 'STS13', 'STS14']
ALL_TASKS = BINARY_CLASSIFICATION_TASKS + MULTICLASS_CLASSIFICATION_TASKS + SIMILARITY_TASKS
PRODUCTION_CONFIG = {
    "usepytorch": True,
    "kfold": 5,
    "nhid": 0,
    "optim": "rmsprop",
    "batch_size": 128,
    "tenacity": 3,
    "epoch_size": 2
}


def glove(output_dir, dims, senteval_config):
    summary_file_name = f"{output_dir}/glove_{dims}.json"
    WE = WordEmbeddings(vector_file=f"embeds/glove.6B.{dims}d.txt")
    # Default Glove
    WE.evaluate(tasks=CLASSIFICATION_TASKS,
                save_summary=True,
                summary_file_name=summary_file_name,
                overwrite_file=True,