""" Adapted from SentEval/examples/bow.py https://github.com/facebookresearch/SentEval/blob/master/examples/bow.py """ import sys import io import numpy as np import argparse import logging from tools.Blogger import Blogger logger = Blogger() # Set PATHs PATH_TO_SENTEVAL = "./SentEval" PATH_TO_DATA = "./SentEval/data" # import SentEval sys.path.insert(0, PATH_TO_SENTEVAL) import senteval # Create dictionary def create_dictionary(sentences, threshold=0): words = {} for s in sentences: for word in s: words[word] = words.get(word, 0) + 1 if threshold > 0: newwords = {}
from lib.ProcessEmbeddings import WordEmbeddings from tools.Blogger import Blogger import os logger = Blogger() CLASSIFICATION_TASKS = ["MR", "CR", "SUBJ", "MPQA", "SST5", "TREC", "MRPC"] SIMILARITY_TASKS = [ "SICKRelatedness", "STS12", "STS13", "STS14", "STS15", "STS16" ] if __name__ == "__main__": WE = WordEmbeddings(vector_file="embeds/glove.6B.300d.txt") WE.sparsify("ckpt/glove3000/ckpt-8000") WE.subract_mean() logger.status_update("Running SentEval tasks...") WE.SentEval( tasks=CLASSIFICATION_TASKS, save_summary=True, summary_file_name="glove_wta_3000.json", )
""" Run in Google Colab. """ from lib.ProcessEmbeddings import WordEmbeddings from tools.Blogger import Blogger from collections import defaultdict logger = Blogger() BINARY_CLASSIFICATION_TASKS = ["MR", "CR", "SUBJ", "MPQA", "SST2"] MULTICLASS_CLASSIFICATION_TASKS = ["SST5", "TREC"] CLASSIFICATION_TASKS = ["MR", "CR", "SUBJ", "MPQA", "SST2", "SST5", "TREC"] SIMILARITY_TASKS = ['STS12', 'STS13', 'STS14'] ALL_TASKS = BINARY_CLASSIFICATION_TASKS + MULTICLASS_CLASSIFICATION_TASKS + SIMILARITY_TASKS PRODUCTION_CONFIG = { "usepytorch": True, "kfold": 5, "nhid": 0, "optim": "rmsprop", "batch_size": 128, "tenacity": 3, "epoch_size": 2 } def glove(output_dir, dims, senteval_config): summary_file_name = f"{output_dir}/glove_{dims}.json" WE = WordEmbeddings(vector_file=f"embeds/glove.6B.{dims}d.txt") # Default Glove WE.evaluate(tasks=CLASSIFICATION_TASKS, save_summary=True, summary_file_name=summary_file_name, overwrite_file=True,