def test_dsd(self): dsd = DSD() df = dsd.load_with_pandas() self.assertEqual(len(df), 99) self.assertListEqual(list(df.columns), ['word1', 'word2', 'similarity']) self.assertEqual(len(dsd.words()), 197)
from danlp.datasets import WordSim353Da, DSD from danlp.models.embeddings import AVAILABLE_EMBEDDINGS, load_wv_with_gensim import tabulate def load_wv_models(): for da_wv_model in AVAILABLE_EMBEDDINGS: yield da_wv_model, load_wv_with_gensim(da_wv_model) ws353 = WordSim353Da() dsd = DSD() data = [] for model_name, wv in load_wv_models(): print("DSD words not in vocab of {}: {}".format( model_name, [w for w in dsd.words() if w.lower() not in wv.vocab])) correlation_on_dsd = wv.evaluate_word_pairs(dsd.file_path, delimiter="\t") spearman_rho_dsd = correlation_on_dsd[1].correlation oov_dsd = correlation_on_dsd[2] print("WS353 words not in vocab of {}: {}".format( model_name, [w for w in ws353.words() if w.lower() not in wv.vocab])) correlation_on_ws353 = wv.evaluate_word_pairs(ws353.file_path, delimiter=',') spearman_rho_ws353 = correlation_on_ws353[1].correlation oov_ws353 = correlation_on_ws353[2]