Beispiel #1
0
    def test_dsd(self):
        dsd = DSD()
        df = dsd.load_with_pandas()

        self.assertEqual(len(df), 99)
        self.assertListEqual(list(df.columns), ['word1', 'word2', 'similarity'])
        self.assertEqual(len(dsd.words()), 197)
from danlp.datasets import WordSim353Da, DSD
from danlp.models.embeddings import AVAILABLE_EMBEDDINGS, load_wv_with_gensim
import tabulate


def load_wv_models():
    for da_wv_model in AVAILABLE_EMBEDDINGS:
        yield da_wv_model, load_wv_with_gensim(da_wv_model)


ws353 = WordSim353Da()
dsd = DSD()

data = []

for model_name, wv in load_wv_models():

    print("DSD words not in vocab of {}: {}".format(
        model_name, [w for w in dsd.words() if w.lower() not in wv.vocab]))

    correlation_on_dsd = wv.evaluate_word_pairs(dsd.file_path, delimiter="\t")
    spearman_rho_dsd = correlation_on_dsd[1].correlation
    oov_dsd = correlation_on_dsd[2]

    print("WS353 words not in vocab of {}: {}".format(
        model_name, [w for w in ws353.words() if w.lower() not in wv.vocab]))
    correlation_on_ws353 = wv.evaluate_word_pairs(ws353.file_path,
                                                  delimiter=',')
    spearman_rho_ws353 = correlation_on_ws353[1].correlation
    oov_ws353 = correlation_on_ws353[2]