def evaluate(embeddings):
    X, y = fetch_dataset_WS353(WS353)
    print(embeddings.shape, X.shape, y.shape)
    print("Spearman correlation of scores on {} {}".format(
        'WS353', evaluate_similarity(embeddings, X, y)))
    X, y = fetch_dataset_MEN(MEN)
    print("Spearman correlation of scores on {} {}".format(
        'MEN', evaluate_similarity(embeddings, X, y)))
    X, y = fetch_dataset_SIM999(SIM999)
    print("Spearman correlation of scores on {} {}".format(
        'SIM999', evaluate_similarity(embeddings, X, y)))
def test_similarity():
    url = "https://www.dropbox.com/s/rm756kjvckxa5ol/top100-sgns-googlenews-300.bin?dl=1"
    file_name = _fetch_file(url, "test")
    w = Embedding.from_word2vec(file_name, binary=True)
    data = fetch_SimLex999()

    result_1 = evaluate_similarity(w, data.X, data.y)
    result_2 =  evaluate_similarity(dict(zip(w.vocabulary.words, w.vectors)), data.X, data.y)

    assert result_2 > 0
    assert result_1 == result_2, "evaluate_similarity should return same result for dict and Embedding instance"
Esempio n. 3
0
def test_similarity_norm():
    url = "https://www.dropbox.com/s/rm756kjvckxa5ol/top100-sgns-googlenews-300.bin?dl=1"
    file_name = _fetch_file(url, "test")
    w = Embedding.from_word2vec(file_name, binary=True)
    w_norm = w.normalize_words()
    data = fetch_SimLex999()

    result_1 = evaluate_similarity(w, data.X, data.y)
    result_2 = evaluate_similarity(w_norm, data.X, data.y)

    assert result_2 > 0
    assert result_1 == result_2, "evaluate_similarity should return same result for normalized and unnormalized words"
Esempio n. 4
0
def test_similarity_norm():
    url = "https://www.dropbox.com/s/rm756kjvckxa5ol/top100-sgns-googlenews-300.bin?dl=1"
    file_name = _fetch_file(url, "test")
    w = Embedding.from_word2vec(file_name, binary=True)
    w_norm = w.normalize_words()
    data = fetch_SimLex999()

    result_1 = evaluate_similarity(w, data.X, data.y)
    result_2 = evaluate_similarity(w_norm, data.X, data.y)

    assert result_2 > 0
    assert result_1 == result_2, "evaluate_similarity should return same result for normalized and unnormalized words"
Esempio n. 5
0
def evaluate_simi(wv, w2i, vocab):
    wv_dict = dict()
    for w in vocab:
        wv_dict[w] = wv[w2i[w], :]

    if isinstance(wv_dict, dict):
        w = Embedding.from_dict(wv_dict)

    # Calculate results on similarity
    print("Calculating similarity benchmarks")
    similarity_tasks = {
        "WS353": fetch_WS353(),
        "RG65": fetch_RG65(),
        #         "WS353R": fetch_WS353(which="relatedness"),
        #         "WS353S": fetch_WS353(which="similarity"),
        "SimLex999": fetch_SimLex999(),
        "MTurk": fetch_MTurk(),
        "RW": fetch_RW(),
        "MEN": fetch_MEN(),
    }

    #     similarity_results = {}

    for name, data in iteritems(similarity_tasks):
        print(
            "Sample data from {}, num of samples: {} : pair \"{}\" and \"{}\" is assigned score {}"
            .format(name, len(data.X), data.X[0][0], data.X[0][1], data.y[0]))
        score = evaluate_similarity(w, data.X, data.y)
        print("Spearman correlation of scores on {} {}".format(name, score))
    def _evaluate_structed(self, data: Bunch, struct_info: str,
                           tokenize_oov_with_deepcut: bool,
                           filter_not_found: bool):
        results = []
        # for coef in np.arange(0.00, 1.05, 0.05):
        for coef in np.arange(0.00, 1.1, 0.1):
            if struct_info == self.CONCEPTNET and self.numberbatch is None:
                self.numberbatch = ConceptNetNumberbatch()
            result = evaluate_similarity(
                self.w,
                data.X,
                data.y,
                tokenize_oov_words_with_deepcut=tokenize_oov_with_deepcut,
                filter_not_found=filter_not_found,
                include_structured_sources=struct_info,
                structed_sources_coef=coef,
                numberbatch=self.numberbatch)
            result['coef'] = coef
            try:
                result['hm'] = scipy.stats.hmean(
                    [result['spearmanr'], result['pearsonr']])
            except:
                result['hm'] = -999  ## undefined
            results.append(result)

        pprint(results)
        result = max(results, key=lambda x: x['hm'])
        hm = result['hm']
        print('BEST COEF: {}'.format(result['coef']))
        print('STRUCTED OOV : {}'.format(result['structed_oov_pairs']))
        return result, hm
Esempio n. 7
0
def evaluate_w2v(data, current_model, similarity_pairs):

    general_similarity = evaluate_similarity(current_model, data.X, data.y)
    low_similarity = evaluate_similarity(
        current_model, np.asarray(similarity_pairs['low'][0]),
        np.asarray(similarity_pairs['low'][1]))
    mid_similarity = evaluate_similarity(
        current_model, np.asarray(similarity_pairs['mid'][0]),
        np.asarray(similarity_pairs['mid'][1]))
    high_similarity = evaluate_similarity(
        current_model, np.asarray(similarity_pairs['high'][0]),
        np.asarray(similarity_pairs['high'][1]))
    mixed_similarity = evaluate_similarity(
        current_model, np.asarray(similarity_pairs['mixed'][0]),
        np.asarray(similarity_pairs['mixed'][1]))
    return general_similarity, low_similarity, mid_similarity, high_similarity, mixed_similarity
Esempio n. 8
0
def web_tests(emb):
    """
    :param emb: dict of words and their corresponding embeddings
    :return: dict of word-embeddings-benchmarks tests and scores received
    """
    similarity_tasks = {
        'WS353': fetch_WS353(),
        'RG65': fetch_RG65(),
        'RW': fetch_RW(),
        'MTurk': fetch_MTurk(),
        'MEN': fetch_MEN(),
        'SimLex999': fetch_SimLex999()
    }

    web_emb = Embedding(Vocabulary(list(emb.keys())), list(emb.values()))
    similarity_results = {}
    for name, data in iteritems(similarity_tasks):
        similarity_results[name] = evaluate_similarity(web_emb, data.X, data.y)
        logging.info("Spearman correlation of scores on {} {}".format(
            name, evaluate_similarity(web_emb, data.X, data.y)))
    return similarity_results
Esempio n. 9
0
 def eval_emb_on_tasks(self, emb_dict, file=sys.stdout):
     results = {}
     print('*' * 30)
     for name, data in self.tasks.items():
         score = evaluate_similarity(emb_dict, data.X, data.y)
         print("Spearman correlation of scores on {} {}".format(
             name, score),
               file=file)
         results[name] = score
     self.cur_score = sum(results.values())
     self.cur_results = results
     self.cur_emb = emb_dict
     return self.cur_score, self.cur_results
    def _evaluate_unstructed(self, data: Bunch, struct_info: str,
                             tokenize_oov_with_deepcut: bool,
                             cut_letters_for_oov: bool,
                             filter_not_found: bool):
        result = evaluate_similarity(
            self.w,
            data.X,
            data.y,
            tokenize_oov_words_with_deepcut=tokenize_oov_with_deepcut,
            filter_not_found=filter_not_found,
            include_structured_sources=struct_info,
            cut_letters_for_oov=cut_letters_for_oov,
            structed_sources_coef=None)

        try:
            hm = scipy.stats.hmean([result['spearmanr'], result['pearsonr']])
        except:
            hm = -999  ## undefined

        return result, hm
Esempio n. 11
0
def call_module(g_filename):
  # Configure logging
  logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.DEBUG, datefmt='%I:%M:%S')

  # Fetch GloVe embedding (warning: it might take few minutes)
  #w_glove = fetch_GloVe(corpus="wiki-6B", dim=300)
  kargs = {'vocab_size':200000, 'dim':400}
  fname=g_filename
  w_custom = load_embedding(fname, format="glove", normalize=True,
                   lower=True, clean_words=False, load_kwargs=kargs)
  # Define tasks
  tasks = {
    "MEN": fetch_MEN(),
    "WS353": fetch_WS353(),
    "SIMLEX999": fetch_SimLex999()
  }

  # Print sample data
  for name, data in iteritems(tasks):
    print("Sample data from {}: pair \"{}\" and \"{}\" is assigned score {}".format(name, data.X[0][0], data.X[0][1], data.y[0]))

  # Calculate results using helper function
  for name, data in iteritems(tasks):
    print("Spearman correlation of scores on {} {}".format(name, evaluate_similarity(w_custom, data.X, data.y)))
def evaluateOnAll(w):
  similarity_tasks = {
      "MTurk": fetch_MTurk(),
      "MEN": fetch_MEN(),
      "WS353": fetch_WS353(),
      "RubensteinAndGoodenough": fetch_RG65(),
      "Rare Words": fetch_RW(),
      "SIMLEX999": fetch_SimLex999(),
      "TR9856": fetch_TR9856()
    }

  similarity_results = {}
  
  for name, data in iteritems(similarity_tasks):
    similarity_results[name] = evaluate_similarity(w, data.X, data.y)
    print("Spearman correlation of scores on {} {}".format(name, similarity_results[name]))
  
  # Calculate results on analogy
  print("Calculating analogy benchmarks")
  analogy_tasks = {
        "Google": fetch_google_analogy(),
        "MSR": fetch_msr_analogy()
  }
  analogy_results = {}
  for name, data in iteritems(analogy_tasks):
    analogy_results[name] = evaluate_analogy(w, data.X, data.y)
    print("Analogy prediction accuracy on {} {}".format(name, analogy_results[name]))
  
  analogy_results["SemEval2012_2"] = calAnswersonSemEval(w)['all']
  print("Analogy prediction accuracy on {} {}".format("SemEval2012", analogy_results["SemEval2012_2"]))

  analogy = pd.DataFrame([analogy_results])
  sim = pd.DataFrame([similarity_results])
  results = sim.join(analogy)

  return results
Esempio n. 13
0
def eval_sim(model):
    d = {word: model.wv[word] for word in model.wv.vocab}
    data = fetch_WS353(which="similarity")
    return evaluate_similarity(d, data.X, data.y)
Esempio n. 14
0
    "TR9856": fetch_TR9856()
}

# In[20]:

# Print sample data
for name, data in iteritems(tasks):
    print("Sample data from {}: pair \"{}\" and \"{}\" is assigned score {}".
          format(name, data.X[0][0], data.X[0][1], data.y[0]))

# In[31]:

# Calculate results using helper function -Fast Text
for name, data in iteritems(tasks):
    print("Spearman correlation of scores on {} {}".format(
        name, evaluate_similarity(FastText, data.X, data.y)))
#     print("Spearman correlation of scores on {} {}".format(name, evaluate_similarity(t_glove, data.X, data.y)))

# In[21]:

# Calculate results using helper function - wiki glove
for name, data in iteritems(tasks):
    print("Spearman correlation of scores on {} {}".format(
        name, evaluate_similarity(w_glove, data.X, data.y)))
#     print("Spearman correlation of scores on {} {}".format(name, evaluate_similarity(t_glove, data.X, data.y)))

# In[22]:

# Calculate results using helper function - twitter glove
for name, data in iteritems(tasks):
    print("Spearman correlation of scores on {} {}".format(
def calSimilarityResult(tasks, w_data):
  # Calculate results using helper function
  for name, data in iteritems(tasks):
    print ("Spearman correlation of scores on {} {}".format(name, evaluate_similarity(w_data, data.X, data.y)))
Esempio n. 16
0
# Print sample data
for name, data in iteritems(tasks):
    print("Sample data from {}: pair \"{}\" and \"{}\" is assigned score {}".format(name, data.X[0][0], data.X[0][1],
                                                                                    data.y[0]))

# Calculate results using helper function for the various word similarity datasets
latex1, latex2 = '', ''
for name, data in iteritems(tasks):
    print("\n", "NEW TASK:", name)
    if INCLUDE_STRUCTED_INFO:
        results = []
        for coef in np.arange(0.00, 1.05, 0.05):
            result = evaluate_similarity(w, data.X, data.y,
                                         tokenize_oov_words_with_deepcut=TOKENIZE_OOV_WORDS_WITH_DEEPCUT,
                                         filter_not_found=FILTER_NOT_FOUND,
                                         include_structured_sources=INCLUDE_STRUCTED_INFO,
                                         structed_sources_coef=coef)
            result['coef'] = coef
            try:
                result['hm'] = scipy.stats.hmean([result['spearmanr'], result['pearsonr']])
            except:
                result['hm'] = -999  ## undefined
            results.append(result)

        pprint(results)
        result = max(results, key=lambda x: x['hm'])
        hm = result['hm']
        print('BEST COEF: {}'.format(result['coef']))
        print('WORDNET OOV : {}'.format(result['structed_oov_pairs']))
Esempio n. 17
0
                    level=logging.DEBUG,
                    datefmt='%I:%M:%S')

# Fetch GloVe embedding (warning: it might take few minutes)
#w_glove = fetch_GloVe(corpus="wiki-6B", dim=300)
kargs = {'vocab_size': 200000, 'dim': 400}
fname = '/home/student/Desktop/paper_1/hadoop-1.2.1/1_sparse_matrix/' + str(
    sys.argv[2]) + '_' + str(sys.argv[1]) + '_' + 'embeddings'
w_custom = load_embedding(fname,
                          format="glove",
                          normalize=True,
                          lower=True,
                          clean_words=False,
                          load_kwargs=kargs)
# Define tasks
tasks = {
    "MEN": fetch_MEN(),
    "WS353": fetch_WS353(),
    "SIMLEX999": fetch_SimLex999()
}

# Print sample data
for name, data in iteritems(tasks):
    print("Sample data from {}: pair \"{}\" and \"{}\" is assigned score {}".
          format(name, data.X[0][0], data.X[0][1], data.y[0]))

# Calculate results using helper function
for name, data in iteritems(tasks):
    print("Spearman correlation of scores on {} {}".format(
        name, evaluate_similarity(w_custom, data.X, data.y)))
Esempio n. 18
0
        similarity_results = {}
        similarity_tasks = {
            "RG65": fetch_RG65(),
            #"MEN": fetch_MEN(),    
            #"WS353": fetch_WS353(),
            #"WS353R": fetch_WS353(which="relatedness"),
            #"WS353S": fetch_WS353(which="similarity"),
            #"SimLex999": fetch_SimLex999(),
            #"MTurk": fetch_MTurk(),

            #"multilingual_SimLex999": fetch_multilingual_SimLex999(),
            #"RW": fetch_RW(),
        }
        
        for name, data in similarity_tasks.items():
            similarity_results[name] = evaluate_similarity(w_embedding, data.X, data.y)
            print ("Spearman correlation of scores on {} {}".format(name, similarity_results[name]))
            res[name] = similarity_results[name]

        
        
        print('{}'.format(' '.join(['-' for x in range(30)])))

        # ANALOGY
        analogy_tasks = {
            #Google": fetch_google_analogy(),
            #"MSR": fetch_msr_analogy()
        }
        analogy_results = {}
        for name, data in analogy_tasks.items():
            print(len(data.X))
Esempio n. 19
0
# Define tasks
tasks = {
    "MTurk": fetch_MTurk(),
    "MEN": fetch_MEN(),
    "WS353": fetch_WS353(),
    "RG65":fetch_RG65(),
    "RW":fetch_RW(),
    "SIMLEX999": fetch_SimLex999(),
    "TR9856":fetch_TR9856()
}

result =   np.zeros((7,7))
# Print sample data
#for name, data in iteritems(tasks):
#    print("Sample data from {}: pair \"{}\" and \"{}\" is assigned score {}".format(name, data.X[0][0], data.X[0][1], data.y[0]))

# Calculate results using helper function
i = 0
for m_name, m_fun in iteritems(models):   
    j = 0
    try:
        model = eval(m_fun)
    except: 
        i += i
        continue
    for name, data in iteritems(tasks):
        eval_result = evaluate_similarity(model, data.X, data.y)
        print("Spearman correlation of scores on {} {}".format(name, eval_result))
        result[i][j] = eval_result
        j +=1
Esempio n. 20
0
import logging
from six import iteritems
from web.datasets.similarity import fetch_MEN, fetch_WS353, fetch_SimLex999
from web.embeddings import fetch_GloVe
from web.evaluate import evaluate_similarity

# Configure logging
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s',
                    level=logging.DEBUG,
                    datefmt='%I:%M:%S')

# Fetch GloVe embedding (warning: it might take few minutes)
w_glove = fetch_GloVe(corpus="wiki-6B", dim=300)

# Define tasks
tasks = {
    "MEN": fetch_MEN(),
    "WS353": fetch_WS353(),
    "SIMLEX999": fetch_SimLex999()
}

# Print sample data
for name, data in iteritems(tasks):
    print("Sample data from {}: pair \"{}\" and \"{}\" is assigned score {}".
          format(name, data.X[0][0], data.X[0][1], data.y[0]))

# Calculate results using helper function
for name, data in iteritems(tasks):
    print "Spearman correlation of scores on {} {}".format(
        name, evaluate_similarity(w_glove, data.X, data.y))
Esempio n. 21
0
pickle.dump(vectors_context_word , open( "/content/drive/My Drive/CS565_Assignment_2/vectors_context_word_200_iterations", "wb" ) )
pickle.dump(biases_main_word , open( "/content/drive/My Drive/CS565_Assignment_2/biases_main_word_200_iterations", "wb" ) )
pickle.dump(biases_context_word , open( "/content/drive/My Drive/CS565_Assignment_2/biases_context_word_200_iterations", "wb" ) )

! rm -rf web
! cp -r '/content/drive/My Drive/CS565_Assignment_2/web/' .
# ! cp -r '/content/drive/My Drive/web/datasets/similarity.py' .

import logging
from six import iteritems
from web.datasets.similarity import fetch_MEN, fetch_WS353, fetch_SimLex999
from web.embeddings import fetch_GloVe
from web.evaluate import evaluate_similarity

tasks = {
    "MEN": fetch_MEN(),
    "WS353": fetch_WS353(),
    "SIMLEX999": fetch_SimLex999()
}

for name, data in iteritems(tasks):
    print("Sample data from {}: pair \"{}\" and \"{}\" is assigned score {}".format(name, data.X[0][0], data.X[0][1], data.y[0]))

for name, data in iteritems(tasks):
    print ("Spearman correlation of scores on {} {}".format(name, evaluate_similarity(vectors_main_word, data.X, data.y)))

w_glove = fetch_GloVe(corpus="wiki-6B", dim=300)
for name, data in iteritems(tasks):
    print ("Spearman correlation of scores on {} {}".format(name, evaluate_similarity(w_glove, data.X, data.y)))

Esempio n. 22
0
import argparse
import pickle

from web.datasets.similarity import fetch_WS353
from web.embeddings import load_embedding
from web.evaluate import evaluate_similarity

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--word-vectors')
    args = parser.parse_args()

    ws353 = fetch_WS353()

    embedding = load_embedding(args.word_vectors,
                               lower=True,
                               clean_words=True,
                               format='dict')
    print('Spearman`s rank on WS353 ',
          evaluate_similarity(embedding, ws353.X, ws353.y))
    "TH-SimLex999": fetch_thai_simlex999(),
    "TWS65": fetch_TWS65()
}

# Print sample data
for name, data in iteritems(tasks):
    print("Sample data from {}: pair \"{}\" and \"{}\" is assigned score {}".
          format(name, data.X[0][0], data.X[0][1], data.y[0]))

# Calculate results using helper function for the various word similarity datasets
latex1, latex2 = '', ''
for name, data in iteritems(tasks):
    print("\n", "NEW TASK:", name)
    result = evaluate_similarity(
        w,
        data.X,
        data.y,
        tokenize_oov_words_with_deepcut=TOKENIZE_OOV_WORDS_WITH_DEEPCUT,
        filter_not_found=FILTER_NOT_FOUND)

    try:
        hm = scipy.stats.hmean([result['spearmanr'], result['pearsonr']])
    except:
        hm = -999  ## undefined
    perc_oov_words = 100 * (
        result['num_missing_words'] /
        (result['num_found_words'] + float(result['num_missing_words'])))

    print(
        'num_found_words and num_missing_words are just the plain counts in the datasets'
    )
    print(