Exemple #1
0
def test_analogy_solver():
    url = "https://www.dropbox.com/s/5occ4p7k28gvxfj/ganalogy-sg-wiki-en-400.bin?dl=1"
    file_name = _fetch_file(url, "test")

    w = Embedding.from_word2vec(file_name, binary=True)
    data = fetch_google_analogy()
    ids = np.random.RandomState(777).choice(range(data.X.shape[0]), 1000, replace=False)
    X, y = data.X[ids], data.y[ids]
    category = data.category_high_level[ids]

    results = evaluate_analogy(w=w, X=X, y=y, category=category)
    assert results['accuracy']['all'] >= 0.65
    assert results['accuracy']['semantic'] >= 0.7
    assert results['accuracy']['syntactic'] >= 0.63

    results = evaluate_analogy(w=w, X=X, y=y, category=category, method="mul")
    assert results['accuracy']['all'] >= 0.7
    assert results['accuracy']['semantic'] >= 0.75
    assert results['accuracy']['syntactic'] >= 0.64

    results_mul = evaluate_analogy(w=w, X=X, y=y, category=category, method="mul", k=400)
    results_add = evaluate_analogy(w=w, X=X, y=y, category=category, method="add", k=400)
    assert results_mul['accuracy']['all'] >= results_add['accuracy']['all']
    assert results_mul['accuracy']['syntactic'] >= results_add['accuracy']['syntactic']
    assert results_mul['accuracy']['semantic'] >= results_add['accuracy']['semantic']
def test_analogy_fetchers():
    data = fetch_msr_analogy()
    assert len(set(data.category)) == 16

    data = fetch_google_analogy()
    assert len(set(data.category)) == 14
    assert len(set(data.category_high_level)) == 2

    data = fetch_semeval_2012_2()
    assert len(data.X) == len(data.y) == 79
    for k, val in iteritems(data.X_prot):
        assert len(val.shape) == 2, "Failed parsing prototypes for " + k

    data = fetch_wordrep(subsample=0.7)
    assert len(set(data.category)) == 25
    assert len(data.X[0]) == 2
    assert "all-capital-cities" in set(data.category)
    assert len(set(data.category_high_level)) == 2
Exemple #3
0
def test_analogy_fetchers():
    data = fetch_msr_analogy()
    assert len(set(data.category)) == 16

    data = fetch_google_analogy()
    assert len(set(data.category)) == 14
    assert len(set(data.category_high_level)) == 2

    data = fetch_semeval_2012_2()
    assert len(data.X) == len(data.y) == 79
    for k, val in iteritems(data.X_prot):
        assert len(val.shape) == 2, "Failed parsing prototypes for " + k

    data = fetch_wordrep(subsample=0.7)
    assert len(set(data.category)) == 25
    assert len(data.X[0]) == 2
    assert "all-capital-cities" in set(data.category)
    assert len(set(data.category_high_level)) == 2
Exemple #4
0
def test_analogy_solver():
    url = "https://www.dropbox.com/s/5occ4p7k28gvxfj/ganalogy-sg-wiki-en-400.bin?dl=1"
    file_name = _fetch_file(url, "test")

    w = Embedding.from_word2vec(file_name, binary=True)
    data = fetch_google_analogy()
    ids = np.random.RandomState(777).choice(range(data.X.shape[0]),
                                            1000,
                                            replace=False)
    X, y = data.X[ids], data.y[ids]
    category = data.category_high_level[ids]

    results = evaluate_analogy(w=w, X=X, y=y, category=category)
    assert results['accuracy']['all'] >= 0.65
    assert results['accuracy']['semantic'] >= 0.7
    assert results['accuracy']['syntactic'] >= 0.63

    results = evaluate_analogy(w=w, X=X, y=y, category=category, method="mul")
    assert results['accuracy']['all'] >= 0.7
    assert results['accuracy']['semantic'] >= 0.75
    assert results['accuracy']['syntactic'] >= 0.64

    results_mul = evaluate_analogy(w=w,
                                   X=X,
                                   y=y,
                                   category=category,
                                   method="mul",
                                   k=400)
    results_add = evaluate_analogy(w=w,
                                   X=X,
                                   y=y,
                                   category=category,
                                   method="add",
                                   k=400)
    assert results_mul['accuracy']['all'] >= results_add['accuracy']['all']
    assert results_mul['accuracy']['syntactic'] >= results_add['accuracy'][
        'syntactic']
    assert results_mul['accuracy']['semantic'] >= results_add['accuracy'][
        'semantic']
def get_dataset(dataset_name):
    if dataset_name == "WS353":
        return fetch_WS353("similarity")
    elif dataset_name == "MEN":
        return fetch_MEN("all")
    elif dataset_name == "SimLex-999":
        return fetch_SimLex999()
    elif dataset_name == "MTurk":
        return fetch_MTurk()
    elif dataset_name == "WS353":
        return fetch_WS353('all')
    elif dataset_name == "RG65":
        return fetch_RG65()
    elif dataset_name == "RW":
        return fetch_RW()
    elif dataset_name == "TR9856":
        return fetch_TR9856()
    elif dataset_name == "MSR":
        return fetch_msr_analogy()
    elif dataset_name == "Google":
        return fetch_google_analogy()
    else:
        raise Exception("{}: dataset not supported".format(dataset_name))
def evaluateOnAll(w):
  similarity_tasks = {
      "MTurk": fetch_MTurk(),
      "MEN": fetch_MEN(),
      "WS353": fetch_WS353(),
      "RubensteinAndGoodenough": fetch_RG65(),
      "Rare Words": fetch_RW(),
      "SIMLEX999": fetch_SimLex999(),
      "TR9856": fetch_TR9856()
    }

  similarity_results = {}
  
  for name, data in iteritems(similarity_tasks):
    similarity_results[name] = evaluate_similarity(w, data.X, data.y)
    print("Spearman correlation of scores on {} {}".format(name, similarity_results[name]))
  
  # Calculate results on analogy
  print("Calculating analogy benchmarks")
  analogy_tasks = {
        "Google": fetch_google_analogy(),
        "MSR": fetch_msr_analogy()
  }
  analogy_results = {}
  for name, data in iteritems(analogy_tasks):
    analogy_results[name] = evaluate_analogy(w, data.X, data.y)
    print("Analogy prediction accuracy on {} {}".format(name, analogy_results[name]))
  
  analogy_results["SemEval2012_2"] = calAnswersonSemEval(w)['all']
  print("Analogy prediction accuracy on {} {}".format("SemEval2012", analogy_results["SemEval2012_2"]))

  analogy = pd.DataFrame([analogy_results])
  sim = pd.DataFrame([similarity_results])
  results = sim.join(analogy)

  return results
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s',
                    level=logging.DEBUG,
                    datefmt='%I:%M:%S')

# Fetch skip-gram trained on GoogleNews corpus and clean it slightly
#w = fetch_SG_GoogleNews(lower=True, clean_words=True)
kargs = {'vocab_size': 200000, 'dim': 400}
fname = '/home/student/Desktop/paper_1/hadoop-1.2.1/1_sparse_matrix/pmi_tfidf_span_embeddings'
w = load_embedding(fname,
                   format="glove",
                   normalize=True,
                   lower=True,
                   clean_words=False,
                   load_kwargs=kargs)
# Fetch analogy dataset
data = fetch_google_analogy()

for cat in (set(data.category)):
    print(cat)

# Pick a sample of data and calculate answers
'''subset = [50, 1000, 4000, 10000, 14000]
for id in subset:
    w1, w2, w3 = data.X[id][0], data.X[id][1], data.X[id][2]
    print("Question: {} is to {} as {} is to ?".format(w1, w2, w3))
    print("Answer: " + data.y[id])
    print("Predicted: " + " ".join(w.nearest_neighbors(w[w2] - w[w1] + w[w3], exclude=[w1, w2, w3])))'''
score = 0.0
total = 0.0
for i, d in enumerate(data.X):
    try:
# -*- coding: utf-8 -*-

"""
 Simple example showing answering analogy questions
"""
import logging
from web.datasets.analogy import fetch_google_analogy
from web.embeddings import fetch_SG_GoogleNews

# Configure logging
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.DEBUG, datefmt='%I:%M:%S')

# Fetch skip-gram trained on GoogleNews corpus and clean it slightly
w = fetch_SG_GoogleNews(lower=True, clean_words=True)

# Fetch analogy dataset
data = fetch_google_analogy()

for cat in (set(data.category)):
    print(cat)

# Pick a sample of data and calculate answers
subset = [50, 1000, 4000, 10000, 14000]
for id in subset:
    w1, w2, w3 = data.X[id][0], data.X[id][1], data.X[id][2]
    print("Question: {} is to {} as {} is to ?".format(w1, w2, w3))
    print("Answer: " + data.y[id])
    print("Predicted: " + " ".join(w.nearest_neighbors(w[w2] - w[w1] + w[w3], exclude=[w1, w2, w3])))

    print("Analogy prediction accuracy on {} {}".format(name, analogy_results[name]))
  
  analogy_results["SemEval2012_2"] = calAnswersonSemEval(w)['all']
  print("Analogy prediction accuracy on {} {}".format("SemEval2012", analogy_results["SemEval2012_2"]))

  analogy = pd.DataFrame([analogy_results])
  sim = pd.DataFrame([similarity_results])
  results = sim.join(analogy)

  return results

"""#### Fetching benchmark datasets"""

# Fetch analogy dataset
data_wordrep = fetch_wordrep()
data_google = fetch_google_analogy()
data_msr = fetch_msr_analogy()
data_semeval = fetch_semeval_2012_2()

"""##### Print categories from benchmark datasets"""

printCategoriesForData(data_wordrep, True)

printCategoriesForData(data_google, True)

printCategoriesForData(data_msr, True)

printCategoriesForData(data_semeval, False)

"""##### WE:1 Analysis on Glove dataset, Wiki corpus"""