def test_analogy_solver(): url = "https://www.dropbox.com/s/5occ4p7k28gvxfj/ganalogy-sg-wiki-en-400.bin?dl=1" file_name = _fetch_file(url, "test") w = Embedding.from_word2vec(file_name, binary=True) data = fetch_google_analogy() ids = np.random.RandomState(777).choice(range(data.X.shape[0]), 1000, replace=False) X, y = data.X[ids], data.y[ids] category = data.category_high_level[ids] results = evaluate_analogy(w=w, X=X, y=y, category=category) assert results['accuracy']['all'] >= 0.65 assert results['accuracy']['semantic'] >= 0.7 assert results['accuracy']['syntactic'] >= 0.63 results = evaluate_analogy(w=w, X=X, y=y, category=category, method="mul") assert results['accuracy']['all'] >= 0.7 assert results['accuracy']['semantic'] >= 0.75 assert results['accuracy']['syntactic'] >= 0.64 results_mul = evaluate_analogy(w=w, X=X, y=y, category=category, method="mul", k=400) results_add = evaluate_analogy(w=w, X=X, y=y, category=category, method="add", k=400) assert results_mul['accuracy']['all'] >= results_add['accuracy']['all'] assert results_mul['accuracy']['syntactic'] >= results_add['accuracy']['syntactic'] assert results_mul['accuracy']['semantic'] >= results_add['accuracy']['semantic']
def test_analogy_fetchers(): data = fetch_msr_analogy() assert len(set(data.category)) == 16 data = fetch_google_analogy() assert len(set(data.category)) == 14 assert len(set(data.category_high_level)) == 2 data = fetch_semeval_2012_2() assert len(data.X) == len(data.y) == 79 for k, val in iteritems(data.X_prot): assert len(val.shape) == 2, "Failed parsing prototypes for " + k data = fetch_wordrep(subsample=0.7) assert len(set(data.category)) == 25 assert len(data.X[0]) == 2 assert "all-capital-cities" in set(data.category) assert len(set(data.category_high_level)) == 2
def test_analogy_solver(): url = "https://www.dropbox.com/s/5occ4p7k28gvxfj/ganalogy-sg-wiki-en-400.bin?dl=1" file_name = _fetch_file(url, "test") w = Embedding.from_word2vec(file_name, binary=True) data = fetch_google_analogy() ids = np.random.RandomState(777).choice(range(data.X.shape[0]), 1000, replace=False) X, y = data.X[ids], data.y[ids] category = data.category_high_level[ids] results = evaluate_analogy(w=w, X=X, y=y, category=category) assert results['accuracy']['all'] >= 0.65 assert results['accuracy']['semantic'] >= 0.7 assert results['accuracy']['syntactic'] >= 0.63 results = evaluate_analogy(w=w, X=X, y=y, category=category, method="mul") assert results['accuracy']['all'] >= 0.7 assert results['accuracy']['semantic'] >= 0.75 assert results['accuracy']['syntactic'] >= 0.64 results_mul = evaluate_analogy(w=w, X=X, y=y, category=category, method="mul", k=400) results_add = evaluate_analogy(w=w, X=X, y=y, category=category, method="add", k=400) assert results_mul['accuracy']['all'] >= results_add['accuracy']['all'] assert results_mul['accuracy']['syntactic'] >= results_add['accuracy'][ 'syntactic'] assert results_mul['accuracy']['semantic'] >= results_add['accuracy'][ 'semantic']
def get_dataset(dataset_name): if dataset_name == "WS353": return fetch_WS353("similarity") elif dataset_name == "MEN": return fetch_MEN("all") elif dataset_name == "SimLex-999": return fetch_SimLex999() elif dataset_name == "MTurk": return fetch_MTurk() elif dataset_name == "WS353": return fetch_WS353('all') elif dataset_name == "RG65": return fetch_RG65() elif dataset_name == "RW": return fetch_RW() elif dataset_name == "TR9856": return fetch_TR9856() elif dataset_name == "MSR": return fetch_msr_analogy() elif dataset_name == "Google": return fetch_google_analogy() else: raise Exception("{}: dataset not supported".format(dataset_name))
def evaluateOnAll(w): similarity_tasks = { "MTurk": fetch_MTurk(), "MEN": fetch_MEN(), "WS353": fetch_WS353(), "RubensteinAndGoodenough": fetch_RG65(), "Rare Words": fetch_RW(), "SIMLEX999": fetch_SimLex999(), "TR9856": fetch_TR9856() } similarity_results = {} for name, data in iteritems(similarity_tasks): similarity_results[name] = evaluate_similarity(w, data.X, data.y) print("Spearman correlation of scores on {} {}".format(name, similarity_results[name])) # Calculate results on analogy print("Calculating analogy benchmarks") analogy_tasks = { "Google": fetch_google_analogy(), "MSR": fetch_msr_analogy() } analogy_results = {} for name, data in iteritems(analogy_tasks): analogy_results[name] = evaluate_analogy(w, data.X, data.y) print("Analogy prediction accuracy on {} {}".format(name, analogy_results[name])) analogy_results["SemEval2012_2"] = calAnswersonSemEval(w)['all'] print("Analogy prediction accuracy on {} {}".format("SemEval2012", analogy_results["SemEval2012_2"])) analogy = pd.DataFrame([analogy_results]) sim = pd.DataFrame([similarity_results]) results = sim.join(analogy) return results
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.DEBUG, datefmt='%I:%M:%S') # Fetch skip-gram trained on GoogleNews corpus and clean it slightly #w = fetch_SG_GoogleNews(lower=True, clean_words=True) kargs = {'vocab_size': 200000, 'dim': 400} fname = '/home/student/Desktop/paper_1/hadoop-1.2.1/1_sparse_matrix/pmi_tfidf_span_embeddings' w = load_embedding(fname, format="glove", normalize=True, lower=True, clean_words=False, load_kwargs=kargs) # Fetch analogy dataset data = fetch_google_analogy() for cat in (set(data.category)): print(cat) # Pick a sample of data and calculate answers '''subset = [50, 1000, 4000, 10000, 14000] for id in subset: w1, w2, w3 = data.X[id][0], data.X[id][1], data.X[id][2] print("Question: {} is to {} as {} is to ?".format(w1, w2, w3)) print("Answer: " + data.y[id]) print("Predicted: " + " ".join(w.nearest_neighbors(w[w2] - w[w1] + w[w3], exclude=[w1, w2, w3])))''' score = 0.0 total = 0.0 for i, d in enumerate(data.X): try:
# -*- coding: utf-8 -*- """ Simple example showing answering analogy questions """ import logging from web.datasets.analogy import fetch_google_analogy from web.embeddings import fetch_SG_GoogleNews # Configure logging logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.DEBUG, datefmt='%I:%M:%S') # Fetch skip-gram trained on GoogleNews corpus and clean it slightly w = fetch_SG_GoogleNews(lower=True, clean_words=True) # Fetch analogy dataset data = fetch_google_analogy() for cat in (set(data.category)): print(cat) # Pick a sample of data and calculate answers subset = [50, 1000, 4000, 10000, 14000] for id in subset: w1, w2, w3 = data.X[id][0], data.X[id][1], data.X[id][2] print("Question: {} is to {} as {} is to ?".format(w1, w2, w3)) print("Answer: " + data.y[id]) print("Predicted: " + " ".join(w.nearest_neighbors(w[w2] - w[w1] + w[w3], exclude=[w1, w2, w3])))
print("Analogy prediction accuracy on {} {}".format(name, analogy_results[name])) analogy_results["SemEval2012_2"] = calAnswersonSemEval(w)['all'] print("Analogy prediction accuracy on {} {}".format("SemEval2012", analogy_results["SemEval2012_2"])) analogy = pd.DataFrame([analogy_results]) sim = pd.DataFrame([similarity_results]) results = sim.join(analogy) return results """#### Fetching benchmark datasets""" # Fetch analogy dataset data_wordrep = fetch_wordrep() data_google = fetch_google_analogy() data_msr = fetch_msr_analogy() data_semeval = fetch_semeval_2012_2() """##### Print categories from benchmark datasets""" printCategoriesForData(data_wordrep, True) printCategoriesForData(data_google, True) printCategoriesForData(data_msr, True) printCategoriesForData(data_semeval, False) """##### WE:1 Analysis on Glove dataset, Wiki corpus"""