Beispiel #1
0
def evaluate(eval_data_dir, embeddings_filename):
    test_treebank_filename = '{}/{}'.format(eval_data_dir,
                                            get_test_treebank_filename())
    train_treebank_filename = '{}/{}'.format(eval_data_dir,
                                             get_train_treebank_filename())
    test_arcstd_filename = '{}/{}'.format(eval_data_dir,
                                          get_test_arcstd_filename())
    train_arcstd_filename = '{}/{}'.format(eval_data_dir,
                                           get_train_arcstd_filename())
    relevant_embeddings_filename = get_relevant_embeddings_filename(
        test_treebank_filename, train_treebank_filename, embeddings_filename)
    word_vecs = read_word_vectors(relevant_embeddings_filename)
    assert len(word_vecs) > 0
    embeddings_dimensionality = len(word_vecs.itervalues().next())
    coverage = compute_coverage(test_treebank_filename, word_vecs)
    if coverage == 0:
        print 'coverage = 0!!'
        return (0.0, 0.0)
    score = parsing_wrapper(train_arcstd_filename, test_arcstd_filename,
                            relevant_embeddings_filename,
                            embeddings_dimensionality)
    os.remove(relevant_embeddings_filename)
    return (
        score,
        coverage,
    )
Beispiel #2
0
def all_word_sim(word_vec_file, word_sim_dir):

    word_vecs = read_word_vectors(word_vec_file)
    print '================================================================================='
    print "%6s" % "Serial", "%20s" % "Dataset", "%15s" % "Num Pairs", "%15s" % "Not found", "%15s" % "Rho"
    print '================================================================================='

    total_rho = 0
    for i, filename in enumerate(os.listdir(word_sim_dir)):
        manual_dict, auto_dict = ({}, {})
        not_found, total_size = (0, 0)
        for line in open(os.path.join(word_sim_dir, filename), 'r'):
            line = line.strip().lower()
            word1, word2, val = line.split()
            if word1 in word_vecs and word2 in word_vecs:
                manual_dict[(word1, word2)] = float(val)
                auto_dict[(word1, word2)] = cosine_sim(word_vecs[word1],
                                                       word_vecs[word2])
            else:
                not_found += 1
            total_size += 1
        rho = spearmans_rho(assign_ranks(manual_dict), assign_ranks(auto_dict))
        total_rho += rho
        print "%6s" % str(i + 1), "%20s" % filename, "%15s" % str(total_size),
        print "%15s" % str(not_found),
        print "%15.4f" % rho
    print "Sum of scores: %15.4f" % total_rho
def evaluate(eval_data_dir, embeddings_filename):
  eval_data_filename = '{}/{}'.format(eval_data_dir, get_qvec_gold_filename())
  relevant_embeddings_filename = get_relevant_embeddings_filename(eval_data_filename, embeddings_filename)
  word_vecs = read_word_vectors(relevant_embeddings_filename)
  coverage = compute_coverage(eval_data_filename, word_vecs)
  score = qvec_cca_wrapper(eval_data_filename, relevant_embeddings_filename)
  os.remove(relevant_embeddings_filename)
  return (score, coverage,)
def evaluate(eval_data_dir, embeddings_filename):
  eval_data_filename = '{}/{}'.format(eval_data_dir, get_word_translation_gold_filename())
  relevant_word_types, relevant_word_pairs = get_relevant_word_types(eval_data_filename)
  relevant_embeddings_filename = get_relevant_embeddings_filename(relevant_word_types, embeddings_filename)
  word_vecs = read_word_vectors(relevant_embeddings_filename)
  coverage = compute_coverage(eval_data_filename, word_vecs)
  score = compute_precision_at_k(relevant_word_pairs, word_vecs, 1)
  os.remove(relevant_embeddings_filename)
  return (score, coverage,)
Beispiel #5
0
def evaluate(eval_data_dir, embeddings_filename):
  eval_data_filename = '{}/{}'.format(eval_data_dir, get_qvec_gold_filename())
  relevant_embeddings_filename = get_relevant_embeddings_filename(eval_data_filename, embeddings_filename)
  word_vecs = read_word_vectors(relevant_embeddings_filename)
  coverage = compute_coverage(eval_data_filename, word_vecs)
  score = qvec_wrapper(eval_data_filename, relevant_embeddings_filename)
  os.remove(relevant_embeddings_filename)
  embedding_size = len(word_vecs[next(iter(word_vecs))])
  return (score / embedding_size, coverage,)
def evaluate(eval_data_dir, embeddings_filename):
    eval_data_filename = "{}/{}".format(eval_data_dir, get_qvec_gold_filename())
    relevant_embeddings_filename = get_relevant_embeddings_filename(eval_data_filename, embeddings_filename)
    word_vecs = read_word_vectors(relevant_embeddings_filename)
    coverage = compute_coverage(eval_data_filename, word_vecs)
    score = qvec_wrapper(eval_data_filename, relevant_embeddings_filename)
    os.remove(relevant_embeddings_filename)
    embedding_size = len(word_vecs[next(iter(word_vecs))])
    return (score / embedding_size, coverage)
def evaluate(eval_data_dir, embeddings_filename):
  eval_data_filename = '{}/{}'.format(eval_data_dir, get_wordsim_gold_filename()) 
  relevant_embeddings_filename = get_relevant_embeddings_filename(eval_data_filename, embeddings_filename)
  word_vecs = read_word_vectors(relevant_embeddings_filename)
  manual_dict, auto_dict, coverage = compute_similarities_and_coverage(eval_data_filename, word_vecs)
  print 'size of manual/auto dicts: ', len(manual_dict), len(auto_dict)
  if coverage == 0 or min(len(manual_dict), len(auto_dict)) < 2: return (0.0, 0.0)
  ranked_manual_dict, ranked_auto_dict = assign_ranks(manual_dict), assign_ranks(auto_dict)
  score = spearmans_rho(ranked_manual_dict, ranked_auto_dict)
  os.remove(relevant_embeddings_filename)
  return (score, coverage,)
def evaluate(eval_data_dir, embeddings_filename):
  test_treebank_filename = '{}/{}'.format(eval_data_dir, get_test_treebank_filename())
  train_treebank_filename = '{}/{}'.format(eval_data_dir, get_train_treebank_filename())
  test_arcstd_filename = '{}/{}'.format(eval_data_dir, get_test_arcstd_filename())
  train_arcstd_filename = '{}/{}'.format(eval_data_dir, get_train_arcstd_filename())
  relevant_embeddings_filename = get_relevant_embeddings_filename(test_treebank_filename, train_treebank_filename, embeddings_filename)
  word_vecs = read_word_vectors(relevant_embeddings_filename)
  assert len(word_vecs) > 0; embeddings_dimensionality = len(word_vecs.itervalues().next())
  coverage = compute_coverage(test_treebank_filename, word_vecs)
  if coverage == 0: 
    print 'coverage = 0!!'
    return (0.0, 0.0)
  score = parsing_wrapper(train_arcstd_filename, test_arcstd_filename, relevant_embeddings_filename, embeddings_dimensionality)
  os.remove(relevant_embeddings_filename)
  return (score, coverage,)
Beispiel #9
0
def evaluate(eval_data_dir, embeddings_filename):
    eval_data_filename = '{}/{}'.format(eval_data_dir,
                                        get_word_translation_gold_filename())
    relevant_word_types, relevant_word_pairs = get_relevant_word_types(
        eval_data_filename)
    relevant_embeddings_filename = get_relevant_embeddings_filename(
        relevant_word_types, embeddings_filename)
    word_vecs = read_word_vectors(relevant_embeddings_filename)
    coverage = compute_coverage(eval_data_filename, word_vecs)
    score = compute_precision_at_k(relevant_word_pairs, word_vecs, 1)
    os.remove(relevant_embeddings_filename)
    return (
        score,
        coverage,
    )
Beispiel #10
0
def evaluate(eval_data_dir, embeddings_filename):
    eval_data_filename = '{}/{}'.format(eval_data_dir,
                                        get_wordsim_gold_filename())
    relevant_embeddings_filename = get_relevant_embeddings_filename(
        eval_data_filename, embeddings_filename)
    word_vecs = read_word_vectors(relevant_embeddings_filename)
    manual_dict, auto_dict, coverage = compute_similarities_and_coverage(
        eval_data_filename, word_vecs)
    print 'size of manual/auto dicts: ', len(manual_dict), len(auto_dict)
    if coverage == 0 or min(len(manual_dict), len(auto_dict)) < 2:
        return (0.0, 0.0)
    ranked_manual_dict, ranked_auto_dict = assign_ranks(
        manual_dict), assign_ranks(auto_dict)
    score = spearmans_rho(ranked_manual_dict, ranked_auto_dict)
    os.remove(relevant_embeddings_filename)
    return (
        score,
        coverage,
    )
Beispiel #11
0
def word_sim(word_vec_file, word_sim_file):

    word_vecs = read_word_vectors(word_vec_file)
    print '================================================================================='
    print "%15s" % "Num Pairs", "%15s" % "Not found", "%15s" % "Rho"
    print '================================================================================='

    manual_dict, auto_dict = ({}, {})
    not_found, total_size = (0, 0)
    for line in open(word_sim_file, 'r'):
        line = line.strip().lower()
        word1, word2, val = line.split()
        if word1 in word_vecs and word2 in word_vecs:
            manual_dict[(word1, word2)] = float(val)
            auto_dict[(word1, word2)] = cosine_sim(word_vecs[word1],
                                                   word_vecs[word2])
        else:
            not_found += 1
        total_size += 1
    print "%15s" % str(total_size), "%15s" % str(not_found),
    print "%15.4f" % spearmans_rho(assign_ranks(manual_dict),
                                   assign_ranks(auto_dict))
Beispiel #12
0
import sys
import os

from read_write import read_word_vectors
from ranking import *

if __name__ == '__main__':
    word_vec_file = sys.argv[1]
    word_sim_dir = sys.argv[2]
    try:
        top_vocab = int(float(sys.argv[3]))  #accepts answers in 1eX notation.
        if top_vocab < 0:
            top_vocab = 1e6
    except IndexError:
        top_vocab = 1e6
    word_vecs = read_word_vectors(word_vec_file, int(top_vocab))
    print '================================================================================='
    print "%6s" % "Serial", "%20s" % "Dataset", "%15s" % "Num Pairs", "%15s" % "Not found", "%15s" % "Rho"
    print '================================================================================='

    for i, filename in enumerate(os.listdir(word_sim_dir)):
        manual_dict, auto_dict = ({}, {})
        not_found, total_size = (0, 0)
        for line in open(os.path.join(word_sim_dir, filename), 'r'):
            line = line.strip().lower()
            word1, word2, val = line.split()
            if word1 in word_vecs and word2 in word_vecs:
                manual_dict[(word1, word2)] = float(val)
                auto_dict[(word1, word2)] = cosine_sim(word_vecs[word1],
                                                       word_vecs[word2])
            else:
Beispiel #13
0
# -*- coding:utf-8 -*-
import sys
import os
from read_write import read_word_vectors
from ranking import *

if __name__ == '__main__':
    word_vec_file = sys.argv[1]
    word_sim_dir = sys.argv[2]
    word_vecs = read_word_vectors(word_vec_file, False)

    print(
        '================================================================================='
    )
    print("%6s" % "Serial", "%20s" % "Dataset", "%15s" % "Num Pairs",
          "%15s" % "Not found", "%15s" % "Rho")
    print(
        '================================================================================='
    )

    for i, filename in enumerate(os.listdir(word_sim_dir)):
        manual_dict, auto_dict = ({}, {})
        not_found, total_size = (0, 0)
        for line in open(os.path.join(word_sim_dir, filename), 'r'):
            line = line.strip().lower()
            word1, word2, val = line.split()
            if word1 in word_vecs and word2 in word_vecs:
                manual_dict[(word1, word2)] = float(val)
                auto_dict[(word1, word2)] = cosine_sim(word_vecs[word1],
                                                       word_vecs[word2])
            else:
Beispiel #14
0
from read_write import read_word_vectors, read_word_vectors_orig
from ent_eval import *
import os
import sys
import time
import argparse

default_data = "../xling-entailment/data/monoling_entailment/baroni2012/data_lex_test.tsv"

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Short sample app')
    parser.add_argument('--m', action="store", dest="model", required=True)
    parser.add_argument('--d',
                        action="store",
                        dest="datapath",
                        default=default_data)
    opts = parser.parse_args(sys.argv[1:])

    scorer = ComplexScorer(datapath=opts.datapath)
    start = time.time()
    re_vecs, im_vecs = read_word_vectors(opts.model)
    # re_vecs,im_vecs=read_word_vectors_orig(opts.model+".real"),read_word_vectors_orig(opts.model+".imag")
    end = time.time()
    print "elapsed in loading", end - start
    missed, scores = scorer.compute_scores([re_vecs, im_vecs])
    print "missed", missed
    scorer.get_best_perf(scores)
Beispiel #15
0
    for line in f:
        line = line.strip().lower()
        word1, word2, val = line.split()
        if word1 in word_vecs1 and word2 in word_vecs1 and word1 in word_vecs2 and word2 in word_vecs2:
            auto_dict1[(word1, word2)] = cosine_sim(word_vecs1[word1], word_vecs1[word2])
            auto_dict2[(word1, word2)] = cosine_sim(word_vecs2[word1], word_vecs2[word2])
        else:
            not_found += 1
        total_size += 1
    return auto_dict1,auto_dict2,not_found,total_size

if __name__=="__main__":
    word_vec1_file = sys.argv[1]
    word_vec2_file = sys.argv[2]
    word_sim_file = sys.argv[3]
    word_vecs1 = read_word_vectors(word_vec1_file)
    word_vecs2 = read_word_vectors(word_vec2_file)
    print '================================================================================='
    print "%15s" % "Num Pairs", "%15s" % "Not found", "%15s" % "Rho"
    print '================================================================================='
    
    manual_dict,auto1_dict,not_found,total_size = compute_vs_gold(open(word_sim_file,'r'),word_vecs1)
    A=spearmans_rho(assign_ranks(manual_dict), assign_ranks(auto1_dict))
    print "%15s" % str(total_size), "%15s" % str(not_found),
    print "%15.4f" % A
    
    manual_dict,auto2_dict,not_found,total_size = compute_vs_gold(open(word_sim_file,'r'),word_vecs2)
    B=spearmans_rho(assign_ranks(manual_dict), assign_ranks(auto2_dict))
    print "%15s" % str(total_size), "%15s" % str(not_found),
    print "%15.4f" % B
                format='png')


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--e', required=True, help="embedding file")
    parser.add_argument('--s', required=True, help="synonym lexicon file")
    parser.add_argument('--a', required=True, help="antonym lexicon file")
    parser.add_argument(
        '--d',
        required=False,
        help="Distance metric. Choose between euclidean and cosine")
    args = parser.parse_args(sys.argv[1:])

    name = args.e
    wordVecs = read_write.read_word_vectors(args.e)
    synonyms = read_write.read_lexicon(args.s)
    antonyms = read_write.read_lexicon(args.a)

    print("now calculating distances")
    if (args.d is not None):
        syn_mean_dist, syn_norm, syns = calculate_lex_distance(
            synonyms, wordVecs, args.d)
        ant_mean_dist, ant_norm, ants = calculate_lex_distance(
            antonyms, wordVecs, args.d)
    else:
        syn_mean_dist, syn_norm, syns = calculate_lex_distance(
            synonyms, wordVecs)
        ant_mean_dist, ant_norm, ants = calculate_lex_distance(
            antonyms, wordVecs)
    print("Calculated distances...")
Beispiel #17
0
import sys

from read_write import read_word_vectors
from findMatch import cosine_sim
from ranking import spearmans_rho
from ranking import assign_ranks

if __name__=='__main__':  
  wordVectorFile = sys.argv[1]
  wordVectors = read_word_vectors(wordVectorFile)
  print '================================================================================='
  print "%6s" %"Serial", "%20s" % "Dataset", "%15s" % "Num Pairs", "%15s" % "Not found", "%15s" % "Rho"
  print '================================================================================='
  DIR = '/usr1/corpora/usr0-corpora/word-sim/'
  FILES = ['EN-MC-30.txt', 'EN-MTurk-287.txt', 'EN-RG-65.txt', 'EN-RW-STANFORD.txt', 'EN-WS-353-ALL.txt', 'EN-WS-353-REL.txt', 'EN-WS-353-SIM.txt', 'EN-MEN-TR-3k.txt', 'EN-YP-130.txt', 'EN-MTurk-771.txt']

  for i, FILE in enumerate(FILES):
    manualDict, autoDict = ({}, {})
    notFound, totalSize = (0, 0)
    for line in open(DIR+FILE,'r'):
      line = line.strip().lower()
      word1, word2, val = line.split()
      if word1 in wordVectors and word2 in wordVectors:
        manualDict[(word1, word2)] = float(val)
        autoDict[(word1, word2)] = cosine_sim(wordVectors[word1], wordVectors[word2])
      else:
        notFound += 1
        totalSize += 1    
    print "%6s" % str(i+1), "%20s" % FILE, "%15s" % str(totalSize),
    print "%15s" % str(notFound),
    print "%15.4f" % spearmans_rho(assign_ranks(manualDict), assign_ranks(autoDict))
Beispiel #18
0
import sys
import os

from read_write import read_word_vectors
from ranking import *

if __name__=='__main__':
  word_vec_file = sys.argv[1]
  word_sim_dir = sys.argv[2]

  word_vecs = read_word_vectors(word_vec_file)
  print '================================================================================='
  print "%6s" %"Serial", "%20s" % "Dataset", "%15s" % "Num Pairs", "%15s" % "Not found", "%15s" % "Rho"
  print '================================================================================='

  total_rho = 0
  for i, filename in enumerate(os.listdir(word_sim_dir)):
    manual_dict, auto_dict = ({}, {})
    not_found, total_size = (0, 0)
    for line in open(os.path.join(word_sim_dir, filename),'r'):
      line = line.strip().lower()
      word1, word2, val = line.split()
      if word1 in word_vecs and word2 in word_vecs:
        manual_dict[(word1, word2)] = float(val)
        auto_dict[(word1, word2)] = cosine_sim(word_vecs[word1], word_vecs[word2])
      else:
        not_found += 1
      total_size += 1
    rho = spearmans_rho(assign_ranks(manual_dict), assign_ranks(auto_dict))
    total_rho += rho
    print "%6s" % str(i+1), "%20s" % filename, "%15s" % str(total_size),
    word_vec_file = sys.argv[1]
    word_sim_dir = sys.argv[2]
    dimension = int(sys.argv[3])
    oov = int(sys.argv[3])

    print(sys.argv)

    v = []
    for i, filename in enumerate(os.listdir(word_sim_dir)):
        for line in open(os.path.join(word_sim_dir, filename), 'r'):
            line = line.strip().lower()
            word1, word2, val = line.split()
            v.append(word1)
            v.append(word2)

    word_vecs = read_word_vectors(word_vec_file, v, dimension, oov)
    print(
        '================================================================================='
    )
    print("%6s" % "Serial", "%20s" % "Dataset", "%15s" % "Num Pairs",
          "%15s" % "Not found", "%15s" % "Rho")
    print(
        '================================================================================='
    )

    for i, filename in enumerate(os.listdir(word_sim_dir)):
        manual_dict, auto_dict = ({}, {})
        not_found, total_size = (0, 0)
        for line in open(os.path.join(word_sim_dir, filename), 'r'):
            line = line.strip().lower()
            word1, word2, val = line.split()
Beispiel #20
0
import sys
import os

from read_write import read_word_vectors
from ranking import *

if __name__ == '__main__':
    word_vec_file = sys.argv[1]
    word_sim_dir = sys.argv[2]

    word_vecs = read_word_vectors(word_vec_file)
    print '================================================================================='
    print "%6s" % "Serial", "%20s" % "Dataset", "%15s" % "Num Pairs", "%15s" % "Not found", "%15s" % "Rho"
    print '================================================================================='

    for i, filename in enumerate(os.listdir(word_sim_dir)):
        manual_dict, auto_dict = ({}, {})
        not_found, total_size = (0, 0)
        for line in open(os.path.join(word_sim_dir, filename), 'r'):
            line = line.strip().lower()
            word1, word2, val = line.split()
            if word1 in word_vecs and word2 in word_vecs:
                manual_dict[(word1, word2)] = float(val)
                auto_dict[(word1, word2)] = cosine_sim(word_vecs[word1],
                                                       word_vecs[word2])
            else:
                not_found += 1
            total_size += 1
        print "%6s" % str(i + 1), "%20s" % filename, "%15s" % str(total_size),
        print "%15s" % str(not_found),
        print "%15.4f" % spearmans_rho(assign_ranks(manual_dict),
    word_vec_files = [f for f in os.listdir(word_vec_dir) if not f.startswith('.')] # Don't read .DS_Store!
    num_embeddings = len(word_vec_files)
    
    word_sim_files = [f for f in os.listdir(word_sim_dir) if not f.startswith('.')] # Don't read .DS_Store!
    num_benchmarks = len(word_sim_files)

    header = ["File #","Word_embedding"] + word_sim_files
    
    scores = np.zeros((num_embeddings,num_benchmarks))


    for i,word_vec_file in enumerate(word_vec_files):
        root,ext = os.path.splitext(word_vec_file)

        word_vecs = read_word_vectors(os.path.join(word_vec_dir,word_vec_file))


        print "%6s" % str(i+1), "%30s" % root,

        for j, word_sim_file in enumerate(word_sim_files):
            manual_dict, auto_dict = ({}, {})
            not_found, total_size = (0, 0)
            not_found_words = []
            for line in open(os.path.join(word_sim_dir, word_sim_file),'r'):
                line = line.strip().lower()
                word1, word2, val = line.split()
                if word1 in word_vecs and word2 in word_vecs:
                    manual_dict[(word1, word2)] = float(val)
                    auto_dict[(word1, word2)] = cosine_sim(word_vecs[word1], word_vecs[word2])
                else: