from copy import deepcopy def euclidean(wordVecs, ppDict, numIters): newWordVecs = deepcopy(wordVecs) wvVocab = set(newWordVecs.keys()) loopVocab = wvVocab.intersection(set(ppDict.keys())) for it in range(numIters): #loop through every node also in ontology (otherwise just use data estimate) for word in loopVocab: wordNeighbours = set(ppDict[word]).intersection(wvVocab) numNeighbours = len(wordNeighbours) #no neighbours, pass - use data estimate if numNeighbours == 0: continue #NOTE: why such a high weight for data estimate? newVec = numNeighbours * wordVecs[word] #loop over neighbours and add to new vector (currently with weight 1) for ppWord in wordNeighbours: newVec += newWordVecs[ppWord] newWordVecs[word] = newVec/(2*numNeighbours) return newWordVecs if __name__=='__main__': wordVecs = read_word_vecs(sys.argv[1]) ppDict = read_ppdb(sys.argv[2], wordVecs) numIter = int(sys.argv[3]) outFileName = sys.argv[4] ''' Enrich the word vectors using ppdb and print the enriched vectors ''' print_word_vecs(euclidean(wordVecs, ppDict, numIter), outFileName)
def euclidean(wordVecs, ppDict, numIters): newWordVecs = deepcopy(wordVecs) wvVocab = set(newWordVecs.keys()) loopVocab = wvVocab.intersection(set(ppDict.keys())) for it in range(numIters): #loop through every node also in ontology (otherwise just use data estimate) for word in loopVocab: wordNeighbours = set(ppDict[word]).intersection(wvVocab) numNeighbours = len(wordNeighbours) #no neighbours, pass - use data estimate if numNeighbours == 0: continue #NOTE: why such a high weight for data estimate? newVec = numNeighbours * wordVecs[word] #loop over neighbours and add to new vector (currently with weight 1) for ppWord in wordNeighbours: newVec += newWordVecs[ppWord] newWordVecs[word] = newVec / (2 * numNeighbours) return newWordVecs if __name__ == '__main__': wordVecs = read_word_vecs(sys.argv[1]) ppDict = read_ppdb(sys.argv[2], wordVecs) numIter = int(sys.argv[3]) outFileName = sys.argv[4] ''' Enrich the word vectors using ppdb and print the enriched vectors ''' print_word_vecs(euclidean(wordVecs, ppDict, numIter), outFileName)
from ranking import spearmans_rho from ranking import assign_ranks from numpy.linalg import norm from random import shuffle from operator import itemgetter ''' Calculates the cosime sim between two numpy arrays ''' def cosine_sim(vec1, vec2): return vec1.dot(vec2) / (norm(vec1) * norm(vec2)) if __name__ == '__main__': wordVectorFile = sys.argv[1] DIR = sys.argv[2] wordVectors = read_word_vecs(wordVectorFile) print '=================================================================================' print "%6s" % "Serial", "%20s" % "Dataset", "%15s" % "Num Quests", "%15s" % "Not found", "%15s" % "%" print '=================================================================================' FILES = ['EN-ESL-50.txt', 'EN-RD-300.txt', 'EN-TOEFL-80.txt'] for i, FILE in enumerate(FILES): targets = [] mostSim = [] candidates = [] for l in open(DIR + FILE, 'r'): w = [c.strip() for c in l.strip().split('|')] targets.append(w[0]) mostSim.append(w[1]) shuffle(w[1:]) candidates.append(w[1:])
from io import read_word_vecs from ranking import spearmans_rho from ranking import assign_ranks from numpy.linalg import norm from random import shuffle from operator import itemgetter ''' Calculates the cosime sim between two numpy arrays ''' def cosine_sim(vec1, vec2): return vec1.dot(vec2)/(norm(vec1)*norm(vec2)) if __name__=='__main__': wordVectorFile = sys.argv[1] DIR = sys.argv[2] wordVectors = read_word_vecs(wordVectorFile) print '=================================================================================' print "%6s" %"Serial", "%20s" % "Dataset", "%15s" % "Num Quests", "%15s" % "Not found", "%15s" % "%" print '=================================================================================' FILES = ['EN-ESL-50.txt','EN-RD-300.txt', 'EN-TOEFL-80.txt'] for i, FILE in enumerate(FILES): targets = [] mostSim = [] candidates = [] for l in open(DIR+FILE,'r'): w = [c.strip() for c in l.strip().split('|')] targets.append(w[0]) mostSim.append(w[1]) shuffle(w[1:]) candidates.append(w[1:])