def evaluateEmbeddingPairs(modelPaths, outputFilename, distance=True): couples = generateWordCouples() results = [] for p1, p2 in modelPaths: m1, m2 = Model.loadKeyedVectors(p1), Model.loadKeyedVectors(p2) print("Bless", m1, m2) results.append( Utils.getTypeScoresForEmbeddings(couples, m1, m2, distance=distance)) Utils.storeResultsToDisk( results, ["{}--{}".format(p1, p2) for p1, p2 in modelPaths], outputFilename, pairs=True)
def createVocabCSVFile(fileName="VisualGloveVocab.csv"): from NormalGlove import Model model = Model.loadGlove50() vocab = list(model.wv.vocab) with open(fileName, 'w') as file: writer = csv.writer(file, quoting=csv.QUOTE_ALL) writer.writerow(vocab)
def combineGloveFiles(file1, file2, outputFile): embeddings = Model.loadKeyedVectors(file1) embeddings2 = Model.loadKeyedVectors(file2) print("Vocab1:", len(embeddings.vocab), " Vocab2:", len(embeddings2.vocab)) vocab = list(embeddings.vocab) missingWords = [] with open(outputFile, 'w', encoding='utf-8') as newFile: for i, w in enumerate(vocab): emb1 = _embeddingsToString(embeddings[w]) if (w in embeddings2): emb2 = _embeddingsToString(embeddings2[w]) newFile.write("{} {} {}\n".format(w, emb1, emb2)) if (i % 100000 == 0): print(i) else: missingWords.append(w) print("Missing words:", len(missingWords)) print(missingWords)
def evaluateEmbeddings(modelPaths, outputFilename, distance=True): couples = generateWordCouples() results = [] for p in modelPaths: print(p) results.append( Utils.getTypeScoresForEmbeddings(couples, Model.loadKeyedVectors(p), distance=distance)) Utils.storeResultsToDisk(results, modelPaths, outputFilename)
def evaluateModels(modelPaths, outputFilename, datasetPath=None, distance=False): couples = generateWordCouples(datasetPath) results = [] for mPath in modelPaths: print("***Synonyms***\n", mPath) model = Model.loadKeyedVectors(mPath) results.append( Utils.getTypeScoresForEmbeddings(couples, model, distance=distance)) Utils.storeResultsToDisk(results, modelPaths, outputFilename)
def performPCA(mainFilename, gloveOutputFolder, dimensions, includeSize, skipSize=0, version=1): skipDimensions = [d + skipSize for d in dimensions] PCAOutputFolder = DatasetManager.getVisualEmbeddingsFullSizeFolderPath( version) + "PCA-{}".format(mainFilename) if (os.path.isdir(PCAOutputFolder) == False): os.mkdir(PCAOutputFolder) embeddingFilePath = DatasetManager.getVisualEmbeddingsFullSizeFolderPath( version) + "/Keyed-VisualGlove-Full" model = Model.loadKeyedVectors(embeddingFilePath) pureEmbeddings = [model.wv[k] for k in model.vocab] PCAReduction.createPCATransformers(pureEmbeddings[:includeSize], skipDimensions, PCAOutputFolder) PCAReduction.createPCAEmbeddingFiles(model, pureEmbeddings, PCAOutputFolder, gloveOutputFolder, mainFilename, skipDimensions, skipSize)
for w in fullVocab: if (w not in wordsFoundInFolders): wordsNotFound.append(w) return wordsNotFound if (__name__ == '__main__'): from NormalGlove import Model # removeListCharsFromFile(DatasetManager.getVisualEmbeddingsFullSizeFolderPath() + "/VisualGlove-Full.txt", "NewFile") file = DatasetManager.getVisualEmbeddingsFullSizeFolderPath( ) + "/ProperFormat.txt" # currentVocab = DatasetManager._getWordsFromEmbeddingFile(file) # concatenateEmbeddingsFiles(DatasetManager.getVisualEmbeddingsFullSizeFolderPath(), "ProperFormat.txt") temp = Model.loadGloveVectors(file) ''' fullVocab = Vocab.readVocabFromCSVFile() print("Full Vocab loaded") print("Current vocab loaded") currentVocabLookup = {} for w in currentVocab: currentVocabLookup[w] = 1 del currentVocab fullVocabSize = len(fullVocab) missingVocab = [] for i, w in enumerate(fullVocab): if(w not in currentVocabLookup): missingVocab.append(w)
def createKeyedVectorsFromGloveFile(gloveFile, outputFileName): print("Creating Keyed Vectors file from:", gloveFile) Model.loadGloveVectors(gloveFile).save(outputFileName)