Exemple #1
0
def evaluateEmbeddingPairs(modelPaths, outputFilename, distance=True):
    couples = generateWordCouples()
    results = []
    for p1, p2 in modelPaths:
        m1, m2 = Model.loadKeyedVectors(p1), Model.loadKeyedVectors(p2)
        print("Bless", m1, m2)
        results.append(
            Utils.getTypeScoresForEmbeddings(couples,
                                             m1,
                                             m2,
                                             distance=distance))
    Utils.storeResultsToDisk(
        results, ["{}--{}".format(p1, p2) for p1, p2 in modelPaths],
        outputFilename,
        pairs=True)
Exemple #2
0
def createVocabCSVFile(fileName="VisualGloveVocab.csv"):
    from NormalGlove import Model
    model = Model.loadGlove50()
    vocab = list(model.wv.vocab)
    with open(fileName, 'w') as file:
        writer = csv.writer(file, quoting=csv.QUOTE_ALL)
        writer.writerow(vocab)
def combineGloveFiles(file1, file2, outputFile):
    embeddings = Model.loadKeyedVectors(file1)
    embeddings2 = Model.loadKeyedVectors(file2)
    print("Vocab1:", len(embeddings.vocab), "  Vocab2:",
          len(embeddings2.vocab))
    vocab = list(embeddings.vocab)
    missingWords = []
    with open(outputFile, 'w', encoding='utf-8') as newFile:
        for i, w in enumerate(vocab):
            emb1 = _embeddingsToString(embeddings[w])
            if (w in embeddings2):
                emb2 = _embeddingsToString(embeddings2[w])
                newFile.write("{} {} {}\n".format(w, emb1, emb2))
                if (i % 100000 == 0):
                    print(i)
            else:
                missingWords.append(w)
        print("Missing words:", len(missingWords))
        print(missingWords)
Exemple #4
0
def evaluateEmbeddings(modelPaths, outputFilename, distance=True):
    couples = generateWordCouples()
    results = []
    for p in modelPaths:
        print(p)
        results.append(
            Utils.getTypeScoresForEmbeddings(couples,
                                             Model.loadKeyedVectors(p),
                                             distance=distance))
    Utils.storeResultsToDisk(results, modelPaths, outputFilename)
def evaluateModels(modelPaths,
                   outputFilename,
                   datasetPath=None,
                   distance=False):
    couples = generateWordCouples(datasetPath)
    results = []
    for mPath in modelPaths:
        print("***Synonyms***\n", mPath)
        model = Model.loadKeyedVectors(mPath)
        results.append(
            Utils.getTypeScoresForEmbeddings(couples, model,
                                             distance=distance))

    Utils.storeResultsToDisk(results, modelPaths, outputFilename)
def performPCA(mainFilename,
               gloveOutputFolder,
               dimensions,
               includeSize,
               skipSize=0,
               version=1):
    skipDimensions = [d + skipSize for d in dimensions]

    PCAOutputFolder = DatasetManager.getVisualEmbeddingsFullSizeFolderPath(
        version) + "PCA-{}".format(mainFilename)
    if (os.path.isdir(PCAOutputFolder) == False):
        os.mkdir(PCAOutputFolder)

    embeddingFilePath = DatasetManager.getVisualEmbeddingsFullSizeFolderPath(
        version) + "/Keyed-VisualGlove-Full"
    model = Model.loadKeyedVectors(embeddingFilePath)
    pureEmbeddings = [model.wv[k] for k in model.vocab]

    PCAReduction.createPCATransformers(pureEmbeddings[:includeSize],
                                       skipDimensions, PCAOutputFolder)
    PCAReduction.createPCAEmbeddingFiles(model, pureEmbeddings,
                                         PCAOutputFolder, gloveOutputFolder,
                                         mainFilename, skipDimensions,
                                         skipSize)
    for w in fullVocab:
        if (w not in wordsFoundInFolders):
            wordsNotFound.append(w)

    return wordsNotFound


if (__name__ == '__main__'):
    from NormalGlove import Model

    # removeListCharsFromFile(DatasetManager.getVisualEmbeddingsFullSizeFolderPath() + "/VisualGlove-Full.txt", "NewFile")
    file = DatasetManager.getVisualEmbeddingsFullSizeFolderPath(
    ) + "/ProperFormat.txt"
    # currentVocab = DatasetManager._getWordsFromEmbeddingFile(file)
    # concatenateEmbeddingsFiles(DatasetManager.getVisualEmbeddingsFullSizeFolderPath(), "ProperFormat.txt")
    temp = Model.loadGloveVectors(file)
    '''
    fullVocab = Vocab.readVocabFromCSVFile()
    print("Full Vocab loaded")
    print("Current vocab loaded")

    currentVocabLookup = {}
    for w in currentVocab:
        currentVocabLookup[w] = 1
    del currentVocab

    fullVocabSize = len(fullVocab)
    missingVocab = []
    for i, w in enumerate(fullVocab):
        if(w not in currentVocabLookup):
            missingVocab.append(w)
def createKeyedVectorsFromGloveFile(gloveFile, outputFileName):
    print("Creating Keyed Vectors file from:", gloveFile)
    Model.loadGloveVectors(gloveFile).save(outputFileName)