예제 #1
0
def fixLeftovers():
    from NormalGlove import Vocab
    path = DatasetManager.getVisualEmbeddingsFullSizeFolderPath()
    vocab = Vocab.readVocabFromCSVFile()
    lostEmbeddings = Leftovers.findWordsThatLackEmbedings(vocab, path)
    print("Leftovers found", len(lostEmbeddings))
    ImageEmbedder.createAverageEmbeddingsForVocab(lostEmbeddings, 3, 10,
                                                  [224, 224], path, 3)
예제 #2
0
def performPCA(mainFilename,
               gloveOutputFolder,
               dimensions,
               includeSize,
               skipSize=0,
               version=1):
    skipDimensions = [d + skipSize for d in dimensions]

    PCAOutputFolder = DatasetManager.getVisualEmbeddingsFullSizeFolderPath(
        version) + "PCA-{}".format(mainFilename)
    if (os.path.isdir(PCAOutputFolder) == False):
        os.mkdir(PCAOutputFolder)

    embeddingFilePath = DatasetManager.getVisualEmbeddingsFullSizeFolderPath(
        version) + "/Keyed-VisualGlove-Full"
    model = Model.loadKeyedVectors(embeddingFilePath)
    pureEmbeddings = [model.wv[k] for k in model.vocab]

    PCAReduction.createPCATransformers(pureEmbeddings[:includeSize],
                                       skipDimensions, PCAOutputFolder)
    PCAReduction.createPCAEmbeddingFiles(model, pureEmbeddings,
                                         PCAOutputFolder, gloveOutputFolder,
                                         mainFilename, skipDimensions,
                                         skipSize)
def main():
    from Datasets import DatasetManager

    embeddingPath = DatasetManager.getVisualEmbeddingsFullSizeFolderPath() + "VisualGlove-Full.txt"
    embeddings = DatasetManager.getWordsAndEmbeddingsFromFile(embeddingPath)
    FileProcessing.saveToFile(embeddings, 'tempfullEmbeddings')

    pureEmbeddings = [embeddings[w] for w in embeddings.keys()]
    print("Getting TSNE embeddings")
    reducedEmbeddings = getTsneEmbeddings(pureEmbeddings)

    print("Saving to file...")
    FileProcessing.saveToFile(reducedEmbeddings, 'tempEmbeddings')

    labels = [k for k in embeddings.keys()]
    visualizeEmbeddings(reducedEmbeddings, labels)
예제 #4
0
                wordsFoundInFolders[w] = 1
        counter += 1

    wordsNotFound = []
    for w in fullVocab:
        if (w not in wordsFoundInFolders):
            wordsNotFound.append(w)

    return wordsNotFound


if (__name__ == '__main__'):
    from NormalGlove import Model

    # removeListCharsFromFile(DatasetManager.getVisualEmbeddingsFullSizeFolderPath() + "/VisualGlove-Full.txt", "NewFile")
    file = DatasetManager.getVisualEmbeddingsFullSizeFolderPath(
    ) + "/ProperFormat.txt"
    # currentVocab = DatasetManager._getWordsFromEmbeddingFile(file)
    # concatenateEmbeddingsFiles(DatasetManager.getVisualEmbeddingsFullSizeFolderPath(), "ProperFormat.txt")
    temp = Model.loadGloveVectors(file)
    '''
    fullVocab = Vocab.readVocabFromCSVFile()
    print("Full Vocab loaded")
    print("Current vocab loaded")

    currentVocabLookup = {}
    for w in currentVocab:
        currentVocabLookup[w] = 1
    del currentVocab

    fullVocabSize = len(fullVocab)
    missingVocab = []
예제 #5
0
def concatEmbeddingFiles():
    path = DatasetManager.getVisualEmbeddingsFullSizeFolderPath()
    GloveFormatter.concatenateEmbeddingsFiles(path, "VisualGlove-Full.txt")