예제 #1
0
def _getCombinedGloveFolder(name, gloveSize, visualSize, version=1):
    if (version == 1):
        return DatasetManager._getDatasetsFolderPath(
        ) + "/CombinedGlove-{}/Glove{}-Visual{}/".format(
            name, gloveSize, visualSize)
    else:
        return DatasetManager._getDatasetsFolderPath(
        ) + "/V{}/CombinedGlove-{}/Glove{}-Visual{}/".format(
            version, name, gloveSize, visualSize)
예제 #2
0
def getTopAndSkipCombined(top, skip, version=1):
    sizes = [(50, 50), (50, 150), (100, 100), (200, 100), (100, 200), (50, 300), (100, 300), (300, 50), (300, 150),
             (300, 300)]
    if (version == 1):
        basePath = DatasetManager._getDatasetsFolderPath() + "/CombinedGlove-Top{}K-Skip{}/".format(top, skip)
    else:
        basePath = DatasetManager._getDatasetsFolderPath() + "/V{}/CombinedGlove-Top{}K-Skip{}/".format(version, top,
                                                                                                        skip)
    return [basePath + "Glove{}-Visual{}/Keyed-Glove{}-Visual{}".format(g, v, g, v) for g, v in sizes]
예제 #3
0
def concatToStandardGlove(visualGloveBaseFolder,
                          name,
                          sizeCombinations=None,
                          version=1):
    if (sizeCombinations == None):
        sizeCombinations = STANDARD_COMBINATIONS

    for gloveSize, visualGloveSize in sizeCombinations:
        gloveFile = DatasetManager._getDatasetsFolderPath(
        ) + "/StandardGlove/Keyed-Glove{}-Visual0".format(gloveSize)
        visualGloveFile = visualGloveBaseFolder + "-{}/Keyed-VisualGlove-{}".format(
            visualGloveSize, visualGloveSize)

        saveDir = _getCombinedGloveFolder(name, gloveSize, visualGloveSize,
                                          version)
        if (os.path.isdir(saveDir) == False):
            os.makedirs(saveDir)

        newFileName = _getCombinedGlovedFilename(name, gloveSize,
                                                 visualGloveSize, version)
        GloveFormatter.combineGloveFiles(visualGloveFile, gloveFile,
                                         newFileName)

        keyedFilename = _getCombinedKeyedFilename(name, gloveSize,
                                                  visualGloveSize, version)
        GloveFormatter.createKeyedVectorsFromGloveFile(newFileName,
                                                       keyedFilename)
예제 #4
0
def generateWordCouples():
    couples = []
    with open(DatasetManager.getBLESSDataset(), 'r') as file:
        for l in file.readlines():
            word, _, type, word2 = l.lower().strip().split(' ')
            couples.append((word, word2, type))
    return couples
def main():
    from Datasets import DatasetManager

    embeddingPath = DatasetManager.getVisualEmbeddingsFullSizeFolderPath() + "VisualGlove-Full.txt"
    embeddings = DatasetManager.getWordsAndEmbeddingsFromFile(embeddingPath)
    FileProcessing.saveToFile(embeddings, 'tempfullEmbeddings')

    pureEmbeddings = [embeddings[w] for w in embeddings.keys()]
    print("Getting TSNE embeddings")
    reducedEmbeddings = getTsneEmbeddings(pureEmbeddings)

    print("Saving to file...")
    FileProcessing.saveToFile(reducedEmbeddings, 'tempEmbeddings')

    labels = [k for k in embeddings.keys()]
    visualizeEmbeddings(reducedEmbeddings, labels)
예제 #6
0
def fixLeftovers():
    from NormalGlove import Vocab
    path = DatasetManager.getVisualEmbeddingsFullSizeFolderPath()
    vocab = Vocab.readVocabFromCSVFile()
    lostEmbeddings = Leftovers.findWordsThatLackEmbedings(vocab, path)
    print("Leftovers found", len(lostEmbeddings))
    ImageEmbedder.createAverageEmbeddingsForVocab(lostEmbeddings, 3, 10,
                                                  [224, 224], path, 3)
예제 #7
0
def performPCA(mainFilename,
               gloveOutputFolder,
               dimensions,
               includeSize,
               skipSize=0,
               version=1):
    skipDimensions = [d + skipSize for d in dimensions]

    PCAOutputFolder = DatasetManager.getVisualEmbeddingsFullSizeFolderPath(
        version) + "PCA-{}".format(mainFilename)
    if (os.path.isdir(PCAOutputFolder) == False):
        os.mkdir(PCAOutputFolder)

    embeddingFilePath = DatasetManager.getVisualEmbeddingsFullSizeFolderPath(
        version) + "/Keyed-VisualGlove-Full"
    model = Model.loadKeyedVectors(embeddingFilePath)
    pureEmbeddings = [model.wv[k] for k in model.vocab]

    PCAReduction.createPCATransformers(pureEmbeddings[:includeSize],
                                       skipDimensions, PCAOutputFolder)
    PCAReduction.createPCAEmbeddingFiles(model, pureEmbeddings,
                                         PCAOutputFolder, gloveOutputFolder,
                                         mainFilename, skipDimensions,
                                         skipSize)
예제 #8
0
def findWordsThatLackEmbedings(fullVocab, folderPathToEmbeddingsFiles):
    wordsFoundInFolders = {}
    counter = 0
    for filePath in os.listdir(folderPathToEmbeddingsFiles):
        print("File", counter)
        for w in DatasetManager._getWordsFromEmbeddingFile(
                folderPathToEmbeddingsFiles + "/" + filePath):
            if (w not in wordsFoundInFolders):
                wordsFoundInFolders[w] = 1
        counter += 1

    wordsNotFound = []
    for w in fullVocab:
        if (w not in wordsFoundInFolders):
            wordsNotFound.append(w)

    return wordsNotFound
예제 #9
0
def concatenateEmbeddingsFiles(folderPath, newFileName):
    embeddedWords = {}
    with open(newFileName, 'w', encoding='utf-8') as newFile:
        for filePath in [
                p for p in os.listdir(folderPath)
                if os.path.isdir(folderPath + "/" + p) == False
        ]:
            print(filePath)
            localEmbeddings = DatasetManager.getWordsAndEmbeddingsFromFile(
                folderPath + "/" + filePath, asStr=True)
            for i, w in enumerate(localEmbeddings):
                if (w not in embeddedWords):
                    embeddedWords[w] = 1
                    newFile.write("{} {}\n".format(
                        w,
                        _embeddingsToString(localEmbeddings[w],
                                            strEmbeddings=True)))
            print("Processed lines:", len(embeddedWords.keys()))
예제 #10
0
def _resizeWorker(id, vocab, imgSize):
    print("Starting worker", id)
    failed = 0
    for i, w in enumerate(vocab):
        try:
            if ((i + 1) % 10 == 0):
                print("{}: {}/{}  Failed: {}".format(id, i, len(vocab),
                                                     failed))

            imgs = DatasetManager.getSameSizeGloveImages(w,
                                                         imgSize,
                                                         asNumpy=True)
            if (len(imgs) == 0):
                failed += 1
                continue

            FileProcessing.saveToFile(imgs, _getResizedWordPath(w))
        except Exception as e:
            failed += 1
            pass
예제 #11
0
def _getDataset(filepath=None):
    if (filepath == None):
        filepath = DatasetManager.getSynonymsDataset()

    with open(filepath, 'r') as file:
        sections = file.read().split('=')

    parsedSections = []
    for s in sections:
        subSections = s.strip().replace('\n',
                                        ' ').replace('.', ' ').split(':'), "\n"

        sortedSection = {}
        currentSection = ""
        for sub in subSections:
            for w in ' '.join(sub).split(' '):
                w = w.strip()
                if (w == "KEY"):
                    currentSection = "KEY"
                    sortedSection[w] = []
                elif (w == "SYN"):
                    currentSection = "SYN"
                    sortedSection[w] = []
                elif (w == "ANT"):
                    currentSection = "ANT"
                    sortedSection[w] = []

                elif (currentSection != "" and len(w) > 0
                      and str.isspace(w) == False):
                    sortedSection[currentSection].append(w.lower())

        if ('KEY' not in sortedSection or len(sortedSection['KEY']) == 0):
            continue
        if (('SYN' in sortedSection and len(sortedSection['SYN']) > 0)
                or ('ANT' in sortedSection and len(sortedSection['ANT']) > 0)):
            parsedSections.append(sortedSection)

    return parsedSections
예제 #12
0
def main():
    datasetPath = DatasetManager._getDatasetsFolderPath()
    print(datasetPath)
    sizes = [(50, 50), (50, 150), (100, 100), (100, 200), (200, 100)]
    for gloveSize, visualGloveSize in sizes:
        gloveFile = datasetPath + "/StandardGlove/glove.6B.{}d.txt".format(
            gloveSize)
        visualGloveFile = datasetPath + "/VisualEmbeddings/Top-100K-{}/VisualGlove-{}.txt".format(
            visualGloveSize, visualGloveSize)

        saveDir = "/home/ubuntu/VisualGlove/Datasets/CombinedGlove-Top100K/Glove{}-Visual{}/".format(
            gloveSize, visualGloveSize)
        if (os.path.isdir(saveDir) == False):
            os.makedirs(saveDir)
        newFileName = saveDir + "CombinedGlove-{}-{}.txt".format(
            gloveSize, visualGloveSize)
        GloveFormatter.combineGloveFiles(visualGloveFile, gloveFile,
                                         newFileName)

        keyedFilename = saveDir + "Keyed-Glove{}-Visual{}".format(
            gloveSize, visualGloveSize)
        GloveFormatter.createKeyedVectorsFromGloveFile(newFileName,
                                                       keyedFilename)
def main():
    version = 2
    skipSize = 0
    includeSize = 400000
    dimensions = [25, 50, 100, 150, 200, 300]

    name = "Top{}K-Skip{}".format(round(includeSize / 1000), skipSize)
    gloveOutputFolder = DatasetManager.getVisualEmbeddingsFolderPath(version) + name

    '''
    PerformPCA.performPCA(name, gloveOutputFolder, dimensions, includeSize, skipSize, version)

    # Convert into KeyedVectors
    for d in dimensions:
        folderPath = gloveOutputFolder + "-{}/".format(d)
        glovePath = folderPath + "VisualGlove-{}-{}.txt".format(name, d)
        keyPath = folderPath + "Keyed-VisualGlove-{}".format(d)
        GloveFormatter.createKeyedVectorsFromGloveFile(glovePath, keyPath)

    '''
    # Concat with standard Glove
    sizes = [(100, 300), (50, 300)]
    ConcatKeyedVectors.concatToStandardGlove(gloveOutputFolder, name, sizeCombinations=sizes, version=version)
예제 #14
0
def main():
    glovePath = DatasetManager.getVisualEmbeddingsFolderPath(
        2) + "/VisualGlove-2.0 Full.txt"
    GloveFormatter.createKeyedVectorsFromGloveFile(
        glovePath, "Keyed-VisualGlove-2.0-Full")
    '''
예제 #15
0
def readVocabFromCSVFile():
    with open(DatasetManager.getGloveVocabCSVPath(), 'r',
              encoding='utf-8') as file:
        reader = csv.reader(file)
        return list(reader)[0]
예제 #16
0
def getTopAndSkipKVisualOnly(top, skip, version=1):
    basePath = DatasetManager.getVisualEmbeddingsFolderPath(version)
    return [basePath + "Top{}K-Skip{}-{}/Keyed-VisualGlove-{}".format(top, skip, i, i) for i in [50, 100, 200, 300]]
예제 #17
0
def getTop100KCombinedPaths():
    sizes = [(50, 50), (50, 150), (100, 100), (200, 100), (100, 200)]
    folderPath = DatasetManager._getDatasetsFolderPath() + "/CombinedGlove-Top100K/"
    return [folderPath + "/Glove{}-Visual{}/Keyed-Glove{}-Visual{}".format(g, v, g, v) for g, v in sizes]
예제 #18
0
                wordsFoundInFolders[w] = 1
        counter += 1

    wordsNotFound = []
    for w in fullVocab:
        if (w not in wordsFoundInFolders):
            wordsNotFound.append(w)

    return wordsNotFound


if (__name__ == '__main__'):
    from NormalGlove import Model

    # removeListCharsFromFile(DatasetManager.getVisualEmbeddingsFullSizeFolderPath() + "/VisualGlove-Full.txt", "NewFile")
    file = DatasetManager.getVisualEmbeddingsFullSizeFolderPath(
    ) + "/ProperFormat.txt"
    # currentVocab = DatasetManager._getWordsFromEmbeddingFile(file)
    # concatenateEmbeddingsFiles(DatasetManager.getVisualEmbeddingsFullSizeFolderPath(), "ProperFormat.txt")
    temp = Model.loadGloveVectors(file)
    '''
    fullVocab = Vocab.readVocabFromCSVFile()
    print("Full Vocab loaded")
    print("Current vocab loaded")

    currentVocabLookup = {}
    for w in currentVocab:
        currentVocabLookup[w] = 1
    del currentVocab

    fullVocabSize = len(fullVocab)
    missingVocab = []
예제 #19
0
def _loadVocab():
    with open(DatasetManager.getGloveVocabCSVPath(), 'r') as f:
        vocab = list(csv.reader(f))[0]
    return vocab
예제 #20
0
def getNormalGlovePaths():
    return [DatasetManager.getNormalGloveFolderPath() + "/Keyed-Glove{}-Visual0".format(i) for i in [50, 100, 200, 300]]
예제 #21
0
def getTop100KVisualOnly():
    basePath = DatasetManager.getVisualEmbeddingsFolderPath()
    return [basePath + "Top-100K-{}/Keyed-VisualGlove-{}".format(i, i) for i in [50, 100, 200, 300]]
예제 #22
0
def concatEmbeddingFiles():
    path = DatasetManager.getVisualEmbeddingsFullSizeFolderPath()
    GloveFormatter.concatenateEmbeddingsFiles(path, "VisualGlove-Full.txt")