コード例 #1
0
ファイル: parameters.py プロジェクト: yuriyfilonov/testbench
def dumpEmbeddings(embeddings, embeddingsFilePath):
    if os.path.exists(embeddingsFilePath):
        os.remove(embeddingsFilePath)

    if not isinstance(embeddings, numpy.ndarray):
        embeddings = numpy.asarray(embeddings)

    embeddingsCount, embeddingSize = embeddings.shape

    with open(embeddingsFilePath, 'w') as embeddingsFile:
        binary.writei(embeddingsFile, embeddingsCount)
        binary.writei(embeddingsFile, embeddingSize)

        for embeddingIndex in range(0, embeddingsCount):
            embedding = embeddings[embeddingIndex]

            binary.writef(embeddingsFile, embedding)

            log.progress('Dumping embeddings: {0:.3f}%.', embeddingIndex + 1, embeddingsCount)

        log.lineBreak()
コード例 #2
0
ファイル: parameters.py プロジェクト: yuriyfilonov/testbench
def dumpWordMap(indexMap, indexMapFilePath):
    if os.path.exists(indexMapFilePath):
        os.remove(indexMapFilePath)

    with open(indexMapFilePath, 'w') as indexMapFile:
        indexMapSize = len(indexMap)
        itemIndex = 0

        binary.writei(indexMapFile, indexMapSize)

        for key, index in indexMap.items():
            keyLength = len(key)

            binary.writei(indexMapFile, keyLength)
            binary.writes(indexMapFile, key)
            binary.writei(indexMapFile, index)

            itemIndex += 1
            log.progress('Dumping map: {0:.3f}%.', itemIndex, indexMapSize)

        indexMapFile.flush()

        log.lineBreak()
コード例 #3
0
def processData(inputDirectoryPath, w2vEmbeddingsFilePath, fileIndexMapFilePath,
                wordIndexMapFilePath, wordEmbeddingsFilePath, contextsPath, windowSize, negative, strict):
    if os.path.exists(contextsPath):
        os.remove(contextsPath)

    fileContextSize = 1
    wordContextSize = windowSize - fileContextSize

    fileIndexMap = {}
    wordIndexMap = collections.OrderedDict()
    wordEmbeddings = []

    noNegativeSamplingPath = contextsPath
    if negative > 0:
        noNegativeSamplingPath += '.temp'

    if os.path.exists(noNegativeSamplingPath):
        os.remove(noNegativeSamplingPath)

    pathName = inputDirectoryPath + '/*.txt'
    textFilePaths = glob.glob(pathName)
    textFilePaths = sorted(textFilePaths)
    textFileCount = len(textFilePaths)

    w2vWordIndexMap, w2vEmbeddings = parameters.loadW2VParameters(w2vEmbeddingsFilePath)

    contextsCount = 0
    with open(noNegativeSamplingPath, 'wb+') as noNegativeSamplingFile:
        binary.writei(noNegativeSamplingFile, 0) # this is a placeholder for contexts count
        binary.writei(noNegativeSamplingFile, windowSize)
        binary.writei(noNegativeSamplingFile, 0)

        startTime = time.time()

        for textFileIndex, textFilePath in enumerate(textFilePaths):
            fileIndexMap[textFilePath] = textFileIndex

            contextProvider = WordContextProvider(textFilePath=textFilePath)
            for wordContext in contextProvider.iterate(wordContextSize):
                allWordsInWordVocabulary = [word in w2vWordIndexMap for word in wordContext]

                if not all(allWordsInWordVocabulary):
                    continue

                for word in wordContext:
                    if word not in wordIndexMap:
                        wordIndexMap[word] = len(wordIndexMap)
                        wordEmbeddingIndex = w2vWordIndexMap[word]
                        wordEmbedding = w2vEmbeddings[wordEmbeddingIndex]
                        wordEmbeddings.append(wordEmbedding)

                indexContext = [textFileIndex] + map(lambda w: wordIndexMap[w], wordContext)

                binary.writei(noNegativeSamplingFile, indexContext)
                contextsCount += 1

            currentTime = time.time()
            elapsed = currentTime - startTime
            secondsPerFile = elapsed / (textFileIndex + 1)

            log.progress('Reading contexts: {0:.3f}%. Elapsed: {1} ({2:.3f} sec/file). Words: {3}. Contexts: {4}.',
                         textFileIndex + 1,
                         textFileCount,
                         log.delta(elapsed),
                         secondsPerFile,
                         len(wordIndexMap),
                         contextsCount)

        log.lineBreak()

        noNegativeSamplingFile.seek(0, io.SEEK_SET)
        binary.writei(noNegativeSamplingFile, contextsCount)
        noNegativeSamplingFile.flush()

    if negative > 0:
        with open(contextsPath, 'wb+') as contextsFile:
            startTime = time.time()

            contextProvider = parameters.IndexContextProvider(noNegativeSamplingPath)

            binary.writei(contextsFile, contextsCount)
            binary.writei(contextsFile, windowSize)
            binary.writei(contextsFile, negative)

            batchSize = 10000
            batchesCount = contextsCount / batchSize + 1

            wordIndices = map(lambda item: item[1], wordIndexMap.items())
            wordIndices = numpy.asarray(wordIndices)
            maxWordIndex = max(wordIndices)

            for batchIndex in xrange(0, batchesCount):
                contexts = contextProvider[batchIndex * batchSize : (batchIndex + 1) * batchSize]
                negativeSamples = generateNegativeSamples(negative, contexts, wordIndices, maxWordIndex, strict)
                contexts = numpy.concatenate([contexts, negativeSamples], axis=1)
                contexts = numpy.ravel(contexts)

                binary.writei(contextsFile, contexts)

                currentTime = time.time()
                elapsed = currentTime - startTime

                log.progress('Negative sampling: {0:.3f}%. Elapsed: {1}.',
                     batchIndex + 1,
                     batchesCount,
                     log.delta(elapsed))

            log.lineBreak()
            contextsFile.flush()

            os.remove(noNegativeSamplingPath)

    parameters.dumpWordMap(fileIndexMap, fileIndexMapFilePath)
    parameters.dumpWordMap(wordIndexMap, wordIndexMapFilePath)
    parameters.dumpEmbeddings(wordEmbeddings, wordEmbeddingsFilePath)