def trainModel(fileVocabulary, wordVocabulary, contextProvider, model, superBatchSize, miniBatchSize, parametersPath, embeddingsPath, learningRate, l1Coefficient, l2Coefficient, epochs, metricsPath): if os.path.exists(metricsPath): os.remove(metricsPath) superBatchesCount = contextProvider.contextsCount / superBatchSize + 1 startTime = time.time() previousTotal = 0 for epoch in xrange(0, epochs): for superBatchIndex in xrange(0, superBatchesCount): contextSuperBatch = contextProvider[superBatchIndex * superBatchSize: (superBatchIndex + 1) * superBatchSize] fileIndices, wordIndices, targetWordIndices = contextSuperBatch[:, 1], contextSuperBatch[:, 1: -1], contextSuperBatch[:, -1] model.train(wordIndices, targetWordIndices, miniBatchSize, learningRate, l1Coefficient, l2Coefficient) metrics = validation.validate(wordVocabulary, model) customMetrics = { 'simGemJewel': similarity('gem', 'jewel', wordVocabulary, model) } validation.dump(metricsPath, epoch, superBatchIndex, *metrics, **customMetrics) validation.dump(metricsPath, epoch, superBatchIndex, *metrics) if previousTotal < sum(metrics): model.dump(parametersPath, embeddingsPath) currentTime = time.time() elapsed = currentTime - startTime secondsPerEpoch = elapsed / (epoch + 1) rg, sim353, simLex999, syntRel, sat = metrics log.progress( 'Training model: {0:.3f}%. Elapsed: {1}. Epoch: {2}. ({3:.3f} sec/epoch), RG: {4}. Sim353: {5}. SimLex999: {6}. SyntRel: {7}. SAT: {8}. Gem/Jewel: {9:.3f}.', epoch + 1, epochs, log.delta(elapsed), epoch, secondsPerEpoch, rg, sim353, simLex999, syntRel, sat, customMetrics['simGemJewel']) log.lineBreak() return model
def prepareWikipediaDumps(inputDirectoryPath, outputDirectoryPath, cleanText=True): if os.path.exists(outputDirectoryPath): shutil.rmtree(outputDirectoryPath, ignore_errors=True) log.info('Output directory {0} has been removed.', outputDirectoryPath) os.mkdir(outputDirectoryPath) os.chown(outputDirectoryPath, 1000, 1000) log.info('Output directory {0} has been created.', outputDirectoryPath) pathName = inputDirectoryPath + '/*wiki*.txt.gz' dumpPaths = glob.glob(pathName)[:10] dumpsCount = len(dumpPaths) log.info('Found {0} Wikipedia dumps.', dumpsCount) startTime = time.time() for dumpIndex, dumpPath in enumerate(dumpPaths): dumpName, pages = unpackDump(dumpPath, cleanText) if len(pages) > 0: dumpDirectoryPath = os.path.join(outputDirectoryPath, dumpName) os.mkdir(dumpDirectoryPath) os.chown(dumpDirectoryPath, 1000, 1000) for pageName, pageText in pages: savePage(dumpDirectoryPath, pageName, pageText) currentTime = time.time() elapsed = currentTime - startTime secondsPerFile = elapsed / (dumpIndex + 1) log.progress('Unpacking Wikipedia dumps: {0:.3f}%. Last dump: {1} ({2} pages). Elapsed: {3}. ({4:.3f} sec/dump)', dumpIndex + 1, dumpsCount, dumpName, len(pages), log.delta(elapsed), secondsPerFile) log.lineBreak() log.info('Processing complete.')
def processData(inputDirectoryPath, fileVocabularyPath, wordVocabularyPath, contextsPath, contextSize, maxVocabularySize): if os.path.exists(contextsPath): os.remove(contextsPath) fileContextSize = 1 wordContextSize = contextSize - fileContextSize fileVocabulary = collections.OrderedDict() wordVocabulary = collections.OrderedDict() unprunedContextsPath = contextsPath + '.unpruned' if os.path.exists(unprunedContextsPath): os.remove(unprunedContextsPath) with open(unprunedContextsPath, 'wb+') as unprunedContextsFile: unprunedContextsFile.write(struct.pack('i', 0)) # this is a placeholder for contexts count unprunedContextsFile.write(struct.pack('i', contextSize)) pathName = inputDirectoryPath + '/*/*.txt' textFilePaths = glob.glob(pathName)[:200] textFilePaths = sorted(textFilePaths) textFileCount = len(textFilePaths) startTime = time.time() contextFormat = '{0}i'.format(contextSize) contextsCount = 0 for textFileIndex, textFilePath in enumerate(textFilePaths): fileVocabulary[textFilePath] = textFileIndex contextProvider = WordContextProvider(textFilePath) for wordContext in contextProvider.next(wordContextSize): for word in wordContext: if word not in wordVocabulary: wordVocabulary[word] = (len(wordVocabulary), 1) else: wordIndex, frequency = wordVocabulary[word] wordVocabulary[word] = (wordIndex, frequency + 1) indexContext = map(lambda w: wordVocabulary[w][0], wordContext) indexContext = [textFileIndex] + indexContext unprunedContextsFile.write(struct.pack(contextFormat, *indexContext)) contextsCount += 1 textFileName = os.path.basename(textFilePath) currentTime = time.time() elapsed = currentTime - startTime secondsPerFile = elapsed / (textFileIndex + 1) log.progress('Reading contexts: {0:.3f}%. Elapsed: {1} ({2:.3f} sec/file). Vocabulary: {3}.', textFileIndex + 1, textFileCount, log.delta(elapsed), secondsPerFile, len(wordVocabulary)) log.lineBreak() unprunedContextsFile.seek(0, io.SEEK_SET) unprunedContextsFile.write(struct.pack('i', contextsCount)) unprunedContextsFile.flush() whiteList = whitelist.load() originalVocabularyLength = len(wordVocabulary) prunedWordVocabulary, wordIndexMap = pruneWordVocabulary(wordVocabulary, maxVocabularySize, whiteList) log.info('Vocabulary has been pruned. {0} items left out of {1}.', len(prunedWordVocabulary), originalVocabularyLength) with open(unprunedContextsPath, 'rb') as unprunedContextsFile: contextsCount = unprunedContextsFile.read(4) contextSize = unprunedContextsFile.read(4) contextsCount = struct.unpack('i', contextsCount)[0] contextSize = struct.unpack('i', contextSize)[0] format = '{0}i'.format(contextSize) # plus one spot for file index bufferSize = (contextSize) * 4 prunedContextsCount = 0 with open(contextsPath, 'wb+') as uncompressedPrunedContexts: uncompressedPrunedContexts.write(struct.pack('i', 0)) # placeholder for contexts count uncompressedPrunedContexts.write(struct.pack('i', contextSize)) contextIndex = 0 while contextIndex < contextsCount: buffer = unprunedContextsFile.read(bufferSize) context = struct.unpack(format, buffer) fileIndex = context[0] indexContext = context[1:] if all([index in wordIndexMap for index in indexContext]): prunedContextsCount += 1 indexContext = map(lambda wordIndex: wordIndexMap[wordIndex], indexContext) context = [fileIndex] + indexContext buffer = struct.pack(format, *context) uncompressedPrunedContexts.write(buffer) contextIndex += 1 contextsPruned = contextIndex - prunedContextsCount + 1 log.progress('Pruning contexts: {0:.3f}%. {1} contexts ({2:.3f}%) pruned out of {3}.', contextIndex, contextsCount, contextsPruned, float(contextsPruned) * 100 / contextsCount, contextsCount) log.lineBreak() uncompressedPrunedContexts.seek(0, io.SEEK_SET) uncompressedPrunedContexts.write(struct.pack('i', prunedContextsCount)) uncompressedPrunedContexts.flush() os.remove(unprunedContextsPath) parameters.dumpFileVocabulary(fileVocabulary, fileVocabularyPath) parameters.dumpWordVocabulary(prunedWordVocabulary, wordVocabularyPath)