def loadEmbeddings(embeddingsFilePath): with open(embeddingsFilePath, 'rb') as embeddingsFile: embeddingsCount = binary.readi(embeddingsFile) embeddingSize = binary.readi(embeddingsFile) embeddings = numpy.empty((embeddingsCount, embeddingSize)).astype('float32') for embeddingIndex in range(0, embeddingsCount): embedding = binary.readf(embeddingsFile, embeddingSize) embeddings[embeddingIndex] = embedding log.progress('Loading embeddings: {0:.3f}%.', embeddingIndex + 1, embeddingsCount) log.info('Loading embeddings complete. {0} embeddings loaded.', embeddingsCount) return embeddings
def loadW2VParameters(filePath, loadEmbeddings=True): with open(filePath, 'rb') as w2vFile: firstLine = w2vFile.readline() embeddingsCount, embeddingSize = tuple(firstLine.split(' ')) embeddingsCount, embeddingSize = int(embeddingsCount), int(embeddingSize) wordIndexMap = collections.OrderedDict() embeddings = numpy.zeros((embeddingsCount, embeddingSize)) embeddingIndex = 0 while True: word = '' while True: char = w2vFile.read(1) if not char: log.lineBreak() if loadEmbeddings: return wordIndexMap, embeddings else: return wordIndexMap if char == ' ': word = word.strip() break word += char wordIndexMap[word] = len(wordIndexMap) if loadEmbeddings: embedding = binary.readf(w2vFile, embeddingSize) embeddings[wordIndexMap[word]] = embedding else: w2vFile.seek(embeddingSize * 4, io.SEEK_CUR) embeddingIndex += 1 log.progress('Loading W2V embeddings: {0:.3f}%. {1} embeddings {2} features each.', embeddingIndex, embeddingsCount, embeddingIndex, embeddingSize)