Ejemplo n.º 1
0
embeddings = numpy.load(os.path.join(directory, 'features.npy'))
inputs = numpy.load(os.path.join(directory, 'inputs.npy'))
labels = numpy.load(os.path.join(directory, 'labels.npy'))

chunkCount = embeddings.shape[0]
chunkLength = embeddings.shape[1]

clusters = numpy.reshape(
    KMeans(n_clusters=numberOfClusters).fit_predict(
        numpy.reshape(embeddings, (-1, embeddings.shape[-1]))),
    (chunkCount, chunkLength))

clusterMap = {i: [] for i in range(numberOfClusters)}

for chunk in range(chunkCount):
    chunkString = [
        vocab.getTokenString(labels[chunk, word])
        for word in range(chunkLength)
    ]

    for word in range(chunkLength):
        clusterId = clusters[chunk, word]
        wordString = vocab.getTokenString(labels[chunk, word])

        clusterMap[clusterId].append((wordString, chunkString))

for clusterId, words in clusterMap.items():
    print("Cluster", clusterId)
    for word, chunk in words:
        print(" ", "'" + word + "'", chunk)
Ejemplo n.º 2
0
    def groupDataIntoClusters(self):

        kmeans = MiniBatchKMeans(n_clusters=self.numberOfClusters)
        featurizer = Featurizer(self.config, self.validationDataset)
        vocab = Vocab(self.config)

        if self.usePCA():
            pca = IncrementalPCA(n_components=32)

        logger.info("Reducing dimensionality...")

        # fit the pca model
        if self.usePCA():
            for iteration in range(self.getIterations()):
                if iteration % 10 == 0:
                    logger.info(" " + str(iteration) + " / " +
                                str(self.getIterations()))
                inputs, labels, embeddings = featurizer.featurizeOneBatch()

                pca.partial_fit(
                    numpy.reshape(embeddings, (-1, embeddings.shape[-1])))

            self.validationDataset.reset()

        logger.info("Fitting model...")

        # fit the kmeans model
        for iteration in range(self.getIterations()):
            if iteration % 10 == 0:
                inputs, labels, embeddings, dataTime, modelTime = featurizer.featurizeOneBatch(
                    reportTime=True)
                logger.info(" " + str(iteration) + " / " +
                            str(self.getIterations()) + " data load time: " +
                            str(dataTime) + " model eval time: " +
                            str(modelTime))
            else:
                inputs, labels, embeddings = featurizer.featurizeOneBatch()

            if self.usePCA():
                embeddings = pca.transform(
                    numpy.reshape(embeddings, (-1, embeddings.shape[-1])))

            kmeans.partial_fit(
                numpy.reshape(embeddings, (-1, embeddings.shape[-1])))

        self.validationDataset.reset()

        # group into clusters
        # create a histogram of word frequencies per cluster
        clusterHistogram = {i: {} for i in range(self.numberOfClusters)}
        clusterWins = {i: 0 for i in range(self.numberOfClusters)}
        documentMap = {}

        logger.info("Clustering data...")

        for iteration in range(self.getIterations()):
            if iteration % 10 == 0:
                inputs, labels, embeddings, dataTime, modelTime = featurizer.featurizeOneBatch(
                    reportTime=True)
                logger.info(" " + str(iteration) + " / " +
                            str(self.getIterations()) + " data load time: " +
                            str(dataTime) + " model eval time: " +
                            str(modelTime))
            else:
                inputs, labels, embeddings = featurizer.featurizeOneBatch()

            chunkLength = embeddings.shape[1]
            batchSize = embeddings.shape[0]

            if self.usePCA():
                embeddings = pca.transform(
                    numpy.reshape(embeddings, (-1, embeddings.shape[-1])))

            clusters = numpy.reshape(
                kmeans.predict(
                    numpy.reshape(embeddings, (-1, embeddings.shape[-1]))),
                (batchSize, chunkLength))

            for batch in range(batchSize):
                documentId = labels[batch, 0]

                if not documentId in documentMap:
                    documentMap[documentId] = []

                clusterIds = []

                for wordIndex in range(1, chunkLength):

                    word = vocab.getTokenString(labels[batch, wordIndex])
                    cluster = clusters[batch, wordIndex]

                    clusterIds.append(cluster)

                    if not labels[batch,
                                  wordIndex] in clusterHistogram[cluster]:
                        clusterHistogram[cluster][labels[batch, wordIndex]] = 0

                    clusterHistogram[cluster][labels[batch, wordIndex]] += 1
                    clusterWins[cluster] += 1

                documentMap[documentId].extend(clusterIds)

        if not os.path.exists(self.outputDirectory):
            os.makedirs(self.outputDirectory)

        # write histograms
        with open(self.getOutputHistogramFileName(), "w") as log:
            for clusterId, clusterCount in sorted(clusterWins.items(),
                                                  key=lambda x: x[1],
                                                  reverse=True):
                words = clusterHistogram[clusterId]
                log.write("Cluster, " + str(clusterId) + " (" +
                          str(clusterCount) + ")\n")
                for wordIndex, count in sorted(words.items(),
                                               key=lambda x: x[1],
                                               reverse=True):
                    log.write("    '" + vocab.getTokenString(wordIndex) +
                              "' " + str(count) + "\n")

        # write document clusters
        for documentId, clusters in documentMap.items():

            histogram = {}

            for cluster in clusters:
                if not cluster in histogram:
                    histogram[cluster] = 0

                histogram[cluster] += 1

            with open(self.getOutputDocumentClusterFileName(documentId),
                      "w") as log:

                for cluster, count in sorted(histogram.items(),
                                             key=lambda x: x[1],
                                             reverse=True):

                    words = clusterHistogram[cluster]
                    topWord = vocab.getTokenString(
                        sorted(words.items(), key=lambda x: x[1],
                               reverse=True)[0][0])
                    log.write("Cluster, " + str(cluster) + ", " + topWord +
                              ", " + str(count) + "\n")
class FallbackTokenEvaluator:
    def __init__(self, config):
        self.config = config
        self.vocab = Vocab(config)

    def initialize(self):
        self.perplexityStates = self.createPerplexityStates(
            self.getBatchSize())

    def evaluate(self, inputs, labels, predictions):
        inputIndices, predictions, vocabProbabilities = self.rewriteSplitTokens(
            inputs, labels, predictions)

        self.recordPredictions(predictions, vocabProbabilities, inputIndices,
                               inputs)

    def getRequestedPredictions(self, inputs, labels):
        return numpy.expand_dims(labels, axis=2)

    def finalize(self):
        return self.getPerplexity()

    def getBatchSize(self):
        if not "adaptor" in self.config:
            return 1

        if not "batching" in self.config["adaptor"]:
            return 1

        if not "size" in self.config["adaptor"]["batching"]:
            return 1

        return int(self.config["adaptor"]["batching"]["size"])

    def createPerplexityStates(self, count):
        return [PerplexityState(self.vocab) for i in range(count)]

    def getPerplexity(self):
        byteCount = sum(
            [state.getByteCount() for state in self.perplexityStates])
        totalEntropy = sum(
            [state.getEntropy() for state in self.perplexityStates])

        return 2.0**(totalEntropy / byteCount)

    def recordPredictions(self, predictions, vocabProbabilities, inputIndices,
                          inputs):
        # predictions is Tensor(batch-size, sequence-length, vocab-size)
        # inputs is Tensor(batch-size, sequence-length)
        batchSize = predictions.shape[0]
        sequenceLength = predictions.shape[1]

        # TODO: replace with something like batch gather
        for batch in range(batchSize):
            for element in range(sequenceLength):
                labelPrediction = predictions[batch, element]
                self.perplexityStates[batch].addPrediction(
                    inputs[batch, :], inputIndices[batch, element],
                    labelPrediction, vocabProbabilities[batch, element, :])

    def rewriteSplitTokens(self, inputs, labels, predictions):
        from functools import reduce

        newInputs = []
        newPredictions = []
        newVocabProbabilities = []

        batchSize = predictions.shape[0]
        sequenceLength = predictions.shape[1]

        # collapse expanded tokens
        for batch in range(batchSize):

            inputString = "".join([
                self.vocab.getTokenString(token) for token in labels[batch, :]
                if not Vocab.isReservedToken(token)
            ])
            reservedIndices = set([
                index for index, token in enumerate(labels[batch, :])
                if Vocab.isReservedToken(token)
            ])

            tokenizer = UnlimitedVocabTokenizerAdaptor(
                StringDataSource(inputString))

            completeTokens = [
                tokenizer.next() for i in range(tokenizer.size())
            ]

            logger.debug("Reformed input string: '" + str([
                self.vocab.getTokenString(token) for token in labels[batch, :]
                if not Vocab.isReservedToken(token)
            ]))
            logger.debug("' tokenized to: " + str(completeTokens))
            logger.debug(
                " tokens: " +
                str([self.vocab.getToken(token) for token in completeTokens]))

            index = 0
            completeTokenIndex = 0

            newBatchInputs = []
            newBatchPredictions = []
            newBatchVocabProbabilities = []

            while index < sequenceLength:
                token = labels[batch, index]
                completeToken = completeTokens[completeTokenIndex]

                # get token end
                tokenEndIndex = index + 1
                if self.vocab.getToken(
                        completeToken
                ) != token and not index in reservedIndices:
                    while tokenEndIndex < sequenceLength:
                        possibleToken = labels[batch, tokenEndIndex]
                        if (completeTokenIndex + 1) < len(completeTokens):
                            if self.vocab.getToken(
                                    completeTokens[completeTokenIndex +
                                                   1]) == possibleToken:
                                break
                        tokenEndIndex += 1

                # add token
                newBatchInputs.append([index, tokenEndIndex])
                newBatchVocabProbabilities.append(
                    list(predictions[batch, index, :]))
                newBatchVocabProbabilities[-1][0] = 0.0

                # compute new probabilities for the merged token
                predictionValues = predictions[batch, index:tokenEndIndex, 0]
                newBatchPredictions.append(
                    reduce(lambda x, y: x * y, predictionValues))

                if tokenEndIndex > (index + 1):
                    logger.debug("Reformed split tokens: " + str([
                        self.vocab.getTokenString(token)
                        for token in labels[batch, index:tokenEndIndex]
                    ]) + (" with prob: %.4f" % newBatchPredictions[-1]))

                if not index in reservedIndices:
                    completeTokenIndex += 1

                index = tokenEndIndex

            newInputs.append(newBatchInputs)
            newPredictions.append(newBatchPredictions)
            newVocabProbabilities.append(newBatchVocabProbabilities)

        # pad
        maxLength = max([len(tokens) for tokens in newInputs])

        newInputs = [
            inputs +
            [self.getPadToken() for i in range(maxLength - len(inputs))]
            for inputs in newInputs
        ]
        newPredictions = [
            predictions + [0.0 for i in range(maxLength - len(predictions))]
            for predictions in newPredictions
        ]
        newVocabProbabilties = [
            predictions + [0.0 for i in range(maxLength - len(predictions))]
            for predictions in newVocabProbabilities
        ]

        return numpy.array(newInputs), numpy.array(
            newPredictions), numpy.array(newVocabProbabilities)