Esempio n. 1
0
    def _runEpoch(self, id, Xtrain, Ytrain, Xvalid, Yvalid):
        Xtrain, Ytrain = self._shuffleDataset(Xtrain, Ytrain)

        print("Train epoch {}/{}: ".format(id + 1, self.EPOCHS),
              end='',
              flush=True)

        t = Timing()
        t.start()

        yPredicted = self._model.train(Xtrain, Ytrain)

        trainAccuracy = 100 * self.evaluateScore(Ytrain, yPredicted)

        self.trainAccuracies[id] = trainAccuracy
        self.trainTimes[id] = t.get_elapsed_secs()
        print("{:.1f}% in {}.\t".format(trainAccuracy, t.get_elapsed_time()),
              end='',
              flush=True)

        # Validation
        validRes = self._test(Xvalid, Yvalid)
        self.validAccuracies[id] = validRes["accuracy"]
        print('Validation: {:.1f}% ({}).\t'.format(
            validRes["accuracy"], Timing.secondsToString(validRes["time"])),
              end='',
              flush=True)

        if id == 0 and validRes["extra_output"] is not None:
            print(validRes["extra_output"], end=' ', flush=True)

        return validRes
Esempio n. 2
0
    def test(self, sparseMat, X, Y):
        correct = 0
        total = Y.shape[0]
        t = Timing()

        for x, y in zip(X, Y):
            w = x.dot(sparseMat).todense().A.ravel()

            code = self.decoder.findKBestCodes(w, 1)[0]

            if y == self.codeManager.codeToLabel(code):
                correct += 1

        return {
            "accuracy": correct * 100 / total,
            "time": t.get_elapsed_time()
        }
Esempio n. 3
0
def read(path, dataset, printSummary=True):
    t = Timing()

    specificPath = "{0}/{1}/{1}".format(path, dataset.name)

    sorted_extension = "_sorted" if dataset.name == "bibtex" else ""

    Xtrain, Ytrain, Xvalid, Yvalid, Xtest, Ytest = \
        load_dataset(
            "{}.train{}".format(specificPath, sorted_extension),
            "{}.heldout{}".format(specificPath, sorted_extension),
            "{}.test{}".format(specificPath, sorted_extension),
            dataset.n_features,
            multilabel=dataset.multilabel)

    if dataset.multilabel:
        LABELS = len(
            set(
                list(Ytrain.indices) + list(Yvalid.indices) +
                list(Ytest.indices)))
    else:
        LABELS = len(set(list(Ytrain) + list(Yvalid) + list(Ytest)))

    DIMS = Xtrain.shape[1]

    # Print summary
    if printSummary:
        effectiveDim = Xtrain.nnz / Xtrain.shape[0]
        print(("{} dataset '{}':\n" + "\tLoaded in:\t{:}\n" +
               "\tLabels:\t\tK={:,}\n" +
               "\tFeatures:\td={:,} ({:.1f} non-zero features on average)"
               ).format("Multi-label" if dataset.multilabel else "Multi-class",
                        dataset.name, t.get_elapsed_time(), LABELS, DIMS,
                        effectiveDim))

    return Xtrain, Ytrain.astype(np.int), Xvalid, Yvalid.astype(
        np.int), Xtest, Ytest.astype(np.int), LABELS, DIMS
Esempio n. 4
0
    def run(self,
            Xtrain,
            Ytrain,
            Xvalid,
            Yvalid,
            Xtest,
            Ytest,
            modelLogPath=None,
            returnBestValidatedModel=False):
        bestRes = 0

        if modelLogPath is not None:
            # If we need to save the model, we increase the recursion limit
            # so we can save objects like the tree in the code manager etc.
            # The code itself is not recursive!
            import sys
            sys.setrecursionlimit(10**5)

        # Run training epochs
        for epoch in range(0, self.EPOCHS):
            validRes = self._runEpoch(epoch, Xtrain, Ytrain, Xvalid, Yvalid)

            # If our validation score is the highest so far
            if validRes["accuracy"] > bestRes:
                bestRes = validRes["accuracy"]

                # We save the model to file whenever we reach a better validation performance,
                # so that if the simulation will be terminated for some reason
                # (usually only happen when we didn't allocate enough space for the process)
                # we will have a backup.
                if modelLogPath is not None:
                    tSave = Timing()
                    tSave.start()

                    # Important:
                    # numpy.savez is originally meant for saving arrays, not objects,
                    # We use it here for simplicity but it sometimes causes very high additional memory requirements
                    # (it processes the data before saving).
                    #
                    # A better way would be to save the data of the binary learners (e.g. means of AROW),
                    # and the coding matrix and allocation, one by one, and then load them with a special method.
                    np.savez(modelLogPath, ltls=self._model)
                    print("Saved model ({}).".format(tSave.get_elapsed_time()),
                          end='',
                          flush=True)
                    del tSave

            print("")

        # If we need to return the best validated model, load it from file before continuing
        if returnBestValidatedModel:
            del self._model

            self._model = np.load(modelLogPath + ".npz")["ltls"][()]

        # Test
        testRes = self._test(Xtest, Ytest)
        print('Test accuracy: {:.1f}% ({})'.format(
            testRes["accuracy"], Timing.secondsToString(testRes["time"])))

        # Calculate average binary loss
        decodingLoss = self._calcBitwiseLoss(Xtrain, Ytrain)
        print("Average binary loss: {:.2f}".format(decodingLoss))