Esempio n. 1
0
    def macau(self, side_info, direct, expected):
        args = self.get_default_opts()

        for d in range(2):
            if side_info[d] != None:
                args["priors"][d] = 'macau'

        session = smurff.TrainSession(**args)
        Ytrain = TestExCAPE_py.data["train.sdm"]
        Ytest = TestExCAPE_py.data["test.sdm"]
        session.addTrainAndTest(Ytrain, Ytest, self.get_train_noise())

        for d in range(2):
            if side_info[d] != None:
                session.addSideInfo(d,
                                    TestExCAPE_py.data[side_info[d]],
                                    self.get_side_noise(),
                                    direct=direct)

        session.init()

        start = time()
        while session.step():
            pass
        rmse = session.getRmseAvg()
        stop = time()
        elapsed = stop - start

        self.assertLess(rmse, expected[0])
        self.assertGreater(rmse, expected[1])
        self.assertLess(elapsed, expected[2])
Esempio n. 2
0
    def test_macau_dense_probit(self):
        A = np.random.randn(25, 2)
        B = np.random.randn(3, 2)

        idx = list(
            itertools.product(np.arange(A.shape[0]), np.arange(B.shape[0])))
        df = pd.DataFrame(np.asarray(idx), columns=["A", "B"])
        df["value"] = (np.array([np.sum(A[i[0], :] * B[i[1], :])
                                 for i in idx]) > 0.0).astype(np.float64)
        Ytrain, Ytest = smurff.make_train_test_df(df, 0.2)

        threshold = 0.5  # since we sample from mu(0,1)

        trainSession = smurff.TrainSession(priors=['macau', 'normal'],
                                           num_latent=4,
                                           threshold=threshold,
                                           burnin=20,
                                           nsamples=20,
                                           verbose=False)

        trainSession.addTrainAndTest(Ytrain, Ytest,
                                     smurff.ProbitNoise(threshold))
        trainSession.addSideInfo(0, A, direct=True)

        predictions = trainSession.run()

        rmse = smurff.calc_rmse(predictions)
        self.assertTrue(
            rmse > 0.55,
            msg=
            "Probit factorization (with dense side) gave AUC below 0.55 (%f)."
            % rmse)
Esempio n. 3
0
    def run_session(self, noise_model):
        Ytrain, Ytest = self.train_test()
        si = self.side_info()

        nmodes = len(Ytrain.shape)
        priors = ['normal'] * nmodes
        if si is not None:
            priors[0] = 'macau'

        session = smurff.TrainSession(priors=priors,
                                      num_latent=10,
                                      burnin=10,
                                      nsamples=15,
                                      verbose=verbose)

        if si is None:
            session.addTrainAndTest(Ytrain, Ytest, noise_model)
        elif isinstance(noise_model, smurff.ProbitNoise):
            session.addSideInfo(0, si)
            session.addTrainAndTest(Ytrain, Ytest, noise_model)
        else:
            session.addSideInfo(0, si, noise_model)
            session.addTrainAndTest(Ytrain, Ytest)

        session.init()
        while session.step():
            pass

        predictions = session.getTestPredictions()
        self.assertEqual(Ytest.nnz, len(predictions))
        self.assertLess(session.getRmseAvg(), 10.)
        return predictions
Esempio n. 4
0
def train_session(root, train, test, sideinfo=None):
    import shutil
    shutil.rmtree(root, ignore_errors=True)
    os.makedirs(root)
    print("save prefix = ", root)
    trainSession = smurff.TrainSession(
        num_latent=4,
        burnin=800,
        nsamples=100,
        verbose=global_verbose,
        save_freq=1,
        save_prefix=root,
    )
    trainSession.addTrainAndTest(train, test, smurff.FixedNoise(1.0))
    if sideinfo is not None:
        trainSession.addSideInfo(0,
                                 sideinfo,
                                 smurff.FixedNoise(10.),
                                 direct=True)

    predictions = trainSession.run()
    rmse = smurff.calc_rmse(predictions)

    #print("RMSE = %.2f%s" % (rmse, "" if sideinfo is None else " (with sideinfo)" ))
    return rmse
Esempio n. 5
0
def test_pybind():
    trainSession = smurff.TrainSession(priors = ["normal", "normal"], verbose = 2 )

    Y = np.array([[1.,2.],[3.,4.]])
    trainSession.setTrain(Y)
    trainSession.setTest(sp.csr_matrix(Y))
    results = trainSession.run()
Esempio n. 6
0
def test_noise_model(density, nmodes, side_info, noise_model):
    Ytrain, Ytest, si = train_test(density, nmodes, side_info)
    nm = noise_model()

    priors = ['normal'] * nmodes
    if si is not None:
        priors[0] = 'macau'

    trainSession = smurff.TrainSession(priors=priors,
                                       num_latent=8,
                                       burnin=20,
                                       nsamples=20,
                                       threshold=.0,
                                       seed=seed,
                                       verbose=verbose)

    trainSession.addTrainAndTest(Ytrain, Ytest, nm)
    if not si is None:
        trainSession.addSideInfo(0, si, smurff.SampledNoise(1.), direct=True)

    trainSession.init()
    while trainSession.step():
        pass

    predictions = trainSession.getTestPredictions()
    assert Ytest.nnz == len(predictions)
    if isinstance(nm, smurff.ProbitNoise):
        assert trainSession.getStatus().auc_avg <= 1.
        assert trainSession.getStatus().auc_avg >= 0.
    else:
        assert trainSession.getRmseAvg() < 10.
    return predictions
Esempio n. 7
0
    def run_train_session(self, nmodes, sparse):
        shape = range(2, nmodes+2) # 2, 3, 4, ... 
        Y = np.random.rand(*shape)
        if sparse: # make Y SparseTensor through make_train_test
            _, Y = smurff.make_train_test(Y, 0.5)
        self.Ytrain, self.Ytest = smurff.make_train_test(Y, 0.1)
        priors = ['normal'] * nmodes

        trainSession = smurff.TrainSession(priors = priors, num_latent=4,
                burnin=10, nsamples=15, verbose=verbose,
                save_freq = 1, save_name = smurff.helper.temp_savename())

        trainSession.addTrainAndTest(self.Ytrain, self.Ytest)

        trainSession.init()
        while trainSession.step():
            pass

        return trainSession
Esempio n. 8
0
    def run_train_session(self):
        Y = scipy.sparse.rand(15, 10, 0.2)
        self.Ytrain, self.Ytest = smurff.make_train_test(Y, 0.5)
        nmodes = len(self.Ytrain.shape)
        priors = ['normal'] * nmodes

        session = smurff.TrainSession(priors=priors,
                                      num_latent=4,
                                      burnin=10,
                                      nsamples=15,
                                      verbose=verbose,
                                      save_freq=1)

        session.addTrainAndTest(self.Ytrain, self.Ytest)

        session.init()
        while session.step():
            pass

        return session
Esempio n. 9
0
    def run_train_session(self):
        Ydense  = np.random.normal(size = (10, 20)).reshape((10,20))
        r       = np.random.permutation(10*20)[:40] # 40 random samples from 10*20 matrix
        Y       = scipy.sparse.coo_matrix(Ydense) # convert to sparse
        Y       = scipy.sparse.coo_matrix( (Y.data[r], (Y.row[r], Y.col[r])), shape=Y.shape )

        self.Ytrain, self.Ytest = smurff.make_train_test(Y, 0.5)
        self.side_info   = Ydense


        nmodes = len(self.Ytrain.shape)
        priors = ['normal'] * nmodes

        session = smurff.TrainSession(priors = priors, num_latent=32,
                burnin=10, nsamples=15, verbose=verbose,
                save_freq = 1)

        session.addTrainAndTest(self.Ytrain, self.Ytest)
        session.addSideInfo(0, self.side_info)
        session.run()
        return session
Esempio n. 10
0
    def macau(self, dirname, expected):
        args = self.get_default_opts()

        trainSession = smurff.TrainSession(**args)
        Ytrain = mio.read_matrix(join(dirname, "train.sdm"))
        Ytest = mio.read_matrix(join(dirname, "test.sdm"))
        trainSession.addTrainAndTest(Ytrain, Ytest, self.get_train_noise())

        sideinfo = mio.read_matrix(join(dirname, "rows.ddm"))
        trainSession.addSideInfo(0, sideinfo, self.get_side_noise(), direct = True)
        trainSession.init()

        start = time()
        while trainSession.step(): pass
        rmse = trainSession.getRmseAvg()
        stop = time()
        elapsed = stop - start

        self.assertLess(rmse, expected[0])
        self.assertGreater(rmse, expected[1])
        self.assertLess(elapsed, expected[2])
Esempio n. 11
0
def read_ini(fname):
    from configparser import ConfigParser
    cfg = ConfigParser()
    cfg.read(fname)

    priors = read_list(cfg["global"], "prior_")
    seed = cfg.getint("global", "random_seed") if cfg.getboolean(
        "global", "random_seed_set") else None
    threshold = cfg.getfloat("global", "threshold") if cfg.getboolean(
        "global", "classify") else None

    session = smurff.TrainSession(
        priors,
        cfg.getint("global", "num_latent"),
        cfg.getint("global", "num_threads", fallback=None),
        cfg.getint("global", "burnin"),
        cfg.getint("global", "nsamples"),
        seed,
        threshold,
        cfg.getint("global", "verbose"),
        cfg.get("global", "save_name", fallback=smurff.temp_savename()),
        cfg.getint("global", "save_freq", fallback=None),
        cfg.getint("global", "checkpoint_freq", fallback=None),
    )

    data, matrix_type, noise, *_ = read_data(cfg, "train")
    session.setTrain(data, noise, matrix_type == "scarce")

    data, *_ = read_data(cfg, "test")
    session.setTest(data)

    for mode in range(len(priors)):
        section = "side_info_%d" % mode
        if section in cfg.keys():
            data, matrix_type, noise, pos, direct, tol = read_data(
                cfg, section)
            session.addSideInfo(mode, data, noise, direct)

    return session
Esempio n. 12
0
    def run_train_session(self, nmodes, density):
        shape = range(5, nmodes + 5)  # 5, 6, 7, ...
        Y, X = smurff.generate.gen_tensor(shape, 3, density)
        self.Ytrain, self.Ytest = smurff.make_train_test(Y, 0.1)
        priors = ['normal'] * nmodes

        trainSession = smurff.TrainSession(
            priors=priors,
            num_latent=4,
            burnin=10,
            nsamples=nsamples,
            verbose=verbose,
            save_freq=1,
            save_name=smurff.helper.temp_savename())

        trainSession.addTrainAndTest(self.Ytrain, self.Ytest)
        for i, x in enumerate(X):
            trainSession.addSideInfo(i, x)

        trainSession.init()
        while trainSession.step():
            pass

        return trainSession, Y, X
Esempio n. 13
0
#!/usr/bin/env python

import smurff
import pickle

Y = smurff.matrix_io.read_matrix("ratings_1k_random.sdm")
Ytrain, Ytest = smurff.prepare.make_train_test(Y, 0.2)
sideinfo = smurff.matrix_io.read_matrix("features_1k_random.sdm")

trainSession = smurff.TrainSession(num_latent=8,
                                   burnin=200,
                                   nsamples=200,
                                   verbose=1,
                                   save_name="movielens.hdf5",
                                   save_freq=1)
trainSession.addTrainAndTest(Ytrain, Ytest)
trainSession.addSideInfo(0, sideinfo, smurff.FixedNoise(10.))

trainSession.run()
Esempio n. 14
0
import numpy as np
from time import time

#load data
ic50_train = mio.read_matrix("chembl-IC50-346targets-100compounds-train.sdm")
ic50_test = mio.read_matrix("chembl-IC50-346targets-100compounds-test.sdm")
#feat = mio.read_matrix("chembl-IC50-100compounds-feat-dense.ddm")
feat = mio.read_matrix("chembl-IC50-100compounds-feat.sdm")

ic50_threshold = 6.

trainSession = smurff.TrainSession(
                            verbose = 1,
                            priors = ['macau', 'normal'],
                            num_latent=32,
                            num_threads=1,
                            seed=1234,
                            burnin=400,
                            nsamples=200,
                            # Using threshold of 6. to calculate AUC on test data
                            threshold=ic50_threshold)

## using activity threshold pIC50 > 6. to binarize train data
trainSession.addTrainAndTest(ic50_train, ic50_test)
trainSession.addSideInfo(0, feat, noise=smurff.SampledNoise(), direct=True)

start = time()
predictions = trainSession.run()
stop = time()

print("time = %.2f" % (stop - start))
print("RMSE = %.2f" % smurff.calc_rmse(predictions))
Esempio n. 15
0
import logging
import numpy as np
import scipy.sparse as sp

import smurff

# logging.getLogger().setLevel(logging.INFO)

trainSession = smurff.TrainSession(priors=["normal", "normal"])

Y = np.array([[1., 2.], [3., 4.]])
trainSession.setTrain(Y)
trainSession.setTest(sp.csr_matrix(Y))

results = trainSession.run()
# for r in results:
#     print(r)
Esempio n. 16
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'pySMURFF - command line utility to the SMURFF Python module')

    parser.add_argument("command",
                        help="Do full 'run' or only 'save' to .h5",
                        choices=['run', 'save'])

    group = parser.add_argument_group("General parameters")
    group.add_argument("--version",
                       action="store_true",
                       help="print version info (and exit)")
    group.add_argument("--verbose",
                       metavar="NUM",
                       type=int,
                       default=1,
                       help="verbose output (default = 1}")
    group.add_argument("--ini",
                       metavar="FILE",
                       type=str,
                       help="read options from this .ini file")
    group.add_argument("--num-threads",
                       metavar="NUM",
                       type=int,
                       help="number of threads (0 = default by OpenMP")
    group.add_argument("--seed",
                       metavar="NUM",
                       type=int,
                       help="random number generator seed")

    group = parser.add_argument_group("Used during training")
    group.add_argument("--train",
                       metavar="FILE",
                       type=str,
                       help="train data file")
    group.add_argument("--test", metavar="FILE", type=str, help="test data")
    group.add_argument("--row-features",
                       metavar="FILE",
                       type=str,
                       help="sparse/dense row features")
    group.add_argument("--col-features",
                       metavar="FILE",
                       type=str,
                       help="sparse/dense column features")
    group.add_argument(
        "--prior",
        metavar="NAME",
        nargs=2,
        type=str,
        help=
        "provide a prior-type for each dimension of train; prior-types:  <normal|normalone|spikeandslab|macau|macauone>"
    )
    group.add_argument("--burnin",
                       metavar="NUM",
                       type=int,
                       help="number of samples to discard")
    group.add_argument("--nsamples",
                       metavar="NUM",
                       type=int,
                       help="number of samples to collect")
    group.add_argument("--num-latent",
                       metavar="NUM",
                       type=int,
                       help="number of latent dimensions")
    group.add_argument(
        "--threshold",
        metavar="NUM",
        type=float,
        help="threshold for binary classification and AUC calculation")

    group = parser.add_argument_group("Storing models and predictions")
    group.add_argument("--restore-from",
                       metavar="FILE",
                       type=str,
                       help="restore trainSession from a saved .h5 file")
    group.add_argument("--save-name",
                       metavar="FILE",
                       type=str,
                       help="save model and/or predictions to this .h5 file")
    group.add_argument(
        "--save-freq",
        metavar="NUM",
        type=int,
        help="save every n iterations (0 == never, -1 == final model)")
    group.add_argument(
        "--checkpoint-freq",
        metavar="NUM",
        type=int,
        help="save state every n seconds, only one checkpointing state is kept"
    )

    args = parser.parse_args()
    print(args)

    if args.version:
        print("SMURFF %s" % smurff.version)
        exit

    session = smurff.TrainSession()

    if args.ini is not None:
        session = read_ini(args.ini)

    file_options = {
        "train": session.setTrain,
        "test": session.setTest,
        "row_features": lambda x: session.addSideInfo(0, x),
        "col_features": lambda x: session.addSideInfo(1, x),
    }

    for opt, func in file_options.items():
        if opt in vars(args) and vars(args)[opt] is not None:
            fname = vars(args)[opt]
            data = mio.read_matrix(fname)
            func(data)

    other_options = {
        "verbose": session.setVerbose,
        "num_threads": session.setNumThreads,
        "seed": session.setRandomSeed,
        "prior": session.setPriorTypes,
        "burnin": session.setBurnin,
        "nsamples": session.setNSamples,
        "num_latent": session.setNumLatent,
        "threshold": session.setThreshold,
        "restore_from": session.setRestoreName,
        "save_name": session.setSaveName,
        "save_freq": session.setSaveFreq,
        "checkpoint-freq": session.setCheckpointFreq,
    }

    print(vars(args))
    for opt, func in other_options.items():
        if opt in vars(args) and vars(args)[opt] is not None:
            value = vars(args)[opt]
            print("processing opt:", opt, "with value", value)
            func(value)

    if args.command == "run":
        session.run()
    else:
        session.init()  # init will validate and save
Esempio n. 17
0
#!/usr/bin/env python

import smurff
import matrix_io as mio

#load data
ic50 = mio.read_matrix("chembl-IC50-346targets.mm")
ic50_train, ic50_test = smurff.make_train_test(ic50, 0.2)
ic50_threshold = 6.

session = smurff.TrainSession(
    priors=['normal', 'normal'],
    num_latent=32,
    burnin=10,
    nsamples=10,
    # Using threshold of 6. to calculate AUC on test data
    threshold=ic50_threshold)

## using activity threshold pIC50 > 6. to binarize train data
session.addTrainAndTest(ic50_train, ic50_test,
                        smurff.ProbitNoise(ic50_threshold))
predictions = session.run()
print("RMSE = %.2f" % smurff.calc_rmse(predictions))
print("AUC = %.2f" % smurff.calc_auc(predictions, ic50_threshold))