コード例 #1
0
    def copy(self):
        """Copy FFN
        """
        from crpm.ffn_bodyplan import copy_ffn

        #init new model using current model's bodyplan
        newmodel = FFN(self.bodyplan,
                       std=self.weightstd,
                       pre=self.pre,
                       post=self.post)

        #copy bodies
        newmodel.body = copy_ffn(self.body)

        #return newmodel
        return newmodel
コード例 #2
0
ファイル: abbc_model.py プロジェクト: dmontemayor/CRPM
    def update(self, state, action, reward, new_state, validation=None):

        # Train our model with new data
        self.train(state, action, reward, new_state, validation)

        # Periodically, shift the prediction network into the target network queue
        if self.iteration % self.target_every == 0:
            tempnet = copy_ffn(self.prednet)
            self.prednet = copy_ffn(self.targetnet1)
            self.targetnet1 = copy_ffn(self.targetnet2)
            self.targetnet2 = copy_ffn(self.targetnet3)
            self.targetnet3 = copy_ffn(self.targetnet4)
            self.targetnet4 = copy_ffn(tempnet)

        # Finally shift our exploration_rate toward zero (less gambling)
        self.exploration_rate *= self.exploration_rate_decay

        #increment iteration counter
        self.iteration += 1
コード例 #3
0
ファイル: test_gan.py プロジェクト: dmontemayor/CRPM
def r_test_spectra2():
    """test spectra2 can be encoded and generated
    """

    import numpy as np
    from crpm.setup_spectra2 import setup_spectra2
    from crpm.dynamics import computecost
    from crpm.analyzebinaryclassifier import analyzebinaryclassifier
    #from crpm.lossfunctions import loss
    #from crpm.analyzebinaryclassifier import plotroc
    from crpm.gradientdecent import gradientdecent
    from crpm.contrastivedivergence import contrastivedivergence
    #from crpm.ffn import FFN
    from crpm.ffn_bodyplan import stack_new_layer
    from crpm.ffn_bodyplan import copy_ffn
    from crpm.fwdprop import fwdprop
    from crpm.backprop import backprop
    #from crpm.dynamics import computeforces
    #from crpm.dynamics import maxforce

    from crpm.gan import gan
    #import matplotlib
    #matplotlib.use('TkAgg')
    #import matplotlib.pyplot as plt

    #init numpy seed
    np.random.seed(40017)

    #setup model
    prototype, data = setup_spectra2()

    #get prototype depth
    nlayer = len(prototype)

    #get data dimensions
    nfeat = data.shape[0]
    nobv = data.shape[1]

    #zscore data
    tdata = np.divide(data - np.mean(data, axis=1, keepdims=True),
                      np.std(data, axis=1, keepdims=True))

    #transform features into boltzmann like probs
    #tdata = np.exp(-data)
    #partfunc = np.sum(tdata, axis=1, keepdims = True)
    #tdata = np.divide(tdata,partfunc) #normalize
    #tdata = np.divide(tdata, np.max(tdata, axis=1, keepdims=True))#scale features by maxintensity

    #plt.plot(data[:,0])
    #plt.show()
    #plt.plot(tdata[:,0])
    #plt.show()
    #data = tdata

    #partition data (labels on first row)
    ntrain = 2 * nobv // 3
    target = data[0, :ntrain]
    train = data[1:, :ntrain]
    vtarget = data[0, ntrain:]
    valid = data[1:, ntrain:]

    #return untrained autoencoder
    _, autoencoder = contrastivedivergence(prototype, train, maxepoch=0)

    #calculate initial reconstruction error
    pred, ireconerr = computecost(autoencoder, valid, valid, "mse")
    print("init recon error = " + str(ireconerr))

    ##train prototype
    #_, autoencoder = contrastivedivergence(prototype, train,
    #                                       ncd=2,
    #                                       batchsize=50,
    #                                       nadj=10,
    #                                       maxepoch=100,
    #                                       momentum=0.1)
    #train prototype
    _, autoencoder = contrastivedivergence(prototype,
                                           train,
                                           validata=valid,
                                           ncd=1,
                                           batchsize=50,
                                           nadj=10,
                                           maxepoch=100,
                                           momentum=0.0)

    #calculate final reconstruction error
    pred, reconerr = computecost(autoencoder, valid, valid, "mse")
    print("pretrained recon error = " + str(reconerr))

    #assert learning is taking place by reduced recon error.
    assert ireconerr > reconerr

    # ----- Discriminator -----
    #create discriminator
    discriminator = copy_ffn(autoencoder[0:len(prototype)])
    discriminator = stack_new_layer(discriminator, n=1, activation="logistic")
    #analyze trained binary classifier
    pred, icost = computecost(discriminator, valid, vtarget, "bce")
    roc, ireport = analyzebinaryclassifier(pred, vtarget)
    if ireport["AreaUnderCurve"] < .5:
        #flip labels
        pred, icost = computecost(discriminator, valid, 1 - vtarget, "bce")
        roc, ireport = analyzebinaryclassifier(pred, 1 - vtarget)
    print(ireport)
    #plotroc(roc)

    #train discriminator
    pred, cost, _ = gradientdecent(discriminator,
                                   train,
                                   target,
                                   "bce",
                                   valid,
                                   vtarget,
                                   earlystop=True,
                                   finetune=6)

    #analyze trained binary classifier
    pred, cost = computecost(discriminator, valid, vtarget, "bce")
    roc, report = analyzebinaryclassifier(pred, vtarget)
    if report["AreaUnderCurve"] < .5:
        #flip labels
        pred, cost = computecost(discriminator, valid, 1 - vtarget, "bce")
        roc, report = analyzebinaryclassifier(pred, 1 - vtarget)
    print(report)
    #plotroc(roc)

    #assert discriminator can be trained by binary cross entropy error
    assert icost > cost

    #assert discriminator has potential to iden two calsses
    assert report["AreaUnderCurve"] > ireport["AreaUnderCurve"]
    #assert report["AreaUnderCurve"] > .6

    # ----- generator -----

    #create generator from decoder
    generator = copy_ffn(autoencoder[len(prototype):len(autoencoder)])

    #adjust regularization
    for layer in generator:
        layer["regval"] = 0  #.00001

    #correct label idecies
    idx = 0
    for layer in generator:
        generator[idx]["layer"] = idx
        idx += 1

    #generate fake samples
    nfake = 600
    ncode = generator[0]["n"]
    fake, _ = fwdprop(np.random.rand(ncode, nfake), generator)

    #calculate initial reconstruction error
    pred, fkreconerr = computecost(autoencoder, fake, fake, "mse")
    print("init fake recon error = " + str(fkreconerr))

    #assert fake data recon error is better than untrained recon error
    assert fkreconerr < ireconerr

    #-- Start GAN training---

    ganerr = gan(generator,
                 discriminator,
                 train,
                 maxepoch=20000,
                 batchsize=50,
                 finetune=6.3)

    #assert generator fools discriminator at least some of the time bce<80%.
    assert ganerr[-1, 1] < .8

    #def moving_average(a, n=3) :
    #    ret = np.cumsum(a, dtype=float)
    #    ret[n:] = ret[n:] - ret[:-n]
    #    return ret[n - 1:] / n

    #fig = plt.figure()
    #plt.plot(ganerr[:, 0], ganerr[:, 1])
    #plt.plot(moving_average(ganerr[:, 0], n=20), moving_average(ganerr[:, 1], n=20))
    #plt.plot(ganerr[0, 0], ganerr[0, 1], marker="D", color="green", markersize=10)
    #plt.plot(ganerr[-1, 0], ganerr[-1, 1], marker="8", color="red", markersize=10)
    #plt.xlabel("discriminator error")
    #plt.ylabel("generator error")
    #plt.show()

    #print("final report")
    #print(report)
    #plotroc(roc)

    assert False
コード例 #4
0
ファイル: test_gan.py プロジェクト: dmontemayor/CRPM
def test_afnetwork():
    """test AF network patients can be encoded and generated
    """
    #import matplotlib
    #matplotlib.use('TkAgg')
    #import matplotlib.pyplot as plt
    #import matplotlib.patches as mpatches

    import numpy as np
    from crpm.setup_afmodel import setup_afmodel

    from crpm.dynamics import computecost
    from crpm.analyzebinaryclassifier import analyzebinaryclassifier
    #from crpm.lossfunctions import loss
    from crpm.analyzebinaryclassifier import plotroc
    from crpm.gradientdecent import gradientdecent
    from crpm.contrastivedivergence import contrastivedivergence
    #from crpm.ffn import FFN
    from crpm.ffn_bodyplan import stack_new_layer
    from crpm.ffn_bodyplan import copy_ffn
    from crpm.fwdprop import fwdprop
    #from crpm.backprop import backprop
    #from crpm.dynamics import computeforces
    #from crpm.dynamics import maxforce

    from crpm.gan import gan

    #init numpy seed
    np.random.seed(40017)

    #setup model
    prototype, train, target, valid, vtarget = setup_afmodel()

    #trim data
    #maxobv = 150
    #train = train[:,:maxobv]
    #valid = valid[:,:maxobv]
    #target = target[:maxobv]
    #vtarget = vtarget[:maxobv]

    #get prototype depth
    nlayer = len(prototype)

    #get data dimensions
    nfeat = train.shape[0]
    nobv = train.shape[1]

    #return untrained autoencoder
    _, autoencoder = contrastivedivergence(prototype, train, maxepoch=0)

    # ----- Discriminator -----

    #create discriminator
    discriminator = copy_ffn(autoencoder[0:len(prototype)])
    discriminator = stack_new_layer(discriminator, n=1, activation="logistic")

    print("analyze untrained discriminator to iden subtype")
    pred, icost = computecost(discriminator, valid, vtarget, "bce")
    roc, ireport = analyzebinaryclassifier(pred, vtarget)
    if ireport["AreaUnderCurve"] < .5:
        #flip labels
        pred, icost = computecost(discriminator, valid, 1 - vtarget, "bce")
        roc, ireport = analyzebinaryclassifier(pred, 1 - vtarget)
    print(ireport)
    #plotroc(roc)

    #train discriminator
    pred, cost, _ = gradientdecent(discriminator,
                                   train,
                                   target,
                                   "bce",
                                   valid,
                                   vtarget,
                                   earlystop=True,
                                   finetune=7)

    print("analyze trained discriminator to iden subtype")
    pred, cost = computecost(discriminator, valid, vtarget, "bce")
    roc, report = analyzebinaryclassifier(pred, vtarget)
    if report["AreaUnderCurve"] < .5:
        #flip labels
        pred, cost = computecost(discriminator, valid, 1 - vtarget, "bce")
        roc, report = analyzebinaryclassifier(pred, 1 - vtarget)
    print(report)
    #plotroc(roc)

    #assert discriminator can be trained by binary cross entropy error
    #assert icost > cost

    #assert discriminator has potential to iden two classes
    #assert report["AreaUnderCurve"] > ireport["AreaUnderCurve"]
    #assert report["AreaUnderCurve"] > .55

    # ----- GENERATOR -----

    #create generator from decoder
    generator = copy_ffn(autoencoder[len(prototype) - 1:len(autoencoder)])

    #correct label idecies
    idx = 0
    for layer in generator:
        generator[idx]["layer"] = idx
        idx += 1

    #assert False
    #-- Main GAN training---
    #ganerr = gan(generator, discriminator, train,
    #                   maxepoch=100000, batchsize=1, finetune=6)
    ganerr = gan(generator,
                 discriminator,
                 train,
                 maxepoch=100000,
                 batchsize=1,
                 finetune=6)

    #def moving_average(a, n=3) :
    #    ret = np.cumsum(a, dtype=float)
    #    ret[n:] = ret[n:] - ret[:-n]
    #    return ret[n - 1:] / n

    #ganerr[:,2] = np.log(ganerr[:,2]) #plot density error on logscale
    #discerrbar = moving_average(ganerr[:, 0], n=20)
    #generrbar = moving_average(ganerr[:, 1], n=20)
    #autoerrbar = moving_average(ganerr[:, 2], n=20)

    #assert generator fools discriminator at least some of the time bce<65%.
    print(ganerr[-1, 1])
    assert ganerr[-1, 1] < .65

    #fig = plt.figure()
    #plt.plot(ganerr[:, 0], ganerr[:, 1])
    #plt.plot(discerrbar, generrbar)
    #plt.plot(discerrbar[0], generrbar[0], marker="D", color="green", markersize=10)
    #plt.plot(discerrbar[-1], generrbar[-1], marker="8", color="red", markersize=10)
    #plt.xlabel("discriminator error")
    #plt.ylabel("generator error")
    #plt.show()

    #fig = plt.figure()
    #plt.plot(ganerr[:, 0], ganerr[:, 2])
    #plt.plot(discerrbar, autoerrbar)
    #plt.plot(discerrbar[0], autoerrbar[0], marker="D", color="green", markersize=10)
    #plt.plot(discerrbar[-1], autoerrbar[-1], marker="8", color="red", markersize=10)
    #plt.xlabel("discriminator error")
    #plt.ylabel("encoder error")
    #plt.show()

    #generate fake data for every training sample
    nsample = train.shape[1]
    fake, _ = fwdprop(np.random.rand(generator[0]["n"], nsample), generator)
    #merge training and fake data
    gandata = np.hstack((train, fake))
    ganlabels = np.hstack((np.repeat(1, nsample), np.repeat(0, nsample)))

    print("analyze trained discriminator on fake vs training set")
    pred, cost = computecost(discriminator, gandata, ganlabels, "bce")
    roc, report = analyzebinaryclassifier(pred, ganlabels)
    if report["AreaUnderCurve"] < .5:
        #flip labels
        pred, cost = computecost(discriminator, gandata, ganlabels, "bce")
        roc, report = analyzebinaryclassifier(pred, 1 - ganlabels)
    print(report)
    #plotroc(roc)

    #gen fake data for every validation sample
    nsample = valid.shape[1]
    fake, _ = fwdprop(np.random.rand(generator[0]["n"], nsample), generator)
    #merge validation and fake data
    gandata = np.hstack((valid, fake))
    ganlabels = np.hstack((np.repeat(1, nsample), np.repeat(0, nsample)))

    print("analyze trained discriminator on fake vs vaidation set")
    pred, costv = computecost(discriminator, gandata, ganlabels, "bce")
    roc, reportv = analyzebinaryclassifier(pred, ganlabels)
    if reportv["AreaUnderCurve"] < .5:
        #flip labels
        pred, costv = computecost(discriminator, gandata, 1 - ganlabels, "bce")
        roc, reportv = analyzebinaryclassifier(pred, 1 - ganlabels)
    print(reportv)
    #plotroc(roc)

    #assert discriminator has poor potential to iden fake data
    assert reportv["AreaUnderCurve"] < .55

    #get fake data the discriminator thinks is real
    pred, _ = fwdprop(fake, discriminator)
    spoof = fake[:, pred[0, :] > report["OptimalThreshold"]]
コード例 #5
0
def langevindynamics(model,
                     data,
                     targets,
                     lossname,
                     validata=None,
                     valitargets=None,
                     maxepoch=int(1E6),
                     maxbuffer=int(1E3),
                     finetune=6):
    """train fnn model by langevin dynamics

        Args:
            model:
            data:
            targets:
            lossname:
            validata: data used to calculate out-sample error
            valitargets: targets used to calculate out-sample error
            maxiteration: hard limit of learning iterations default is 10000
        Returns: final predictions and cost. Training will modify model.
    """

    import numpy as np
    import copy
    from crpm.dynamics import setupdynamics
    #from crpm.dynamics import normalizelearningrate
    from crpm.dynamics import computecost
    from crpm.dynamics import computeforces
    from crpm.dynamics import maxforce
    from crpm.ffn_bodyplan import copy_ffn
    from crpm.pvalue import righttailpvalue

    #convergence test constants
    #alpha_norm = 5E-5 #scales learning rate by max force relative to weight
    alpha_norm = 10**(-finetune)
    nbuffer = 500
    #maxslope = -1E-6 #max learning slope should be negative but close to zero

    #buffer time grid
    #tgrid = np.array(range(nbuffer))
    #tsum = np.sum(tgrid)
    #tvar = nbuffer*np.sum(np.multiply(tgrid, tgrid))-tsum*tsum

    #langevin hyper parameters
    #eta = 5E-1 #ideal fraction of unexplained variance in costbuffer
    #downgamma = 0.95 #fraction by which friction is decreased
    #upgamma = 1.05 #fraction by which friction is decreased
    downtemp = 0.95  #fraction by which temperature is decreased
    uptemp = 1.05  #fraction by which temperature is increased

    #init lagevin parameters
    gamma = 5E-2  #viscosity or friction
    invbeta = 1E-6  #temperature ~ 1/beta

    #setup dynamics
    forces = setupdynamics(model, data, targets, lossname)

    #check if using validation set
    is_validating = not ((validata is None) or (valitargets is None))

    #define out-sample error calculator
    def out_sample_error():
        if is_validating:
            pred, cost = computecost(model, validata, valitargets, lossname)
        else:
            pred, cost = computecost(model, data, targets, lossname)
        return pred, cost

    #calculate out-sample error
    _, cost = out_sample_error()

    #init best error and model
    best_cost = copy.copy(cost)
    best_model = copy_ffn(model)

    #init cost history
    costhistory = np.full(maxbuffer, cost)

    #iterate training until:
    # 1) cost diverges - defined true when cost > 1E16
    # or
    # 2) too many iterations - hardcoded to ensure loop exit
    epoch = 0
    window = 0
    continuelearning = True
    while continuelearning:

        ##clear cost buffer
        #costbuffer = []

        #save cost at begining of buffer
        init_cost = copy.copy(cost)

        #normalize learning rate alpha based on current forces
        #alpha = normalizelearningrate(model, forces, alpha_norm)
        alpha = alpha_norm * maxforce(model, forces)

        #calculate langevin dynamics factors
        timestep = np.sqrt(2 * alpha)
        halftimestep = timestep / 2
        littled = np.exp(-gamma * timestep)
        littleq = (1 - littled) / gamma
        sigma = np.sqrt(invbeta * (1 - gamma * gamma))

        #loop for training steps in buffer
        #for i in tgrid:
        for i in range(nbuffer):
            #update current learning step
            epoch += 1

            #update model postions by half step
            for layer in forces:
                index = layer["layer"]
                model[index]["weight"] = (
                    model[index]["weight"] +
                    halftimestep * model[index]["weightdot"])
                model[index]["bias"] = (model[index]["bias"] +
                                        halftimestep * model[index]["biasdot"])

            #compute forces
            forces = computeforces(model, data, targets, lossname)

            #update model momenta by whole step
            for layer in forces:
                index = layer["layer"]
                ncurr = model[index]["n"]
                nprev = model[index - 1]["n"]
                model[index]["weightdot"] = (
                    littled * model[index]["weightdot"] +
                    littleq * layer["fweight"] +
                    sigma * np.random.randn(ncurr, nprev))
                model[index]["biasdot"] = (littled * model[index]["biasdot"] +
                                           littleq * layer["fbias"] +
                                           sigma * np.random.randn(ncurr, 1))

            #update model postions by second half-step
            for layer in forces:
                index = layer["layer"]
                model[index]["weight"] = (
                    model[index]["weight"] +
                    halftimestep * model[index]["weightdot"])
                model[index]["bias"] = (model[index]["bias"] +
                                        halftimestep * model[index]["biasdot"])

            ##record cost at full step
            #costbuffer.append(computecost(model, data, targets, lossname))

        #calculate out-sample error
        _, cost = out_sample_error()

        #increment window counter and save out sample error in cost history
        window += 1
        costhistory[window % maxbuffer] = copy.copy(cost)

        #Record best error and save model
        if cost <= best_cost:
            best_cost = copy.copy(cost)
            best_model = copy_ffn(model)

        #linear regression and goodness of fit measures in buffer
        #ysum = np.sum(costbuffer) # sum of costbuffer
        #in-sample error slope
        #slope = (nbuffer*np.sum(np.multiply(tgrid, costbuffer))-tsum*ysum)/tvar
        #ntercept = (ysum-slope*tsum)/nbuffer #in-sample error y-intercept
        #residuals = np.subtract(costbuffer,(slope*tgrid+intercept)) #fit error
        ##explained error sum of squares times nbuffer
        #sserr = nbuffer*np.sum(np.multiply(residuals,residuals))
        ##total error sum of squares times nbuffer
        #sstot = nbuffer*np.sum(np.multiply(costbuffer, costbuffer))-ysum*ysum
        #fvu = sserr/sstot #fraction of variance unexplained
        out_slope = (cost - init_cost)  #/nbuffer #out-sample(validation) slope

        #Thermostat
        #if out_slope is negative
        #then decrease temperature
        #else increase temperature with probability p_out
        #where p_out is the proportion of out sample error historical values
        #that are greater than the current out sample error
        #in other words p_out is the right-tailed p_value of the out sample error.
        if out_slope < 0:
            invbeta *= downtemp
            #print(" ")
            #print("- temp "+str(invbeta))
        else:
            pvalue = righttailpvalue(np.array([cost]), costhistory)
            #print(" ")
            #print("pvalue = "+str(pvalue))
            if np.random.random() <= pvalue:
                #print("+ temp "+str(invbeta))
                invbeta *= uptemp

        #Viscostat
        #if fraction of unexplained variance is < eta
        #then decrease friction
        #else increase friction
        #where hyperparameter eta should be close to 0
        #if fvu < eta:
        #    gamma *= .95
        #else:
        #    gamma *= 1.05

        #if window%10==0:
        #    keng = 0
        #    for layer in model[1:]:
        #        keng += np.sum(np.multiply(layer["weightdot"],layer["weightdot"]))
        #    print("temp = "+str(invbeta)+"    KE = "+str(keng)+"    <cost> = "
        #          +str(np.mean(costhistory))+"    cost = "+str(cost)+
        #          "    best cost = "+str(best_cost))

        # - EXIT CONDITIONS -
        #exit if learning is taking too long
        if epoch > maxepoch:
            print(
                "Warning langevindynamics.py: Training is taking a long time!"
                + " - Try increaseing maxepoch - Training will end")
            continuelearning = False
        #exit if cost has diverged
        if cost > 1E16:
            print(
                "Warning langevindynamics.py: diverging cost function " +
                "- try lowering learning rate or inc regularization constant" +
                " - training will end.")
            continuelearning = False
            #model = copy_ffn(best_model)

    #return best model
    model = copy_ffn(best_model)

    #return predictions and cost
    return out_sample_error()
コード例 #6
0
ファイル: abbc_model.py プロジェクト: dmontemayor/CRPM
    def pretrain(self, state, validation=None):
        """ will pretrain deep network model by contrastive divergence """

        #make sure input all have the same number of observations
        nobv = state.shape[1]
        failcheck = False
        if validation is not None and validation.shape[0] != nobv:
            failcheck = True
        if failcheck:
            print(
                "runtime error in pretrain: inconsistent number of observations!"
            )
            return

        #get network input size
        nfeat = state.shape[0]  #network input size

        if validation is None:
            #manually set validation data to False
            validation = np.full(state.shape[0], False)

        #partition out validation patients from dataset
        intrain = ~validation
        nobv = np.sum(intrain)
        #exit if too few participated
        if nobv < 1:
            print("too few participants found for training")
            return
        #otherwise proceed with training
        data = state[:, intrain].reshape((nfeat, nobv))

        #Left off here - need to pop off last layer in model and add random weight to target and prediction nets

        #return untrained autoencoder
        _, autoencoder = contrastivedivergence(self.prednet, data, maxepoch=0)
        print(autoencoder)

        #calculate initial mean squared error
        pred, _ = fwdprop(data, autoencoder)
        icost, _ = loss("mse", pred, data)
        print(icost)

        #train model
        _, autoencoder = contrastivedivergence(self.prednet,
                                               data,
                                               maxepoch=100)

        #calculate final mean squared error
        pred, _ = fwdprop(data, autoencoder)
        cost, _ = loss("mse", pred, data)

        #print(autoencoder)
        print(icost)
        print(cost)

        #reinit the target network(s)
        #with the prediciton network
        #self.targetnet = copy_ffn(self.prednet)
        self.targetnet1 = copy_ffn(self.prednet)
        self.targetnet2 = copy_ffn(self.prednet)
        self.targetnet3 = copy_ffn(self.prednet)
        self.targetnet4 = copy_ffn(self.prednet)
コード例 #7
0
def gradientdecent(model,
                   data,
                   targets,
                   lossname,
                   validata=None,
                   valitargets=None,
                   maxepoch=1E6,
                   earlystop=False,
                   healforces=True,
                   finetune=6):
    """train fnn model by gradient decent

        Args:
            model: FFN object or as the body in FFN class
            data: training data with features in columns and observation in rows
            targets: labels with targets in columns and observation in rows
            lossname: loss function string defined in crmp.lossfunctions
            validata: data used to calculate out-sample error
            valitargets: targets used to calculate out-sample error
            maxiteration: hard limit of learning iterations default is 10000
        Returns: final predictions and cost along with exit condition.
            Exit conditions are 0) learning converged, 1) learning not
            converged, 2) learning was stopped early, and -1) learning diverged.
            Training will modify model.
    """

    import numpy as np
    from crpm.dynamics import setupdynamics
    #from crpm.dynamics import normalizelearningrate
    from crpm.dynamics import computecost
    from crpm.dynamics import computeforces
    from crpm.dynamics import maxforce
    from crpm.ffn_bodyplan import copy_ffn
    from crpm.ffn import FFN

    #convergence test constants
    #alpha norm scales learning rate by max force relative to weight
    alpha_norm = 10**(-finetune)
    #alpha_norm = 1E-8#7#5E-6
    #alpha_norm = 1E-7#5 #scales learning rate by max force relative to weight
    nbuffer = 500
    maxslope = -1E-6  #max learning slope should be negative but close to zero
    tgrid = np.array(range(nbuffer))
    tsum = np.sum(tgrid)
    tvar = nbuffer * np.sum(np.multiply(tgrid, tgrid)) - tsum * tsum

    #setup dynamics if requested (allows for reinit to heal bad forces)
    if healforces:
        forces = setupdynamics(model, data, targets, lossname)
    else:
        forces = computeforces(model, data, targets, lossname)

    #check if using validation set
    is_validating = not ((validata is None) or (valitargets is None))

    #define out-sample error calculator
    def out_sample_error():
        if is_validating:
            pred, cost = computecost(model, validata, valitargets, lossname)
        else:
            pred, cost = computecost(model, data, targets, lossname)
        return pred, cost

    #calculate out-sample error
    _, cost = out_sample_error()

    #init best error and model
    best_cost = np.copy(cost)
    if isinstance(model, FFN):
        best_model = model.copy()
    else:
        best_model = copy_ffn(model)

    #iterate training until:
    # 1) cost converges - defined as when slope of costbuffer is greater than to -1e-6
    # or
    # 2) out-sample error increases
    # or
    # 3) cost diverges - defined true when cost > 1E16
    # or
    # 4) too many iterations - hardcoded to ensure loop exit
    continuelearning = True
    #Do not do any learning if maxepoch is not a positive integer
    if maxepoch < 1:
        continuelearning = False
    count = 0
    exitcond = 0
    while continuelearning:

        #clear cost buffer
        costbuffer = []

        #normalize learning rate alpha based on current forces
        alpha = alpha_norm * maxforce(model, forces)
        #alpha = normalizelearningrate(model, forces, alpha_norm)

        #loop for training steps in buffer
        for i in tgrid:

            #update current learning step
            count += 1

            #update body wieghts and biases
            body = model
            if isinstance(model, FFN):
                body = model.body

            #loop over layer
            for layer in forces:
                index = layer["layer"]
                body[index]["weight"] = body[index][
                    "weight"] + alpha * layer["fweight"]
                body[index][
                    "bias"] = body[index]["bias"] + alpha * layer["fbias"]

            #compute forces
            forces = computeforces(model, data, targets, lossname)

            #record cost
            _, cost = computecost(model, data, targets, lossname)
            costbuffer.append(cost)

        #calculate cost slope to check for convergence
        slope = nbuffer * np.sum(np.multiply(
            tgrid, costbuffer)) - tsum * np.sum(costbuffer)
        slope = slope / tvar

        #calculate out-sample error
        _, cost = out_sample_error()

        #Record best error and save model
        if cost <= best_cost:
            best_cost = np.copy(cost)
            if isinstance(model, FFN):
                best_model = model.copy()
            else:
                best_model = copy_ffn(model)

        # - EXIT CONDITIONS -
        #exit if learning is taking too long
        if count > int(maxepoch):
            print(
                "Warning gradientdecent.py: Training is taking a long time!" +
                " - Try increaseing maxepoch - Training will end")
            exitcond = 1
            continuelearning = False
        #exit if learning has plateaued
        if slope > maxslope:
            exitcond = 0
            continuelearning = False
        #exit if early stopping and error has risen
        if earlystop and cost > best_cost:
            print("early stopping")
            exitcond = 2
            continuelearning = False
        #exit if cost has diverged
        if cost > 1E16:
            print(
                "Warning gradientdecent.py: diverging cost function " +
                "- try lowering learning rate or inc regularization constant "
                + "- training will end.")
            exitcond = -1
            continuelearning = False

    #return best model
    if isinstance(model, FFN):
        best_model = model.copy()
    else:
        best_model = copy_ffn(model)

    #return predictions and cost
    return (*out_sample_error(), exitcond)