def test_MultDPRegression():
    # Read data from resources dir
    data_file_name = os.path.split(os.path.realpath(__file__))[0] + \
        '/../resources/data/trajectory_data_1.csv'
    df = pd.read_csv(data_file_name)
    
    prior_file_name = os.path.split(os.path.realpath(__file__))[0] + \
        '/../resources/priors/trajectory_prior_1.p'
    
    # Read prior from resources dir
    with open(prior_file_name, 'rb') as f:
        prior_file_info = pickle.load(f)
        
    preds = get_pred_names_from_prior_info(prior_file_info)
    targets = get_target_names_from_prior_info(prior_file_info)        

    D = len(targets)
    M = len(preds)
    K = 20

    prior_data = {}
    prior_data['w_mu0'] = np.zeros([M, D])
    prior_data['w_var0'] = np.ones([M, D])
    prior_data['lambda_a0'] = np.ones([D])
    prior_data['lambda_b0'] = np.ones([D])
    prior_data['alpha'] = prior_file_info['alpha']
    for (d, target) in enumerate(targets):
        prior_data['lambda_a0'][d] = prior_file_info['lambda_a0'][target]
        prior_data['lambda_b0'][d] = prior_file_info['lambda_b0'][target]            
            
        for (m, pred) in enumerate(preds):
            prior_data['w_mu0'][m, d] = prior_file_info['w_mu0'][target][pred]
            prior_data['w_var0'][m, d] = prior_file_info['w_var0'][target][pred]            
    
    mm = MultDPRegression(prior_data['w_mu0'], prior_data['w_var0'],
                          prior_data['lambda_a0'], prior_data['lambda_b0'], 1,
                          prior_data['alpha'], K=K)

    mm.fit(target_names=targets, predictor_names=preds, df=df, groupby='id',
           iters=20, verbose=True)

    df_traj = mm.to_df()

    num_trajs_found = np.sum(np.where(pd.crosstab(df_traj.traj.values,
                                    df.traj.values).values == 250))

    assert num_trajs_found == 2, "Trajectory assignment error"
Esempio n. 2
0
def main():
    """
    """

    desc = """Updates older models to ensure compatibility with latest 
    implementation"""

    parser = ArgumentParser(description=desc)
    parser.add_argument('--in_model',
                        help='Filename of input model to update',
                        metavar='<string>',
                        required=True)
    parser.add_argument('--out_model',
                        help='Filename of updated output model',
                        metavar='<string>')

    op = parser.parse_args()

    print("Reading model...")
    with open(op.in_model, 'rb') as f:
        mm_in = pickle.load(f)['MultDPRegression']

        print("Updating...")
        mm_out = MultDPRegression(mm_in)

        if op.out_model is not None:
            print("Saving updated model...")
            pickle.dump({'MultDPRegression': mm_out}, open(op.out_model, 'wb'))

            print("Saving model provenance info...")
            provenance_desc = """ """
            write_provenance_data(op.out_model,
                                  generator_args=op,
                                  desc=provenance_desc,
                                  module_name='bayes_traj')

    print("DONE.")
Esempio n. 3
0
def main():
    """
    """
    np.set_printoptions(precision=1,
                        suppress=True,
                        threshold=1e6,
                        linewidth=300)

    desc = """Runs Bayesian trajectory analysis on the specified data file \
    with the specified predictors and target variables"""

    parser = ArgumentParser(description=desc)
    parser.add_argument('--in_csv',
                        help='Input csv file containing data on \
        which to run Bayesian trajectory analysis',
                        metavar='<string>',
                        required=True)
    #    parser.add_argument('--preds', help='Comma-separated list of predictor \
    #        names. Must appear as column names of the input data file.',
    #        dest='preds', metavar='<string>', required=True)
    parser.add_argument('--targets',
                        help='Comma-separated list of target \
        names. Must appear as column names of the input data file.',
                        dest='targets',
                        metavar='<string>',
                        required=True)
    parser.add_argument('--groupby',
                        help='Column name in input data file \
        indicating those data instances that must be in the same trajectory. \
        This is typically a subject identifier (e.g. in the case of a \
        longitudinal data set).',
                        dest='groupby',
                        metavar='<string>',
                        default=None)
    parser.add_argument('--out_csv',
                        help='If specified, an output csv file \
        will be generated that contains the contents of the input csv file, \
        but with additional columns indicating trajectory assignment \
        information for each data instance. There will be a column called traj \
        with an integer value indicating the most probable trajectory \
        assignment. There will also be columns prefixed with traj_ and then a \
        trajectory-identifying integer. The values of these columns indicate \
        the probability that the data instance belongs to each of the \
        corresponding trajectories.',
                        dest='out_csv',
                        metavar='<string>',
                        type=str,
                        default=None)
    parser.add_argument('--prior',
                        help='Input pickle file containing prior \
        settings',
                        metavar='<string>',
                        required=True)
    parser.add_argument('--prec_prior_weight',
                        help='A floating point value \
        indicating how much weight to put on the prior over the residual \
        precisions. Higher values mean that more weight will be given to the \
        prior',
                        metavar='<float>',
                        type=float,
                        default=0.25)
    parser.add_argument('--alpha',
                        help='If specified, over-rides the value in \
        the prior file',
                        dest='alpha',
                        metavar=float,
                        default=None)
    parser.add_argument('--out_model',
                        help='Pickle file name. If specified, \
        the model object will be written to this file.',
                        dest='out_model',
                        metavar='<string>',
                        default=None,
                        required=False)
    parser.add_argument('--iters',
                        help='Number of inference iterations',
                        dest='iters',
                        metavar='<int>',
                        default=100)
    parser.add_argument('--repeats',
                        help='Number of repeats to attempt. If a \
        value greater than 1 is specified, the WAIC2 fit criterion will be \
        computed at the end of each repeat. If, for a given repeat, the WAIC2 \
        score is lower than the lowest score seen at that point, the model \
        will be saved to file.',
                        type=int,
                        metavar='<int>',
                        default=1)
    parser.add_argument('-k',
                        help='Number of columns in the truncated \
        assignment matrix',
                        metavar='<int>',
                        default=30)
    parser.add_argument('--prob_thresh',
                        help='If during data fitting the \
        probability of a data instance belonging to a given trajectory drops \
        below this threshold, then the probabality of that data instance \
        belonging to the trajectory will be set to 0',
                        metavar='<float>',
                        type=float,
                        default=0.001)
    #    parser.add_argument('--waic2_thresh', help='Model will only be written to \
    #        file provided that the WAIC2 value is below this threshold',
    #        dest='waic2_thresh', metavar='<float>', type=float,
    #        default=sys.float_info.max)
    #    parser.add_argument('--bic_thresh', help='Model will only be written to \
    #        file provided that BIC values are above this threshold',
    #        dest='bic_thresh', metavar='<float>', type=float,
    #        default=-sys.float_info.max)
    #    parser.add_argument("--save_all", help="By default, only the model with the \
    #        highest BIC scores is saved to file. However, if this flag is set a model \
    #        file is saved for each repeat. The specified output file name is used \
    #        with a 'repeat[n]' appended, where [n] indicates the repeat number.",
    #        action="store_true")
    parser.add_argument("--verbose",
                        help="Display per-trajectory counts \
        during optimization",
                        action="store_true")
    parser.add_argument('--probs_weight',
                        help='Value between 0 and 1 that \
        controls how much weight to assign to traj_probs, the marginal \
        probability of observing each trajectory. This value is only meaningful \
        if traj_probs has been set in the input prior file. Otherwise, it has no \
        effect. Higher values place more weight on the model-derived probabilities \
        and reflect a stronger belief in those assignment probabilities.',
                        dest='probs_weight',
                        metavar='<float>',
                        type=float,
                        default=None)
    parser.add_argument('--weights_only',
                        help='Setting this flag will force \
        the fitting routine to only optimize the trajectory weights. The \
        assumption is that the specified prior file contains previously \
        modeled trajectory information, and that those trajectories should be \
        used for the current fit. This option can be useful if a model \
        learned from one cohort is applied to another cohort, where it is \
        possible that the relative proportions of different trajectory \
        subgroups differs. By using this flag, the proportions of previously \
        determined trajectory subgroups will be determined for the current \
        data set.',
                        action='store_true')

    op = parser.parse_args()
    iters = int(op.iters)
    repeats = int(op.repeats)
    #preds =  op.preds.split(',')
    targets = op.targets.split(',')
    in_csv = op.in_csv
    prior = op.prior
    out_model = op.out_model
    probs_weight = None  #op.probs_weight

    if probs_weight is not None:
        assert probs_weight >=0 and probs_weight <= 1, \
            "Invalide probs_weight value"

    #---------------------------------------------------------------------------
    # Get priors from file
    #---------------------------------------------------------------------------
    print("Reading prior...")
    with open(prior, 'rb') as f:
        prior_file_info = pickle.load(f)

        preds = get_pred_names_from_prior_info(prior_file_info)

        D = len(targets)
        M = len(preds)
        K = int(op.k)

        prior_data = {}
        for i in [
                'v_a', 'v_b', 'w_mu', 'w_var', 'lambda_a', 'lambda_b',
                'traj_probs', 'probs_weight', 'w_mu0', 'w_var0', 'lambda_a0',
                'lambda_b0', 'alpha'
        ]:
            prior_data[i] = None

        prior_data['probs_weight'] = None
        prior_data['w_mu0'] = np.zeros([M, D])
        prior_data['w_var0'] = np.ones([M, D])
        prior_data['lambda_a0'] = np.ones([D])
        prior_data['lambda_b0'] = np.ones([D])
        prior_data['v_a'] = None
        prior_data['v_b'] = None
        prior_data['w_mu'] = None
        prior_data['w_var'] = None
        prior_data['lambda_a'] = None
        prior_data['lambda_b'] = None
        prior_data['traj_probs'] = None

        if 'v_a' in prior_file_info.keys():
            prior_data['v_a'] = prior_file_info['v_a']
            if prior_file_info['v_a'] is not None:
                K = prior_file_info['v_a'].shape[0]
                print("Using K={} (from prior)".format(K))
        if 'v_b' in prior_file_info.keys():
            prior_data['v_b'] = prior_file_info['v_b']

        if 'w_mu' in prior_file_info.keys():
            if prior_file_info['w_mu'] is not None:
                prior_data['w_mu'] = np.zeros([M, D, K])
        if 'w_var' in prior_file_info.keys():
            if prior_file_info['w_var'] is not None:
                prior_data['w_var'] = np.ones([M, D, K])
        if 'lambda_a' in prior_file_info.keys():
            if prior_file_info['lambda_a'] is not None:
                prior_data['lambda_a'] = np.ones([D, K])
        if 'lambda_b' in prior_file_info.keys():
            if prior_file_info['lambda_b'] is not None:
                prior_data['lambda_b'] = np.ones([D, K])
        if 'traj_probs' in prior_file_info.keys():
            prior_data['traj_probs'] = prior_file_info['traj_probs']

        prior_data['alpha'] = prior_file_info['alpha']
        for (d, target) in enumerate(op.targets.split(',')):
            prior_data['lambda_a0'][d] = prior_file_info['lambda_a0'][target]
            prior_data['lambda_b0'][d] = prior_file_info['lambda_b0'][target]

            if prior_data['lambda_a'] is not None:
                prior_data['lambda_a'][d, :] = \
                    prior_file_info['lambda_a'][target]
            if prior_data['lambda_b'] is not None:
                prior_data['lambda_b'][d, :] = \
                    prior_file_info['lambda_b'][target]

            for (m, pred) in enumerate(preds):
                prior_data['w_mu0'][m, d] = \
                    prior_file_info['w_mu0'][target][pred]
                prior_data['w_var0'][m, d] = \
                    prior_file_info['w_var0'][target][pred]
                if prior_data['w_mu'] is not None:
                    prior_data['w_mu'][m, d, :] = \
                        prior_file_info['w_mu'][pred][target]
                if prior_data['w_var'] is not None:
                    prior_data['w_var'][m, d, :] = \
                        prior_file_info['w_var'][pred][target]

    if op.alpha is not None:
        prior_data['alpha'] = float(op.alpha)

    print("Reading data...")
    df = pd.read_csv(in_csv)

    if np.sum(np.isnan(np.sum(df[preds].values, 1))) > 0:
        print("Warning: identified NaNs in predictor set. \
        Proceeding with non-NaN data")
        df = df.dropna(subset=preds).reset_index()

    #---------------------------------------------------------------------------
    # Set up and run the traj alg
    #---------------------------------------------------------------------------
    waics_tracker = []
    bics_tracker = []
    num_tracker = []
    best_mm = None
    best_waic2 = sys.float_info.max
    bic_thresh = -sys.float_info.max
    best_bics = (bic_thresh, bic_thresh)

    print("Fitting...")
    for r in np.arange(repeats):
        if r > 0:
            print("---------- Repeat {}, Best WAIC2: {} ----------".\
                  format(r, best_waic2))

        mm = MultDPRegression(prior_data['w_mu0'],
                              prior_data['w_var0'],
                              prior_data['lambda_a0'],
                              prior_data['lambda_b0'],
                              op.prec_prior_weight,
                              prior_data['alpha'],
                              K=K,
                              prob_thresh=op.prob_thresh)

        mm.fit(target_names=targets,
               predictor_names=preds,
               df=df,
               groupby=op.groupby,
               iters=iters,
               verbose=op.verbose,
               traj_probs=prior_data['traj_probs'],
               traj_probs_weight=op.probs_weight,
               v_a=prior_data['v_a'],
               v_b=prior_data['v_b'],
               w_mu=prior_data['w_mu'],
               w_var=prior_data['w_var'],
               lambda_a=prior_data['lambda_a'],
               lambda_b=prior_data['lambda_b'],
               weights_only=op.weights_only)

        if r == 0:
            if op.out_model is not None:
                print("Saving model...")
                pickle.dump({'MultDPRegression': mm}, open(op.out_model, 'wb'))

                print("Saving model provenance info...")
                provenance_desc = """ """
                write_provenance_data(op.out_model,
                                      generator_args=op,
                                      desc=provenance_desc,
                                      module_name='bayes_traj')

            if op.out_csv is not None:
                print("Saving data file with trajectory info...")
                mm.to_df().to_csv(op.out_csv, index=False)

                print("Saving data file provenance info...")
                provenance_desc = """ """
                write_provenance_data(op.out_csv,
                                      generator_args=op,
                                      desc=provenance_desc,
                                      module_name='bayes_traj')

            if repeats > 1:
                best_waic2 = compute_waic2(mm)
        else:
            waic2 = compute_waic2(mm)
            if waic2 < best_waic2:
                best_waic2 = waic2

                if op.out_model is not None:
                    print("Saving model...")
                    pickle.dump({'MultDPRegression': mm},
                                open(op.out_model, 'wb'))

                    print("Saving model provenance info...")
                    provenance_desc = """ """
                    write_provenance_data(op.out_model,
                                          generator_args=op,
                                          desc=provenance_desc,
                                          module_name='bayes_traj')

                if op.out_csv is not None:
                    print("Saving data file with trajectory info...")
                    mm.to_df().to_csv(op.out_csv, index=False)

                    print("Saving data file provenance info...")
                    provenance_desc = """ """
                    write_provenance_data(op.out_csv,
                                          generator_args=op,
                                          desc=provenance_desc,
                                          module_name='bayes_traj')

    print("DONE.")
Esempio n. 4
0
def test_get_group_likelihood_samples_1():
    # Create a model
    df = pd.DataFrame({
        'sid': ['a'],
        'intercept': np.array([1]),
        'x': np.array([0]),
        'y': np.array([10])
    })
    M = 2
    D = 1
    N = df.shape[0]

    w_var0 = np.zeros([M, D])
    w_mu0 = np.zeros([M, D])
    lambda_a0 = np.ones(D)
    lambda_b0 = np.ones(D)
    alpha = 1
    K = 20

    prec_mu = 1
    prec_var = 1e-10
    prec_prior_weight = 1

    mm = MultDPRegression(w_mu0,
                          w_var0,
                          lambda_a0,
                          lambda_b0,
                          prec_prior_weight,
                          alpha,
                          K=K)
    mm.R_ = np.zeros([N, K])
    mm.R_[0, 0] = 1

    mm.w_mu_ = np.zeros([M, D, K])
    mm.w_var_ = np.ones([M, D, K])
    mm.w_mu_[:, 0, 0] = np.array([10, -1])
    mm.w_var_[:, 0, 0] = 1e-10 * np.array([1, 1])

    mm.w_mu_[:, 0, 1] = np.array([11, -1])
    mm.w_var_[:, 0, 1] = 1e-10 * np.array([1, 1])

    mm.lambda_a_ = np.ones([D, K])
    mm.lambda_b_ = np.ones([D, K])
    mm.lambda_a_[0, 0] = (prec_mu**2) / prec_var
    mm.lambda_b_[0, 0] = prec_mu / prec_var

    mm.gb_ = df.groupby('sid')
    mm.X_ = df[['intercept', 'x']].values
    mm.Y_ = np.atleast_2d(df.y.values)
    mm.N_ = N

    tmp = get_group_likelihood_samples(mm, num_samples=1000)
    assert np.isclose(np.mean(tmp), 1/np.sqrt(2*np.pi)), \
        "Likelihood not as expected"

    mm.R_[0, [0, 1]] = np.array([0, 1])
    tmp_2 = get_group_likelihood_samples(mm, num_samples=1000)

    assert np.mean(tmp) > np.mean(tmp_2), "Unexpected likelihood comparison"
Esempio n. 5
0
def test_get_group_likelihood_samples_2():
    # Create a model
    df = pd.DataFrame({
        'sid': ['a', 'a', 'a'],
        'intercept': np.array([1., 1., 1.]),
        'x': np.array([0., 5., 10.]),
        'y1': np.array([10., 5., 0.]),
        'y2': np.array([0., 5., 10.])
    })

    M = 2
    D = 2
    N = df.shape[0]

    w_var0 = np.zeros([M, D])
    w_mu0 = np.zeros([M, D])
    lambda_a0 = np.ones(D)
    lambda_b0 = np.ones(D)
    alpha = 1.
    K = 20

    prec_mu = 1.
    prec_var = 1e-10
    prec_prior_weight = 1

    mm = MultDPRegression(w_mu0,
                          w_var0,
                          lambda_a0,
                          lambda_b0,
                          prec_prior_weight,
                          alpha,
                          K=K)
    mm.R_ = np.zeros([N, K])
    mm.R_[:, 0] = 1.

    mm.target_type_ = {}
    mm.target_type_[0] = 'gaussian'
    mm.target_type_[1] = 'gaussian'

    mm.w_mu_ = np.zeros([M, D, K])
    mm.w_var_ = 1e-10 * np.ones([M, D, K])
    mm.w_mu_[:, 0, 0] = np.array([10, -1])
    mm.w_mu_[:, 1, 0] = np.array([0, 1])

    mm.lambda_a_ = np.ones([D, K])
    mm.lambda_b_ = np.ones([D, K])
    mm.lambda_a_[0, 0] = (prec_mu**2) / prec_var
    mm.lambda_b_[0, 0] = prec_mu / prec_var
    mm.lambda_a_[1, 0] = (prec_mu**2) / prec_var
    mm.lambda_b_[1, 0] = prec_mu / prec_var

    mm.gb_ = df.groupby('sid')
    mm.X_ = df[['intercept', 'x']].values
    mm.Y_ = df[['y1', 'y2']].values
    mm.N_ = N

    tmp_1 = get_group_likelihood_samples(mm, num_samples=1000)
    assert np.isclose(np.mean(tmp_1), (1/np.sqrt(2*np.pi))**6), \
        "Likelihood not as expected"

    # Internally, the missing target values will be imputed using the
    # posterior. This will not be the optimal value in this toy
    # data set (which should be 10). As such, we expect the mean from tmp_1
    # to be greater than the mean for tmp_2.
    mm.Y_[0, 0] = np.nan
    tmp_2 = get_group_likelihood_samples(mm, num_samples=1000)

    assert np.mean(tmp_1) > np.mean(tmp_2), "Unexpected likelihood comparison"
Esempio n. 6
0
def test_compute_waic2_1():
    # Create a model
    df = pd.DataFrame({
        'sid': ['a', 'a', 'a'],
        'intercept': np.array([1., 1., 1.]),
        'x': np.array([0., 5., 10.]),
        'y1': np.array([10., 5., 0.]),
        'y2': np.array([0., 5., 10.])
    })

    M = 2
    D = 2
    N = df.shape[0]

    w_var0 = np.zeros([M, D])
    w_mu0 = np.zeros([M, D])
    lambda_a0 = np.ones(D)
    lambda_b0 = np.ones(D)
    alpha = 1.
    K = 20

    prec_mu = 1.
    prec_var = 1e-10
    prec_prior_weight = 1

    mm_1 = MultDPRegression(w_mu0,
                            w_var0,
                            lambda_a0,
                            lambda_b0,
                            prec_prior_weight,
                            alpha,
                            K=K)
    mm_1.R_ = np.zeros([N, K])
    mm_1.R_[:, 0] = 1.

    mm_1.target_type_ = {}
    mm_1.target_type_[0] = 'gaussian'
    mm_1.target_type_[1] = 'gaussian'

    mm_1.w_mu_ = np.zeros([M, D, K])
    mm_1.w_var_ = 1e-10 * np.ones([M, D, K])
    mm_1.w_mu_[:, 0, 0] = np.array([10, -1])
    mm_1.w_mu_[:, 1, 0] = np.array([0, 1])

    mm_1.lambda_a_ = np.ones([D, K])
    mm_1.lambda_b_ = np.ones([D, K])
    mm_1.lambda_a_[0, 0] = (prec_mu**2) / prec_var
    mm_1.lambda_b_[0, 0] = prec_mu / prec_var
    mm_1.lambda_a_[1, 0] = (prec_mu**2) / prec_var
    mm_1.lambda_b_[1, 0] = prec_mu / prec_var

    mm_1.gb_ = df.groupby('sid')
    mm_1.X_ = df[['intercept', 'x']].values
    mm_1.Y_ = df[['y1', 'y2']].values
    mm_1.N_ = N

    waic2_1 = compute_waic2(mm_1)

    mm_2 = MultDPRegression(w_mu0,
                            w_var0,
                            lambda_a0,
                            lambda_b0,
                            prec_prior_weight,
                            alpha,
                            K=K)
    mm_2.R_ = np.zeros([N, K])
    mm_2.R_[:, 0] = 1.

    mm_2.target_type_ = {}
    mm_2.target_type_[0] = 'gaussian'
    mm_2.target_type_[1] = 'gaussian'

    mm_2.w_mu_ = np.zeros([M, D, K])
    mm_2.w_var_ = 1e-10 * np.ones([M, D, K])
    mm_2.w_mu_[:, 0, 0] = np.array([11, -1])  # Poorer value
    mm_2.w_mu_[:, 1, 0] = np.array([0, 1])

    mm_2.lambda_a_ = np.ones([D, K])
    mm_2.lambda_b_ = np.ones([D, K])
    mm_2.lambda_a_[0, 0] = (prec_mu**2) / prec_var
    mm_2.lambda_b_[0, 0] = prec_mu / prec_var
    mm_2.lambda_a_[1, 0] = (prec_mu**2) / prec_var
    mm_2.lambda_b_[1, 0] = prec_mu / prec_var

    mm_2.gb_ = df.groupby('sid')
    mm_2.X_ = df[['intercept', 'x']].values
    mm_2.Y_ = df[['y1', 'y2']].values
    mm_2.N_ = N

    waic2_2 = compute_waic2(mm_2)
    assert waic2_1 < waic2_2, "Unexpect WAIC2 comparison"
def test_update_w_logistic():
    data_file_name = os.path.split(os.path.realpath(__file__))[0] + \
        '/../resources/data/binary_data_1.csv'
    df = pd.read_csv(data_file_name)

    M = 2
    D = 1
    K = 1
    prior_data = {}
    prior_data['w_mu0'] = np.zeros([M, D])    
    prior_data['w_var0'] = 100*np.ones([M, D])
    prior_data['lambda_a0'] = np.ones([D])
    prior_data['lambda_b0'] = np.ones([D])
    prior_data['alpha'] = 1

    mm = MultDPRegression(prior_data['w_mu0'], prior_data['w_var0'],
                          prior_data['lambda_a0'], prior_data['lambda_b0'],
                          1/df.shape[0], prior_data['alpha'], K=K)

    mm.N_ = df.shape[0]
    mm.target_type_[0] = 'binary'
    mm.num_binary_targets_ = 1
    mm.w_var_ = None
    mm.w_covmat_ = np.nan*np.ones([M, M, D, K])
    mm.lambda_a_ = None    
    mm.lambda_b_ = None    
    mm.X_ = df[['intercept', 'pred']].values
    mm.Y_ = np.atleast_2d(df.target.values).T
    mm.gb_ = None
    
    mm.init_traj_params()

    mm.R_ = np.ones([mm.N_, K])
    
    mm.update_w_logistic(25)

    assert np.isclose(mm.w_mu_[0, 0, 0], 2.206, atol=0, rtol=.01), \
        "Intercept not as expected"
    assert np.isclose(mm.w_mu_[1, 0, 0], -2.3492, atol=0, rtol=.01), \
        "Slope not as expected"

    # Check that the function can handle nans
    mm.Y_[0, 0] = np.nan

    mm.init_traj_params()
    mm.update_w_logistic(25)

    # The intercept and slope that were used to create this synthetic data were
    # 2.5 and -2.5, respectively. When running standard logistic regression on
    # this data, the intercept and slope are found to be 2.2060 and -2.3492 
    assert np.isclose(mm.w_mu_[0, 0, 0], 2.206, atol=0, rtol=.01), \
        "Intercept not as expected"
    assert np.isclose(mm.w_mu_[1, 0, 0], -2.3492, atol=0, rtol=.01), \
        "Slope not as expected"
def test_update_w_logistic_2():
    data_file_name = os.path.split(os.path.realpath(__file__))[0] + \
        '/../resources/data/binary_data_2.csv'
    df = pd.read_csv(data_file_name)

    # Intercept, slope for group 1: 2.5, -2.5
    # Intercept, slope for group 1: -4, 4    
    
    M = 2
    D = 1
    K = 2
    prior_data = {}
    prior_data['w_mu0'] = np.zeros([M, D])
    #prior_data['w_mu0'][:, 0] = np.array([-50, 10])
    
    prior_data['w_var0'] = 100*np.ones([M, D])
    prior_data['lambda_a0'] = np.ones([D])
    prior_data['lambda_b0'] = np.ones([D])
    prior_data['alpha'] = 1

    mm = MultDPRegression(prior_data['w_mu0'], prior_data['w_var0'],
                          prior_data['lambda_a0'], prior_data['lambda_b0'],
                          1/df.shape[0],
                          prior_data['alpha'], K=K)

    mm.N_ = df.shape[0]
    mm.target_type_[0] = 'binary'
    mm.num_binary_targets_ = 1
    mm.w_var_ = None
    mm.w_covmat_ = np.nan*np.ones([M, M, D, K])
    mm.lambda_a_ = None
    mm.lambda_b_ = None    
    mm.X_ = df[['intercept', 'pred']].values
    mm.Y_ = np.atleast_2d(df.target.values).T
    mm.gb_ = None
    
    mm.init_traj_params()

    mm.R_ = np.zeros([mm.N_, K])
    mm.R_[0:int(mm.N_/2), 0] = 1
    mm.R_[int(mm.N_/2):-1, 1] = 1

    mm.R_ = np.zeros([mm.N_, K]) + 1e-4 #+ .00000000001
    mm.R_[0:int(mm.N_/2), 0] = 1-1e-4#.99999999999
    mm.R_[int(mm.N_/2)::, 1] = 1-1e-4#.99999999999

    mm.update_w_logistic(25)
    
    # The intercept and slope that were used to create this synthetic data were
    # 2.5 and -2.5 for the first group and -4 and 4 for the second group. When
    # running standard logistic regression on this data, the intercept and slope
    # are found to be 2.3702 and -2.1732 for the first group and -3.8925 and
    # 4.1095 for the second group    
    assert np.isclose(mm.w_mu_[0, 0, 0], 2.3702, atol=0, rtol=.01), \
        "Intercept not as expected"
    assert np.isclose(mm.w_mu_[1, 0, 0], -2.1732, atol=0, rtol=.01), \
        "Slope not as expected"

    assert np.isclose(mm.w_mu_[0, 0, 1], -3.8925, atol=0, rtol=.01), \
        "Intercept not as expected"
    assert np.isclose(mm.w_mu_[1, 0, 1], 4.1095, atol=0, rtol=.01), \
        "Slope not as expected"
def test_init_R_mat():
    data_file_name = os.path.split(os.path.realpath(__file__))[0] + \
        '/../resources/data/trajectory_data_1.csv'
    df = pd.read_csv(data_file_name)
    
    prior_file_name = os.path.split(os.path.realpath(__file__))[0] + \
        '/../resources/priors/model_1_posterior.p'
    prior_info = pickle.load(open(prior_file_name, 'rb'))

    preds = get_pred_names_from_prior_info(prior_info)
    targets = get_target_names_from_prior_info(prior_info)        

    D = len(targets)
    M = len(preds)
    K = 20

    prec_prior_weight = 1

    prior_data = {}
    prior_data['w_mu0'] = np.zeros([M, D])
    prior_data['w_var0'] = np.ones([M, D])
    prior_data['lambda_a0'] = np.ones([D])
    prior_data['lambda_b0'] = np.ones([D])

    prior_data['w_mu'] = np.zeros([M, D, K])
    prior_data['w_var'] = np.ones([M, D, K])
    prior_data['lambda_a'] = np.ones([D, K])
    prior_data['lambda_b'] = np.ones([D, K])
    
    prior_data['alpha'] = prior_info['alpha']
    for (d, target) in enumerate(targets):
        prior_data['lambda_a0'][d] = prior_info['lambda_a0'][target]
        prior_data['lambda_b0'][d] = prior_info['lambda_b0'][target]            
        prior_data['lambda_a'][d, :] = prior_info['lambda_a'][target]
        prior_data['lambda_b'][d, :] = prior_info['lambda_b'][target]        
        for (m, pred) in enumerate(preds):
            prior_data['w_mu0'][m, d] = prior_info['w_mu0'][target][pred]
            prior_data['w_var0'][m, d] = prior_info['w_var0'][target][pred]
            prior_data['w_mu'][m, d, :] = prior_info['w_mu'][pred][target]
            prior_data['w_var'][m, d, :] = prior_info['w_var'][pred][target]

    traj_probs = prior_info['traj_probs']
    traj_probs_weight = 1.
    v_a = prior_info['v_a']
    v_b = prior_info['v_b']
    w_mu = None
    w_var = None
    lambda_a = None
    lambda_b = None
    
    mm = MultDPRegression(prior_data['w_mu0'], prior_data['w_var0'],
                          prior_data['lambda_a0'], prior_data['lambda_b0'],
                          prec_prior_weight, prior_data['alpha'], K)
    mm.fit(target_names=targets, predictor_names=preds, df=df, groupby='id',
           iters=0, verbose=True, traj_probs=traj_probs,
           traj_probs_weight=traj_probs_weight,
           v_a=prior_info['v_a'], v_b=prior_info['v_b'],
           w_mu=prior_data['w_mu'], w_var=prior_data['w_var'],
           lambda_a=prior_data['lambda_a'], lambda_b=prior_data['lambda_b'])

    assert np.sum((traj_probs > 0) | (np.sum(mm.R_, 0) > 0)) == 2, \
        "R_mat not initialized properly"
    
    mm.fit(target_names=targets, predictor_names=preds, df=df, groupby='id',
           iters=0, verbose=True, traj_probs=traj_probs,
           traj_probs_weight=0,
           v_a=prior_info['v_a'], v_b=prior_info['v_b'],
           w_mu=prior_data['w_mu'], w_var=prior_data['w_var'],
           lambda_a=prior_data['lambda_a'], lambda_b=prior_data['lambda_b'])

    # It's possible, thoush highly unlikely that the following sum is <=2. With
    # traj_probs_weight set to 0, a number of initialized trajectories should
    # have non-zero weight
    assert np.sum((traj_probs > 0) | (np.sum(mm.R_, 0) > 0)) > 2, \
        "R_mat may not be initialized properly"
def test_init_traj_parmas():
    data_file_name = os.path.split(os.path.realpath(__file__))[0] + \
        '/../resources/data/trajectory_data_1.csv'
    df = pd.read_csv(data_file_name)
    
    prior_file_name = os.path.split(os.path.realpath(__file__))[0] + \
        '/../resources/priors/model_1_posterior.p'
    prior_info = pickle.load(open(prior_file_name, 'rb'))

    preds = get_pred_names_from_prior_info(prior_info)
    targets = get_target_names_from_prior_info(prior_info)        

    D = len(targets)
    M = len(preds)
    K = 20

    prec_prior_weight = 1

    prior_data = {}
    prior_data['w_mu0'] = np.zeros([M, D])
    prior_data['w_var0'] = np.ones([M, D])
    prior_data['lambda_a0'] = np.ones([D])
    prior_data['lambda_b0'] = np.ones([D])

    prior_data['w_mu'] = np.zeros([M, D, K])
    prior_data['w_var'] = np.ones([M, D, K])
    prior_data['lambda_a'] = np.ones([D, K])
    prior_data['lambda_b'] = np.ones([D, K])
    
    prior_data['alpha'] = prior_info['alpha']
    for (d, target) in enumerate(targets):
        prior_data['lambda_a0'][d] = prior_info['lambda_a0'][target]
        prior_data['lambda_b0'][d] = prior_info['lambda_b0'][target]            
        prior_data['lambda_a'][d, :] = prior_info['lambda_a'][target]
        prior_data['lambda_b'][d, :] = prior_info['lambda_b'][target]        
        for (m, pred) in enumerate(preds):
            prior_data['w_mu0'][m, d] = prior_info['w_mu0'][target][pred]
            prior_data['w_var0'][m, d] = prior_info['w_var0'][target][pred]
            prior_data['w_mu'][m, d, :] = prior_info['w_mu'][pred][target]
            prior_data['w_var'][m, d, :] = prior_info['w_var'][pred][target]

    traj_probs = prior_info['traj_probs']
    traj_probs_weight = 1.
    v_a = prior_info['v_a']
    v_b = prior_info['v_b']
    w_mu = None
    w_var = None
    lambda_a = None
    lambda_b = None
    
    mm = MultDPRegression(prior_data['w_mu0'], prior_data['w_var0'],
                          prior_data['lambda_a0'], prior_data['lambda_b0'],
                          prec_prior_weight, prior_data['alpha'], K)
    mm.fit(target_names=targets, predictor_names=preds, df=df, groupby='id',
           iters=0, verbose=True, traj_probs=traj_probs,
           traj_probs_weight=traj_probs_weight,
           v_a=prior_info['v_a'], v_b=prior_info['v_b'],
           w_mu=prior_data['w_mu'], w_var=prior_data['w_var'],
           lambda_a=prior_data['lambda_a'], lambda_b=prior_data['lambda_b'])

    assert np.sum(mm.w_mu_[:, :, traj_probs > 0] == \
                  prior_data['w_mu'][:, :, traj_probs > 0]) == 4, \
                  "Trajs params not initialized properly"
    
    assert np.sum(prior_data['w_var'] == mm.w_var_) == 40, \
        "Trajs params not initialized properly"

    assert np.sum(prior_data['lambda_a'] == mm.lambda_a_) == 20, \
        "Trajs params not initialized properly"

    assert np.sum(prior_data['lambda_b'] == mm.lambda_b_) == 20, \
        "Trajs params not initialized properly"    

    assert np.sum(prior_info['v_a'] == mm.v_a_) == 20, \
        "Trajs params not initialized properly"

    assert np.sum(prior_info['v_b'] == mm.v_b_) == 20, \
        "Trajs params not initialized properly"        
def test_predict_proba():
    """
    """
    D = 1
    M = 2
    K = 2    
    w_mu0 = np.zeros([M, D])
    w_var0 = np.ones([M, D])
    lambda_a0 = np.ones(D)
    lambda_b0 = np.ones(D)
    prec_prior_weight = 1
    alpha = 5
    mm = MultDPRegression(w_mu0, w_var0, lambda_a0, lambda_b0,
                          prec_prior_weight, alpha, K)

    mm.target_type_ = {}
    mm.target_type_[0] = 'gaussian'

    mm.w_mu_ = np.zeros([M, D, K])
    mm.w_mu_[:, 0, 0] = np.array([2, 1])
    mm.w_mu_[:, 0, 1] = np.array([-2, -1])    
    mm.R_ = 0.5*np.ones([3, 2])
    mm.lambda_b_ = np.ones([D, K])
    mm.lambda_a_ = np.ones([D, K])
    mm.gb_ = None
    
    X = np.array([[1, 2], [1, 2], [1, 2]])
    Y = np.array([[3], [0], [-3]])

    mm.group_first_index_ = np.ones(X.shape[0], dtype=bool)
    
    R = mm.predict_proba_(X, Y)
    R_ref = np.array([[1.00000000e+00, 3.77513454e-11],
                      [5.00000000e-01, 5.00000000e-01],
                      [3.77513454e-11, 1.00000000e+00]])
    assert np.sum(np.isclose(R, R_ref)) == 6, "Unexpected R value"
def test_init_R_mat():
    """
    """
    # Construct some synthetic data: three trajectories with three different
    # intercepts and slopes
    num_per_traj = 10
    x = np.linspace(0, 10, num_per_traj)
    m1 = 1; b1 = 29; std1 = 0.3
    m2 = 0; b2 = 26; std2 = 0.01
    m3 = -1; b3 = 23; std3 = 0.5
    y1 = m1*x + b1 + std1*np.random.randn(num_per_traj)
    y2 = m2*x + b2 + std2*np.random.randn(num_per_traj)
    y3 = m3*x + b3 + std3*np.random.randn(num_per_traj)

    X_mat = np.ones([num_per_traj*3, 2])
    X_mat[:, 1] = np.vstack([x, x, x]).reshape(-1)
    Y_mat = np.atleast_2d(np.vstack([y1, y2, y3]).reshape(-1)).T
    
    M = X_mat.shape[1]
    D = Y_mat.shape[1]
    N = Y_mat.shape[0]
    
    w_mu0 = np.zeros([M, D])
    w_var0 = np.ones([M, D])
    lambda_a0 = np.ones(D)
    lambda_b0 = np.ones(D)
    prec_prior_weight = 1
    alpha = 5
    K = 30
    mm = MultDPRegression(w_mu0, w_var0, lambda_a0, lambda_b0,
                          prec_prior_weight, alpha, K)

    mm.target_type_ = {}
    mm.target_type_[0] = 'gaussian'
    mm.target_type_[1] = 'gaussian'
    mm.gb_ = None
    lambda_a = np.ones([D, K])
    lambda_b = np.ones([D, K])
    w_mu = np.zeros([M, D, K])
    w_mu[:, 0, 0] = np.array([b1, m1])
    w_mu[:, 0, 1] = np.array([b2, m2])
    w_mu[:, 0, 2] = np.array([b3, m3])    
    w_var = np.ones([M, D, K])
    for d in range(D):
        lambda_a[d, :] = lambda_a0[d]
        lambda_b[d, :] = lambda_b0[d]
    lambda_a[0, 0] = 1
    lambda_b[0, 0] = .3**2
    lambda_a[0, 1] = 1
    lambda_b[0, 1] = .01**2
    lambda_a[0, 2] = 1
    lambda_b[0, 2] = .5**2     
        
    mm.w_mu_ = w_mu
    mm.w_var = w_var
    mm.lambda_a_ = lambda_a
    mm.lambda_b_ = lambda_b    
    mm.N_ = N
    mm.X_ = X_mat
    mm.Y_ = Y_mat
    traj_probs = np.zeros(K)
    traj_probs[0] = .25
    traj_probs[1] = .25
    traj_probs[2] = .25
    traj_probs[3] = .25    

    mm.init_R_mat(traj_probs, traj_probs_weight=1)
    assert np.sum(np.isclose(np.ones(K), np.sum(mm.R_, 1))) == K, \
        "Unexpected R_ sum"

    traj_assignments = np.array([np.where(mm.R_[i, :] == \
        np.max(mm.R_[i, :]))[0][0] for i in range(N)])
    assert np.sum(traj_assignments == \
                  np.array([0]*10 + [1]*10 + [2]*10)) == 30, \
                  "Unexpected trajectory assignments"
def test_update_z_logistic():
    data_file_name = os.path.split(os.path.realpath(__file__))[0] + \
        '/../resources/data/binary_data_3.csv'
    df = pd.read_csv(data_file_name)

    # Intercept, slope for group 1: 0, 50
    # Intercept, slope for group 1: 0, -50
    
    M = 2
    D = 1
    K = 2
    prior_data = {}
    prior_data['w_mu0'] = np.zeros([M, D])
    
    prior_data['w_var0'] = 100*np.ones([M, D])
    prior_data['lambda_a0'] = np.ones([D])
    prior_data['lambda_b0'] = np.ones([D])
    prior_data['alpha'] = 1

    mm = MultDPRegression(prior_data['w_mu0'], prior_data['w_var0'],
                          prior_data['lambda_a0'], prior_data['lambda_b0'],
                          1/df.shape[0], prior_data['alpha'], K=K)

    mm.N_ = df.shape[0]
    mm.target_type_[0] = 'binary'
    mm.num_binary_targets_ = 1
    mm.w_var_ = None
    mm.lambda_a_ = None
    mm.lambda_b_ = None    
    mm.X_ = df[['intercept', 'pred']].values
    mm.Y_ = np.atleast_2d(df.target.values).T
    mm.gb_ = None
    mm.group_first_index_ = np.ones(mm.N_, dtype=bool)
    mm.w_covmat_ = np.ones([M, M, D, K])
    
    mm.init_traj_params()
    mm.v_a_ = np.ones(mm.K_)
    mm.v_b_ = mm.alpha_*np.ones(mm.K_)

    # Set w_mu_ to be correct
    mm.w_mu_[0, 0, 0] = 0
    mm.w_mu_[1, 0, 0] = 50
    mm.w_mu_[0, 0, 1] = 0
    mm.w_mu_[1, 0, 1] = -50
    
    # Set w_covmat_ to be correct
    mm.w_covmat_ = np.zeros([M, M, D, K])
    mm.w_covmat_[:, :, 0, 0] = 1e-50*np.diag([M, M])
    mm.w_covmat_[:, :, 0, 1] = 1e-50*np.diag([M, M])    
    
    # Set R to be correct
    mm.R_ = np.zeros([mm.N_, mm.K_])
    mm.R_[0:int(mm.N_/2), 0] = 1
    mm.R_[int(mm.N_/2)::, 1] = 1

    mm.update_v()

    # Scramble R
    mm.R_[:, 0] = np.random.uniform(0.001, .999, mm.N_)
    mm.R_[:, 1] = 1. - mm.R_[:, 0]
    
    # test update_z
    R_updated = mm.update_z(mm.X_, mm.Y_)

    assert np.isclose(np.mean(R_updated[int(mm.N_/2)::], 0)[1], .999,
                      atol=0, rtol=.01), "R not updated correctly"
    assert np.isclose(np.mean(R_updated[0:int(mm.N_/2)], 0)[0], .999,
                      atol=0, rtol=.01), "R not updated correctly"