Example #1
0
def main():
    """
    """
    desc = """Generate provenace information for the specified file. Provenance
    information for file 'f' will be stored in 'f.provenance_info.xml'"""

    parser = ArgumentParser(description=desc)
    parser.add_argument('-f',
        help='File for which to generate provenance information', dest='f', \
        metavar='<string>', default=None)
    parser.add_argument(
        '--aux_file_deps',
        help='In some cases, a file might depend on other data files but may \
        itself be generated by third-party software that makes it difficult \
        to seemless track provenance information across the processing \
        pipeline. By specifying here a comma-separated list of file names \
        (including the complete paths) on which a third-party processing \
        operation depends, we can partially track information necessary to \
        recreate the data file. Note, however, that this is not optimal. \
        Ideally, one would also want to track all details of the processing \
        operation itself (parameter settings, code version, etc), but this \
        can be challenging in certain special circumstances.',
        dest='aux_file_deps',
        metavar='<string>',
        default=None)

    op = parser.parse_args()

    write_provenance_data(op.f,
                          write_generator_info=False,
                          aux_file_deps=op.aux_file_deps)
Example #2
0
def main():
    """
    """

    desc = """Updates older models to ensure compatibility with latest 
    implementation"""

    parser = ArgumentParser(description=desc)
    parser.add_argument('--in_model',
                        help='Filename of input model to update',
                        metavar='<string>',
                        required=True)
    parser.add_argument('--out_model',
                        help='Filename of updated output model',
                        metavar='<string>')

    op = parser.parse_args()

    print("Reading model...")
    with open(op.in_model, 'rb') as f:
        mm_in = pickle.load(f)['MultDPRegression']

        print("Updating...")
        mm_out = MultDPRegression(mm_in)

        if op.out_model is not None:
            print("Saving updated model...")
            pickle.dump({'MultDPRegression': mm_out}, open(op.out_model, 'wb'))

            print("Saving model provenance info...")
            provenance_desc = """ """
            write_provenance_data(op.out_model,
                                  generator_args=op,
                                  desc=provenance_desc,
                                  module_name='bayes_traj')

    print("DONE.")
Example #3
0
def main():
    """
    """
    desc = """Assigns individuals to trajectory subgroups using their data 
    contained the input csv file and a trajectory model. The individuals can be 
    different from those used to train the model. However, it is assumed that 
    the predictor names and target names, as well as the groupby name, match."""

    args = ArgumentParser(desc)
    args.add_argument('--in_csv', help='Input csv data file. Individuals in \
        this file will be assigned to the best trajectory', required=True,
        type=str)
    args.add_argument('--gb', help='Subject identifier column name in the \
        input data file to use for grouping. If none specified, an attempt \
        will be made to get this from the input model. However, there may be a \
        mismatch between the subject identifier stored in the model and the \
        appropriate column in the input data. If this is the case, this flag \
        should be used.', required=False, type=str)    
    args.add_argument('--model', help='Pickled trajectory model to use for \
        assigning data instances to trajectories', type=str, required=True)
    args.add_argument('--out_csv', help='Output csv file with data instances \
        assigned to trajectories. The output csv file will be identical to the \
        input csv file, but it will additionally have a traj column indicating \
        the trajectory number with the highest assigmnet probability. It will \
        also contain columns with the traj_ prefix, followed by a numer. These \
        columns contain the probability of assignment to the corresponding \
        trajectory.', type=str, default=None)
    args.add_argument('--traj_map', help='The default trajectory numbering \
        scheme is somewhat arbitrary. Use this flag to provide a mapping \
        between the defualt trajectory numbers and a desired numbering scheme. \
        Provide as a comma-separated list of hyphenated mappings. \
        E.g.: 3-1,18-2,7-3 would indicate a mapping from 3 to 1, from 18 to 2, \
        and from 7 to 3. Original trajectory values not used in the mapping \
        will be reassigned to NaNs ', type=str, default=None)        

    op = args.parse_args()
    
    print("Reading data...")
    df = pd.read_csv(op.in_csv)

    print("Reading model...")
    mm = pickle.load(open(op.model, 'rb'))['MultDPRegression']

    traj_map = {}
    if op.traj_map is not None:
        for ii in op.traj_map.split(','):
            traj_map[int(ii.split('-')[0])] = int(ii.split('-')[1])
    else:
        for ii in np.where(mm.sig_trajs_)[0]:
            traj_map[ii] = ii
    
    print("Assigning...")
    groupby_col = None
    if op.gb is not None:
        groupby_col = op.gb
    elif mm.gb_ is not None:
        groupby_col = mm.gb_.count().index.name
        
    df_out = mm.augment_df_with_traj_info(mm.target_names_,
        mm.predictor_names_, df, groupby_col)
    df_out.replace({'traj': traj_map}, inplace=True)
    
    if op.out_csv is not None:
        print("Saving data with trajectory info...")
        df_out.to_csv(op.out_csv, index=False)
        
        print("Saving data file provenance info...")
        provenance_desc = """ """
        write_provenance_data(op.out_csv, generator_args=op,
                              desc=provenance_desc,
                              module_name='bayes_traj')  

    print("DONE.")
Example #4
0
def main():
    """
    """
    np.set_printoptions(precision=1,
                        suppress=True,
                        threshold=1e6,
                        linewidth=300)

    desc = """Runs Bayesian trajectory analysis on the specified data file \
    with the specified predictors and target variables"""

    parser = ArgumentParser(description=desc)
    parser.add_argument('--in_csv',
                        help='Input csv file containing data on \
        which to run Bayesian trajectory analysis',
                        metavar='<string>',
                        required=True)
    #    parser.add_argument('--preds', help='Comma-separated list of predictor \
    #        names. Must appear as column names of the input data file.',
    #        dest='preds', metavar='<string>', required=True)
    parser.add_argument('--targets',
                        help='Comma-separated list of target \
        names. Must appear as column names of the input data file.',
                        dest='targets',
                        metavar='<string>',
                        required=True)
    parser.add_argument('--groupby',
                        help='Column name in input data file \
        indicating those data instances that must be in the same trajectory. \
        This is typically a subject identifier (e.g. in the case of a \
        longitudinal data set).',
                        dest='groupby',
                        metavar='<string>',
                        default=None)
    parser.add_argument('--out_csv',
                        help='If specified, an output csv file \
        will be generated that contains the contents of the input csv file, \
        but with additional columns indicating trajectory assignment \
        information for each data instance. There will be a column called traj \
        with an integer value indicating the most probable trajectory \
        assignment. There will also be columns prefixed with traj_ and then a \
        trajectory-identifying integer. The values of these columns indicate \
        the probability that the data instance belongs to each of the \
        corresponding trajectories.',
                        dest='out_csv',
                        metavar='<string>',
                        type=str,
                        default=None)
    parser.add_argument('--prior',
                        help='Input pickle file containing prior \
        settings',
                        metavar='<string>',
                        required=True)
    parser.add_argument('--prec_prior_weight',
                        help='A floating point value \
        indicating how much weight to put on the prior over the residual \
        precisions. Higher values mean that more weight will be given to the \
        prior',
                        metavar='<float>',
                        type=float,
                        default=0.25)
    parser.add_argument('--alpha',
                        help='If specified, over-rides the value in \
        the prior file',
                        dest='alpha',
                        metavar=float,
                        default=None)
    parser.add_argument('--out_model',
                        help='Pickle file name. If specified, \
        the model object will be written to this file.',
                        dest='out_model',
                        metavar='<string>',
                        default=None,
                        required=False)
    parser.add_argument('--iters',
                        help='Number of inference iterations',
                        dest='iters',
                        metavar='<int>',
                        default=100)
    parser.add_argument('--repeats',
                        help='Number of repeats to attempt. If a \
        value greater than 1 is specified, the WAIC2 fit criterion will be \
        computed at the end of each repeat. If, for a given repeat, the WAIC2 \
        score is lower than the lowest score seen at that point, the model \
        will be saved to file.',
                        type=int,
                        metavar='<int>',
                        default=1)
    parser.add_argument('-k',
                        help='Number of columns in the truncated \
        assignment matrix',
                        metavar='<int>',
                        default=30)
    parser.add_argument('--prob_thresh',
                        help='If during data fitting the \
        probability of a data instance belonging to a given trajectory drops \
        below this threshold, then the probabality of that data instance \
        belonging to the trajectory will be set to 0',
                        metavar='<float>',
                        type=float,
                        default=0.001)
    #    parser.add_argument('--waic2_thresh', help='Model will only be written to \
    #        file provided that the WAIC2 value is below this threshold',
    #        dest='waic2_thresh', metavar='<float>', type=float,
    #        default=sys.float_info.max)
    #    parser.add_argument('--bic_thresh', help='Model will only be written to \
    #        file provided that BIC values are above this threshold',
    #        dest='bic_thresh', metavar='<float>', type=float,
    #        default=-sys.float_info.max)
    #    parser.add_argument("--save_all", help="By default, only the model with the \
    #        highest BIC scores is saved to file. However, if this flag is set a model \
    #        file is saved for each repeat. The specified output file name is used \
    #        with a 'repeat[n]' appended, where [n] indicates the repeat number.",
    #        action="store_true")
    parser.add_argument("--verbose",
                        help="Display per-trajectory counts \
        during optimization",
                        action="store_true")
    parser.add_argument('--probs_weight',
                        help='Value between 0 and 1 that \
        controls how much weight to assign to traj_probs, the marginal \
        probability of observing each trajectory. This value is only meaningful \
        if traj_probs has been set in the input prior file. Otherwise, it has no \
        effect. Higher values place more weight on the model-derived probabilities \
        and reflect a stronger belief in those assignment probabilities.',
                        dest='probs_weight',
                        metavar='<float>',
                        type=float,
                        default=None)
    parser.add_argument('--weights_only',
                        help='Setting this flag will force \
        the fitting routine to only optimize the trajectory weights. The \
        assumption is that the specified prior file contains previously \
        modeled trajectory information, and that those trajectories should be \
        used for the current fit. This option can be useful if a model \
        learned from one cohort is applied to another cohort, where it is \
        possible that the relative proportions of different trajectory \
        subgroups differs. By using this flag, the proportions of previously \
        determined trajectory subgroups will be determined for the current \
        data set.',
                        action='store_true')

    op = parser.parse_args()
    iters = int(op.iters)
    repeats = int(op.repeats)
    #preds =  op.preds.split(',')
    targets = op.targets.split(',')
    in_csv = op.in_csv
    prior = op.prior
    out_model = op.out_model
    probs_weight = None  #op.probs_weight

    if probs_weight is not None:
        assert probs_weight >=0 and probs_weight <= 1, \
            "Invalide probs_weight value"

    #---------------------------------------------------------------------------
    # Get priors from file
    #---------------------------------------------------------------------------
    print("Reading prior...")
    with open(prior, 'rb') as f:
        prior_file_info = pickle.load(f)

        preds = get_pred_names_from_prior_info(prior_file_info)

        D = len(targets)
        M = len(preds)
        K = int(op.k)

        prior_data = {}
        for i in [
                'v_a', 'v_b', 'w_mu', 'w_var', 'lambda_a', 'lambda_b',
                'traj_probs', 'probs_weight', 'w_mu0', 'w_var0', 'lambda_a0',
                'lambda_b0', 'alpha'
        ]:
            prior_data[i] = None

        prior_data['probs_weight'] = None
        prior_data['w_mu0'] = np.zeros([M, D])
        prior_data['w_var0'] = np.ones([M, D])
        prior_data['lambda_a0'] = np.ones([D])
        prior_data['lambda_b0'] = np.ones([D])
        prior_data['v_a'] = None
        prior_data['v_b'] = None
        prior_data['w_mu'] = None
        prior_data['w_var'] = None
        prior_data['lambda_a'] = None
        prior_data['lambda_b'] = None
        prior_data['traj_probs'] = None

        if 'v_a' in prior_file_info.keys():
            prior_data['v_a'] = prior_file_info['v_a']
            if prior_file_info['v_a'] is not None:
                K = prior_file_info['v_a'].shape[0]
                print("Using K={} (from prior)".format(K))
        if 'v_b' in prior_file_info.keys():
            prior_data['v_b'] = prior_file_info['v_b']

        if 'w_mu' in prior_file_info.keys():
            if prior_file_info['w_mu'] is not None:
                prior_data['w_mu'] = np.zeros([M, D, K])
        if 'w_var' in prior_file_info.keys():
            if prior_file_info['w_var'] is not None:
                prior_data['w_var'] = np.ones([M, D, K])
        if 'lambda_a' in prior_file_info.keys():
            if prior_file_info['lambda_a'] is not None:
                prior_data['lambda_a'] = np.ones([D, K])
        if 'lambda_b' in prior_file_info.keys():
            if prior_file_info['lambda_b'] is not None:
                prior_data['lambda_b'] = np.ones([D, K])
        if 'traj_probs' in prior_file_info.keys():
            prior_data['traj_probs'] = prior_file_info['traj_probs']

        prior_data['alpha'] = prior_file_info['alpha']
        for (d, target) in enumerate(op.targets.split(',')):
            prior_data['lambda_a0'][d] = prior_file_info['lambda_a0'][target]
            prior_data['lambda_b0'][d] = prior_file_info['lambda_b0'][target]

            if prior_data['lambda_a'] is not None:
                prior_data['lambda_a'][d, :] = \
                    prior_file_info['lambda_a'][target]
            if prior_data['lambda_b'] is not None:
                prior_data['lambda_b'][d, :] = \
                    prior_file_info['lambda_b'][target]

            for (m, pred) in enumerate(preds):
                prior_data['w_mu0'][m, d] = \
                    prior_file_info['w_mu0'][target][pred]
                prior_data['w_var0'][m, d] = \
                    prior_file_info['w_var0'][target][pred]
                if prior_data['w_mu'] is not None:
                    prior_data['w_mu'][m, d, :] = \
                        prior_file_info['w_mu'][pred][target]
                if prior_data['w_var'] is not None:
                    prior_data['w_var'][m, d, :] = \
                        prior_file_info['w_var'][pred][target]

    if op.alpha is not None:
        prior_data['alpha'] = float(op.alpha)

    print("Reading data...")
    df = pd.read_csv(in_csv)

    if np.sum(np.isnan(np.sum(df[preds].values, 1))) > 0:
        print("Warning: identified NaNs in predictor set. \
        Proceeding with non-NaN data")
        df = df.dropna(subset=preds).reset_index()

    #---------------------------------------------------------------------------
    # Set up and run the traj alg
    #---------------------------------------------------------------------------
    waics_tracker = []
    bics_tracker = []
    num_tracker = []
    best_mm = None
    best_waic2 = sys.float_info.max
    bic_thresh = -sys.float_info.max
    best_bics = (bic_thresh, bic_thresh)

    print("Fitting...")
    for r in np.arange(repeats):
        if r > 0:
            print("---------- Repeat {}, Best WAIC2: {} ----------".\
                  format(r, best_waic2))

        mm = MultDPRegression(prior_data['w_mu0'],
                              prior_data['w_var0'],
                              prior_data['lambda_a0'],
                              prior_data['lambda_b0'],
                              op.prec_prior_weight,
                              prior_data['alpha'],
                              K=K,
                              prob_thresh=op.prob_thresh)

        mm.fit(target_names=targets,
               predictor_names=preds,
               df=df,
               groupby=op.groupby,
               iters=iters,
               verbose=op.verbose,
               traj_probs=prior_data['traj_probs'],
               traj_probs_weight=op.probs_weight,
               v_a=prior_data['v_a'],
               v_b=prior_data['v_b'],
               w_mu=prior_data['w_mu'],
               w_var=prior_data['w_var'],
               lambda_a=prior_data['lambda_a'],
               lambda_b=prior_data['lambda_b'],
               weights_only=op.weights_only)

        if r == 0:
            if op.out_model is not None:
                print("Saving model...")
                pickle.dump({'MultDPRegression': mm}, open(op.out_model, 'wb'))

                print("Saving model provenance info...")
                provenance_desc = """ """
                write_provenance_data(op.out_model,
                                      generator_args=op,
                                      desc=provenance_desc,
                                      module_name='bayes_traj')

            if op.out_csv is not None:
                print("Saving data file with trajectory info...")
                mm.to_df().to_csv(op.out_csv, index=False)

                print("Saving data file provenance info...")
                provenance_desc = """ """
                write_provenance_data(op.out_csv,
                                      generator_args=op,
                                      desc=provenance_desc,
                                      module_name='bayes_traj')

            if repeats > 1:
                best_waic2 = compute_waic2(mm)
        else:
            waic2 = compute_waic2(mm)
            if waic2 < best_waic2:
                best_waic2 = waic2

                if op.out_model is not None:
                    print("Saving model...")
                    pickle.dump({'MultDPRegression': mm},
                                open(op.out_model, 'wb'))

                    print("Saving model provenance info...")
                    provenance_desc = """ """
                    write_provenance_data(op.out_model,
                                          generator_args=op,
                                          desc=provenance_desc,
                                          module_name='bayes_traj')

                if op.out_csv is not None:
                    print("Saving data file with trajectory info...")
                    mm.to_df().to_csv(op.out_csv, index=False)

                    print("Saving data file provenance info...")
                    provenance_desc = """ """
                    write_provenance_data(op.out_csv,
                                          generator_args=op,
                                          desc=provenance_desc,
                                          module_name='bayes_traj')

    print("DONE.")
Example #5
0
def main():
    desc = """"""

    parser = ArgumentParser(description=desc)
    parser.add_argument('--model',
                        help='Model containing trajectories to visualize',
                        type=str,
                        required=True)
    parser.add_argument('--y_axis',
                        help='Name of the target variable that will \
        be plotted on the y-axis',
                        type=str,
                        required=True)
    parser.add_argument('--y_label',
                        help='Label to display on y-axis. If none \
        given, the variable name specified with the y_axis flag will be used.',
                        type=str,
                        default=None)
    parser.add_argument('--x_axis',
                        help='Name of the predictor variable that will \
        be plotted on the x-axis',
                        type=str,
                        required=True)
    parser.add_argument('--x_label',
                        help='Label to display on x-axis. If none \
        given, the variable name specified with the x_axis flag will be used.',
                        type=str,
                        default=None)
    parser.add_argument('--trajs',
                        help='Comma-separated list of trajectories to \
        plot. If none specified, all trajectories will be plotted.',
                        type=str,
                        default=None)
    parser.add_argument('--min_traj_prob',
                        help='The probability of a given \
        trajectory must be at least this value in order to be rendered. Value \
        should be between 0 and 1 inclusive.',
                        type=float,
                        default=0)
    parser.add_argument('--max_traj_prob',
                        help='The probability of a given \
        trajectory can not be larger than this value in order to be rendered. \
        Value should be between 0 and 1 inclusive.',
                        type=float,
                        default=1.01)
    parser.add_argument('--fig_file',
                        help='If specified, will save the figure to \
        file.',
                        type=str,
                        default=None)
    parser.add_argument('--traj_map',
                        help='The default trajectory numbering \
        scheme is somewhat arbitrary. Use this flag to provide a mapping \
        between the defualt trajectory numbers and a desired numbering scheme. \
        Provide as a comma-separated list of hyphenated mappings. \
        E.g.: 3-1,18-2,7-3 would indicate a mapping from 3 to 1, from 18 to 2, \
        and from 7 to 3. Only the default trajectories in the mapping will be \
        plotted. If this flag is specified, it will override --trajs',
                        type=str,
                        default=None)

    op = parser.parse_args()

    traj_map = None
    if op.traj_map is not None:
        traj_map = {}
        for ii in op.traj_map.split(','):
            traj_map[int(ii.split('-')[0])] = int(ii.split('-')[1])

    with open(op.model, 'rb') as f:
        mm = pickle.load(f)['MultDPRegression']
        assert op.x_axis in mm.predictor_names_, \
            'x-axis variable not among model predictor variables'
        assert op.y_axis in mm.target_names_, \
            'y-axis variable not among model target variables'

        show = op.fig_file is None

        if op.trajs is not None:
            ax = mm.plot(op.x_axis,
                         op.y_axis,
                         op.x_label,
                         op.y_label,
                         np.array(op.trajs.split(','), dtype=int),
                         show=show,
                         min_traj_prob=op.min_traj_prob,
                         max_traj_prob=op.max_traj_prob,
                         traj_map=traj_map)
        else:
            ax = mm.plot(op.x_axis,
                         op.y_axis,
                         op.x_label,
                         op.y_label,
                         show=show,
                         min_traj_prob=op.min_traj_prob,
                         max_traj_prob=op.max_traj_prob,
                         traj_map=traj_map)

        if op.fig_file is not None:
            print("Saving figure...")
            plt.savefig(op.fig_file)
            print("Writing provenance info...")
            write_provenance_data(op.fig_file,
                                  generator_args=op,
                                  desc=""" """,
                                  module_name='bayes_traj')
            print("DONE.")
Example #6
0
def main():        
    desc = """Generates a pickled file containing Bayesian trajectory prior 
    information"""
    
    parser = ArgumentParser(description=desc)
    parser.add_argument('--preds', help='Comma-separated list of predictor names',
        dest='preds', type=str, default=None)
    parser.add_argument('--targets', help='Comma-separated list of target names',
        dest='targets', type=str, default=None)
    parser.add_argument('--out_file', help='Output (pickle) file that will \
        contain the prior', dest='out_file', type=str, default=None)
#    parser.add_argument('-k', help='Number of columns in the truncated assignment \
#        matrix', metavar='<int>', default=20)
    parser.add_argument('--tar_resid', help='Use this flag to specify the residual \
        precision mean and variance for the corresponding target value. Specify as \
        a comma-separated tuple: target_name,mean,var. Note that precision is the \
        inverse of the variance. Only applies to continuous targets', type=str,
        default=None, action='append', nargs='+')
    parser.add_argument('--coef', help='Coefficient prior for a specified \
        target and predictor. Specify as a comma-separated tuple: \
        target_name,predictor_name,mean,std', type=str,
        default=None, action='append', nargs='+')
    parser.add_argument('--coef_std', help='Coefficient prior standard deviation \
        for a specified target and predictor. Specify as a comma-separated tuple: \
        target_name,predictor_name,std', type=str, default=None,
        action='append', nargs='+')
    parser.add_argument('--in_data', help='If a data file is specified, it will be \
        read in and used to set reasonable prior values using regression. It \
        is assumed that the file contains data columns with names corresponding \
        to the predictor and target names specified on the command line.',
        type=str, default=None)
    parser.add_argument('--num_trajs', help='Rough estimate of the number of \
        trajectories expected in the data set. Can be specified as a single \
        value or as a dash-separated range, such as 4-6. If a single value is \
        specified, a range will be assumed as -1 to +1 the specified value.',
        type=str, default='3')
    parser.add_argument('--model', help='Pickled bayes_traj model that \
        has been fit to data and from which information will be extracted to \
        produce an updated prior file', type=str, default=None)
#    parser.add_argument('--model_trajs', help='Comma-separated list of integers \
#        indicating which trajectories to use from the specified model. If a model \
#        is not specified, the values specified with this flag will be ignored. If \
#        a model is specified, and specific trajectories are not specified with \
#        this flag, then all trajectories will be used to inform the prior', \
#        default=None)
    parser.add_argument('--groupby', help='Column name in input data file \
        indicating those data instances that must be in the same trajectory. This \
        is typically a subject identifier (e.g. in the case of a longitudinal data \
        set).', dest='groupby', metavar='<string>', default=None)
    
    op = parser.parse_args()
    
    preds = op.preds.split(',')
    targets = op.targets.split(',')

    pg = PriorGenerator(targets, preds)
    
    #---------------------------------------------------------------------------
    # Set the number of trajs
    #---------------------------------------------------------------------------
    num_trajs = np.zeros(2, dtype='float')
    tmp = op.num_trajs.split('-')

    if len(tmp) > 1:
        pg.min_num_trajs_ = float(op.num_trajs.split('-')[0])
        pg.max_num_trajs_ = float(op.num_trajs.split('-')[1])        
    else:
        pg.min_num_trajs_ = np.max([0.001, float(tmp[0]) - 1])
        pg.max_num_trajs_ = float(tmp[0]) + 1
        
    #---------------------------------------------------------------------------
    # Read in and process data and models as availabe
    #---------------------------------------------------------------------------
    if op.model is not None:
        with open(op.model, 'rb') as f:
            print("Reading model...")
            mm = pickle.load(f)['MultDPRegression']
            pg.set_model(mm)

    if op.in_data is not None:
        print("Reading data...")
        pg.set_data(pd.read_csv(op.in_data), op.groupby)

    pg.compute_prior_info()        
    prior_info = copy.deepcopy(pg.prior_info_)
    
    #---------------------------------------------------------------------------
    # Override prior settings with user-specified preferences
    #---------------------------------------------------------------------------
    if op.tar_resid is not None:
        for i in range(len(op.tar_resid)):
            tt = op.tar_resid[i][0].split(',')[0]
            mean_tmp = float(op.tar_resid[i][0].split(',')[1])
            var_tmp = float(op.tar_resid[i][0].split(',')[2])        
            assert tt in targets, "{} not among specified targets".format(tt)
                    
            prior_info['lambda_b0'][tt] = mean_tmp/var_tmp
            prior_info['lambda_a0'][tt] = (mean_tmp**2)/var_tmp
    
    if op.coef is not None:
        for i in range(len(op.coef)):
            tt = op.coef[i][0].split(',')[0]
            pp = op.coef[i][0].split(',')[1]
            m = float(op.coef[i][0].split(',')[2])
            s = float(op.coef[i][0].split(',')[3])
    
            assert tt in targets, "{} not among specified targets".format(tt)
            assert pp in preds, "{} not among specified predictors".format(pp)        
    
            prior_info['w_mu0'][tt][pp] = m
            prior_info['w_var0'][tt][pp] = s**2
    
    if op.coef_std is not None:
        for i in range(len(op.coef_std)):
            tt = op.coef_std[i][0].split(',')[0]
            pp = op.coef_std[i][0].split(',')[1]
            s = float(op.coef_std[i][0].split(',')[2])
    
            assert tt in targets, "{} not among specified targets".format(tt)
            assert pp in preds, "{} not among specified predictors".format(pp)        
    
            prior_info['w_var0'][tt][pp] = s**2

    #---------------------------------------------------------------------------
    # Summarize prior info and save to file
    #---------------------------------------------------------------------------        
    print('---------- Prior Info ----------')
    print('alpha: {:.2e}'.format(prior_info['alpha']))        
    for tt in targets:
        print(" ")
        if prior_info['lambda_a0'][tt] is not None:
            prec_mean = prior_info['lambda_a0'][tt]/\
                prior_info['lambda_b0'][tt]
            prec_var = prior_info['lambda_a0'][tt]/\
                (prior_info['lambda_b0'][tt]**2)
            print("{} residual (precision mean, precision variance): \
            ({:.2e}, {:.2e})".format(tt, prec_mean, prec_var))
        for pp in preds:
            tmp_mean = prior_info['w_mu0'][tt][pp]
            tmp_std = np.sqrt(prior_info['w_var0'][tt][pp])
            print("{} {} (mean, std): ({:.2e}, {:.2e})".\
                  format(tt, pp, tmp_mean, tmp_std))

    if op.out_file is not None:                    
        pickle.dump(prior_info, open(op.out_file, 'wb'))
        desc = """ """
        write_provenance_data(op.out_file, generator_args=op, desc=desc,
                              module_name='bayes_traj')
def main():
    desc = """Produces a scatter plot of the data contained in the input data 
    file as well as plots of random draws from the prior. This is useful to 
    inspect whether the prior appropriately captures prior belief."""
    
    parser = ArgumentParser(description=desc)
    parser.add_argument('--data_file', help='Input data file', type=str,
        default=None)
    parser.add_argument('--prior', help='Input prior file', type=str,
        default=None)
    parser.add_argument('--num_draws', help='Number of random draws to take \
        from prior', type=int, default=10)
    parser.add_argument('--y_axis', help='Name of the target variable that \
        will be plotted on the y-axis', type=str, default=None)
    parser.add_argument('--y_label', help='Label to display on y-axis. If none \
        given, the variable name specified with the y_axis flag will be used.',
        type=str, default=None)
    parser.add_argument('--x_axis', help='Name of the predictor variable that \
        will be plotted on the x-axis', type=str, default=None)
    parser.add_argument('--x_label', help='Label to display on x-axis. If none \
        given, the variable name specified with the x_axis flag will be used.',
        type=str, default=None)
    parser.add_argument('--ylim', help='Comma-separated tuple to set the \
        limits of display for the y-axis', type=str, default=None)    
    parser.add_argument('--hide_resid', help='If set, shaded regions \
        corresponding to residual spread will not be displayed. This can be \
        useful to reduce visual clutter. Only relevant for continuous target \
        variables.', action='store_true')    
    parser.add_argument('--fig_file', help='File name where figure will be \
        saved', type=str, default=None)
    
    op = parser.parse_args()
    
    df = pd.read_csv(op.data_file)

    nonnan_ids = ~np.isnan(df[op.y_axis].values)
    target_type = 'gaussian'
    if set(df[op.y_axis].values[nonnan_ids]).issubset({1.0, 0.0}):
        target_type = 'binary'        
    
    if op.prior is not None:
        with open(op.prior, 'rb') as f:
            prior_file_info = pickle.load(f)
    
            targets = get_target_names_from_prior_info(prior_file_info)
            preds = get_pred_names_from_prior_info(prior_file_info)
            
            D = len(targets)
            M = len(preds)
            
            prior_data = {}
            prior_data['w_mu0'] = np.zeros([M, D])
            prior_data['w_var0'] = np.ones([M, D])
            prior_data['lambda_a0'] = np.ones([D])
            prior_data['lambda_b0'] = np.ones([D])
            
            for (d, target) in enumerate(targets):
                prior_data['lambda_a0'][d] = \
                    prior_file_info['lambda_a0'][target]
                prior_data['lambda_b0'][d] = \
                    prior_file_info['lambda_b0'][target]            
                
                for (m, pred) in enumerate(preds):
                    prior_data['w_mu0'][m, d] = \
                        prior_file_info['w_mu0'][target][pred]
                    prior_data['w_var0'][m, d] = \
                        prior_file_info['w_var0'][target][pred]
    
    fig, ax = plt.subplots(figsize=(8, 8))                
    ax.scatter(df[op.x_axis].values, df[op.y_axis].values, facecolor='none',
                edgecolor='k', alpha=0.2)
    
    num_dom_locs = 100
    x_dom = np.linspace(np.min(df[op.x_axis].values), \
                        np.max(df[op.x_axis].values), num_dom_locs)
    
    for nn in range(op.num_draws):
        target_index = np.where(np.array(targets) == op.y_axis)[0][0]
        if target_type == 'gaussian':
            scale = 1./prior_data['lambda_b0'][target_index]
            shape = prior_data['lambda_a0'][target_index]
            std = np.sqrt(1./np.random.gamma(shape, scale, size=1))
        
        co = sample_cos(prior_data['w_mu0'],
                        prior_data['w_var0'])[:, target_index, 0]
    
        X_tmp = np.ones([num_dom_locs, M])
        for (inc, pp) in enumerate(preds):
            tmp_pow = pp.split('^')
            tmp_int = pp.split('*')
            
            if len(tmp_pow) > 1:
                if op.x_axis in tmp_pow:                
                    X_tmp[:, inc] = x_dom**(int(tmp_pow[-1]))
                else:                
                    X_tmp[:, inc] = np.mean(df[tmp_pow[0]].values)**\
                        (int(tmp_pow[-1]))
            elif len(tmp_int) > 1:
                if op.x_axis in tmp_int:                
                    X_tmp[:, inc] = \
                        x_dom**np.mean(df[tmp_int[np.where(np.array(tmp_int) \
                                    != op.x_axis)[0][0]]].values)
                else:
                    X_tmp[:, inc] = np.mean(df[tmp_int[0]])*\
                        np.mean(df[tmp_int[1]])
            elif pp == op.x_axis:
                X_tmp[:, inc] = x_dom
            else:
                X_tmp[:, inc] = np.mean(df[tmp_pow[0]].values)

        if target_type == 'gaussian':
            y_tmp = np.dot(co, X_tmp.T)
        elif target_type == 'binary':
            y_tmp = np.exp(np.dot(co, X_tmp.T))/\
                (1 + np.exp(np.dot(co, X_tmp.T)))
        
        ax.plot(x_dom, y_tmp)
        if target_type == 'gaussian' and not op.hide_resid:
            ax.fill_between(x_dom, y_tmp-2*std, y_tmp+2*std, alpha=0.3)

    x_label = op.x_label if op.x_label is not None else op.x_axis
    y_label = op.y_label if op.y_label is not None else op.y_axis
    ax.set_xlabel(x_label, fontsize=16)
    ax.set_ylabel(y_label, fontsize=16)
    if op.ylim is not None:
        ax.set_ylim(float(op.ylim.strip('--').split(',')[0]),
                    float(op.ylim.strip('--').split(',')[1]))
    
    if op.fig_file is not None:
        print("Saving figure...")
        plt.savefig(op.fig_file)
        print("Writing provenance info...")
        write_provenance_data(op.fig_file, generator_args=op, desc=""" """,
                              module_name='bayes_traj')
        print("DONE.")
    else:
        plt.show()
def main():
    desc = """Generates an arbitrary number of quadratic trajectories. Useful 
    for testing purposes. The x-axis is referred to as 'age' throughout. The 
    context of this script mimics a longitudinal study in which individuals are 
    recruited and then followed for a specified number of visits, spread apart 
    by a specified number of time."""

    parser = ArgumentParser(description=desc)
    parser.add_argument('--traj_params',
                        help='Tuple specifying the trajectory \
        shape, residual noise, and number of subjects. Can be used multiple \
        times. Specify as: <intercept,age,age^2,resid_std,num>. If resid_std \
        is specified as NA, then the target variable will be assumed binary.',
                        type=str,
                        default=None,
                        action='append',
                        nargs='+')
    parser.add_argument('--enrollment',
                        help='Comma-separated tuple: min age, max \
        age. This specifies the range of randomly generated ages correpsonding to \
        a synthetically generated individuals baseline age',
                        dest='enrollment',
                        default=None)
    parser.add_argument('--visit_span',
                        help='Num years between successive visits',
                        dest='visit_span',
                        type=float,
                        default=None)
    parser.add_argument('--max_age',
                        help='No subject age will be above this amount',
                        dest='max_age',
                        type=float,
                        default=95)
    parser.add_argument('--num_visits',
                        help='Number of longitudinal visits \
        per individual. Note that the actual number for an individual may be less \
        than this if the generated age for a visit is greater than max_age',
                        dest='num_visits',
                        type=int,
                        default=1)
    parser.add_argument('--out_file',
                        help='Output data file name. Columns \
        include: intercept, x, x^2, y, id, data_names, traj.',
                        default=None)

    op = parser.parse_args()

    enrollment_min = int(op.enrollment.split(',')[0])
    enrollment_max = int(op.enrollment.split(',')[1])

    if len(op.traj_params) <= 10:
        cmap = plt.cm.get_cmap('tab10')
    else:
        cmap = plt.cm.get_cmap('tab20')

    df_out = pd.DataFrame()

    subj_inc = 0
    traj_inc = 0

    plt.figure(figsize=(8, 8))
    print("Generating trajectories...")
    for tp in op.traj_params:
        traj_inc += 1

        traj_dim_cos = tp[0].split('><')
        D = len(traj_dim_cos)
        num_in_traj = int(traj_dim_cos[0].strip('>').split(',')[4])

        for s in range(num_in_traj):
            subj_inc += 1
            df_tmp = pd.DataFrame()

            enrollment_age = np.random.uniform(enrollment_min, enrollment_max)

            # Get age range
            ages_tmp = np.linspace(enrollment_age, \
                                   enrollment_age + op.visit_span*(op.num_visits-1),
                                   op.num_visits)
            ids_tmp = ages_tmp <= op.max_age
            ages = ages_tmp[ids_tmp]

            df_tmp['intercept'] = np.ones(ages.shape[0])
            df_tmp['age'] = ages
            df_tmp['age^2'] = ages**2
            df_tmp['id'] = str(subj_inc)
            df_tmp['data_names'] = [str(subj_inc) + '_' + \
                                    str(j) for j in range(ages.shape[0])]
            df_tmp['traj_gt'] = traj_inc

            for i, dd in enumerate(traj_dim_cos):
                cos = np.array(dd.strip('<').strip('>').split(',')[0:3],
                               dtype=float)
                tmp = dd.strip('<').strip('>').split(',')[3]
                if tmp.lower() == 'na':
                    dot_tmp = np.dot(cos, \
                        df_tmp[['intercept', 'age', 'age^2']].values.T)
                    mu = np.exp(dot_tmp) / (1 + np.exp(dot_tmp))
                    y = np.random.binomial(1, mu, ages.shape[0])
                else:
                    traj_resid_std = float(tmp)

                    y = np.dot(cos, \
                        df_tmp[['intercept', 'age', 'age^2']].values.T) + \
                        traj_resid_std*np.random.randn(ages.shape[0])

                df_tmp['y{}'.format(i + 1)] = y

            df_out = pd.concat([df_out, df_tmp])

    if traj_inc <= 10:
        cmap = plt.cm.get_cmap('tab10')
    else:
        cmap = plt.cm.get_cmap('tab20')

    fig, axs = plt.subplots(1, D, figsize=(6 * D, 6))
    for d in range(D):
        for tt in range(traj_inc):
            ids = df_out.traj_gt.values == tt + 1
            num_in_traj = df_out[ids].groupby('id').ngroups
            if D > 1:
                axs[d].scatter(df_out[ids].age.values,
                               df_out[ids]['y{}'.format(d + 1)].values,
                               edgecolor='k',
                               color=cmap(tt),
                               alpha=0.5,
                               label='Traj {} (N={})'.format(
                                   tt + 1, num_in_traj))
                axs[d].set_xlabel('Age')
                axs[d].set_ylabel('y{}'.format(d + 1))
                axs[d].legend()
            else:
                axs.scatter(df_out[ids].age.values,
                            df_out[ids]['y{}'.format(d + 1)].values,
                            edgecolor='k',
                            color=cmap(tt),
                            alpha=0.5,
                            label='Traj {} (N={})'.format(tt + 1, num_in_traj))
                axs.set_xlabel('Age')
                axs.set_ylabel('y{}'.format(d + 1))
                axs.legend()
    plt.show()

    if op.out_file is not None:
        print("Writing to file...")
        df_out.to_csv(op.out_file, index=False)
        write_provenance_data(op.out_file,
                              generator_args=op,
                              module_name='bayes_traj')

    print("DONE.")