def parse_args(xvar='laps', yvar='evidence'):
    ''' Returns Namespace of parsed arguments retrieved from command line
    '''
    parser = argparse.ArgumentParser()
    parser.add_argument('dataName', type=str, default='AsteriskK8')
    parser.add_argument('jpath', type=str, default='demo*')

    parser.add_argument('--xvar', type=str, default=xvar,
                        choices=list(LabelMap.keys()),
                        help="name of x axis variable to plot.")

    parser.add_argument('--yvar', type=str, default=yvar,
                        #choices=LabelMap.keys(),
                        help="name of y axis variable to plot.")

    helpMsg = "ids of trials/runs to plot from given job." + \
              " Example: '4' or '1,2,3' or '2-6'."
    parser.add_argument(
        '--taskids', type=str, default=None, help=helpMsg)
    parser.add_argument(
        '--savefilename', type=str, default=None,
        help="location where to save figure (absolute path directory)")

    args, unkList = parser.parse_known_args()

    argDict = BNPYArgParser.arglist_to_kwargs(unkList, doConvertFromStr=False)
    argDict.update(args.__dict__)
    argDict['jpathPattern'] = os.path.join(os.environ['BNPYOUTDIR'],
                                           args.dataName,
                                           args.jpath)
    del argDict['dataName']
    del argDict['jpath']
    return argDict
Example #2
0
def parse_args():
    ''' Returns Namespace of parsed arguments retrieved from command line
    '''
    parser = argparse.ArgumentParser()
    parser.add_argument('dataName', type=str, default='AsteriskK8')
    parser.add_argument('jpath', type=str, default='demo*')

    helpMsg = "ids of trials/runs to plot from given job." + \
              " Example: '4' or '1,2,3' or '2-6'."
    parser.add_argument(
        '--taskids',
        type=str, default=None,
        help=helpMsg)
    parser.add_argument(
        '--savefilename', type=str, default=None,
        help="location where to save figure (absolute path directory)")
    parser.add_argument('--fileSuffix', type=str, default='PredLik.mat')
    args, unkList = parser.parse_known_args()

    argDict = BNPYArgParser.arglist_to_kwargs(unkList)
    argDict.update(args.__dict__)
    argDict['jpathPattern'] = os.path.join(os.environ['BNPYOUTDIR'],
                                           args.dataName,
                                           args.jpath)
    del argDict['dataName']
    del argDict['jpath']
    return argDict
def parse_args(**kwargs):
    ''' Returns Namespace of parsed arguments retrieved from command line
    '''
    parser = argparse.ArgumentParser()
    parser.add_argument('dataName', type=str, default='AsteriskK8')
    parser.add_argument('jpathPattern', type=str, default='demo*')
    parser.add_argument('--xvar', type=str, default=None,
                        help="name of x axis variable to plot.")
    parser.add_argument('--yvar', type=str, default='evidence',
                        choices=list(LabelMap.keys()),
                        help="name of y axis variable to plot.")
    parser.add_argument('--lvar', type=str, default=None,
                        help="quantity that varies across lines")
    parser.add_argument('--pvar', type=str, default=None,
                        help="quantity that varies across subplots")
    parser.add_argument('--taskids', type=str, default='all',
                        help="specify which task to plot (all, .best, .worst, etc)")
    parser.add_argument(
        '--savefilename', type=str, default=None,
        help="location where to save figure (absolute path directory)")
    args, unkList = parser.parse_known_args()
    argDict = BNPYArgParser.arglist_to_kwargs(unkList)
    argDict.update(args.__dict__)
    argDict.update(kwargs)
    argDict['jpathPattern'] = os.path.join(os.environ['BNPYOUTDIR'],
                                           args.dataName,
                                           args.jpathPattern)
    del argDict['dataName']
    for key in argDict:
        if key.endswith('vals'):
            if not isinstance(argDict[key], list):
                argDict[key] = argDict[key].split(',')
    return argDict
Example #4
0
def parse_args():
  ''' Parse cmd line arguments
  '''
  parser = argparse.ArgumentParser() 
   
  BNPYArgParser.addRequiredVizArgsToParser(parser)
  BNPYArgParser.addStandardVizArgsToParser(parser)
  parser.add_argument('--lap', default=None, type=float,
        help="Specific lap at which to plot parameters." \
             + " If exact lap not available, instead plots nearest lap.")
  parser.add_argument('--doPlotData', action='store_true', default=False,
        help="If present, also plot training data.")
  parser.add_argument('--doPlotTruth', action='store_true', default=False,
        help="If present, also plot true model params that generated data.")
  parser.add_argument('--doSort', action='store_true', default=False,
        help="If present, sort parameters by global appearance probabilities.")
  args = parser.parse_args()
  return args
Example #5
0
def parse_args():
  ''' Returns Namespace of parsed arguments retrieved from command line
  '''
  parser = argparse.ArgumentParser()
  BNPYArgParser.addRequiredVizArgsToParser(parser)
  BNPYArgParser.addStandardVizArgsToParser(parser)
  parser.add_argument('--xvar', type=str, default='laps',
        help="name of x axis variable to plot. one of {iters,laps,times}")

  parser.add_argument('--traceEvery', type=str, default=None,
        help="Specifies how often to plot data points. For example, traceEvery=10 only plots data points associated with laps divisible by 10.")
  parser.add_argument('--legendnames', type=str, default=None,
        help="optional names to show on legend in place of jobnames")
  args = parser.parse_args()
  args.algNames = args.algNames.split(',')
  args.jobnames = args.jobnames.split(',')
  if args.legendnames is not None:
    args.legendnames = args.legendnames.split(',')
    #assert len(args.legendnames) == len(args.jobnames) * len(args.algNames)
  return args
Example #6
0
def plot_all_tasks_for_job(jobpath, args, jobname=None, color=None):
  ''' Create line plot in current matplotlib figure
      for each task/run of the designated jobpath
  '''
  if not os.path.exists(jobpath):
    raise ValueError("No such path: %s" % (jobpath))
  
  taskids = BNPYArgParser.parse_task_ids(jobpath, args.taskids)
    
  xAll = list()
  yAll = list()
  xLocs = list()
  yLocs = list()
  for tt, taskid in enumerate(taskids):
    xs = np.loadtxt(os.path.join(jobpath, taskid, args.xvar+'.txt'))
    ys = np.loadtxt(os.path.join(jobpath, taskid, 'evidence.txt'))
    # remove first-lap of moVB, since ELBO is not accurate
    if jobpath.count('moVB') > 0 and args.xvar == 'laps':
      mask = xs >= 1.0
      xs = xs[mask]
      ys = ys[mask]
    if args.traceEvery is not None:
      mask = bnpy.util.isEvenlyDivisibleFloat(xs, args.traceEvery)
      xs = xs[mask]
      ys = ys[mask]


    plotargs = dict(markersize=10, linewidth=2, label=None,
                    color=color, markeredgecolor=color)
    if tt == 0:
      plotargs['label'] = jobname
    pylab.plot(xs, ys, '.-', **plotargs)
    if len(ys) > 0:
      xLocs.append(xs[-1])
      yLocs.append(ys[-1])
      yAll.extend(ys[1:])
      xAll.extend(xs[1:])
      
  # Zoom in to the useful part of the ELBO trace
  if len(yAll) > 0:
    global YMin, YMax
    ymin = np.percentile(yAll, 1)
    ymax = np.max(yAll)
    if YMin is None:
      YMin = ymin
      YMax = ymax
    else:
      YMin = np.minimum(ymin, YMin)
      YMax = np.maximum(YMax, ymax)
    blankmargin = 0.08*(YMax - YMin)
    pylab.ylim( [YMin, YMax + blankmargin])
  pylab.xlabel(XLabelMap[args.xvar])
  pylab.ylabel('log evidence')
Example #7
0
def plot_all_tasks_for_job(jobpath, args, jobname=None, color=None):
  ''' Create line plot in current matplotlib figure
      for each task/run of the designated jobpath
  '''
  if not os.path.exists(jobpath):
    raise ValueError("No such path: %s" % (jobpath))
  
  taskids = BNPYArgParser.parse_task_ids(jobpath, args.taskids)
    
  xAll = list()
  yAll = list()
  xLocs = list()
  yLocs = list()
  for tt, taskid in enumerate(taskids):
    xs = np.loadtxt(os.path.join(jobpath, taskid, args.xvar+'.txt'))
    try:
      ys = np.loadtxt(os.path.join(jobpath, taskid, 'K.txt'))
    except IOError:
      MatDict = scipy.io.loadmat(os.path.join(jobpath,taskid, 'AllocPrior.mat'))
      Kfixed = int(MatDict['K'])
      ys = Kfixed* np.ones(len(xs))
    if args.traceEvery is not None:
      mask = bnpy.util.isEvenlyDivisibleFloat(xs, args.traceEvery)
      xs = xs[mask]
      ys = ys[mask]

    plotargs = dict(markersize=10, linewidth=2, label=None,
                    color=color, markeredgecolor=color)
    if tt == 0:
      plotargs['label'] = jobname
    pylab.plot(xs, ys, '.-', **plotargs)
    if len(ys) > 0:
      xLocs.append(xs[-1])
      yLocs.append(ys[-1])
      yAll.extend(ys[1:])
      xAll.extend(xs[1:])
      
  # Zoom in to the useful part of the ELBO trace
  if len(yAll) > 0:
    global YMax
    ymax = np.max(yAll)
    if YMax is None:
      YMax = ymax
    else:
      YMax = np.maximum(YMax, ymax)
    blankmargin = 0.05*(YMax)
    pylab.ylim( [0, YMax + blankmargin])
  pylab.xlabel(XLabelMap[args.xvar])
  pylab.ylabel('K')
Example #8
0
def parse_jobpath_and_taskids(args):
  rootpath = os.path.join(os.environ['BNPYOUTDIR'], args.dataName, 
                              args.allocModelName, args.obsModelName)
  jobpath = os.path.join(rootpath, args.algNames, args.jobnames)
  if not os.path.exists(jobpath):
    raise ValueError("No such path: %s" % (jobpath))
  taskids = BNPYArgParser.parse_task_ids(jobpath, args.taskids)

  # Verify that the intended savefile will work as expected!
  if args.savefilename is not None:
    if args.savefilename.count('%') and len(taskids) > 1:
      try:
        args.savefilename % ('1')
      except TypeError:
        raise ValueError("Missing or bad format string in savefilename %s" %  
                        (args.savefilename)
                      )  
  return jobpath, taskids
Example #9
0
def plotCompsForJob(jobpath='', taskids=[1], lap=None, **kwargs):
    ''' Show plot of learned clusters from run(s) saved results on disk
    '''

    # Verify given absolute path is valid.
    jobpath_originalarg = jobpath
    if not os.path.isdir(jobpath):
        # Fallback: try to prepend BNPYOUTDIR to handle "shortcut" names
        jobpath = os.path.join(os.environ['BNPYOUTDIR'], jobpath)
    if not os.path.isdir(jobpath):
        raise ValueError('Not valid path: ' + jobpath_originalarg)
    taskids = BNPYArgParser.parse_task_ids(jobpath, taskids)
    for tt, taskid in enumerate(taskids):
        if tt == 0 and isinstance(taskid, str):
            if taskid.startswith('.'):
                rankTasksForSingleJobOnDisk(jobpath)
        taskpath = os.path.join(jobpath, str(taskid))
        plotCompsForTask(taskpath, lap=lap, **kwargs)
    if 'block' in kwargs:
        pylab.show(block=kwargs['block'])
Example #10
0
            legNames = ['%s=%s' % (plotkey, x) for x in RangeMap[plotkey]]

        # Build list of final jpaths in order of decided legend
        keepListFinal = list()
        for x in RangeMap[plotkey]:
            for jID, jdict in enumerate(keepListD):
                if jdict[plotkey] == x:
                    keepListFinal.append(keepListP[jID])
    else:
        keepListFinal = keepListP[:1]
        legNames = [None]

    if verbose:
        print('\nLegend entries for selected jobs (auto-selected)')
        for name in legNames:
            print(name)

    return keepListFinal, legNames


if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('dataName', default='AsteriskK8')
    parser.add_argument('jobName', default='bm')
    args, unkList = parser.parse_known_args()
    reqDict = BNPYArgParser.arglist_to_kwargs(unkList, doConvertFromStr=False)
    jpath = os.path.join(os.environ['BNPYOUTDIR'], args.dataName, args.jobName)

    keepJobs, legNames = filterJobs(jpath, verbose=1, **reqDict)
Example #11
0
def plot_all_tasks_for_job(jobpath, label, taskids=None,
                           lineType='.-',
                           spreadLineType='--',
                           color=None,
                           yvar='avgLikScore',
                           xvar='laps',
                           markersize=10,
                           linewidth=2,
                           minLap=0,
                           showFinalPt=0,
                           fileSuffix='PredLik.mat',
                           xjitter=None,
                           prefix='predlik',
                           colorID=0,
                           **kwargs):
    ''' Create line plot in current figure for each task/run of jobpath
    '''
    if not os.path.exists(jobpath):
        print('PATH NOT FOUND', jobpath)
        return None
    if not yvar.startswith('avg') and yvar.count('Kactive') == 0:
        yvar = 'avg' + yvar
    if not yvar.endswith('Score') and yvar.count('Kactive') == 0:
        yvar = yvar + 'Score'

    if color is None:
        color = Colors[colorID % len(Colors)]
    taskids = BNPYArgParser.parse_task_ids(jobpath, taskids)

    for tt, taskid in enumerate(taskids):
        taskoutpath = os.path.join(jobpath, taskid)
        hpaths = glob.glob(os.path.join(taskoutpath, '*' + fileSuffix))
        txtpaths = glob.glob(os.path.join(taskoutpath, 'predlik-*.txt'))
        ys_hi = None
        ys_lo = None
        if len(txtpaths) > 0:
            if fileSuffix.endswith('.txt'):
                suffix = '-' + fileSuffix
            else:
                suffix = '.txt'
            if xvar.count('lap'):
                xs = np.loadtxt(
                    os.path.join(taskoutpath, prefix + '-lapTrain.txt'))
            elif xvar.count('K'):
                xs = np.loadtxt(os.path.join(taskoutpath, prefix + '-K.txt'))
            elif xvar.count('time'):
                xs = np.loadtxt(os.path.join(
                    taskoutpath, prefix + '-timeTrain.txt'))
            else:
                raise ValueError("Unrecognized xvar: " + xvar)
            if yvar.count('Kactive') and not yvar.count('Percentile'):
                ys = np.loadtxt(os.path.join(taskoutpath, 
                        prefix + '-' + yvar + 'Percentile50.txt'))
                ys_lo = np.loadtxt(os.path.join(taskoutpath, 
                    prefix + '-' + yvar + 'Percentile10.txt'))
                ys_hi = np.loadtxt(os.path.join(taskoutpath, 
                    prefix + '-' + yvar + 'Percentile90.txt'))
            else:
                ys = np.loadtxt(
                    os.path.join(taskoutpath, prefix + '-' + yvar + suffix))

            if minLap > 0 and taskoutpath.count('fix'):
                mask = laps > minLap
                xs = xs[mask]
                ys = ys[mask]
        elif len(hpaths) > 0:
            hpaths.sort()
            basenames = [x.split(os.path.sep)[-1] for x in hpaths]
            xs = np.asarray([float(x[3:11]) for x in basenames])
            ys = np.zeros_like(xs)
            for ii, hpath in enumerate(hpaths):
                MatVars = scipy.io.loadmat(hpath)
                ys[ii] = float(MatVars['avgPredLL'])
        else:
            raise ValueError(
                'Pred Lik data unavailable for job\n' + taskoutpath)

        plotargs = dict(markersize=markersize, linewidth=linewidth, label=None,
                        color=color, markeredgecolor=color,
                        )
        plotargs.update(kwargs)

        if tt == 0:
            plotargs['label'] = label
        if xjitter is not None:
            xs = xs + xjitter
        pylab.plot(xs, ys, lineType, **plotargs)
        if ys_lo is not None:
            del plotargs['label']
            pylab.plot(xs, ys_lo, spreadLineType, **plotargs)
            pylab.plot(xs, ys_hi, spreadLineType, **plotargs)

        if showFinalPt:
            pylab.plot(xs[-1], ys[-1], '.', **plotargs)
    pylab.xlabel(XLabelMap[xvar])
    pylab.ylabel(YLabelMap[yvar])
Example #12
0
def run(dataName=None, allocModelName=None, obsModelName=None, algName=None, \
                      doSaveToDisk=True, doWriteStdOut=True,
                      taskID=None, **kwargs):
    ''' Fit specified model to data with learning algorithm.
    
      Usage
      -------
      To fit a Gauss MixModel to a custom dataset defined in matrix X 
      >> Data = bnpy.data.XData(X)
      >> hmodel = run(Data, 'MixModel', 'Gauss', 'EM', K=3, nLap=10)

      To load a dataset specified in a specific script
      For example, 2D toy data in demodata/AsteriskK8.py
      >> hmodel = run('AsteriskK8', 'MixModel', 'Gauss', 'VB', K=3)
      
      To run 5 tasks (separate initializations) and get best of 5 runs:
      >> opts = dict(K=8, nLap=100, printEvery=0)
      >> hmodel = run('AsteriskK8','MixModel','Gauss','VB', nTask=5, **opts)

      Args
      -------
      dataName : either one of
                  * bnpy Data object,
                  * string filesystem path of Data module within BNPYDATADIR
      allocModelName : string name of allocation (latent structure) model
                        {MixModel, DPMixModel, AdmixModel, HMM, etc.}
      obsModelName : string name of observation (likelihood) model
                        {Gauss, ZMGauss, WordCount, etc.}
      **kwargs : keyword args defining properties of the model or alg
                  see Doc for details [TODO]
      Returns
      -------
      hmodel : best model fit to the dataset (across nTask runs)
      LP : local parameters of that best model on the dataset
      evBound : log evidence (ELBO) for the best model on the dataset
                  scalar, real value where larger value implies better model
  '''
    hasReqArgs = dataName is not None
    hasReqArgs &= allocModelName is not None
    hasReqArgs &= obsModelName is not None
    hasReqArgs &= algName is not None

    if hasReqArgs:
        ReqArgs = dict(dataName=dataName,
                       allocModelName=allocModelName,
                       obsModelName=obsModelName,
                       algName=algName)
    else:
        ReqArgs = BNPYArgParser.parseRequiredArgs()
        dataName = ReqArgs['dataName']
        allocModelName = ReqArgs['allocModelName']
        obsModelName = ReqArgs['obsModelName']
        algName = ReqArgs['algName']
    KwArgs, UnkArgs = BNPYArgParser.parseKeywordArgs(ReqArgs, **kwargs)

    jobname = KwArgs['OutputPrefs']['jobname']

    if taskID is None:
        starttaskid = KwArgs['OutputPrefs']['taskid']
    else:
        starttaskid = taskID
        KwArgs['OutputPrefs']['taskid'] = taskID
    nTask = KwArgs['OutputPrefs']['nTask']

    bestInfo = None
    bestEvBound = -np.inf
    for taskid in range(starttaskid, starttaskid + nTask):
        hmodel, LP, Info = _run_task_internal(jobname, taskid, nTask, ReqArgs,
                                              KwArgs, UnkArgs, dataName,
                                              allocModelName, obsModelName,
                                              algName, doSaveToDisk,
                                              doWriteStdOut)
        if (Info['evBound'] > bestEvBound):
            bestModel = hmodel
            bestLP = LP
            bestEvBound = Info['evBound']
            bestInfo = Info
    return bestModel, bestLP, bestInfo
Example #13
0
def plotSingleJob(dataName,
                  jobname,
                  taskids='1',
                  lap=None,
                  showELBOInTitle=True,
                  cmap='gray',
                  title='',
                  mixZs=False):
    ''' Visualize results of single run
    '''

    # Parse the jobpath, and create example task paths
    jobpath = os.path.join(os.path.expandvars('$BNPYOUTDIR'), dataName,
                           jobname)
    if isinstance(taskids, str):
        taskids = BNPYArgParser.parse_task_ids(jobpath, taskids)
    elif isinstance(taskids, int):
        taskids = [str(taskids)]
    taskpath = os.path.join(jobpath, taskids[0])

    # Load data, with same dataset size prefs as specified at inference time.
    dataKwargs = bnpy.ioutil.DataReader.loadDataKwargsFromDisk(taskpath)
    Data = bnpy.ioutil.DataReader.loadDataFromSavedTask(taskpath)
    AdjMat = np.squeeze(Data.toAdjacencyMatrix())
    if hasattr(Data, 'TrueParams'):
        if 'nodeZ' in Data.TrueParams:
            sortids = np.argsort(Data.TrueParams['nodeZ'])
            print 'Sorting nodes by true labels...'
        elif 'pi' in Data.TrueParams:
            sortids = np.argsort(Data.TrueParams['pi'].argmax(axis=1))
    else:
        sortids = np.arange(AdjMaj.shape[0])
    # Rearrange the rows/cols of AdjMat
    AdjMat = AdjMat[sortids, :]
    AdjMat = AdjMat[:, sortids]
    if hasattr(Data, 'nodeNames'):
        nodeNames = [Data.nodeNames[s] for s in sortids]
    else:
        nodeNames = None
    # Show the true adj mat and the estimated side-by-side
    # First, the true adjacency matrix
    ncols = len(taskids) + 1
    pylab.subplots(nrows=1, ncols=ncols, figsize=(3 * ncols, 3))
    pylab.subplot(1, ncols, 1)
    pylab.imshow(AdjMat, cmap='Greys', interpolation='nearest', vmin=0, vmax=1)

    if len(nodeNames) < 25:
        pylab.gca().set_yticks(np.arange(len(nodeNames)))
        pylab.gca().set_yticklabels(nodeNames)

    for tt, taskid in enumerate(taskids):
        taskoutpath = os.path.join(jobpath, taskid) + os.path.sep
        # Load the model for the current task at specified lap
        hmodel, curLap = bnpy.ioutil.ModelReader.loadModelForLap(
            taskoutpath, lap)
        # Compute expected state-state edge prob matrix Ew
        Ew = hmodel.obsModel.Post.lam1 / \
            (hmodel.obsModel.Post.lam1 + hmodel.obsModel.Post.lam0)
        isAssortative = str(type(hmodel.allocModel)).count('Assort')
        if isAssortative:
            K = hmodel.allocModel.K
            Ew_tmp = hmodel.allocModel.epsilon * np.ones((K, K, Ew.shape[-1]))
            for k in xrange(K):
                Ew_tmp[k, k] = Ew[k]
            Ew = Ew_tmp
        taskAdjMat = np.zeros((Data.nNodes, Data.nNodes, Data.dim))
        useLP = 0
        if useLP:
            LP = hmodel.calc_local_params(Data)
            for eid, (s, t) in enumerate(Data.edges):
                resp_st = LP['resp'][eid]
                if isAssortative:
                    taskAdjMat[s, t] = np.sum(resp_st[:, np.newaxis] * Ew,
                                              axis=0)
                else:
                    assert np.allclose(resp_st.sum(), 1.0)
                    taskAdjMat[s, t] = np.sum(resp_st[:, :, np.newaxis] * Ew,
                                              axis=(0, 1))

        else:
            Epi = np.exp(hmodel.allocModel.E_logPi())
            for eid, (s, t) in enumerate(Data.edges):
                for d in xrange(Data.dim):
                    taskAdjMat[s, t,
                               d] = np.inner(Epi[s, :],
                                             np.dot(Ew[:, :, d], Epi[t, :]))
        assert taskAdjMat.min() >= 0
        assert taskAdjMat.max() <= 1.0
        taskAdjMat = np.squeeze(taskAdjMat)
        taskAdjMat = taskAdjMat[sortids, :]
        taskAdjMat = taskAdjMat[:, sortids]
        pylab.subplot(1, ncols, 2 + tt)
        pylab.imshow(taskAdjMat,
                     cmap='Greys',
                     interpolation='nearest',
                     vmin=0,
                     vmax=1)
Example #14
0
def plotSingleJob(
    dataset,
    jobname,
    taskids='1',
    lap='final',
    sequences=[1],
    showELBOInTitle=False,
    dispTrue=True,
    aspectFactor=4.0,
    specialStateIDs=None,
    seqNames=None,
    cmap='Set1',
    maxT=None,
    colorManyToOne=False,
):
    '''
    Returns the array of Data corresponding to a single sequence to display

    If dispTrue = True, the true labels will be shown underneath the
      estimated labels
    '''
    # Make sequences zero-indexed
    if isinstance(sequences, str):
        sequences = np.asarray([int(x) for x in args.sequences.split(',')],
                               dtype=np.int32)
    sequences = np.asarray(sequences, dtype=np.int32)
    if np.min(sequences) < 1:
        raise ValueError('Sequences need to be one-index.\n' +
                         'Valid values are 1,2,...N.')
    sequences -= 1

    # Determine the jobpath and taskids
    jobpath = os.path.join(os.path.expandvars('$BNPYOUTDIR'), dataset, jobname)
    if isinstance(taskids, str):
        if taskids.startswith('.'):
            taskids = [taskids]
        else:
            taskids = BNPYArgParser.parse_task_ids(jobpath, taskids)
    elif isinstance(taskids, int):
        taskids = [str(taskids)]

    datasetPrefFile = os.path.join(jobpath, taskids[0],
                                   'args-DatasetPrefs.txt')
    datasetPrefs = dict()
    if os.path.exists(datasetPrefFile):
        with open(datasetPrefFile, 'r') as f:
            for line in f.readlines():
                fields = line.strip().split(' ')
                if len(fields) != 2:
                    continue
                datasetPrefs[fields[0]] = fields[1]

    # Load Data from its python module
    Datamod = imp.load_source(
        dataset, os.path.expandvars('$BNPYDATADIR/' + dataset + '.py'))
    if dataset == 'SpeakerDiar':
        if len(sequences) > 1:
            raise ValueError(
                'Joint modeling of several sequences makes no sense')
        Data = Datamod.get_data(meetingNum=sequences[0] + 1, **datasetPrefs)
        jobpath = jobpath.replace('SpeakerDiar',
                                  'SpeakerDiar' + str(sequences[0] + 1))
        sequences[0] = 0

    else:
        Data = Datamod.get_data(**datasetPrefs)

    # Determine the maximum length among any of the sequences to be plotted
    if maxT is None:
        Ts = Data.doc_range[sequences + 1] - Data.doc_range[sequences]
        maxT = np.max(Ts)

    # Define the number of pixels used by vertical space of figure
    NUM_STACK = int(np.ceil(maxT / float(aspectFactor)))
    if dispTrue:
        NUM_STACK /= 2

    f, axes = plt.subplots(len(sequences),
                           len(taskids),
                           sharex='col',
                           sharey='row')

    # For singleton case, make sure that axes is index-able
    if len(sequences) == 1 and len(taskids) == 1:
        axes = [axes]

    for tt, taskidstr in enumerate(taskids):
        if tt == 0 and taskidstr.startswith('.'):
            rankTasksForSingleJobOnDisk(jobpath)

        path = os.path.join(jobpath, taskidstr) + os.path.sep

        # Figure out which lap to use
        if lap == 'final':
            lapsFile = open(path + 'laps-saved-params.txt')
            curLap = lapsFile.readlines()
            curLap = float(curLap[-1])
            lapsFile.close()
        else:
            curLap = int(lap)

        if showELBOInTitle:
            hdists = np.loadtxt(os.path.join(path, 'hamming-distance.txt'))
            hlaps = np.loadtxt(os.path.join(path, 'laps-saved-params.txt'))
            Keffvals = np.loadtxt(os.path.join(path, 'Keff-saved-params.txt'))
            # Determine scalar values to display
            loc = np.argmin(np.abs(hlaps - curLap))
            hdist = hdists[loc]
            Kefffinal = Keffvals[loc]

            try:
                Kvals = np.loadtxt(os.path.join(path, 'K.txt'))
                ELBOscores = np.loadtxt(os.path.join(path, 'evidence.txt'))
                laps = np.loadtxt(os.path.join(path, 'laps.txt'))

                loc = np.argmin(np.abs(laps - curLap))
                ELBO = ELBOscores[loc]
                Kfinal = Kvals[loc]
            except IOError:
                ELBO = 0.0
                Kfinal = Kefffinal

        # Load in the saved Data from $BNPYOUTDIR
        try:
            filename = 'Lap%08.3fMAPStateSeqsAligned.mat' % curLap
            zHatBySeq = scipy.io.loadmat(path + filename)
            key1 = 'zHatBySeqAligned'
            key2 = 'zHatBySeq'
            if key1 in zHatBySeq:
                zHatBySeq = convertStateSeq_MAT2list(zHatBySeq[key1])
            elif key2 in zHatBySeq:
                zHatBySeq = convertStateSeq_MAT2list(zHatBySeq[key2])
            else:
                raise IOError
        except IOError:
            filename = 'Lap%08.3fMAPStateSeqs.mat' % curLap
            zHatBySeq = scipy.io.loadmat(path + filename)
            zHatBySeq = convertStateSeq_MAT2list(zHatBySeq['zHatBySeq'])

        if specialStateIDs is not None:
            zHatBySeq = relabelAllSequences(zHatBySeq, specialStateIDs)

        # Find maximum number of states we need to display
        nSeq = len(zHatBySeq)
        Kmax = np.max([zHatBySeq[i].max() for i in xrange(nSeq)])
        hasGroundTruth = False

        vmin = 0
        Kignore = 0
        if hasattr(Data, 'TrueParams') and 'Z' in Data.TrueParams:
            hasGroundTruth = True
            Kmax = np.maximum(Data.TrueParams['Z'].max(), Kmax)
            uLabels = np.unique(Data.TrueParams['Z'])
            Kignore = np.sum(uLabels < 0)
            if Kignore > 0:
                for k in range(1, Kignore + 1):
                    print 'ignoring state %d  Ttrue = %d' % (
                        -k, np.sum(Data.TrueParams['Z'] == -k))

            if colorManyToOne:
                # For each state in zHat, find best true sequence
                Zflat = convertStateSeq_list2flat(zHatBySeq, Data)
                ZflatA = -1 * np.ones_like(Zflat)
                for uID in np.unique(Zflat):
                    overlap = np.zeros(uLabels.size)
                    for ii, trueID in enumerate(uLabels):
                        overlap[ii] = np.sum(
                            np.logical_and(Data.TrueParams['Z'] == trueID,
                                           Zflat == uID))
                    bestii = overlap.argmax()
                    ZflatA[Zflat == uID] = uLabels[bestii]
                zHatBySeq = convertStateSeq_flat2list(ZflatA, Data)

        # In case there's only one sequence, make sure it's index-able
        for ii, seqNum in enumerate(sequences):
            image = np.tile(zHatBySeq[seqNum], (NUM_STACK, 1))

            # Add the true labels to the image (if they exist)
            if hasGroundTruth and dispTrue:
                start = Data.doc_range[seqNum]
                stop = Data.doc_range[seqNum + 1]
                img_trueZ = np.tile(Data.TrueParams['Z'][start:stop],
                                    (NUM_STACK, 1))
                if dispTrue == 2:
                    image = img_trueZ  # Show only true labels
                else:
                    image = np.vstack((image, img_trueZ))

            image = image[:, :maxT]
            if len(sequences) == 1 or len(taskids) == 1:
                cur_ax = axes[ii + tt]
            else:
                cur_ax = axes[ii, tt]

            if hasattr(cmap, 'N'):
                vmax = cmap.N
            else:
                vmax = Kmax

            cur_ax.imshow(Kignore + image + .0001,
                          interpolation='nearest',
                          vmin=vmin,
                          vmax=vmax,
                          cmap=cmap)
            if tt == 0:
                if seqNames is not None:
                    h = cur_ax.set_ylabel('%s' % (seqNames[ii]), fontsize=13)
                    h.set_rotation(0)

                elif len(sequences) > 4:
                    cur_ax.set_ylabel('%d' % (seqNum + 1), fontsize=13)
                else:
                    cur_ax.set_ylabel('Seq. %d' % (seqNum + 1), fontsize=13)

            if ii == 0:
                if showELBOInTitle:
                    fmtSpec = "ELBO: %.3f  K=%d Keff=%d  "
                    if hdist > 0.01:
                        fmtSpec += "dist=%.2f"
                    elif hdist > 0.001:
                        fmtSpec += "dist=%.3f"
                    else:
                        fmtSpec += "dist=%.4f"
                    title = fmtSpec % (ELBO, Kfinal, Kefffinal, hdist)
                    cur_ax.set_title(title)

            cur_ax.set_xlim([0, maxT])
            cur_ax.set_ylim([0, image.shape[0]])
            cur_ax.set_yticks([])
            # ... end loop over sequences
    return axes, zHatBySeq
Example #15
0
def plotSingleLineAcrossJobsByXVar(jpathPattern,
                                   label='',
                                   xvar=None,
                                   xvals=None,
                                   xlabel=None,
                                   yvar='evidence',
                                   lineStyle='.-',
                                   taskids='all',
                                   lineID=0,
                                   lvar='',
                                   **kwargs):
    ''' Create line plot in current figure for job matching the pattern

    Iterates over each xval in provided list of values.
    Each one corresponds to a single saved job.

    Post Condition
    --------------
    Current axes have one line added.
    '''
    prefixfilepath = os.path.sep.join(jpathPattern.split(os.path.sep)[:-1])
    PPListMap = makePPListMapFromJPattern(jpathPattern)
    if xvals is None:
        xvals = PPListMap[xvar]

    xs = np.zeros(len(xvals))
    ys = np.zeros(len(xvals))
    jpathList = makeListOfJPatternsWithSpecificVals(
        PPListMap,
        prefixfilepath=prefixfilepath,
        key=xvar,
        vals=xvals,
        **kwargs)

    plotargs = copy.deepcopy(DefaultLinePlotKwArgs)
    # Plot all tasks as faint points with no connections
    for i, jobpath in enumerate(jpathList):
        if not os.path.exists(jobpath):
            raise ValueError("PATH NOT FOUND: %s" % (jobpath))
        x = float(xvals[i])

        for key in plotargs:
            if key in kwargs:
                plotargs[key] = kwargs[key]
        plotargs['markeredgecolor'] = plotargs['color']

        alltaskids = BNPYArgParser.parse_task_ids(jobpath, taskids)
        for tid in alltaskids:
            y = loadYValFromDisk(jobpath, tid, yvar=yvar)
            pylab.plot(x, y, '.', **plotargs)

    # Plot top-ranked tasks as solid points connected by line
    for i, jobpath in enumerate(jpathList):
        rankTasksForSingleJobOnDisk(os.path.join(jobpath))
        x = float(xvals[i])
        y = loadYValFromDisk(jobpath, '.best', yvar=yvar)
        assert isinstance(x, float)
        assert isinstance(y, float)
        xs[i] = x
        ys[i] = y

    plotargs = copy.deepcopy(DefaultLinePlotKwArgs)
    for key in plotargs:
        if key in kwargs:
            plotargs[key] = kwargs[key]
    plotargs['markeredgecolor'] = plotargs['color']
    plotargs['label'] = label
    pylab.plot(xs, ys, lineStyle, **plotargs)

    if lineID == 0:
        if xlabel is None:
            xlabel = xvar
        pylab.xlabel(xlabel)
        pylab.ylabel(LabelMap[yvar])
Example #16
0
def run(dataName=None,
        allocModelName=None,
        obsModelName=None,
        algName=None,
        doSaveToDisk=True,
        doWriteStdOut=True,
        taskID=None,
        **kwargs):
    """ Fit specified model to data with learning algorithm.

        Args
        -------
        dataName : either one of
                    * bnpy Data object,
                    * string name of python file within BNPYDATADIR
        allocModelName : string name of allocation (latent structure) model
        obsModelName : string name of observation (likelihood) model
        **kwargs : keyword args defining properties of the model or alg

        Returns
        -------
        hmodel : best model fit to the dataset (across nTask runs)
        Info   : dict of information about this best model
    """
    hasReqArgs = dataName is not None
    hasReqArgs &= allocModelName is not None
    hasReqArgs &= obsModelName is not None
    hasReqArgs &= algName is not None

    if hasReqArgs:
        ReqArgs = dict(dataName=dataName,
                       allocModelName=allocModelName,
                       obsModelName=obsModelName,
                       algName=algName)
    else:
        ReqArgs = BNPYArgParser.parseRequiredArgs()
        dataName = ReqArgs['dataName']
        allocModelName = ReqArgs['allocModelName']
        obsModelName = ReqArgs['obsModelName']
        algName = ReqArgs['algName']
    KwArgs, UnkArgs = BNPYArgParser.parseKeywordArgs(ReqArgs, **kwargs)
    KwArgs['OutputPrefs']['doSaveToDisk'] = doSaveToDisk
    KwArgs['OutputPrefs']['doWriteStdOut'] = doWriteStdOut

    jobname = KwArgs['OutputPrefs']['jobname']
    # Update stored numerical options via keyword args
    bnpy.util.NumericUtil.UpdateConfig(**UnkArgs)

    if taskID is None:
        starttaskid = KwArgs['OutputPrefs']['taskid']
    else:
        starttaskid = taskID
        KwArgs['OutputPrefs']['taskid'] = taskID
    nTask = KwArgs['OutputPrefs']['nTask']

    best_info_dict = None
    best_loss = np.inf
    for taskid in range(starttaskid, starttaskid + nTask):
        hmodel, info_dict = _run_task_internal(jobname, taskid, nTask, ReqArgs,
                                               KwArgs, UnkArgs, dataName,
                                               allocModelName, obsModelName,
                                               algName, doSaveToDisk,
                                               doWriteStdOut)
        if (taskid == starttaskid or info_dict['loss'] < best_loss):
            bestModel = hmodel
            best_loss = info_dict['loss']
            best_info_dict = info_dict
    return bestModel, best_info_dict
Example #17
0
def run(dataName=None, allocModelName=None, obsModelName=None, algName=None, \
                      doSaveToDisk=True, doWriteStdOut=True, 
                      taskID=None, **kwargs):
  ''' Fit specified model to data with learning algorithm.
    
      Usage
      -------
      To fit a Gauss MixModel to a custom dataset defined in matrix X 
      >> Data = bnpy.data.XData(X)
      >> hmodel = run(Data, 'MixModel', 'Gauss', 'EM', K=3, nLap=10)

      To load a dataset specified in a specific script
      For example, 2D toy data in demodata/AsteriskK8.py
      >> hmodel = run('AsteriskK8', 'MixModel', 'Gauss', 'VB', K=3)
      
      To run 5 tasks (separate initializations) and get best of 5 runs:
      >> opts = dict(K=8, nLap=100, printEvery=0)
      >> hmodel = run('AsteriskK8','MixModel','Gauss','VB', nTask=5, **opts)

      Args
      -------
      dataName : either one of
                  * bnpy Data object,
                  * string filesystem path of Data module within BNPYDATADIR
      allocModelName : string name of allocation (latent structure) model
                        {MixModel, DPMixModel, AdmixModel, HMM, etc.}
      obsModelName : string name of observation (likelihood) model
                        {Gauss, ZMGauss, WordCount, etc.}
      **kwargs : keyword args defining properties of the model or alg
                  see Doc for details [TODO]
      Returns
      -------
      hmodel : best model fit to the dataset (across nTask runs)
      LP : local parameters of that best model on the dataset
      evBound : log evidence (ELBO) for the best model on the dataset
                  scalar, real value where larger value implies better model
  '''
  hasReqArgs = dataName is not None
  hasReqArgs &= allocModelName is not None
  hasReqArgs &= obsModelName is not None
  hasReqArgs &= algName is not None
  
  if hasReqArgs:
    ReqArgs = dict(dataName=dataName, allocModelName=allocModelName,
                    obsModelName=obsModelName, algName=algName)
  else:
    ReqArgs = BNPYArgParser.parseRequiredArgs()
    dataName = ReqArgs['dataName']
    allocModelName = ReqArgs['allocModelName']
    obsModelName = ReqArgs['obsModelName']
    algName = ReqArgs['algName']
  KwArgs, UnkArgs = BNPYArgParser.parseKeywordArgs(ReqArgs, **kwargs)
  
  jobname = KwArgs['OutputPrefs']['jobname']

  if taskID is None:
    starttaskid = KwArgs['OutputPrefs']['taskid']
  else:
    starttaskid = taskID
    KwArgs['OutputPrefs']['taskid'] = taskID
  nTask = KwArgs['OutputPrefs']['nTask']
  
  bestInfo = None
  bestEvBound = -np.inf
  for taskid in range(starttaskid, starttaskid + nTask):
    hmodel, LP, Info = _run_task_internal(jobname, taskid, nTask,
                      ReqArgs, KwArgs, UnkArgs,
                      dataName, allocModelName, obsModelName, algName,
                      doSaveToDisk, doWriteStdOut)
    if (Info['evBound'] > bestEvBound):
      bestModel = hmodel
      bestLP = LP
      bestEvBound = Info['evBound']
      bestInfo = Info
  return bestModel, bestLP, bestInfo
def plot_all_tasks_for_job(jobpath, label, taskids=None,
                           color=None,
                           colorID=0,
                           density=2,
                           yvar='evidence',
                           markersize=10,
                           linewidth=2,
                           linestyle='-',
                           drawLineToXMax=None,
                           showOnlyAfterLap=0,
                           xvar='laps',
                           **kwargs):
    ''' Create line plot in current figure for each task/run of jobpath
    '''
    if not os.path.exists(jobpath):
        if not jobpath.startswith(os.path.sep):
            jobpath_tmp = os.path.join(os.environ['BNPYOUTDIR'], jobpath)
            if not os.path.exists(jobpath_tmp):
                raise ValueError("PATH NOT FOUND: %s" % (jobpath))
            jobpath = jobpath_tmp
    if color is None:
        color = Colors[colorID % len(Colors)]
    taskids = BNPYArgParser.parse_task_ids(jobpath, taskids)

    if yvar == 'hamming-distance':
        yspfile = os.path.join(jobpath, taskids[0], yvar + '-saved-params.txt')
        if xvar == 'laps' and os.path.isfile(yspfile):
            xvar = 'laps-saved-params'

    for tt, taskid in enumerate(taskids):
        xs = None
        ys = None
        laps = None

        try:
            var_ext = ''
            ytxtfile = os.path.join(jobpath, taskid, yvar + '.txt')
            if not os.path.isfile(ytxtfile):
                var_ext = '-saved-params'
                ytxtfile = os.path.join(
                    jobpath, taskid, yvar + var_ext + '.txt')
            ys = np.loadtxt(ytxtfile)

            if ytxtfile.count('saved-params'):
                laptxtfile = os.path.join(jobpath, taskid, 'laps-saved-params.txt')
            else:
                laptxtfile = os.path.join(jobpath, taskid, 'laps.txt')
        except IOError as e:
            # TODO: when is this code needed?
            # xs, ys = loadXYFromTopicModelFiles(jobpath, taskid)
            try:
                if isinstance(xs, np.ndarray) and yvar.count('Keff'):
                    ys = loadKeffForTask(
                        os.path.join(jobpath, taskid), **kwargs)
                    assert xs.size == ys.size
                else:
                    # Heldout metrics
                    xs, ys = loadXYFromTopicModelSummaryFiles(
                        jobpath, taskid, xvar=xvar, yvar=yvar)
                    if showOnlyAfterLap and showOnlyAfterLap > 0:
                        laps, _ = loadXYFromTopicModelSummaryFiles(
                            jobpath, taskid, xvar='laps', yvar=yvar)
            except ValueError:
                try:
                    xs, ys = loadXYFromTopicModelSummaryFiles(jobpath, taskid)
                except ValueError:
                    raise e
        if yvar == 'hamming-distance' or yvar == 'Keff':
            if xvar == 'laps-saved-params':
                # fix off-by-one error, if we save an extra dist on final lap
                if xs.size == ys.size - 1:
                    ys = ys[:-1]
                elif ys.size == xs.size - 1:
                    xs = xs[:-1]  # fix off-by-one error, if we quit early
            elif xs.size != ys.size:
                # Try to subsample both time series at laps where they
                # intersect
                laps_x = np.loadtxt(os.path.join(jobpath, taskid, 'laps.txt'))
                laps_y = np.loadtxt(os.path.join(jobpath, taskid,
                                                 'laps-saved-params.txt'))
                assert xs.size == laps_x.size
                if ys.size == laps_y.size - 1:
                    laps_y = laps_y[:-1]
                xs = xs[np.in1d(laps_x, laps_y)]
                ys = ys[np.in1d(laps_y, laps_x)]

        if xs.size != ys.size:
            raise ValueError('Dimension mismatch. len(xs)=%d, len(ys)=%d'
                             % (xs.size, ys.size))

        # Cleanup laps data. Verify that it is sorted, with no collisions.
        if xvar == 'laps':
            diff = xs[1:] - xs[:-1]
            goodIDs = np.flatnonzero(diff >= 0)
            if len(goodIDs) < xs.size - 1:
                print('WARNING: looks like multiple runs writing to this file!')
                print(jobpath)
                print('Task: ', taskid)
                print(len(goodIDs), xs.size - 1)
                xs = np.hstack([xs[goodIDs], xs[-1]])
                ys = np.hstack([ys[goodIDs], ys[-1]])

        if xvar == 'laps' and yvar == 'evidence':
            mask = xs >= 1.0
            xs = xs[mask]
            ys = ys[mask]
        elif showOnlyAfterLap:
            # print "Filtering for data recorded at lap >= %s" % (
            #    showOnlyAfterLap)
            if laps is None:
                laps = np.loadtxt(laptxtfile)
            mask = laps >= showOnlyAfterLap
            xs = xs[mask]
            ys = ys[mask]

        # Force plot density (data points per lap) to desired specification
        # This avoids making plots that have huge file sizes,
        # due to too much content in the given display space
        if xvar == 'laps' and xs.size > 20 and np.sum(xs > 5) > 10:
            if (xs[-1] - xs[9]) != 0:
                curDensity = (xs.size - 10) / (xs[-1] - xs[9])
            else:
                curDensity = density
            while curDensity > density and xs.size > 11:
                # Thin xs and ys data by a factor of 2
                # while preserving the first 10 data points
                xs = np.hstack([xs[:10], xs[10::2]])
                ys = np.hstack([ys[:10], ys[10::2]])
                curDensity = (xs.size - 10) / (xs[-1] - xs[9])

        plotargs = dict(
            markersize=markersize,
            linewidth=linewidth,
            linestyle=linestyle,
            label=None,
            color=color, markeredgecolor=color)
        for key in kwargs:
            if key in plotargs:
                plotargs[key] = kwargs[key]
        if tt == 0:
            plotargs['label'] = label

        pylab.plot(xs, ys, **plotargs)
        if drawLineToXMax:
            xs_dashed = np.asarray([xs[-1], drawLineToXMax])
            ys_dashed = np.asarray([ys[-1], ys[-1]])
            plotargs['label'] = None
            pylab.plot(xs_dashed, ys_dashed, '--', **plotargs)


    pylab.xlabel(LabelMap[xvar])
    if yvar in LabelMap:
        yLabelStr = LabelMap[yvar]
        if yvar == 'Keff' and 'effCountThr' in kwargs:
            effCountThr = float(kwargs['effCountThr'])
            yLabelStr = yLabelStr + ' > %s' % (str(effCountThr))
        pylab.ylabel(yLabelStr)