def parse_args(xvar='laps', yvar='evidence'): ''' Returns Namespace of parsed arguments retrieved from command line ''' parser = argparse.ArgumentParser() parser.add_argument('dataName', type=str, default='AsteriskK8') parser.add_argument('jpath', type=str, default='demo*') parser.add_argument('--xvar', type=str, default=xvar, choices=list(LabelMap.keys()), help="name of x axis variable to plot.") parser.add_argument('--yvar', type=str, default=yvar, #choices=LabelMap.keys(), help="name of y axis variable to plot.") helpMsg = "ids of trials/runs to plot from given job." + \ " Example: '4' or '1,2,3' or '2-6'." parser.add_argument( '--taskids', type=str, default=None, help=helpMsg) parser.add_argument( '--savefilename', type=str, default=None, help="location where to save figure (absolute path directory)") args, unkList = parser.parse_known_args() argDict = BNPYArgParser.arglist_to_kwargs(unkList, doConvertFromStr=False) argDict.update(args.__dict__) argDict['jpathPattern'] = os.path.join(os.environ['BNPYOUTDIR'], args.dataName, args.jpath) del argDict['dataName'] del argDict['jpath'] return argDict
def parse_args(): ''' Returns Namespace of parsed arguments retrieved from command line ''' parser = argparse.ArgumentParser() parser.add_argument('dataName', type=str, default='AsteriskK8') parser.add_argument('jpath', type=str, default='demo*') helpMsg = "ids of trials/runs to plot from given job." + \ " Example: '4' or '1,2,3' or '2-6'." parser.add_argument( '--taskids', type=str, default=None, help=helpMsg) parser.add_argument( '--savefilename', type=str, default=None, help="location where to save figure (absolute path directory)") parser.add_argument('--fileSuffix', type=str, default='PredLik.mat') args, unkList = parser.parse_known_args() argDict = BNPYArgParser.arglist_to_kwargs(unkList) argDict.update(args.__dict__) argDict['jpathPattern'] = os.path.join(os.environ['BNPYOUTDIR'], args.dataName, args.jpath) del argDict['dataName'] del argDict['jpath'] return argDict
def parse_args(**kwargs): ''' Returns Namespace of parsed arguments retrieved from command line ''' parser = argparse.ArgumentParser() parser.add_argument('dataName', type=str, default='AsteriskK8') parser.add_argument('jpathPattern', type=str, default='demo*') parser.add_argument('--xvar', type=str, default=None, help="name of x axis variable to plot.") parser.add_argument('--yvar', type=str, default='evidence', choices=list(LabelMap.keys()), help="name of y axis variable to plot.") parser.add_argument('--lvar', type=str, default=None, help="quantity that varies across lines") parser.add_argument('--pvar', type=str, default=None, help="quantity that varies across subplots") parser.add_argument('--taskids', type=str, default='all', help="specify which task to plot (all, .best, .worst, etc)") parser.add_argument( '--savefilename', type=str, default=None, help="location where to save figure (absolute path directory)") args, unkList = parser.parse_known_args() argDict = BNPYArgParser.arglist_to_kwargs(unkList) argDict.update(args.__dict__) argDict.update(kwargs) argDict['jpathPattern'] = os.path.join(os.environ['BNPYOUTDIR'], args.dataName, args.jpathPattern) del argDict['dataName'] for key in argDict: if key.endswith('vals'): if not isinstance(argDict[key], list): argDict[key] = argDict[key].split(',') return argDict
def parse_args(): ''' Parse cmd line arguments ''' parser = argparse.ArgumentParser() BNPYArgParser.addRequiredVizArgsToParser(parser) BNPYArgParser.addStandardVizArgsToParser(parser) parser.add_argument('--lap', default=None, type=float, help="Specific lap at which to plot parameters." \ + " If exact lap not available, instead plots nearest lap.") parser.add_argument('--doPlotData', action='store_true', default=False, help="If present, also plot training data.") parser.add_argument('--doPlotTruth', action='store_true', default=False, help="If present, also plot true model params that generated data.") parser.add_argument('--doSort', action='store_true', default=False, help="If present, sort parameters by global appearance probabilities.") args = parser.parse_args() return args
def parse_args(): ''' Returns Namespace of parsed arguments retrieved from command line ''' parser = argparse.ArgumentParser() BNPYArgParser.addRequiredVizArgsToParser(parser) BNPYArgParser.addStandardVizArgsToParser(parser) parser.add_argument('--xvar', type=str, default='laps', help="name of x axis variable to plot. one of {iters,laps,times}") parser.add_argument('--traceEvery', type=str, default=None, help="Specifies how often to plot data points. For example, traceEvery=10 only plots data points associated with laps divisible by 10.") parser.add_argument('--legendnames', type=str, default=None, help="optional names to show on legend in place of jobnames") args = parser.parse_args() args.algNames = args.algNames.split(',') args.jobnames = args.jobnames.split(',') if args.legendnames is not None: args.legendnames = args.legendnames.split(',') #assert len(args.legendnames) == len(args.jobnames) * len(args.algNames) return args
def plot_all_tasks_for_job(jobpath, args, jobname=None, color=None): ''' Create line plot in current matplotlib figure for each task/run of the designated jobpath ''' if not os.path.exists(jobpath): raise ValueError("No such path: %s" % (jobpath)) taskids = BNPYArgParser.parse_task_ids(jobpath, args.taskids) xAll = list() yAll = list() xLocs = list() yLocs = list() for tt, taskid in enumerate(taskids): xs = np.loadtxt(os.path.join(jobpath, taskid, args.xvar+'.txt')) ys = np.loadtxt(os.path.join(jobpath, taskid, 'evidence.txt')) # remove first-lap of moVB, since ELBO is not accurate if jobpath.count('moVB') > 0 and args.xvar == 'laps': mask = xs >= 1.0 xs = xs[mask] ys = ys[mask] if args.traceEvery is not None: mask = bnpy.util.isEvenlyDivisibleFloat(xs, args.traceEvery) xs = xs[mask] ys = ys[mask] plotargs = dict(markersize=10, linewidth=2, label=None, color=color, markeredgecolor=color) if tt == 0: plotargs['label'] = jobname pylab.plot(xs, ys, '.-', **plotargs) if len(ys) > 0: xLocs.append(xs[-1]) yLocs.append(ys[-1]) yAll.extend(ys[1:]) xAll.extend(xs[1:]) # Zoom in to the useful part of the ELBO trace if len(yAll) > 0: global YMin, YMax ymin = np.percentile(yAll, 1) ymax = np.max(yAll) if YMin is None: YMin = ymin YMax = ymax else: YMin = np.minimum(ymin, YMin) YMax = np.maximum(YMax, ymax) blankmargin = 0.08*(YMax - YMin) pylab.ylim( [YMin, YMax + blankmargin]) pylab.xlabel(XLabelMap[args.xvar]) pylab.ylabel('log evidence')
def plot_all_tasks_for_job(jobpath, args, jobname=None, color=None): ''' Create line plot in current matplotlib figure for each task/run of the designated jobpath ''' if not os.path.exists(jobpath): raise ValueError("No such path: %s" % (jobpath)) taskids = BNPYArgParser.parse_task_ids(jobpath, args.taskids) xAll = list() yAll = list() xLocs = list() yLocs = list() for tt, taskid in enumerate(taskids): xs = np.loadtxt(os.path.join(jobpath, taskid, args.xvar+'.txt')) try: ys = np.loadtxt(os.path.join(jobpath, taskid, 'K.txt')) except IOError: MatDict = scipy.io.loadmat(os.path.join(jobpath,taskid, 'AllocPrior.mat')) Kfixed = int(MatDict['K']) ys = Kfixed* np.ones(len(xs)) if args.traceEvery is not None: mask = bnpy.util.isEvenlyDivisibleFloat(xs, args.traceEvery) xs = xs[mask] ys = ys[mask] plotargs = dict(markersize=10, linewidth=2, label=None, color=color, markeredgecolor=color) if tt == 0: plotargs['label'] = jobname pylab.plot(xs, ys, '.-', **plotargs) if len(ys) > 0: xLocs.append(xs[-1]) yLocs.append(ys[-1]) yAll.extend(ys[1:]) xAll.extend(xs[1:]) # Zoom in to the useful part of the ELBO trace if len(yAll) > 0: global YMax ymax = np.max(yAll) if YMax is None: YMax = ymax else: YMax = np.maximum(YMax, ymax) blankmargin = 0.05*(YMax) pylab.ylim( [0, YMax + blankmargin]) pylab.xlabel(XLabelMap[args.xvar]) pylab.ylabel('K')
def parse_jobpath_and_taskids(args): rootpath = os.path.join(os.environ['BNPYOUTDIR'], args.dataName, args.allocModelName, args.obsModelName) jobpath = os.path.join(rootpath, args.algNames, args.jobnames) if not os.path.exists(jobpath): raise ValueError("No such path: %s" % (jobpath)) taskids = BNPYArgParser.parse_task_ids(jobpath, args.taskids) # Verify that the intended savefile will work as expected! if args.savefilename is not None: if args.savefilename.count('%') and len(taskids) > 1: try: args.savefilename % ('1') except TypeError: raise ValueError("Missing or bad format string in savefilename %s" % (args.savefilename) ) return jobpath, taskids
def plotCompsForJob(jobpath='', taskids=[1], lap=None, **kwargs): ''' Show plot of learned clusters from run(s) saved results on disk ''' # Verify given absolute path is valid. jobpath_originalarg = jobpath if not os.path.isdir(jobpath): # Fallback: try to prepend BNPYOUTDIR to handle "shortcut" names jobpath = os.path.join(os.environ['BNPYOUTDIR'], jobpath) if not os.path.isdir(jobpath): raise ValueError('Not valid path: ' + jobpath_originalarg) taskids = BNPYArgParser.parse_task_ids(jobpath, taskids) for tt, taskid in enumerate(taskids): if tt == 0 and isinstance(taskid, str): if taskid.startswith('.'): rankTasksForSingleJobOnDisk(jobpath) taskpath = os.path.join(jobpath, str(taskid)) plotCompsForTask(taskpath, lap=lap, **kwargs) if 'block' in kwargs: pylab.show(block=kwargs['block'])
legNames = ['%s=%s' % (plotkey, x) for x in RangeMap[plotkey]] # Build list of final jpaths in order of decided legend keepListFinal = list() for x in RangeMap[plotkey]: for jID, jdict in enumerate(keepListD): if jdict[plotkey] == x: keepListFinal.append(keepListP[jID]) else: keepListFinal = keepListP[:1] legNames = [None] if verbose: print('\nLegend entries for selected jobs (auto-selected)') for name in legNames: print(name) return keepListFinal, legNames if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() parser.add_argument('dataName', default='AsteriskK8') parser.add_argument('jobName', default='bm') args, unkList = parser.parse_known_args() reqDict = BNPYArgParser.arglist_to_kwargs(unkList, doConvertFromStr=False) jpath = os.path.join(os.environ['BNPYOUTDIR'], args.dataName, args.jobName) keepJobs, legNames = filterJobs(jpath, verbose=1, **reqDict)
def plot_all_tasks_for_job(jobpath, label, taskids=None, lineType='.-', spreadLineType='--', color=None, yvar='avgLikScore', xvar='laps', markersize=10, linewidth=2, minLap=0, showFinalPt=0, fileSuffix='PredLik.mat', xjitter=None, prefix='predlik', colorID=0, **kwargs): ''' Create line plot in current figure for each task/run of jobpath ''' if not os.path.exists(jobpath): print('PATH NOT FOUND', jobpath) return None if not yvar.startswith('avg') and yvar.count('Kactive') == 0: yvar = 'avg' + yvar if not yvar.endswith('Score') and yvar.count('Kactive') == 0: yvar = yvar + 'Score' if color is None: color = Colors[colorID % len(Colors)] taskids = BNPYArgParser.parse_task_ids(jobpath, taskids) for tt, taskid in enumerate(taskids): taskoutpath = os.path.join(jobpath, taskid) hpaths = glob.glob(os.path.join(taskoutpath, '*' + fileSuffix)) txtpaths = glob.glob(os.path.join(taskoutpath, 'predlik-*.txt')) ys_hi = None ys_lo = None if len(txtpaths) > 0: if fileSuffix.endswith('.txt'): suffix = '-' + fileSuffix else: suffix = '.txt' if xvar.count('lap'): xs = np.loadtxt( os.path.join(taskoutpath, prefix + '-lapTrain.txt')) elif xvar.count('K'): xs = np.loadtxt(os.path.join(taskoutpath, prefix + '-K.txt')) elif xvar.count('time'): xs = np.loadtxt(os.path.join( taskoutpath, prefix + '-timeTrain.txt')) else: raise ValueError("Unrecognized xvar: " + xvar) if yvar.count('Kactive') and not yvar.count('Percentile'): ys = np.loadtxt(os.path.join(taskoutpath, prefix + '-' + yvar + 'Percentile50.txt')) ys_lo = np.loadtxt(os.path.join(taskoutpath, prefix + '-' + yvar + 'Percentile10.txt')) ys_hi = np.loadtxt(os.path.join(taskoutpath, prefix + '-' + yvar + 'Percentile90.txt')) else: ys = np.loadtxt( os.path.join(taskoutpath, prefix + '-' + yvar + suffix)) if minLap > 0 and taskoutpath.count('fix'): mask = laps > minLap xs = xs[mask] ys = ys[mask] elif len(hpaths) > 0: hpaths.sort() basenames = [x.split(os.path.sep)[-1] for x in hpaths] xs = np.asarray([float(x[3:11]) for x in basenames]) ys = np.zeros_like(xs) for ii, hpath in enumerate(hpaths): MatVars = scipy.io.loadmat(hpath) ys[ii] = float(MatVars['avgPredLL']) else: raise ValueError( 'Pred Lik data unavailable for job\n' + taskoutpath) plotargs = dict(markersize=markersize, linewidth=linewidth, label=None, color=color, markeredgecolor=color, ) plotargs.update(kwargs) if tt == 0: plotargs['label'] = label if xjitter is not None: xs = xs + xjitter pylab.plot(xs, ys, lineType, **plotargs) if ys_lo is not None: del plotargs['label'] pylab.plot(xs, ys_lo, spreadLineType, **plotargs) pylab.plot(xs, ys_hi, spreadLineType, **plotargs) if showFinalPt: pylab.plot(xs[-1], ys[-1], '.', **plotargs) pylab.xlabel(XLabelMap[xvar]) pylab.ylabel(YLabelMap[yvar])
def run(dataName=None, allocModelName=None, obsModelName=None, algName=None, \ doSaveToDisk=True, doWriteStdOut=True, taskID=None, **kwargs): ''' Fit specified model to data with learning algorithm. Usage ------- To fit a Gauss MixModel to a custom dataset defined in matrix X >> Data = bnpy.data.XData(X) >> hmodel = run(Data, 'MixModel', 'Gauss', 'EM', K=3, nLap=10) To load a dataset specified in a specific script For example, 2D toy data in demodata/AsteriskK8.py >> hmodel = run('AsteriskK8', 'MixModel', 'Gauss', 'VB', K=3) To run 5 tasks (separate initializations) and get best of 5 runs: >> opts = dict(K=8, nLap=100, printEvery=0) >> hmodel = run('AsteriskK8','MixModel','Gauss','VB', nTask=5, **opts) Args ------- dataName : either one of * bnpy Data object, * string filesystem path of Data module within BNPYDATADIR allocModelName : string name of allocation (latent structure) model {MixModel, DPMixModel, AdmixModel, HMM, etc.} obsModelName : string name of observation (likelihood) model {Gauss, ZMGauss, WordCount, etc.} **kwargs : keyword args defining properties of the model or alg see Doc for details [TODO] Returns ------- hmodel : best model fit to the dataset (across nTask runs) LP : local parameters of that best model on the dataset evBound : log evidence (ELBO) for the best model on the dataset scalar, real value where larger value implies better model ''' hasReqArgs = dataName is not None hasReqArgs &= allocModelName is not None hasReqArgs &= obsModelName is not None hasReqArgs &= algName is not None if hasReqArgs: ReqArgs = dict(dataName=dataName, allocModelName=allocModelName, obsModelName=obsModelName, algName=algName) else: ReqArgs = BNPYArgParser.parseRequiredArgs() dataName = ReqArgs['dataName'] allocModelName = ReqArgs['allocModelName'] obsModelName = ReqArgs['obsModelName'] algName = ReqArgs['algName'] KwArgs, UnkArgs = BNPYArgParser.parseKeywordArgs(ReqArgs, **kwargs) jobname = KwArgs['OutputPrefs']['jobname'] if taskID is None: starttaskid = KwArgs['OutputPrefs']['taskid'] else: starttaskid = taskID KwArgs['OutputPrefs']['taskid'] = taskID nTask = KwArgs['OutputPrefs']['nTask'] bestInfo = None bestEvBound = -np.inf for taskid in range(starttaskid, starttaskid + nTask): hmodel, LP, Info = _run_task_internal(jobname, taskid, nTask, ReqArgs, KwArgs, UnkArgs, dataName, allocModelName, obsModelName, algName, doSaveToDisk, doWriteStdOut) if (Info['evBound'] > bestEvBound): bestModel = hmodel bestLP = LP bestEvBound = Info['evBound'] bestInfo = Info return bestModel, bestLP, bestInfo
def plotSingleJob(dataName, jobname, taskids='1', lap=None, showELBOInTitle=True, cmap='gray', title='', mixZs=False): ''' Visualize results of single run ''' # Parse the jobpath, and create example task paths jobpath = os.path.join(os.path.expandvars('$BNPYOUTDIR'), dataName, jobname) if isinstance(taskids, str): taskids = BNPYArgParser.parse_task_ids(jobpath, taskids) elif isinstance(taskids, int): taskids = [str(taskids)] taskpath = os.path.join(jobpath, taskids[0]) # Load data, with same dataset size prefs as specified at inference time. dataKwargs = bnpy.ioutil.DataReader.loadDataKwargsFromDisk(taskpath) Data = bnpy.ioutil.DataReader.loadDataFromSavedTask(taskpath) AdjMat = np.squeeze(Data.toAdjacencyMatrix()) if hasattr(Data, 'TrueParams'): if 'nodeZ' in Data.TrueParams: sortids = np.argsort(Data.TrueParams['nodeZ']) print 'Sorting nodes by true labels...' elif 'pi' in Data.TrueParams: sortids = np.argsort(Data.TrueParams['pi'].argmax(axis=1)) else: sortids = np.arange(AdjMaj.shape[0]) # Rearrange the rows/cols of AdjMat AdjMat = AdjMat[sortids, :] AdjMat = AdjMat[:, sortids] if hasattr(Data, 'nodeNames'): nodeNames = [Data.nodeNames[s] for s in sortids] else: nodeNames = None # Show the true adj mat and the estimated side-by-side # First, the true adjacency matrix ncols = len(taskids) + 1 pylab.subplots(nrows=1, ncols=ncols, figsize=(3 * ncols, 3)) pylab.subplot(1, ncols, 1) pylab.imshow(AdjMat, cmap='Greys', interpolation='nearest', vmin=0, vmax=1) if len(nodeNames) < 25: pylab.gca().set_yticks(np.arange(len(nodeNames))) pylab.gca().set_yticklabels(nodeNames) for tt, taskid in enumerate(taskids): taskoutpath = os.path.join(jobpath, taskid) + os.path.sep # Load the model for the current task at specified lap hmodel, curLap = bnpy.ioutil.ModelReader.loadModelForLap( taskoutpath, lap) # Compute expected state-state edge prob matrix Ew Ew = hmodel.obsModel.Post.lam1 / \ (hmodel.obsModel.Post.lam1 + hmodel.obsModel.Post.lam0) isAssortative = str(type(hmodel.allocModel)).count('Assort') if isAssortative: K = hmodel.allocModel.K Ew_tmp = hmodel.allocModel.epsilon * np.ones((K, K, Ew.shape[-1])) for k in xrange(K): Ew_tmp[k, k] = Ew[k] Ew = Ew_tmp taskAdjMat = np.zeros((Data.nNodes, Data.nNodes, Data.dim)) useLP = 0 if useLP: LP = hmodel.calc_local_params(Data) for eid, (s, t) in enumerate(Data.edges): resp_st = LP['resp'][eid] if isAssortative: taskAdjMat[s, t] = np.sum(resp_st[:, np.newaxis] * Ew, axis=0) else: assert np.allclose(resp_st.sum(), 1.0) taskAdjMat[s, t] = np.sum(resp_st[:, :, np.newaxis] * Ew, axis=(0, 1)) else: Epi = np.exp(hmodel.allocModel.E_logPi()) for eid, (s, t) in enumerate(Data.edges): for d in xrange(Data.dim): taskAdjMat[s, t, d] = np.inner(Epi[s, :], np.dot(Ew[:, :, d], Epi[t, :])) assert taskAdjMat.min() >= 0 assert taskAdjMat.max() <= 1.0 taskAdjMat = np.squeeze(taskAdjMat) taskAdjMat = taskAdjMat[sortids, :] taskAdjMat = taskAdjMat[:, sortids] pylab.subplot(1, ncols, 2 + tt) pylab.imshow(taskAdjMat, cmap='Greys', interpolation='nearest', vmin=0, vmax=1)
def plotSingleJob( dataset, jobname, taskids='1', lap='final', sequences=[1], showELBOInTitle=False, dispTrue=True, aspectFactor=4.0, specialStateIDs=None, seqNames=None, cmap='Set1', maxT=None, colorManyToOne=False, ): ''' Returns the array of Data corresponding to a single sequence to display If dispTrue = True, the true labels will be shown underneath the estimated labels ''' # Make sequences zero-indexed if isinstance(sequences, str): sequences = np.asarray([int(x) for x in args.sequences.split(',')], dtype=np.int32) sequences = np.asarray(sequences, dtype=np.int32) if np.min(sequences) < 1: raise ValueError('Sequences need to be one-index.\n' + 'Valid values are 1,2,...N.') sequences -= 1 # Determine the jobpath and taskids jobpath = os.path.join(os.path.expandvars('$BNPYOUTDIR'), dataset, jobname) if isinstance(taskids, str): if taskids.startswith('.'): taskids = [taskids] else: taskids = BNPYArgParser.parse_task_ids(jobpath, taskids) elif isinstance(taskids, int): taskids = [str(taskids)] datasetPrefFile = os.path.join(jobpath, taskids[0], 'args-DatasetPrefs.txt') datasetPrefs = dict() if os.path.exists(datasetPrefFile): with open(datasetPrefFile, 'r') as f: for line in f.readlines(): fields = line.strip().split(' ') if len(fields) != 2: continue datasetPrefs[fields[0]] = fields[1] # Load Data from its python module Datamod = imp.load_source( dataset, os.path.expandvars('$BNPYDATADIR/' + dataset + '.py')) if dataset == 'SpeakerDiar': if len(sequences) > 1: raise ValueError( 'Joint modeling of several sequences makes no sense') Data = Datamod.get_data(meetingNum=sequences[0] + 1, **datasetPrefs) jobpath = jobpath.replace('SpeakerDiar', 'SpeakerDiar' + str(sequences[0] + 1)) sequences[0] = 0 else: Data = Datamod.get_data(**datasetPrefs) # Determine the maximum length among any of the sequences to be plotted if maxT is None: Ts = Data.doc_range[sequences + 1] - Data.doc_range[sequences] maxT = np.max(Ts) # Define the number of pixels used by vertical space of figure NUM_STACK = int(np.ceil(maxT / float(aspectFactor))) if dispTrue: NUM_STACK /= 2 f, axes = plt.subplots(len(sequences), len(taskids), sharex='col', sharey='row') # For singleton case, make sure that axes is index-able if len(sequences) == 1 and len(taskids) == 1: axes = [axes] for tt, taskidstr in enumerate(taskids): if tt == 0 and taskidstr.startswith('.'): rankTasksForSingleJobOnDisk(jobpath) path = os.path.join(jobpath, taskidstr) + os.path.sep # Figure out which lap to use if lap == 'final': lapsFile = open(path + 'laps-saved-params.txt') curLap = lapsFile.readlines() curLap = float(curLap[-1]) lapsFile.close() else: curLap = int(lap) if showELBOInTitle: hdists = np.loadtxt(os.path.join(path, 'hamming-distance.txt')) hlaps = np.loadtxt(os.path.join(path, 'laps-saved-params.txt')) Keffvals = np.loadtxt(os.path.join(path, 'Keff-saved-params.txt')) # Determine scalar values to display loc = np.argmin(np.abs(hlaps - curLap)) hdist = hdists[loc] Kefffinal = Keffvals[loc] try: Kvals = np.loadtxt(os.path.join(path, 'K.txt')) ELBOscores = np.loadtxt(os.path.join(path, 'evidence.txt')) laps = np.loadtxt(os.path.join(path, 'laps.txt')) loc = np.argmin(np.abs(laps - curLap)) ELBO = ELBOscores[loc] Kfinal = Kvals[loc] except IOError: ELBO = 0.0 Kfinal = Kefffinal # Load in the saved Data from $BNPYOUTDIR try: filename = 'Lap%08.3fMAPStateSeqsAligned.mat' % curLap zHatBySeq = scipy.io.loadmat(path + filename) key1 = 'zHatBySeqAligned' key2 = 'zHatBySeq' if key1 in zHatBySeq: zHatBySeq = convertStateSeq_MAT2list(zHatBySeq[key1]) elif key2 in zHatBySeq: zHatBySeq = convertStateSeq_MAT2list(zHatBySeq[key2]) else: raise IOError except IOError: filename = 'Lap%08.3fMAPStateSeqs.mat' % curLap zHatBySeq = scipy.io.loadmat(path + filename) zHatBySeq = convertStateSeq_MAT2list(zHatBySeq['zHatBySeq']) if specialStateIDs is not None: zHatBySeq = relabelAllSequences(zHatBySeq, specialStateIDs) # Find maximum number of states we need to display nSeq = len(zHatBySeq) Kmax = np.max([zHatBySeq[i].max() for i in xrange(nSeq)]) hasGroundTruth = False vmin = 0 Kignore = 0 if hasattr(Data, 'TrueParams') and 'Z' in Data.TrueParams: hasGroundTruth = True Kmax = np.maximum(Data.TrueParams['Z'].max(), Kmax) uLabels = np.unique(Data.TrueParams['Z']) Kignore = np.sum(uLabels < 0) if Kignore > 0: for k in range(1, Kignore + 1): print 'ignoring state %d Ttrue = %d' % ( -k, np.sum(Data.TrueParams['Z'] == -k)) if colorManyToOne: # For each state in zHat, find best true sequence Zflat = convertStateSeq_list2flat(zHatBySeq, Data) ZflatA = -1 * np.ones_like(Zflat) for uID in np.unique(Zflat): overlap = np.zeros(uLabels.size) for ii, trueID in enumerate(uLabels): overlap[ii] = np.sum( np.logical_and(Data.TrueParams['Z'] == trueID, Zflat == uID)) bestii = overlap.argmax() ZflatA[Zflat == uID] = uLabels[bestii] zHatBySeq = convertStateSeq_flat2list(ZflatA, Data) # In case there's only one sequence, make sure it's index-able for ii, seqNum in enumerate(sequences): image = np.tile(zHatBySeq[seqNum], (NUM_STACK, 1)) # Add the true labels to the image (if they exist) if hasGroundTruth and dispTrue: start = Data.doc_range[seqNum] stop = Data.doc_range[seqNum + 1] img_trueZ = np.tile(Data.TrueParams['Z'][start:stop], (NUM_STACK, 1)) if dispTrue == 2: image = img_trueZ # Show only true labels else: image = np.vstack((image, img_trueZ)) image = image[:, :maxT] if len(sequences) == 1 or len(taskids) == 1: cur_ax = axes[ii + tt] else: cur_ax = axes[ii, tt] if hasattr(cmap, 'N'): vmax = cmap.N else: vmax = Kmax cur_ax.imshow(Kignore + image + .0001, interpolation='nearest', vmin=vmin, vmax=vmax, cmap=cmap) if tt == 0: if seqNames is not None: h = cur_ax.set_ylabel('%s' % (seqNames[ii]), fontsize=13) h.set_rotation(0) elif len(sequences) > 4: cur_ax.set_ylabel('%d' % (seqNum + 1), fontsize=13) else: cur_ax.set_ylabel('Seq. %d' % (seqNum + 1), fontsize=13) if ii == 0: if showELBOInTitle: fmtSpec = "ELBO: %.3f K=%d Keff=%d " if hdist > 0.01: fmtSpec += "dist=%.2f" elif hdist > 0.001: fmtSpec += "dist=%.3f" else: fmtSpec += "dist=%.4f" title = fmtSpec % (ELBO, Kfinal, Kefffinal, hdist) cur_ax.set_title(title) cur_ax.set_xlim([0, maxT]) cur_ax.set_ylim([0, image.shape[0]]) cur_ax.set_yticks([]) # ... end loop over sequences return axes, zHatBySeq
def plotSingleLineAcrossJobsByXVar(jpathPattern, label='', xvar=None, xvals=None, xlabel=None, yvar='evidence', lineStyle='.-', taskids='all', lineID=0, lvar='', **kwargs): ''' Create line plot in current figure for job matching the pattern Iterates over each xval in provided list of values. Each one corresponds to a single saved job. Post Condition -------------- Current axes have one line added. ''' prefixfilepath = os.path.sep.join(jpathPattern.split(os.path.sep)[:-1]) PPListMap = makePPListMapFromJPattern(jpathPattern) if xvals is None: xvals = PPListMap[xvar] xs = np.zeros(len(xvals)) ys = np.zeros(len(xvals)) jpathList = makeListOfJPatternsWithSpecificVals( PPListMap, prefixfilepath=prefixfilepath, key=xvar, vals=xvals, **kwargs) plotargs = copy.deepcopy(DefaultLinePlotKwArgs) # Plot all tasks as faint points with no connections for i, jobpath in enumerate(jpathList): if not os.path.exists(jobpath): raise ValueError("PATH NOT FOUND: %s" % (jobpath)) x = float(xvals[i]) for key in plotargs: if key in kwargs: plotargs[key] = kwargs[key] plotargs['markeredgecolor'] = plotargs['color'] alltaskids = BNPYArgParser.parse_task_ids(jobpath, taskids) for tid in alltaskids: y = loadYValFromDisk(jobpath, tid, yvar=yvar) pylab.plot(x, y, '.', **plotargs) # Plot top-ranked tasks as solid points connected by line for i, jobpath in enumerate(jpathList): rankTasksForSingleJobOnDisk(os.path.join(jobpath)) x = float(xvals[i]) y = loadYValFromDisk(jobpath, '.best', yvar=yvar) assert isinstance(x, float) assert isinstance(y, float) xs[i] = x ys[i] = y plotargs = copy.deepcopy(DefaultLinePlotKwArgs) for key in plotargs: if key in kwargs: plotargs[key] = kwargs[key] plotargs['markeredgecolor'] = plotargs['color'] plotargs['label'] = label pylab.plot(xs, ys, lineStyle, **plotargs) if lineID == 0: if xlabel is None: xlabel = xvar pylab.xlabel(xlabel) pylab.ylabel(LabelMap[yvar])
def run(dataName=None, allocModelName=None, obsModelName=None, algName=None, doSaveToDisk=True, doWriteStdOut=True, taskID=None, **kwargs): """ Fit specified model to data with learning algorithm. Args ------- dataName : either one of * bnpy Data object, * string name of python file within BNPYDATADIR allocModelName : string name of allocation (latent structure) model obsModelName : string name of observation (likelihood) model **kwargs : keyword args defining properties of the model or alg Returns ------- hmodel : best model fit to the dataset (across nTask runs) Info : dict of information about this best model """ hasReqArgs = dataName is not None hasReqArgs &= allocModelName is not None hasReqArgs &= obsModelName is not None hasReqArgs &= algName is not None if hasReqArgs: ReqArgs = dict(dataName=dataName, allocModelName=allocModelName, obsModelName=obsModelName, algName=algName) else: ReqArgs = BNPYArgParser.parseRequiredArgs() dataName = ReqArgs['dataName'] allocModelName = ReqArgs['allocModelName'] obsModelName = ReqArgs['obsModelName'] algName = ReqArgs['algName'] KwArgs, UnkArgs = BNPYArgParser.parseKeywordArgs(ReqArgs, **kwargs) KwArgs['OutputPrefs']['doSaveToDisk'] = doSaveToDisk KwArgs['OutputPrefs']['doWriteStdOut'] = doWriteStdOut jobname = KwArgs['OutputPrefs']['jobname'] # Update stored numerical options via keyword args bnpy.util.NumericUtil.UpdateConfig(**UnkArgs) if taskID is None: starttaskid = KwArgs['OutputPrefs']['taskid'] else: starttaskid = taskID KwArgs['OutputPrefs']['taskid'] = taskID nTask = KwArgs['OutputPrefs']['nTask'] best_info_dict = None best_loss = np.inf for taskid in range(starttaskid, starttaskid + nTask): hmodel, info_dict = _run_task_internal(jobname, taskid, nTask, ReqArgs, KwArgs, UnkArgs, dataName, allocModelName, obsModelName, algName, doSaveToDisk, doWriteStdOut) if (taskid == starttaskid or info_dict['loss'] < best_loss): bestModel = hmodel best_loss = info_dict['loss'] best_info_dict = info_dict return bestModel, best_info_dict
def plot_all_tasks_for_job(jobpath, label, taskids=None, color=None, colorID=0, density=2, yvar='evidence', markersize=10, linewidth=2, linestyle='-', drawLineToXMax=None, showOnlyAfterLap=0, xvar='laps', **kwargs): ''' Create line plot in current figure for each task/run of jobpath ''' if not os.path.exists(jobpath): if not jobpath.startswith(os.path.sep): jobpath_tmp = os.path.join(os.environ['BNPYOUTDIR'], jobpath) if not os.path.exists(jobpath_tmp): raise ValueError("PATH NOT FOUND: %s" % (jobpath)) jobpath = jobpath_tmp if color is None: color = Colors[colorID % len(Colors)] taskids = BNPYArgParser.parse_task_ids(jobpath, taskids) if yvar == 'hamming-distance': yspfile = os.path.join(jobpath, taskids[0], yvar + '-saved-params.txt') if xvar == 'laps' and os.path.isfile(yspfile): xvar = 'laps-saved-params' for tt, taskid in enumerate(taskids): xs = None ys = None laps = None try: var_ext = '' ytxtfile = os.path.join(jobpath, taskid, yvar + '.txt') if not os.path.isfile(ytxtfile): var_ext = '-saved-params' ytxtfile = os.path.join( jobpath, taskid, yvar + var_ext + '.txt') ys = np.loadtxt(ytxtfile) if ytxtfile.count('saved-params'): laptxtfile = os.path.join(jobpath, taskid, 'laps-saved-params.txt') else: laptxtfile = os.path.join(jobpath, taskid, 'laps.txt') except IOError as e: # TODO: when is this code needed? # xs, ys = loadXYFromTopicModelFiles(jobpath, taskid) try: if isinstance(xs, np.ndarray) and yvar.count('Keff'): ys = loadKeffForTask( os.path.join(jobpath, taskid), **kwargs) assert xs.size == ys.size else: # Heldout metrics xs, ys = loadXYFromTopicModelSummaryFiles( jobpath, taskid, xvar=xvar, yvar=yvar) if showOnlyAfterLap and showOnlyAfterLap > 0: laps, _ = loadXYFromTopicModelSummaryFiles( jobpath, taskid, xvar='laps', yvar=yvar) except ValueError: try: xs, ys = loadXYFromTopicModelSummaryFiles(jobpath, taskid) except ValueError: raise e if yvar == 'hamming-distance' or yvar == 'Keff': if xvar == 'laps-saved-params': # fix off-by-one error, if we save an extra dist on final lap if xs.size == ys.size - 1: ys = ys[:-1] elif ys.size == xs.size - 1: xs = xs[:-1] # fix off-by-one error, if we quit early elif xs.size != ys.size: # Try to subsample both time series at laps where they # intersect laps_x = np.loadtxt(os.path.join(jobpath, taskid, 'laps.txt')) laps_y = np.loadtxt(os.path.join(jobpath, taskid, 'laps-saved-params.txt')) assert xs.size == laps_x.size if ys.size == laps_y.size - 1: laps_y = laps_y[:-1] xs = xs[np.in1d(laps_x, laps_y)] ys = ys[np.in1d(laps_y, laps_x)] if xs.size != ys.size: raise ValueError('Dimension mismatch. len(xs)=%d, len(ys)=%d' % (xs.size, ys.size)) # Cleanup laps data. Verify that it is sorted, with no collisions. if xvar == 'laps': diff = xs[1:] - xs[:-1] goodIDs = np.flatnonzero(diff >= 0) if len(goodIDs) < xs.size - 1: print('WARNING: looks like multiple runs writing to this file!') print(jobpath) print('Task: ', taskid) print(len(goodIDs), xs.size - 1) xs = np.hstack([xs[goodIDs], xs[-1]]) ys = np.hstack([ys[goodIDs], ys[-1]]) if xvar == 'laps' and yvar == 'evidence': mask = xs >= 1.0 xs = xs[mask] ys = ys[mask] elif showOnlyAfterLap: # print "Filtering for data recorded at lap >= %s" % ( # showOnlyAfterLap) if laps is None: laps = np.loadtxt(laptxtfile) mask = laps >= showOnlyAfterLap xs = xs[mask] ys = ys[mask] # Force plot density (data points per lap) to desired specification # This avoids making plots that have huge file sizes, # due to too much content in the given display space if xvar == 'laps' and xs.size > 20 and np.sum(xs > 5) > 10: if (xs[-1] - xs[9]) != 0: curDensity = (xs.size - 10) / (xs[-1] - xs[9]) else: curDensity = density while curDensity > density and xs.size > 11: # Thin xs and ys data by a factor of 2 # while preserving the first 10 data points xs = np.hstack([xs[:10], xs[10::2]]) ys = np.hstack([ys[:10], ys[10::2]]) curDensity = (xs.size - 10) / (xs[-1] - xs[9]) plotargs = dict( markersize=markersize, linewidth=linewidth, linestyle=linestyle, label=None, color=color, markeredgecolor=color) for key in kwargs: if key in plotargs: plotargs[key] = kwargs[key] if tt == 0: plotargs['label'] = label pylab.plot(xs, ys, **plotargs) if drawLineToXMax: xs_dashed = np.asarray([xs[-1], drawLineToXMax]) ys_dashed = np.asarray([ys[-1], ys[-1]]) plotargs['label'] = None pylab.plot(xs_dashed, ys_dashed, '--', **plotargs) pylab.xlabel(LabelMap[xvar]) if yvar in LabelMap: yLabelStr = LabelMap[yvar] if yvar == 'Keff' and 'effCountThr' in kwargs: effCountThr = float(kwargs['effCountThr']) yLabelStr = yLabelStr + ' > %s' % (str(effCountThr)) pylab.ylabel(yLabelStr)