def calcLscoreFromHardSegmentation(origmatpath, doAppendTxtFile=True): ''' Compute variational objective from specific segmentation (as .mat file) Returns ------- Lscore : scalar Post Condition -------------- if doAppend, will append to evidence-saved-params.txt ''' # Figure out dataName dataName = origmatpath.replace(os.environ['BNPYOUTDIR'], '') if dataName.startswith(os.path.sep): dataName = dataName[1:] dataName = dataName.split(os.path.sep)[0] # Load train data Data, TrueZ = loadDatasetByName(dataName) # Load the estimated zs and convert to flat 1D array SVars = scipy.io.loadmat(origmatpath) estZflat = convertStateSeq_list2flat( convertStateSeq_MAT2list(SVars['zHatBySeq']), Data) initLP = bnpy.init.FromTruth.convertLPFromHardToSoft( dict(Z=estZflat), Data) # Load a model with right hyperparameters # Requires similar run with memoized already started. memomatpath = origmatpath.replace('foxHDPHMMsampler', 'bnpyHDPHMMmemo') memomatpath = os.path.sep.join(memomatpath.split(os.path.sep)[:-1]) print memomatpath model, lapQ = bnpy.ioutil.ModelReader.loadModelForLap(memomatpath, 0.0) # Run inference one step forward initLP = model.allocModel.initLPFromResp(Data, initLP) initSS = model.get_global_suff_stats(Data, initLP) model.update_global_params(initSS) LP = model.calc_local_params(Data) SS = model.get_global_suff_stats(Data, LP) model.update_global_params(SS) # Compute evidence Lscore = model.calc_evidence(SS=SS) print Lscore, '<<< Lscore' # Write to text files! if doAppendTxtFile: outpathFields = origmatpath.split(os.path.sep) taskoutpath = os.path.sep.join(outpathFields[:-1]) outtxtfile = os.path.join(taskoutpath, 'evidence-saved-params.txt') with open(outtxtfile, 'a') as f: f.write('%.6f\n' % (Lscore)) return Lscore, model
def makeAlignedStateSeqsFromMATFile(origmatpath): # Figure out dataName dataName = origmatpath.replace(os.environ['BNPYOUTDIR'], '') if dataName.startswith(os.path.sep): dataName = dataName[1:] dataName = dataName.split(os.path.sep)[0] print 'Aligning state seqs for dataset: ', dataName # Load the estimated zs and convert to list format SVars = scipy.io.loadmat(origmatpath) estZlist = convertStateSeq_MAT2list(SVars['zHatBySeq']) # Find relabeling that aligns to true labels, and get back into MAT format alignedZlist, hdist = getAlignedStateSeqsAndHammingDist(dataName, estZlist) zHatBySeq = convertStateSeq_list2MAT(alignedZlist) # Save to MAT file outmatpath = origmatpath.replace('MAPStateSeqs', 'MAPStateSeqsAligned') scipy.io.savemat(outmatpath, dict(zHatBySeq=zHatBySeq), oned_as='rows') # Write to text files! outpathFields = origmatpath.split(os.path.sep) taskoutpath = os.path.sep.join(outpathFields[:-1]) outtxtfile = os.path.join(taskoutpath, 'hamming-distance.txt') with open(outtxtfile, 'a') as f: f.write('%.6f\n' % (hdist)) lap = float(outpathFields[-1][3:9]) outtxtfile = os.path.join(taskoutpath, 'laps-saved-params.txt') with open(outtxtfile, 'a') as f: f.write('%d\n' % (lap)) Keff = getKeff(alignedZlist) outtxtfile = os.path.join(taskoutpath, 'Keff-saved-params.txt') with open(outtxtfile, 'a') as f: f.write('%d\n' % (Keff)) outtxtfile = os.path.join(taskoutpath, 'K-saved-params.txt') with open(outtxtfile, 'a') as f: f.write('%d\n' % (SVars['K'])) print 'Done. Appended data from lap %.3f to \n %s' % ( lap, taskoutpath)
def makeBEDFileFromSegmentation(chrData, taskpath, lapFrac, ColorMap=[[255, 0, 0]]): """ Create .bed format file for display at genome.ucsc.edu Post Condition -------------- File exists at <taskpath>/Lap<Prefix>Segmentation.bed """ import bnpy import scipy.io from bnpy.util.StateSeqUtil import convertStateSeq_MAT2list ColorMap = np.asarray(ColorMap) header1 = "browser position chr7:116,260,000-116,361,000" header2 = 'track name="CD4T segmentation test" visibility=2 itemRgb="On" useScore=1' chrName = chrData.fileNames[0] slinePattern = "chr7 0 " + str(chrData.endLoc_bp) + " state??? 1000 . 0 0" segfpath = os.path.join(taskpath, "Lap%08.3fMAPStateSeqs.mat" % (lapFrac)) SaveVars = scipy.io.loadmat(segfpath) zBySeqMAT = SaveVars["zHatBySeq"] zBySeq = convertStateSeq_MAT2list(zBySeqMAT) for zHat in zBySeq: uLabels = np.unique(zHat) for uLoc, uID in enumerate(uLabels): sline = slinePattern.replace("???", str(uID)) blockSizes, blockStarts = extractBlocksFromMask(zHat == uID) blockSizes *= chrData.stepSize_bp blockStarts *= chrData.stepSize_bp blockStarts[1:] += chrData.startLoc_bp # Must add final singleton block to mark the endLoc blockSizes = np.hstack([blockSizes, 1]) blockStarts = np.hstack([blockStarts, chrData.endLoc_bp - 1]) blockSizeStr = ",".join(["%d" % (x) for x in blockSizes]) blockStartStr = ",".join(["%d" % (x) for x in blockStarts]) curColorRGB = ColorMap[uLoc % ColorMap.shape[0]] colorStr = ",".join(["%d" % (x) for x in curColorRGB]) sline = sline + " %s %d %s %s" % (colorStr, len(blockSizes), blockSizeStr, blockStartStr) print sline
def plotSingleJob( dataset, jobname, taskids='1', lap='final', sequences=[1], showELBOInTitle=False, dispTrue=True, aspectFactor=4.0, specialStateIDs=None, seqNames=None, cmap='Set1', maxT=None, colorManyToOne=False, ): ''' Returns the array of Data corresponding to a single sequence to display If dispTrue = True, the true labels will be shown underneath the estimated labels ''' # Make sequences zero-indexed if isinstance(sequences, str): sequences = np.asarray([int(x) for x in args.sequences.split(',')], dtype=np.int32) sequences = np.asarray(sequences, dtype=np.int32) if np.min(sequences) < 1: raise ValueError('Sequences need to be one-index.\n' + 'Valid values are 1,2,...N.') sequences -= 1 # Determine the jobpath and taskids jobpath = os.path.join(os.path.expandvars('$BNPYOUTDIR'), dataset, jobname) if isinstance(taskids, str): if taskids.startswith('.'): taskids = [taskids] else: taskids = BNPYArgParser.parse_task_ids(jobpath, taskids) elif isinstance(taskids, int): taskids = [str(taskids)] datasetPrefFile = os.path.join(jobpath, taskids[0], 'args-DatasetPrefs.txt') datasetPrefs = dict() if os.path.exists(datasetPrefFile): with open(datasetPrefFile, 'r') as f: for line in f.readlines(): fields = line.strip().split(' ') if len(fields) != 2: continue datasetPrefs[fields[0]] = fields[1] # Load Data from its python module Datamod = imp.load_source( dataset, os.path.expandvars('$BNPYDATADIR/' + dataset + '.py')) if dataset == 'SpeakerDiar': if len(sequences) > 1: raise ValueError( 'Joint modeling of several sequences makes no sense') Data = Datamod.get_data(meetingNum=sequences[0] + 1, **datasetPrefs) jobpath = jobpath.replace('SpeakerDiar', 'SpeakerDiar' + str(sequences[0] + 1)) sequences[0] = 0 else: Data = Datamod.get_data(**datasetPrefs) # Determine the maximum length among any of the sequences to be plotted if maxT is None: Ts = Data.doc_range[sequences + 1] - Data.doc_range[sequences] maxT = np.max(Ts) # Define the number of pixels used by vertical space of figure NUM_STACK = int(np.ceil(maxT / float(aspectFactor))) if dispTrue: NUM_STACK /= 2 f, axes = plt.subplots(len(sequences), len(taskids), sharex='col', sharey='row') # For singleton case, make sure that axes is index-able if len(sequences) == 1 and len(taskids) == 1: axes = [axes] for tt, taskidstr in enumerate(taskids): if tt == 0 and taskidstr.startswith('.'): rankTasksForSingleJobOnDisk(jobpath) path = os.path.join(jobpath, taskidstr) + os.path.sep # Figure out which lap to use if lap == 'final': lapsFile = open(path + 'laps-saved-params.txt') curLap = lapsFile.readlines() curLap = float(curLap[-1]) lapsFile.close() else: curLap = int(lap) if showELBOInTitle: hdists = np.loadtxt(os.path.join(path, 'hamming-distance.txt')) hlaps = np.loadtxt(os.path.join(path, 'laps-saved-params.txt')) Keffvals = np.loadtxt(os.path.join(path, 'Keff-saved-params.txt')) # Determine scalar values to display loc = np.argmin(np.abs(hlaps - curLap)) hdist = hdists[loc] Kefffinal = Keffvals[loc] try: Kvals = np.loadtxt(os.path.join(path, 'K.txt')) ELBOscores = np.loadtxt(os.path.join(path, 'evidence.txt')) laps = np.loadtxt(os.path.join(path, 'laps.txt')) loc = np.argmin(np.abs(laps - curLap)) ELBO = ELBOscores[loc] Kfinal = Kvals[loc] except IOError: ELBO = 0.0 Kfinal = Kefffinal # Load in the saved Data from $BNPYOUTDIR try: filename = 'Lap%08.3fMAPStateSeqsAligned.mat' % curLap zHatBySeq = scipy.io.loadmat(path + filename) key1 = 'zHatBySeqAligned' key2 = 'zHatBySeq' if key1 in zHatBySeq: zHatBySeq = convertStateSeq_MAT2list(zHatBySeq[key1]) elif key2 in zHatBySeq: zHatBySeq = convertStateSeq_MAT2list(zHatBySeq[key2]) else: raise IOError except IOError: filename = 'Lap%08.3fMAPStateSeqs.mat' % curLap zHatBySeq = scipy.io.loadmat(path + filename) zHatBySeq = convertStateSeq_MAT2list(zHatBySeq['zHatBySeq']) if specialStateIDs is not None: zHatBySeq = relabelAllSequences(zHatBySeq, specialStateIDs) # Find maximum number of states we need to display nSeq = len(zHatBySeq) Kmax = np.max([zHatBySeq[i].max() for i in xrange(nSeq)]) hasGroundTruth = False vmin = 0 Kignore = 0 if hasattr(Data, 'TrueParams') and 'Z' in Data.TrueParams: hasGroundTruth = True Kmax = np.maximum(Data.TrueParams['Z'].max(), Kmax) uLabels = np.unique(Data.TrueParams['Z']) Kignore = np.sum(uLabels < 0) if Kignore > 0: for k in range(1, Kignore + 1): print 'ignoring state %d Ttrue = %d' % ( -k, np.sum(Data.TrueParams['Z'] == -k)) if colorManyToOne: # For each state in zHat, find best true sequence Zflat = convertStateSeq_list2flat(zHatBySeq, Data) ZflatA = -1 * np.ones_like(Zflat) for uID in np.unique(Zflat): overlap = np.zeros(uLabels.size) for ii, trueID in enumerate(uLabels): overlap[ii] = np.sum( np.logical_and(Data.TrueParams['Z'] == trueID, Zflat == uID)) bestii = overlap.argmax() ZflatA[Zflat == uID] = uLabels[bestii] zHatBySeq = convertStateSeq_flat2list(ZflatA, Data) # In case there's only one sequence, make sure it's index-able for ii, seqNum in enumerate(sequences): image = np.tile(zHatBySeq[seqNum], (NUM_STACK, 1)) # Add the true labels to the image (if they exist) if hasGroundTruth and dispTrue: start = Data.doc_range[seqNum] stop = Data.doc_range[seqNum + 1] img_trueZ = np.tile(Data.TrueParams['Z'][start:stop], (NUM_STACK, 1)) if dispTrue == 2: image = img_trueZ # Show only true labels else: image = np.vstack((image, img_trueZ)) image = image[:, :maxT] if len(sequences) == 1 or len(taskids) == 1: cur_ax = axes[ii + tt] else: cur_ax = axes[ii, tt] if hasattr(cmap, 'N'): vmax = cmap.N else: vmax = Kmax cur_ax.imshow(Kignore + image + .0001, interpolation='nearest', vmin=vmin, vmax=vmax, cmap=cmap) if tt == 0: if seqNames is not None: h = cur_ax.set_ylabel('%s' % (seqNames[ii]), fontsize=13) h.set_rotation(0) elif len(sequences) > 4: cur_ax.set_ylabel('%d' % (seqNum + 1), fontsize=13) else: cur_ax.set_ylabel('Seq. %d' % (seqNum + 1), fontsize=13) if ii == 0: if showELBOInTitle: fmtSpec = "ELBO: %.3f K=%d Keff=%d " if hdist > 0.01: fmtSpec += "dist=%.2f" elif hdist > 0.001: fmtSpec += "dist=%.3f" else: fmtSpec += "dist=%.4f" title = fmtSpec % (ELBO, Kfinal, Kefffinal, hdist) cur_ax.set_title(title) cur_ax.set_xlim([0, maxT]) cur_ax.set_ylim([0, image.shape[0]]) cur_ax.set_yticks([]) # ... end loop over sequences return axes, zHatBySeq