def add_conservation_features(self): ''' will go through the wiggle track and make features of type conservation for intronic regions where conservation is higher than it should be for introns and is at least 6 bp wide ''' tr = self.wigs['MamConserv'] feature_locs = [] idx = 0 for scores in score.sliding_window(self.wigs['MamConserv'], 6, step=1): loc = (idx, idx + 6) if sum(scores) / 6 > cfg.MAM_CONSERV_MIN_WINDOW_SCORE: feature_locs.append(loc) idx += 1 consol_flocs = interval.union([interval(*feature_locs)]) #remove exons and splice signals: consol_flocs = consol_flocs & \ interval(self.get_exon_and_signal_pos()).invert() for cfloc in consol_flocs: sta = int(cfloc[0]) end = int(cfloc[1]) if end - sta < 6: continue fscore = sum(tr[slice(sta, end)]) / (end - sta) self.features.append(make_seq_feature(sta, end, 'MamConserv', {'label' : ['MamConserv', ], 'evidence' : str(fscore)}))
def get_raw_split_annotations(this_annotation_data, seconds): start_time = 0 end_time = this_annotation_data[3].describe().max() index = np.vstack([ np.array(range(start_time, int(end_time), int(seconds))), np.array( range(start_time + seconds, int(end_time + seconds), int(seconds))) ]) split_annotations = pd.DataFrame(index=list(index), columns=annotation_label) split_intervals = [interval(x) for x in index.T] for label in range(len(annotation_label)): label_name = annotation_label[label] try: annotation_start_end_times = np.delete(this_annotation_data.xs( label_name, level=1).values, 2, axis=1) lables_intervals = interval.union( [interval(x) for x in annotation_start_end_times]) found_within = [ each_interval in lables_intervals for each_interval in split_intervals ] split_annotations[label_name] = found_within except: # print('Issues with lable ' + str(label_name)) pass # no lables from the same tire should be TRUE at the same time - sanity check # print('sanity check') x = split_annotations[(split_annotations['off-tsak'] == True) & (split_annotations['on-task'] == True)] if len(x) > 0: print('Issues with off-tsak - on-task') print(x) x = split_annotations[(split_annotations['distarcted'] == True) & (split_annotations['focused'] == True) & (split_annotations['idle'] == True)] if len(x) > 0: print('Issues with distarcted, focused, idle') print(x) x = split_annotations[(split_annotations['Bored'] == True) & (split_annotations['Satisfied'] == True) & (split_annotations['Confused'] == True)] if len(x) > 0: print('Issues with Bored, Satisfied, Confused') print(x) return split_annotations
def get_intervals(self,row,col,reasons=[]): mask = self.mask if len(reasons)==0 else [self.reasonEnum[reason] for reason in reasons] intervalList = [self.intervals[row,col][i] for i in np.where(np.in1d(self.reasons[row,col],mask))[0]] return interval.union(intervalList)
def consolidate_motifs(self, flist, mt): #return these lists: # stores new features that consolidate old ones that overlap consolidated_features = [] # stores a list that tells which original features belong in which new one consolidated_list = {} ilist = [interval[extract_pos(feat.location)] for feat in flist] union = interval.union(ilist) #a dict of components and their constituent intervals cidict = {} #current interval civl = 0 #for each component, check to see if this interval is in this connected #component. if it's not, go to the next component (since the ilist is #sorted when it is made). for comp in union.components: cidict[comp] = [] while civl < len(ilist) and ilist[civl] in comp: cidict[comp].append(civl) civl += 1 summed_scores = 0 for comp in cidict: #sum up the components' scores fsum = 0 fcount = 0 flen = float(comp[0][1] - comp[0][0]) for f_index in cidict[comp]: feat = flist[f_index] fsum += float(feat.qualifiers['evidence']) fcount += 1 #calculate the feature mean, i.e. the average scores for all #sub-features, and the feature average, the average score #per base pair within the new metafeature fmean = float(fsum) / float(fcount) if fcount > 0 else 0 flens = map(lambda i: i[0][1] - i[0][0], ilist) flen_avg = float(sum(flens)) / float(len(flens)) favg = (fsum * flen_avg) / flen note_str = "len= %d; sum=%f; count=%d; mean=%f; avg=%f;" % \ (flen, fsum, fcount, fmean, favg) #finally, make a new feature feat = make_seq_feature(comp[0][0], comp[0][1], mt.type, {'label' : [mt.type, ], 'evidence' : str(favg), 'note' : [note_str, ], 'seq_motif_type': mt.type, 'nmers' : [str(self.seq[slice(*flist[fi].extract_pos())]) \ for fi in cidict[comp]], 'meta' : True}) consolidated_features.append(feat) consolidated_list[feat] = [] for f_index in cidict[comp]: consolidated_list[feat].append(flist[f_index]) return (consolidated_features, consolidated_list)
def find_motifs(self, seq_motif_type): '''associate successive n-mers with a motif_type object and add any found features to the record's feature list ''' score_result_dict = \ seq_motif_type.score(string.upper(str(self.seq))) nmers = score_result_dict['nmers'] locations = score_result_dict['locations'] scores = score_result_dict['scores'] #names list is only given by some score types: if 'names' in score_result_dict: names = score_result_dict['names'] else: names = [] features = list() for i in range(len(locations)): #skip this site if it doesn't match the motif's filter criteria if not seq_motif_type.filter_score(scores[i]): continue #if there is a context attrib, skip if it's not in the right context if 'context' in seq_motif_type.attribs: exons = [interval(*[extract_pos(exon) for exon in self.exon_list])] exons = interval.union(exons) if seq_motif_type.attribs['context'] == 'exon': if interval(locations[i]) not in exons: continue if seq_motif_type.attribs['context'] == 'donor_intron': donor_intron = interval([exons[-1][1], len(self.seq)]) if interval(locations[i]) not in donor_intron: continue if seq_motif_type.attribs['context'] == 'acceptor_intron': acceptor_intron = interval([0, exons[0][0]]) if interval(locations[i]) not in acceptor_intron: continue start = locations[i][0] end = locations[i][1] motif_bounds = seq_motif_type.bounds if \ seq_motif_type.bounds else (0, len(nmers[i])) note_str = seq_motif_type.note_str(motif_bounds, nmers[i]) feat = make_seq_feature(start, end, seq_motif_type.type, {'label' : [seq_motif_type.type, ], 'evidence' : str(scores[i]), 'note' : [note_str, ], 'seq_motif_type': seq_motif_type.type} ) #add a motif name if this score type has them: if names: feat.qualifiers['instance_name'] = names[i] features.append(feat) self.features.extend(features)
def hotPixelsTest2(startTime=12.3, endTime=23.1, getRawCount=True): ''' Runs a check of the bad pixel time masking. Checks: - Number of photons removed from returned image is consistent with time masks. - That timestamps of all photons in the time-masked image are outside the time intervals defined for each pixel in the hot pixel file. Outputs a test hot pixel file in the current directory, and runs checks on it. INPUTS: startTime - time from beginning of obs file at which to start the check. endTime - time from beginning to obs file at which to end (both in seconds). getRawCounts - if True, use raw, non-wavelength calibrated photon counts with no wavelength cutoffs applied. ''' #dir = '/Users/vaneyken/Data/UCSB/ARCONS/Palomar2012/hotPixTest2/' run = 'PAL2012' date = '20121208' obsFileName = FileName(run=run,date=date,tstamp='20121209-044636').obs() #'obs_20121209-044636.h5' wvlCalFileName = FileName(run=run, date=date, tstamp='20121209-060704').calSoln() #'calsol_20121209-060704.h5' flatCalFileName = FileName(run=run, date='20121210').flatSoln() #'flatsol_20121210.h5' hotPixFileName = os.path.abspath('test-hotPix_20121209-044636.h5') paramFile = os.path.join(os.path.dirname(__file__),'../../params/hotPixels.dict') startTime = float(startTime) #Force these values to floats to make sure endTime = float(endTime) #that getPixelCountImage calls getPixelSpectrum #and applies wavelength cutoffs consistently. if not os.path.exists(hotPixFileName): print 'Creating hot pixel file....' hp.findHotPixels(paramFile=paramFile, inputFileName=obsFileName, outputFileName=hotPixFileName, timeStep=1, startTime=0, endTime=-1, fwhm=3.0, boxSize=5, nSigmaHot=2.5, nSigmaCold=2.5, display=True) print 'Done creating hot pixel file.' print intTime = endTime - startTime obsFile = of.ObsFile(obsFileName) obsFile.loadWvlCalFile(wvlCalFileName) obsFile.loadFlatCalFile(flatCalFileName) print 'Loading hot pixel file into obsFile...' obsFile.loadHotPixCalFile(hotPixFileName) obsFile.setWvlCutoffs() print 'Getting image with masking...' imhp = obsFile.getPixelCountImage(startTime, intTime, weighted=False, getRawCount=getRawCount)['image'] print 'Getting image without masking...' obsFile.switchOffHotPixTimeMask() im = obsFile.getPixelCountImage(startTime, intTime, weighted=False, getRawCount=getRawCount)['image'] diffim = im - imhp #Should end up containing the total number of photons masked from each pixel. print 'Displaying images...' mpl.ion() mpl.matshow(imhp) mpl.title('Hot-pixel masked') mpl.colorbar() mpl.matshow(im) mpl.title('Unmasked') mpl.colorbar() print 'Loading local version of hot pixel file for direct inspection...' hotPix = hp.readHotPixels(hotPixFileName) if True: print 'Checking for consistency in number of photons removed....' for iRow in range(np.shape(diffim)[0]): for iCol in range(np.shape(diffim)[1]): nMaskedPhotons = 0 for eachInter in hotPix['intervals'][iRow, iCol]: #Make sure there is only one component per interval assert len(eachInter) == 1 #If the interval overlaps with the time range in question then #add the number of photons in the interval to our running tally. if eachInter[0][0] < endTime and eachInter[0][1] > startTime: firstSec = max(eachInter[0][0], startTime) lastSec = min(eachInter[0][1], endTime) nMaskedPhotons += obsFile.getPixelCount(iRow, iCol, firstSec=firstSec, integrationTime=lastSec - firstSec, weighted=False, fluxWeighted=False, getRawCount=getRawCount)['counts'] assert nMaskedPhotons == diffim[iRow, iCol] print 'Okay.' print 'Checking timestamps of remaining photons for consistency with exposure masks' obsFile.switchOnHotPixTimeMask() #Switch on hot pixel masking for iRow in range(np.shape(diffim)[0]): for iCol in range(np.shape(diffim)[1]): timeStamps = obsFile.getTimedPacketList(iRow, iCol, firstSec=startTime, integrationTime=intTime)['timestamps'] #timeStamps = timeStamps[np.logical_and(timeStamps<=endTime, timeStamps>=startTime)] badInterval = interval.union(hotPix['intervals'][iRow, iCol]) #The following check would be nice, but doesn't work because getTimedPacketList doesn't do a wavelength cut like getPixelCount does. #assert len(timeStamps) == im[iRow,iCol] #Double check that the number of photons returned matches the number in the masked image # for eachTimestamp in timeStamps: assert eachTimestamp not in badInterval #Check that none of those photons' timestamps are in the masked time range print 'Okay.' print print 'Done. All looks good.'
def hotPixelsTest(testFileName=FileName(run='PAL2012',date='20121208',tstamp='20121209-044636').obs()): ''' Runs some basic checks for consistency between intermediate output masks and the final 'bad time lists'. To run - from Python: hotPixelsTest('someObsFile.h5') - from command line: python hotPixelsTest.py someObsFile.h5 (No parameter file need be supplied). ''' workingDir = '/Users/vaneyken/Data/UCSB/ARCONS/Palomar2012/hotPixTest2/' outputFile = workingDir + 'testoutput.h5' paramFile = os.path.join(os.path.dirname(__file__),'../../params/hotPixels.dict') #/Users/vaneyken/UCSB/ARCONS/pipeline/github/ARCONS-pipeline/params/hotPixels.dict' testStartTime = 2 #In seconds testEndTime = 4 #In seconds timeStep = 2 #In seconds (deliberately equal to start time - end time) fwhm = 3.0 boxSize = 5 nSigmaHot = 2.5 nSigmaCold = 2.0 hp.findHotPixels(paramFile=paramFile, inputFileName=testFileName, outputFileName=outputFile, timeStep=timeStep, startTime=testStartTime, endTime=testEndTime, fwhm=fwhm, boxSize=boxSize, nSigmaHot=nSigmaHot, nSigmaCold=nSigmaCold, display=True) intermediateOutput = hp.checkInterval(inputFileName=testFileName, display=True, firstSec=testStartTime, intTime=testEndTime - testStartTime, fwhm=fwhm, boxSize=boxSize, nSigmaHot=nSigmaHot, nSigmaCold=nSigmaCold) hpOutput = hp.readHotPixels(outputFile) intMask = intermediateOutput['mask'] > 0 #Make a Boolean mask - any code > 0 is bad for some reason. intervals = hpOutput['intervals'] reasons = hpOutput['reasons'] #Find the number of entries for each pixel in both the 'intervals' and the #'reasons' arrays. nIntervals = np.reshape([len(x) for x in intervals.flat], np.shape(intervals)) nReasons = np.reshape([len(x) for x in reasons.flat], np.shape(reasons)) #Union the interval lists for each pixel to give an array of single (multi-component) interval objects: uIntervals = np.reshape(np.array([interval.union(x) for x in intervals.flat], dtype='object'), np.shape(intervals)) #Create a boolean mask that should be True for all bad (hot/cold/dead/other) pixels within the test time range finalMask = np.reshape([(interval(testStartTime, testEndTime) in x) for x in uIntervals.flat], np.shape(uIntervals)) assert np.all(np.equal(intMask, finalMask)) assert np.all(np.equal(nIntervals, nReasons)) print print "All seems okay. The two plots shown should look identical."