Example #1
0
def add_conservation_features(self):
    '''
    will go through the wiggle track and make features of type conservation
    for intronic regions where conservation is higher than it should be for 
    introns and is at least 6 bp wide
    '''
    
    tr = self.wigs['MamConserv']
    
    feature_locs = []
    idx = 0
    
    for scores in score.sliding_window(self.wigs['MamConserv'], 6, step=1):
        loc = (idx, idx + 6)
        if sum(scores) / 6 > cfg.MAM_CONSERV_MIN_WINDOW_SCORE:
            feature_locs.append(loc)
        idx += 1
    
    consol_flocs = interval.union([interval(*feature_locs)])
    
    #remove exons and splice signals:
    consol_flocs = consol_flocs & \
        interval(self.get_exon_and_signal_pos()).invert()
    
    for cfloc in consol_flocs:
        sta = int(cfloc[0])
        end = int(cfloc[1])
        if end - sta < 6: continue

        fscore = sum(tr[slice(sta, end)]) / (end - sta)
        self.features.append(make_seq_feature(sta, end, 'MamConserv',
                {'label'    :  ['MamConserv', ],
                 'evidence' : str(fscore)}))
Example #2
0
def get_raw_split_annotations(this_annotation_data, seconds):
    start_time = 0
    end_time = this_annotation_data[3].describe().max()

    index = np.vstack([
        np.array(range(start_time, int(end_time), int(seconds))),
        np.array(
            range(start_time + seconds, int(end_time + seconds), int(seconds)))
    ])
    split_annotations = pd.DataFrame(index=list(index),
                                     columns=annotation_label)
    split_intervals = [interval(x) for x in index.T]

    for label in range(len(annotation_label)):
        label_name = annotation_label[label]
        try:
            annotation_start_end_times = np.delete(this_annotation_data.xs(
                label_name, level=1).values,
                                                   2,
                                                   axis=1)
            lables_intervals = interval.union(
                [interval(x) for x in annotation_start_end_times])
            found_within = [
                each_interval in lables_intervals
                for each_interval in split_intervals
            ]
            split_annotations[label_name] = found_within

        except:
            # print('Issues with lable ' + str(label_name))
            pass

    # no lables from the same tire should be TRUE at the same time - sanity check
    # print('sanity check')
    x = split_annotations[(split_annotations['off-tsak'] == True)
                          & (split_annotations['on-task'] == True)]
    if len(x) > 0:
        print('Issues with off-tsak - on-task')
        print(x)
    x = split_annotations[(split_annotations['distarcted'] == True)
                          & (split_annotations['focused'] == True) &
                          (split_annotations['idle'] == True)]
    if len(x) > 0:
        print('Issues with  distarcted, focused, idle')
        print(x)
    x = split_annotations[(split_annotations['Bored'] == True)
                          & (split_annotations['Satisfied'] == True) &
                          (split_annotations['Confused'] == True)]
    if len(x) > 0:
        print('Issues with  Bored, Satisfied, Confused')
        print(x)

    return split_annotations
 def get_intervals(self,row,col,reasons=[]):
     mask = self.mask if len(reasons)==0 else [self.reasonEnum[reason] for reason in reasons]
     intervalList = [self.intervals[row,col][i] for i in np.where(np.in1d(self.reasons[row,col],mask))[0]]
     return interval.union(intervalList)
Example #4
0
def consolidate_motifs(self, flist, mt):
    
    #return these lists:
    # stores new features that consolidate old ones that overlap
    consolidated_features = []
    # stores a list that tells which original features belong in which new one
    consolidated_list = {}  

    
    ilist = [interval[extract_pos(feat.location)] for feat in flist]
    union = interval.union(ilist)

    #a dict of components and their constituent intervals
    cidict = {}
    
    #current interval
    civl = 0
    
    #for each component, check to see if this interval is in this connected 
    #component. if it's not, go to the next component (since the ilist is 
    #sorted when it is made). 
    for comp in union.components:
        cidict[comp] = []
        while civl < len(ilist) and ilist[civl] in comp:
            cidict[comp].append(civl)
            civl += 1
    
    summed_scores = 0
    
    for comp in cidict:
        #sum up the components' scores
        fsum = 0
        fcount = 0
        flen = float(comp[0][1] - comp[0][0])
        for f_index in cidict[comp]:
            feat = flist[f_index]
            fsum += float(feat.qualifiers['evidence'])
            fcount += 1
            
        #calculate the feature mean, i.e. the average scores for all 
        #sub-features, and the feature average, the average score
        #per base pair within the new metafeature
        fmean = float(fsum) / float(fcount) if fcount > 0 else 0
        flens = map(lambda i: i[0][1] - i[0][0], ilist)
        flen_avg = float(sum(flens)) / float(len(flens))
        favg = (fsum * flen_avg) / flen
        
        note_str = "len= %d; sum=%f; count=%d; mean=%f; avg=%f;" % \
            (flen, fsum, fcount, fmean, favg)
        
        #finally, make a new feature
        feat = make_seq_feature(comp[0][0], comp[0][1], mt.type,
            {'label'    :  [mt.type, ],
             'evidence' : str(favg),
             'note'     : [note_str, ],
             'seq_motif_type': mt.type,
             'nmers' : [str(self.seq[slice(*flist[fi].extract_pos())]) \
                           for fi in cidict[comp]],
             'meta'     : True})
        
        consolidated_features.append(feat)
        consolidated_list[feat] = []
        for f_index in cidict[comp]:
           consolidated_list[feat].append(flist[f_index])
    
    return (consolidated_features, consolidated_list)
Example #5
0
def find_motifs(self, seq_motif_type):
    '''associate successive n-mers with a motif_type object and add
    any found features to the record's feature list
    '''

    score_result_dict = \
        seq_motif_type.score(string.upper(str(self.seq)))
    

    nmers = score_result_dict['nmers']
    locations = score_result_dict['locations']
    scores = score_result_dict['scores']
    
    #names list is only given by some score types:
    if 'names' in score_result_dict:
        names = score_result_dict['names']
    else:
        names = []
     
    
    features = list()
        
    for i in range(len(locations)):
        
        #skip this site if it doesn't match the motif's filter criteria
        if not seq_motif_type.filter_score(scores[i]): continue
        
        #if there is a context attrib, skip if it's not in the right context
        if 'context' in seq_motif_type.attribs:
            
            exons = [interval(*[extract_pos(exon) for exon in self.exon_list])]
            exons = interval.union(exons)
            
            if seq_motif_type.attribs['context'] == 'exon':
                if interval(locations[i]) not in exons:
                    continue
            if seq_motif_type.attribs['context'] == 'donor_intron':
                donor_intron = interval([exons[-1][1], len(self.seq)])
                if interval(locations[i]) not in donor_intron:
                    continue
                
            if seq_motif_type.attribs['context'] == 'acceptor_intron':
                acceptor_intron = interval([0, exons[0][0]])
                if interval(locations[i]) not in acceptor_intron:
                    continue
                
        start = locations[i][0]
        end = locations[i][1]
        
        motif_bounds = seq_motif_type.bounds if \
            seq_motif_type.bounds else (0, len(nmers[i]))
        
        note_str = seq_motif_type.note_str(motif_bounds, nmers[i])
        
        feat = make_seq_feature(start, end, seq_motif_type.type,
                {'label'    :  [seq_motif_type.type, ],
                 'evidence' : str(scores[i]),
                 'note'     : [note_str, ],
                 'seq_motif_type': seq_motif_type.type}
                 )
        
        #add a motif name if this score type has them:
        if names:
            feat.qualifiers['instance_name'] = names[i]
        
        features.append(feat)
        
    self.features.extend(features)
def hotPixelsTest2(startTime=12.3, endTime=23.1, getRawCount=True):
    '''
    Runs a check of the bad pixel time masking. Checks:
        - Number of photons removed from returned image is consistent
        with time masks.
        - That timestamps of all photons in the time-masked image are outside
        the time intervals defined for each pixel in the hot pixel file.
        Outputs a test hot pixel file in the current directory, and runs checks 
        on it.
        
    INPUTS:
        startTime - time from beginning of obs file at which to start the check.
        endTime - time from beginning to obs file at which to end (both in seconds).
        getRawCounts - if True, use raw, non-wavelength calibrated photon counts
                        with no wavelength cutoffs applied.
    '''
    
    #dir = '/Users/vaneyken/Data/UCSB/ARCONS/Palomar2012/hotPixTest2/'
    run = 'PAL2012'
    date = '20121208'
    obsFileName = FileName(run=run,date=date,tstamp='20121209-044636').obs()   #'obs_20121209-044636.h5'
    wvlCalFileName = FileName(run=run, date=date, tstamp='20121209-060704').calSoln()    #'calsol_20121209-060704.h5'
    flatCalFileName = FileName(run=run, date='20121210').flatSoln()    #'flatsol_20121210.h5'
    hotPixFileName = os.path.abspath('test-hotPix_20121209-044636.h5')
    paramFile = os.path.join(os.path.dirname(__file__),'../../params/hotPixels.dict')
    startTime = float(startTime)    #Force these values to floats to make sure
    endTime = float(endTime)        #that getPixelCountImage calls getPixelSpectrum
                                    #and applies wavelength cutoffs consistently.
    if not os.path.exists(hotPixFileName):
        print 'Creating hot pixel file....'
        hp.findHotPixels(paramFile=paramFile, inputFileName=obsFileName,
                         outputFileName=hotPixFileName, timeStep=1,
                         startTime=0, endTime=-1,
                         fwhm=3.0, boxSize=5, nSigmaHot=2.5,
                         nSigmaCold=2.5, display=True)
        print 'Done creating hot pixel file.'
        print
    
    intTime = endTime - startTime
    
    obsFile = of.ObsFile(obsFileName)
    obsFile.loadWvlCalFile(wvlCalFileName)
    obsFile.loadFlatCalFile(flatCalFileName)
    print 'Loading hot pixel file into obsFile...'
    obsFile.loadHotPixCalFile(hotPixFileName)
    obsFile.setWvlCutoffs()
    print 'Getting image with masking...'
    imhp = obsFile.getPixelCountImage(startTime, intTime, weighted=False,
                                      getRawCount=getRawCount)['image']
    print 'Getting image without masking...'
    obsFile.switchOffHotPixTimeMask()
    im = obsFile.getPixelCountImage(startTime, intTime, weighted=False,
                                    getRawCount=getRawCount)['image']
    
    diffim = im - imhp #Should end up containing the total number of photons masked from each pixel.

    
    print 'Displaying images...'
    mpl.ion()
    mpl.matshow(imhp)
    mpl.title('Hot-pixel masked')
    mpl.colorbar()
    mpl.matshow(im)
    mpl.title('Unmasked')
    mpl.colorbar()
    
    print 'Loading local version of hot pixel file for direct inspection...'
    hotPix = hp.readHotPixels(hotPixFileName)
    
    if True:
        print 'Checking for consistency in number of photons removed....'
        for iRow in range(np.shape(diffim)[0]):
            for iCol in range(np.shape(diffim)[1]):
                nMaskedPhotons = 0
                for eachInter in hotPix['intervals'][iRow, iCol]:
                    #Make sure there is only one component per interval
                    assert len(eachInter) == 1
                    #If the interval overlaps with the time range in question then 
                    #add the number of photons in the interval to our running tally.
                    if eachInter[0][0] < endTime and eachInter[0][1] > startTime:
                        firstSec = max(eachInter[0][0], startTime)
                        lastSec = min(eachInter[0][1], endTime)
                        nMaskedPhotons += obsFile.getPixelCount(iRow, iCol, firstSec=firstSec,
                                                                integrationTime=lastSec - firstSec,
                                                                weighted=False, fluxWeighted=False,
                                                                getRawCount=getRawCount)['counts']
                assert nMaskedPhotons == diffim[iRow, iCol]
        print 'Okay.'
    
    print 'Checking timestamps of remaining photons for consistency with exposure masks'
    obsFile.switchOnHotPixTimeMask()       #Switch on hot pixel masking
    for iRow in range(np.shape(diffim)[0]):
        for iCol in range(np.shape(diffim)[1]):
            timeStamps = obsFile.getTimedPacketList(iRow, iCol, firstSec=startTime, integrationTime=intTime)['timestamps']
            #timeStamps = timeStamps[np.logical_and(timeStamps<=endTime, timeStamps>=startTime)]
            badInterval = interval.union(hotPix['intervals'][iRow, iCol])
           
            #The following check would be nice, but doesn't work because getTimedPacketList doesn't do a wavelength cut like getPixelCount does.
            #assert len(timeStamps) == im[iRow,iCol]     #Double check that the number of photons returned matches the number in the masked image
            #
            
            for eachTimestamp in timeStamps:
               assert eachTimestamp not in badInterval   #Check that none of those photons' timestamps are in the masked time range

    print 'Okay.'
    print       
    print 'Done. All looks good.'
def hotPixelsTest(testFileName=FileName(run='PAL2012',date='20121208',tstamp='20121209-044636').obs()):
    '''
    Runs some basic checks for consistency between intermediate output
    masks and the final 'bad time lists'.
    
    To run
        - from Python:
            hotPixelsTest('someObsFile.h5')
        
        - from command line:
            python hotPixelsTest.py someObsFile.h5
    
    
    (No parameter file need be supplied).
    
    ''' 
    
    workingDir = '/Users/vaneyken/Data/UCSB/ARCONS/Palomar2012/hotPixTest2/'
    outputFile = workingDir + 'testoutput.h5'
    paramFile = os.path.join(os.path.dirname(__file__),'../../params/hotPixels.dict')  #/Users/vaneyken/UCSB/ARCONS/pipeline/github/ARCONS-pipeline/params/hotPixels.dict'
    testStartTime = 2   #In seconds
    testEndTime = 4     #In seconds
    timeStep = 2        #In seconds (deliberately equal to start time - end time)
    fwhm = 3.0
    boxSize = 5
    nSigmaHot = 2.5
    nSigmaCold = 2.0

    hp.findHotPixels(paramFile=paramFile, inputFileName=testFileName,
                     outputFileName=outputFile, timeStep=timeStep,
                     startTime=testStartTime, endTime=testEndTime,
                     fwhm=fwhm, boxSize=boxSize, nSigmaHot=nSigmaHot,
                     nSigmaCold=nSigmaCold, display=True)
    
    intermediateOutput = hp.checkInterval(inputFileName=testFileName, display=True,
                                          firstSec=testStartTime,
                                          intTime=testEndTime - testStartTime,
                                          fwhm=fwhm, boxSize=boxSize,
                                          nSigmaHot=nSigmaHot, nSigmaCold=nSigmaCold)
    
    hpOutput = hp.readHotPixels(outputFile)

    intMask = intermediateOutput['mask'] > 0    #Make a Boolean mask - any code > 0 is bad for some reason.
    intervals = hpOutput['intervals']
    reasons = hpOutput['reasons']

    #Find the number of entries for each pixel in both the 'intervals' and the
    #'reasons' arrays.
    nIntervals = np.reshape([len(x) for x in intervals.flat], np.shape(intervals))
    nReasons = np.reshape([len(x) for x in reasons.flat], np.shape(reasons))

    #Union the interval lists for each pixel to give an array of single (multi-component) interval objects:
    uIntervals = np.reshape(np.array([interval.union(x) for x in intervals.flat],
                                     dtype='object'), np.shape(intervals))


    #Create a boolean mask that should be True for all bad (hot/cold/dead/other) pixels within the test time range
    finalMask = np.reshape([(interval(testStartTime, testEndTime) in x) 
                            for x in uIntervals.flat], np.shape(uIntervals))   

    assert np.all(np.equal(intMask, finalMask))
    assert np.all(np.equal(nIntervals, nReasons))

    print
    print "All seems okay. The two plots shown should look identical."