Ejemplo n.º 1
0
 def normalization_shift_rows_or_cols(self, axis, stacker, history):
     endshape = [-1, -1]
     endshape[axis] = 1
     if self.combine_replicates:
         all_plates = stacker(self.normalization_plate_values.keys())
         controls = (stacker([self.normalization_control_maps[pl] for pl, _ in self.normalization_plate_values.keys()]) != CONTROL_POPULATION)
         all_plates[controls] = np.nan
         # use conservative_nanmedian to avoid taking median of too few values
         offsets = fix_nans(conservative_nanmedian(all_plates, axis))
         if offsets is None:  # too many NaNs to use conservative_nanmedian, try again.
             offsets = fix_nans(nanmedian(all_plates, axis))
             assert offsets is not None, "Too many bad values to correct row/column"
         offsets = offsets.reshape(endshape)
         # shift offsets to zero-median to keep things identifiable
         offsets -= np.median(offsets)
         history += offsets
         return dict(((plate, repindex), values - offsets)
                     for ((plate, repindex), values) in self.normalization_plate_values.iteritems())
     else:
         offsets = {}
         for repindex in range(self.num_replicates):
             rep_plates = stacker([v for (_, rep), v in self.normalization_plate_values.iteritems() if repindex == rep])
             controls = (stacker([self.normalization_control_maps[pl] for pl, rep in self.normalization_plate_values.keys() if repindex == rep]) != CONTROL_POPULATION)
             rep_plates[controls] = np.nan
             # use conservative_nanmedian to avoid taking median of too few values
             offsets[repindex] = fix_nans(conservative_nanmedian(rep_plates, axis))
             if offsets[repindex] is None:  # too many NaNs to use conservative_nanmedian, try again.
                 offsets[repindex] = fix_nans(nanmedian(rep_plates, axis))
                 assert offsets[repindex] is not None, "Too many bad values to correct row/column"
             offsets[repindex] = offsets[repindex].reshape(endshape)
             # shift offsets to zero-median to keep things identifiable
             offsets[repindex] -= np.median(offsets[repindex])
             history[repindex] += offsets[repindex]
         return dict(((plate, repindex), values - offsets[repindex])
                     for ((plate, repindex), values) in self.normalization_plate_values.iteritems())
Ejemplo n.º 2
0
def clean_outliers(data, thresh):
    '''
    (by Alejandro N |uacute| |ntilde| ez)
    
    Cleans a data from outliers by replacing them with numpy nans. A point *x* is identified as an outlier if \| *x* - *med* \| / *MAD* > *thresh*, where *med* is the median of the data values and *MAD* is the median absolute deviation, defined as 1.482 * median(\| *x* - *med* \|).
    
    This function mimics IDL mc_findoutliers (by Mike Cushing), with output differences.
    
    *data*
      Array with data values.
    *thresh*
      The sigma threshold that defines data outliers.
    '''
    # Check inputs
    try:
        data[0]
    except TypeError:
        print('Data invalid.')
        return
    
    # Calculate median and median absolute deviation
    med = sps.nanmedian(data)
    mad = 1.482 * sps.nanmedian(abs(data-med))
    
    dataClean = np.array(data).copy()
    if mad == 0:
        print('MAD is equal to zero.')
    else:
        outlierIdx = np.where(abs((dataClean - med) / mad) > thresh)
        if len(outlierIdx) != 0:
            dataClean[outlierIdx] = np.nan
    
    return dataClean
Ejemplo n.º 3
0
def calc_norm_summary_tables(accuracy_tbl, time_tbl):
    """
    Calculate normalized performance/ranking summary, as numpy
    matrices as usual for convenience, and matrices of additional
    statistics (min, max, percentiles, etc.)

    Here normalized means relative to the best which gets a 1, all
    others get the ratio resulting from dividing by the performance of
    the best.
    """
    # Min across all minimizers, i.e. for each fit problem what is the lowest chi-squared and the lowest time
    min_sum_err_sq = np.nanmin(accuracy_tbl, 1)
    min_runtime = np.nanmin(time_tbl, 1)

    # create normalised tables
    norm_acc_rankings = accuracy_tbl / min_sum_err_sq[:, None]
    norm_runtimes = time_tbl / min_runtime[:, None]

    summary_cells_acc = np.array([np.nanmin(norm_acc_rankings, 0),
                                  np.nanmax(norm_acc_rankings, 0),
                                  stats.nanmean(norm_acc_rankings, 0),
                                  stats.nanmedian(norm_acc_rankings, 0)
                                  ])

    summary_cells_runtime = np.array([np.nanmin(norm_runtimes, 0),
                                      np.nanmax(norm_runtimes, 0),
                                      stats.nanmean(norm_runtimes, 0),
                                      stats.nanmedian(norm_runtimes, 0)
                                      ])

    return norm_acc_rankings, norm_runtimes, summary_cells_acc, summary_cells_runtime
Ejemplo n.º 4
0
def plotForce():
    figure(size=3,aspect=0.5)
    subplot(1,2,1)
    from EvalTraj import plotFF
    plotFF(vp=351,t=28,f=900,cm=0.6,foffset=8)
    subplot_annotate()
    
    subplot(1,2,2)
    for i in [1,2,3,4]:
        R=np.squeeze(np.load('Rdpse%d.npy'%i))
        R=stats.nanmedian(R,axis=2)[:,1:,:]
        dps=np.linspace(-1,1,201)[1:]
        plt.plot(dps,R[:,:,2].mean(0));
    plt.legend([0,0.1,0.2,0.3],loc=3) 
    i=2
    R=np.squeeze(np.load('Rdpse%d.npy'%i))
    R=stats.nanmedian(R,axis=2)[:,1:,:]
    mn=np.argmin(R,axis=1)
    y=np.random.randn(mn.shape[0])*0.00002+0.0438
    plt.plot(np.sort(dps[mn[:,2]]),y,'+',mew=1,ms=6,mec=[ 0.39  ,  0.76,  0.64])
    plt.xlabel('Displacement of Force Origin')
    plt.ylabel('Average Net Force Magnitude')
    hh=dps[mn[:,2]]
    err=np.std(hh)/np.sqrt(hh.shape[0])*stats.t.ppf(0.975,hh.shape[0])
    err2=np.std(hh)/np.sqrt(hh.shape[0])*stats.t.ppf(0.75,hh.shape[0])
    m=np.mean(hh)
    print m, m-err,m+err
    np.save('force',[m, m-err,m+err,m-err2,m+err2])
    plt.xlim([-0.5,0.5])
    plt.ylim([0.0435,0.046])
    plt.grid(b=True,axis='x')
    subplot_annotate()
def removeoutliers(inarray,stdcut=3.0):
    #bonehead outlier cut, stdcut is how many sigma, replace with nearest neighbor
    #first mark the bad numbers
    inarray[np.logical_not(np.isfinite(inarray))]=0.
    indexarray=np.arange(len(inarray))
    badi=indexarray[np.abs(inarray-nanmedian(inarray)) > stdcut*nanstd(inarray) ]
    goodi=indexarray[np.abs(inarray-nanmedian(inarray)) <= stdcut*nanstd(inarray) ]
    outarray=inarray
    for i in badi:
        outarray[i]=inarray[np.abs(goodi-i).argmin()]
    return outarray
Ejemplo n.º 6
0
 def median_f(self, x):
     """Compute median over time varying axis of a front relative
     quantity, x.
     """
     # TODO: the axis used in nanmean is different for U and Uf
     # calcs - change Uf dims to make consistent?
     return stats.nanmedian(x, axis=1)
Ejemplo n.º 7
0
    def get_munged_clean_data(self):
        train_df = self.datamart['train']

        Y_train_df = train_df['Target']
        Y_train = np.array(Y_train_df)
        del train_df['Target']

        test_df = self.datamart['test']

        assert np.all(train_df.columns == test_df.columns)
        X_train = np.array(train_df)
        del train_df
        X_test = np.array(test_df)
        del test_df
        X_train_nan = np.isnan(X_train)
        X_test_nan = np.isnan(X_test)
        X_train = np.hstack((X_train,X_train_nan))
        X_test = np.hstack((X_test,X_test_nan))
        X_train_median = stats.nanmedian(X_train,axis=0)
        for i in xrange(X_train.shape[1]):
            X_train[np.isnan(X_train[:,i]),i] = X_train_median[i]
        for i in xrange(X_test.shape[1]):
            X_test[np.isnan(X_test[:,i]),i] = X_train_median[i]
        keep_not_boring = X_train.std(axis=0) > 0.0
        X_train = X_train[:,keep_not_boring]
        X_test = X_test[:,keep_not_boring]
        return X_train, Y_train, X_test
Ejemplo n.º 8
0
def calc_stats_old(a, maskzero=False):
    """Calculate the statistics of an array"""
    
    statsDict = {}
    a = np.array(a)
    if maskzero:
        a = np.where( np.equal(a, 0.0), np.nan, a)

    # Check that array is not all NaNs
    statsDict['npix'] = int(np.sum(np.where(np.isnan(a),0.0,1.0)))
    if statsDict['npix']>=2:
        statsDict['stdev'] = float(stats.nanstd(a.flatten()))
        statsDict['mean'] = float(stats.nanmean(a.flatten()))
        statsDict['median'] = float(stats.nanmedian(a.flatten()))
        statsDict['max'] = float(np.nanmax(a))
        statsDict['min'] = float(np.nanmin(a))
        statsDict['centmax'] = list(np.unravel_index(np.nanargmax(a),
                                                     a.shape))
        statsDict['madfm'] = float(MAD(a.flatten()))
        statsDict['npix'] = int(np.sum(np.where(np.isnan(a),0.0,1.0)))
        statsDict['success'] = True
        
    else:
        statsDict['npix'] == 0
        statsDict['stdev']   = 0.0
        statsDict['mean']    = 0.0
        statsDict['median']  = 0.0
        statsDict['max']     = 0.0
        statsDict['min']     = 0.0
        statsDict['centmax'] = (0.0, 0.0)
        statsDict['madfm']   = 0.0
        statsDict['success'] = False
        
    return statsDict
Ejemplo n.º 9
0
def calc_clipped_stats_old(data, clip=3.0, nIter=10):
    """Calculate the mean and stdev of an array given a sigma clip"""
    
    data = np.array(data).flatten()
    
    mean = float(stats.nanmean(data))
    std = float(stats.nanstd(data))
    mad = float(MAD(data))

    if clip > 0.0:
        convergeFlg = 0
        itCnt = 0
        while convergeFlg==0 and itCnt<nIter:
            meanOld, stdOld, madOld = mean, std, mad
            minVal = mean - (clip * mad)
            maxVal = mean + (clip * mad)

            # Blank values outside the 3-sigma range
            dataMsk = np.where(np.greater(data, maxVal), np.nan, data)
            dataMsk = np.where(np.less(data, minVal), np.nan, dataMsk)

            # Measure the statistics
            mean = stats.nanmean(dataMsk)
            median = stats.nanmedian(dataMsk)
            std = stats.nanstd(dataMsk)
            mad = MAD(dataMsk)
            npix = np.sum(np.where(np.isnan(dataMsk),0.0,1.0))
            dataMsk = []
            
            if mean == meanOld and mad == madOld:
                convergFlg = 1
            itCnt += 1
            

    # Assemble the measurements into a dictionary
    m = {}
    m['mean'] = float(mean)
    m['median'] = float(median)
    m['stdev'] = float(std)
    m['madfm'] = float(mad)
    m['npix'] =int(npix)
    m['max'] = float(np.nanmax(data))
    m['min'] = float(np.nanmin(data))
    del data
    
    # If all nans
    if m['npix'] == 0:
        m['stdev'] = 0.0
        m['mean'] = 0.0
        m['median'] = 0.0
        m['max'] = 0.0
        m['min'] = 0.0
        m['centmax'] = (0.0,0.0)
        m['madfm'] = 0.0
        m['success'] = False
    else:
        m['success'] = True

    return m
Ejemplo n.º 10
0
    def summary(self):
        """
        return summary statistics for the dataset
        """
        r="RT (all): Min=%.2f, Max=%.2f, Mean=%.2f, Median=%.2f\n"%(np.nanmin(self.RT),
                                                                    np.nanmax(self.RT),
                                                                    stats.nanmean(self.RT),
                                                                    stats.nanmedian(self.RT))
        for cond in range(self.design.nconditions()):
            r+="RT ({cond}): Min={minrt}, Max={maxrt}, Mean={meanrt}, Median={medrt}\n".format(
                cond=":".join(self.design.condidx(cond)),
                minrt=np.nanmin(self.RT[self.condition==cond]),
                maxrt=np.nanmax(self.RT[self.condition==cond]),
                meanrt=stats.nanmean(self.RT[self.condition==cond]),
                medrt=stats.nanmedian(self.RT[self.condition==cond]))

        r+='errors (all GO): {nerr}/{ntrials} ({errperc:.2f} %)\n'.format(
            nerr=np.sum(np.logical_not(self.correct[np.isnan(self.SSD)])),
            ntrials=len(self.correct[np.isnan(self.SSD)]),
            errperc=np.sum(np.logical_not(self.correct[np.isnan(self.SSD)]))/float(len(self.correct[np.isnan(self.SSD)])))
        for cond in range(self.design.nconditions()):
            r+='errors ({cond}): {nerr}/{ntrials} ({errperc:.2f} %)\n'.format(
                cond=":".join(self.design.condidx(cond)),
                nerr=np.sum(np.logical_not(self.correct[(self.condition==cond) & np.isnan(self.SSD)])),
                ntrials=len(self.correct[(self.condition==cond) & np.isnan(self.SSD)]),
                errperc=np.sum(np.logical_not(self.correct[(self.condition==cond) & np.isnan(self.SSD)]))
                               /float(len(self.correct[(self.condition==cond) & np.isnan(self.SSD)])))
                
            
        r+='miss GO (all): {nmiss}/{ntrials} ({missperc:.2f} %)\n'.format(
            nmiss=np.sum(np.isnan(self.RT[np.isnan(self.SSD)])),
            ntrials=self.ntrials,
            missperc=100.*np.sum(np.isnan(self.RT[np.isnan(self.SSD)]))/float(self.ntrials)
            )
        for cond in range(self.design.nconditions()):
            r+="miss GO ({cond}): {nmiss}/{ntrials} ({missperc:.2f} %)\n".format(
                cond=":".join(self.design.condidx(cond)),
                ntrials=len(self.RT[self.condition==cond]),
                missperc=100.*np.sum(np.isnan(self.RT[(self.condition==cond) & np.isnan(self.SSD)]))/float(self.ntrials),
                nmiss=np.sum(np.isnan(self.RT[(self.condition==cond) & (np.isnan(self.SSD))])))

        r+="SSD-distribution\n"
        a=stats.itemfreq(self.SSD[np.isfinite(self.SSD)])#.astype(np.int)
        r+= " NUM | "+" ".join(["%7i"%int(i) for i in (a[:,1])]) + "\n"
        r+= " SSD | "+" ".join(["%7.2f"%(i) for i in (a[:,0])]) +"\n"            
        return r
Ejemplo n.º 11
0
def fivenum(v):
    v = np.array(v)
    try:
        np.sum(v)
    except TypeError:
        print('Error: you must provide a list or array of only numbers')
    q1 = scoreatpercentile(v[~np.isnan(v)],25)
    q3 = scoreatpercentile(v[~np.isnan(v)],75)
    md = nanmedian(v)
    return np.nanmin(v), q1, md, q3, np.nanmax(v),
Ejemplo n.º 12
0
    def normalization_align_plates(self):
        offsets = {}
        for (plate, repindex), values in self.normalization_plate_values.iteritems():
            control_map = self.normalization_control_maps[plate]
            if self.alignment_method == ALIGN_POPULATION:
                align_values = values[control_map == CONTROL_POPULATION]
            elif self.alignment_method == ALIGN_EVERYTHING:
                align_values = values
            else:
                assert False, "Unknown normalization method: %s"%(self.alignment_method)

            # XXX - should not shift plates that are more than half filled by controls
            # compute an offset per-plate and per-replicate
            if len(align_values) > 0:
                offsets[plate, repindex] = np.median(align_values)
            else:
                offsets[plate, repindex] = np.nan
        # shift offsets to zero-median to keep things identifiable
        if self.combine_replicates:
            # keep overall shift at 0
            global_shift = nanmedian(offsets.values())
            for plate, repindex in offsets:
                if np.isnan(offsets[plate, repindex]):
                    offsets[plate, repindex] = 0.0
                else:
                    offsets[plate, repindex] -= global_shift
                self.normalization_total_plate_shifts[plate, repindex] += offsets[plate, repindex]
            return dict(((plate, repindex), values - offsets[plate, repindex])
                        for ((plate, repindex), values) in self.normalization_plate_values.iteritems())
        else:
            replicate_indices = np.array([repindex for _, repindex in offsets])
            offset_vals = np.array(offsets.values())
            per_replicate_shifts = dict((repindex, nanmedian(offset_vals[replicate_indices == repindex])) for repindex in range(self.num_replicates))
            for plate, repindex in offsets:
                if np.isnan(offsets[plate, repindex]):
                    offsets[plate, repindex] = 0.0
                else:
                    offsets[plate, repindex] -= per_replicate_shifts[repindex]
                self.normalization_total_plate_shifts[plate, repindex] += offsets[plate, repindex]
            return dict(((plate, repindex), values - offsets[plate, repindex])
                        for ((plate, repindex), values) in self.normalization_plate_values.iteritems())
Ejemplo n.º 13
0
def calc_summary_table(minimizers, group_results):
    """
    Calculates a summary from problem-individual results. At the moment the only summary
    statistic calculated is the median. The output is produced as numpy matrices.

    @param minimizers :: list of minimizers used (their names)

    @param group_results :: results from running fitting tests on different problems (list
    of lists, where the first level is the group, and the second level is the individual test).


    @returns two numpy matrices (where columns are the groups, and rows are the minimizers)
    with summary statistic (median) from the problem-individual results.
    """

    num_groups = len(group_results)
    num_minimizers = len(minimizers)

    groups_norm_acc = np.zeros((num_groups, num_minimizers))
    groups_norm_runtime = np.zeros((num_groups, num_minimizers))
    for group_idx, results_per_test in enumerate(group_results):
        num_tests = len(results_per_test)
        accuracy_tbl = np.zeros((num_tests, num_minimizers))
        time_tbl = np.zeros((num_tests, num_minimizers))

        for test_idx in range(0, num_tests):
            for minimiz_idx in range(0, num_minimizers):
                accuracy_tbl[test_idx, minimiz_idx] = results_per_test[test_idx][minimiz_idx].sum_err_sq
                time_tbl[test_idx, minimiz_idx] = results_per_test[test_idx][minimiz_idx].runtime

        # Min across all alternative runs/minimizers
        min_sum_err_sq = np.nanmin(accuracy_tbl, 1)
        min_runtime = np.nanmin(time_tbl, 1)

        norm_acc_rankings = accuracy_tbl / min_sum_err_sq[:, None]
        norm_runtime_rankings = time_tbl / min_runtime[:, None]

        groups_norm_acc[group_idx, :] = stats.nanmedian(norm_acc_rankings, 0)
        groups_norm_runtime[group_idx, :] = stats.nanmedian(norm_runtime_rankings, 0)

    return groups_norm_acc, groups_norm_runtime
Ejemplo n.º 14
0
def read_data(filename):
    data = pd.read_table(filename, sep=',', warn_bad_lines=True, error_bad_lines=True)
    data = np.asarray(data.values, dtype = float)
    col_mean = stats.nanmedian(data,axis = 0)
    inds = np.where(np.isnan(data))
    data[inds] = np.take(col_mean,inds[1])
    #data=[np.concatenate((np.array([data[:,1]]).T,data[:,6:]),axis=1)]
    X_train = data[:,6: ]
    Y_train = data[:,1:6]
    svm_x = map(lambda xr: { i+1: xr[i] for i in range(xr.shape[0]) if not np.isnan(xr[i]) } , X_train )
    svm_y_ary = map( lambda i : [ y for y in Y_train[:,i]], range(Y_train.shape[1]) )
    return svm_x, svm_y_ary
Ejemplo n.º 15
0
 def get_no_nan_median(self):
     tmp = 1*self.C
     tmp[tmp == 127] = np.nan
     cur_med = stats.nanmedian(tmp,axis=1)
     
     if np.sum(np.isnan(cur_med)) > 0:
         cur_med_is_not_nan_idx = np.logical_not(np.isnan(cur_med))
         self.prev_med[cur_med_is_not_nan_idx] = cur_med[cur_med_is_not_nan_idx]
     else:
         self.prev_med = cur_med
     
     return 1*self.prev_med
Ejemplo n.º 16
0
def fivenum(v):
    """Returns Tukey's five number summary (minimum, lower-hinge, median, upper-hinge, maximum) for the input vector, a list or array of numbers based on 1.5 times the interquartile distance"""
    try:
        numpy.sum(v)
    except TypeError:
        print('Error: you must provide a list or array of only numbers')
    q1 = scoreatpercentile(v,25)
    q3 = scoreatpercentile(v,75)
    iqd = q3-q1
    md = nanmedian(v)
    whisker = 1.5*iqd
    return numpy.nanmin(v), md-whisker, md, md+whisker, numpy.nanmax(v),
Ejemplo n.º 17
0
def binMean(X,Y,numBins=8,xmin=None,xmax=None):
    if xmin is None:
        xmin = X.min()
    if xmax is None:
        xmax = X.max()
    bins = np.linspace(xmin,xmax,numBins+1)
#    print bins,Y

    YY = np.array([nanmean(Y[(X > bins[binInd]) & (X <= bins[binInd+1])]) for binInd in range(numBins)])
    YYmedian = np.array([nanmedian(Y[(X > bins[binInd]) & (X <= bins[binInd+1])]) for binInd in range(numBins)])
    YYstd = np.array([np.std(Y[(X > bins[binInd]) & (X <= bins[binInd+1])]) for binInd in range(numBins)])
    return bins[:-1]+(bins[1]-bins[0])*0.5,YY,YYmedian,YYstd
Ejemplo n.º 18
0
def main():
    dat=pd.read_table('data/train_v2.csv',sep=',')
    print "reading done, train"
    loss=np.asarray(dat.loss)
    dat=dat.drop(['loss','id'],1)
    dat['new1']=dat['f528']-dat['f527'] #golden feature 1
    dat['new2']=dat['f528']-dat['f274'] #golden feature 2
    dat=np.asarray(dat.values, dtype=float)
    col_med = stats.nanmedian(dat,axis=0)
    print "calculated medians, train"
    inds = np.where(np.isnan(dat))
    dat[inds]=np.take(col_med,inds[1])
    print "median imputation done, train"
    scaler=preprocessing.Scaler().fit(dat)
    dat=scaler.transform(dat)
    print "scaling done, train"
    labels=(loss>0).astype(int)
    np.save('data/x_train.npy',dat)
    np.save('data/y_train.npy',labels)
    np.save('data/loss.npy',loss)
    print "trainset done"
    
    dat=pd.read_table('data/test_v2.csv',sep=',')
    print "reading done, test"
    ids=np.asarray(dat.id)
    dat=dat.drop(['id'],1)
    dat['new1']=dat['f528']-dat['f527'] #golden feature 1
    dat['new2']=dat['f528']-dat['f274'] #golden feature 2
    dat=np.asarray(dat.values,dtype=float)
    col_med=stats.nanmedian(dat,axis=0)
    print "calculated medians, test"
    inds=np.where(np.isnan(dat))
    dat[inds]=np.take(col_med,inds[1])
    print "imputation done, test"
    dat=scaler.transform(dat)
    print "scaling done, test"
    np.save('data/x_test.npy',dat)
    np.save('data/ids.npy',ids)
    print "testset done"
Ejemplo n.º 19
0
def na_median(X):
    ''' returns a copy of X with NAs
    replaced by the median of the non NAs
    for each column
    '''
    col_median = nanmedian(X,axis=0)
    a=np.copy(X)
    inds = np.where(np.isnan(a))
    import sys
    sys.stderr.write(str(inds)+'\n')
    if inds[0].shape[0]>0:
        a[inds]=col_median
    return a
 def getAnnulusCounts(self, im, annulusInner, annulusOuter, center):
     startpx = int(np.round(center[0]))
     startpy = int(np.round(center[1]))
     innerMask = aperture(startpx, startpy, annulusInner)
     outerMask = aperture(startpx, startpy, annulusOuter)
     annulusMask = outerMask-innerMask
     nanMask = np.isnan(im)
     annulusPixels =  np.array(np.where(np.logical_and(annulusMask==1, nanMask==False)))
     nAnnPix = annulusPixels.shape[1]
     annulusCounts = nanmedian(im[annulusPixels[0],annulusPixels[1]])*nAnnPix
     if self.verbose:
         print "Annulus Counts = ", annulusCounts
         print "Annulus pixels = ", nAnnPix
     return [annulusCounts, nAnnPix]
Ejemplo n.º 21
0
def sdize_vector( vec, ignore_zeroes=True, use_median=True ): ## note this is inplace! If don't want, pass vec.copy() !!
    v = vec
    if ignore_zeroes:
        v = vec[ vec != 0 ]
    if use_median:
        from scipy.stats import nanmedian
        mn = nanmedian(v)
        sd = mad(v)
    else:
        mn = np.nanmean( v )
        sd = np.nanstd( v )
    vec -= mn
    vec /= (sd + 0.001) ## try to minimize copies?
    return vec
Ejemplo n.º 22
0
def plotAgdist():
    ''' plot average distance of the pursued agents'''
    dist,discard,the,rest=computeTrackInfo()
    del discard,the,rest
    plt.figure(0,figsize=(10,8))
    for vp in range(1,5):
        xlim=500
        ys=dist[vp-1]
        dat=np.zeros((len(ys),int(HZ*xlim/1000.0),2))*np.nan
        datrev=np.zeros((len(ys),int(HZ*500/1000.0),2))*np.nan
        #datN=np.zeros((len(ys),xlim/20))
        for i in range(len(ys)):
            ao=np.argsort(map(np.median,ys[i]))
            if len(ys[i])==0:continue
            N=ys[i][ao[0]].size
            if N==0:continue
            dat[i,:min(dat.shape[1],N),0]=ys[i][ao[0]][:min(dat.shape[1],N)]
            datrev[i,-min(datrev.shape[1],N):,0]=ys[i][ao[0]][-min(datrev.shape[1],N):]
            N=ys[i][ao[-1]].size
            dat[i,:min(dat.shape[1],N),1]=ys[i][ao[-1]][:min(dat.shape[1],N)]
            datrev[i,-min(datrev.shape[1],N):,1]=ys[i][ao[-1]][-min(datrev.shape[1],N):]
        nrags=np.array(map(len,ys))
        ylims=[[[1,2.5]]*3,[[],[3,4],[3,5]]]
        for a in range(3)[::-1]:
            if a==2: sel=nrags>=(a+1)
            else: sel = nrags==(a+1)
            for i in range(2):
                if a==0 and i==1:continue
                plt.subplot(4,4,i*8+vp);plt.grid(b=False);#plt.ylim(ylims[i][a])
                plt.plot(np.linspace(0,xlim/1000.,dat.shape[1]),nanmedian(dat[sel,:,i],0));
                plt.subplot(4,4,i*8+vp+4);plt.grid(b=False);#plt.ylim(ylims[i][a])
                ss=datrev.shape[1]/HZ
                plt.plot(np.linspace(-ss,0,datrev.shape[1]),nanmedian(datrev[sel,:,i],0));
    plt.subplot(441)
    plt.legend(['> 2','2','1'],loc=4)
    initVP(1,1)
    plt.savefig(figpath+'trackAgdist')
Ejemplo n.º 23
0
    def medianres(self, res, wrap=2*np.pi):
        
        ncyc = self.cycs.shape[0]
        nwid = self.data.shape[2]
        nlen = self.data.shape[1]
        logger.info('Analyzing %d cycles for unwrapping errors'% (ncyc))
        numcheck = np.zeros(self.nslice, dtype=np.int)
        numcycper = np.zeros(self.nslice, dtype=np.int)
        
        progb = tsio.ProgressBar(maxValue=self.nslice)
        for kkk in range(self.nslice):
            cycind = np.flatnonzero(self.cycs[:, 0] == (kkk + 1))
            numcycper[kkk] = len(cycind)
            orig = self.data[kkk, :, :]
            resarr = np.zeros((numcycper[kkk], nlen, nwid), dtype=np.int)
            
            for img in range(numcycper[kkk]):
                ind = cycind[img]
                sgn1 = np.sign(self.cycs[ind, 1])
                ifg1 = np.abs(self.cycs[ind, 1]) - 1
            
                sgn2 = np.sign(self.cycs[ind, 2])
                ifg2 = np.sign(self.cycs[ind, 2]) - 1
            
                p11 = self.data[ifg1, :, :]
                p22 = self.data[ifg2, :, :]
            
                recons = sgn1 * p11 + sgn2 * p22
                derr = orig - recons
                #refph = st.nanmedian(derr.flatten())
                #derr = derr - refph
                
                resarr[img, :, :] = (np.round(derr / wrap)).astype(np.int)
                
            medres = st.nanmedian(resarr, axis=0)
            #idict = {}
            #idict['orig'] = orig
            #idict['medres'] = medres/wrap
            #plots.imagemany(idict,show=True)
            
            res.data[kkk, :, :] = medres
            numcheck[kkk] = np.nansum(np.abs(medres) > 0)
            
            progb.update(kkk, every=3)
            
        progb.close()

        self.cyccount = numcycper
        self.check = numcheck
Ejemplo n.º 24
0
    def median(self, files=[], bands=[1], doReproject=True, maskName='mask',
               **kwargs):
        '''Calculate median of input bands

        Memory and CPU greedy method. Generates 3D cube from bands of
        all input images and calculates median. Adds median bands to self

        Parameters
        -----------
        files : list
            list of input files
        bands : list
            list of names/band_numbers to be processed
        doReproject : boolean, [True]
            reproject input files?
        maskName : str, ['mask']
            name of the mask in input files
        nClass : child of Nansat, [Nansat]
            This class is used to read input files
        eResampleAlg : int, [0]
            agorithm for reprojection, see Nansat.reproject()
        period : [datetime0, datetime1]
            Start and stop datetime objects from pyhon datetime.

        '''
        # check inputs
        if len(files) == 0:
            self.logger.error('No input files given!')
            return

        # modify default values
        self.bandIDs = bands
        self.doReproject = doReproject
        self.maskName = maskName
        self._set_defaults(kwargs)

        lastN = self._get_layer_image(files[-1])
        # add medians of all bands
        for band in bands:
            bandCube, mask = self._get_cube(files, band)
            bandMedian = st.nanmedian(bandCube, axis=0)

            # get metadata of this band from the last image
            parameters = lastN.get_metadata(bandID=band)
            # add band and std with metadata
            self.add_band(array=bandMedian, parameters=parameters)

        self.add_band(array=mask, parameters={'name': 'mask'})
Ejemplo n.º 25
0
    def median(self, files=[], bands=[1], doReproject=True, maskName='mask',
                opener=Nansat, eResampleAlg=0, period=(None, None),
                vmin=-np.inf, vmax=np.inf):
        '''Calculate median of input bands

        Memory and CPU greedy method. Generates 3D cube from bands of
        all input images and calculates median. Adds median bands to self

        Parameters
        -----------
        files : list
            list of input files
        bands : list
            list of names/band_numbers to be processed
        doReproject : boolean, [True]
            reproject input files?
        maskName : str, ['mask']
            name of the mask in input files
        nClass : child of Nansat, [Nansat]
            This class is used to read input files
        eResampleAlg : int, [0]
            agorithm for reprojection, see Nansat.reproject()
        period : [datetime0, datetime1]
            Start and stop datetime objects from pyhon datetime.

        '''
        # check inputs
        if len(files) == 0:
            self.logger.error('No input files given!')
            return

        # add medians of all bands
        for band in bands:
            cube, mask, metadata = self._get_cube(files, band,
                                                    doReproject,
                                                    maskName,
                                                    opener,
                                                    eResampleAlg,
                                                    period, vmin, vmax)
            median = st.nanmedian(cube, axis=0)

            # add band and std with metadata
            self.add_band(array=median, parameters=metadata)

        self.add_band(array=mask, parameters={'name': 'mask'})
Ejemplo n.º 26
0
    def _clean_nans(self, data):
        """
		Substitute NaNs with the median value of the related features

        Parameters
        ----------
        data : array, shape=[n_samples, n_features]
               Data array
        """
        r, c = np.isnan(data).nonzero()

        my = dict()
        for ic in np.unique(c):
            my[ic] = nanmedian(data[:, ic])

        for i in range(len(r)):
            data[r[i], c[i]] = my[c[i]]

        return data
Ejemplo n.º 27
0
def clean_spec_NaNs(flux):
    
    #fix initial nans on edges
    nanMap = np.isnan(flux)
    nanGroups, nNanGroups = label(nanMap)
#     leftEdgeIdx=0
#     rightEdgeIdx=len(flux)
    
#     plt.plot(nanMap)
#     plt.show()
    
#     nanMapIdx = np.where(nanMap==True) <<<<<make the next lines faster by using this
    if np.sum(nanMap)>0:
        print 'Found NaNs in flux array'
        
    for i,booI in enumerate(nanMap):
        if booI==False:
            leftEdgeIdx = i
            break
            
    for j,rbooI in enumerate(nanMap[::-1]):
        if rbooI==False:
            rightEdgeIdx = len(nanMap)-j
            break        

    fluxMedian = stats.nanmedian(flux)
    if leftEdgeIdx>0:
        flux[:leftEdgeIdx] = np.linspace(fluxMedian, flux[leftEdgeIdx+1],leftEdgeIdx)
    if rightEdgeIdx<len(flux):
        flux[rightEdgeIdx:] = np.linspace(flux[rightEdgeIdx-1], fluxMedian, len(flux)-rightEdgeIdx)

    nanMap = np.isnan(flux)        
    if np.sum(nanMap)>0:
        print 'NaNs remain in flux array'        

    plt.plot(nanMap)
    plt.show()
Ejemplo n.º 28
0
    def append_clean_nans(self):
        train_nan = np.isnan(self.training_x)
        train_median = stats.nanmedian(self.training_x)
        train_nan_locs = np.where(train_nan)
        ms, ns = train_nan_locs
        for m, n in zip(ms, ns):
            self.training_x.ix[m,n] = train_median[n]
        cols_to_keep = train_nan.sum(axis=0) != 0
        index_cols_to_keep = cols_to_keep.ix[np.where(cols_to_keep)].index
        self.train_dummy_nan = train_nan[index_cols_to_keep].astype(float)
        n_columns = []
        for i in self.train_dummy_nan.columns.tolist():
            i  = "nan_" + i
            n_columns.append(i)
        self.train_dummy_nan.columns = n_columns
        #self.training_x += self.train_dummy_nan

        test_nan = np.isnan(self.testing_x)
        test_nan_locs = np.where(test_nan)
        ms, ns = test_nan_locs
        for m, n in zip(ms, ns):
            self.testing_x.ix[m,n] = train_median[n]
        self.test_dummy_nan = test_nan[index_cols_to_keep].astype(float)
        self.test_dummy_nan.columns = n_columns
Ejemplo n.º 29
0
    def make_hists(self, num_bins=None, use_prettyplotlib=True):

        if use_prettyplotlib:
            try:
                import prettyplotlib as plt
            except ImportError:
                import matplotlib.pyplot as plt
                use_prettyplotlib = False
                print "prettyplotlib not installed. Using matplotlib..."
        else:
            import matplotlib.pyplot as plt

        # Setup subplots if plotting together
        if self.subplot:
          num = len(self.columns)
          if num <= 3:
            ncols = 1
            nrows = num
          elif num <= 8:
            ncols = 2
            nrows = num / 2
          else:  # Max columns right now is 12
            ncols = 3
            nrows = num / 3
          # Check if we need an extra row.
          if num % ncols != 0:
            nrows += 1

          # Make the objects
          fig, axes = plt.subplots(nrows=nrows, ncols=ncols)

          # This is a bit awkward to get the indices, but my matplotlib version
          # doesn't use the same object type as prettyplotlib creates.
          posns = np.indices(axes.shape)
          x, y = posns[0].ravel(), posns[1].ravel()

        # Keep the mean, median, std.
        data_stats = {}
        for i, column in enumerate(self.columns):
          data = self.dataframe[column]
          data = data[np.isfinite(data)]
          if num_bins is None:
            num_bins = np.sqrt(len(data))

          data_stats[column] = [nanmean(data), nanstd(data), nanmedian(data)]

          if self.subplot:
            if use_prettyplotlib:
              plt.hist(axes[x[i], y[i]],data, num_bins, grid="y")
            else:
              axes[x[i], y[i]].hist(data, num_bins)
            axes[x[i], y[i]].set_xlabel(column)  # ADD UNITS!
          else:
            fig, axes = plt.subplots(1)
            axes.hist(data, num_bins)
            axes.set_xlabel(column)  # ADD UNITS!

          if self.verbose and not self.subplot:
            print column+" Stats: %s" % (data_stats[column])
            p.show()

          elif not self.subplot:
            fig.savefig(self.save_name+"_"+column+"."+self.save_type)
            p.close()

        if self.subplot:
          p.tight_layout()
          if self.verbose:
            for column in self.columns:
              print column+" Stats: %s" % (data_stats[column])
            p.show()
          else:
            fig.savefig(self.save_name+"_hists."+self.save_type)
Ejemplo n.º 30
0
 def rebin(self, field, shape):
     """Rebin field to a coarser matrix"""
     sh = shape[0],field.shape[0]//shape[0],shape[1],field.shape[1]//shape[1]
     return nanmedian(nanmedian(field.reshape(sh),axis=-1), axis=1)