Beispiel #1
0
def meanbin(dataset, stepsize=None, numbins=None, dim=None, 
                       mastercolumn=0, mastermax=None, mastermin=None,
                       nanthreshold=0.5, original_if_supersample = False):
                           
    """ Do a mean binnning of the dataset  
    
    dataset:        a dataset
    stepsize:       the size of the steps in the new rebinning
                        (this mode centers the endpoints to get 
                        a fixed number bins)
    numbins:        the number of bins (overrides stepsize)
    dim:            the dimension to do the binning along (None means largest)
    mastercolumn:   the column to use as master (default 0)
    mastermax:      the maximum value needed
    mastermin:      the minimum value needed
    nanthreshold:   the amount of nans in a bin to make it a nan
    
    Stepsize or numbins must be provided.
    """
    
    data = numpy.array(dataset["data"], dtype=numpy.dtype('float'))
    
    if data.ndim != 2:
        raise TypeError("Data must be 2-d")

    # if autodim, find largest dimension
    if dim is None:
        dim = 0 if numpy.size(data, 0) > numpy.size(data, 1) else 1
                
    if dim == 1:
        data = data.transpose()
        
    if stepsize is None and numbins is None:
        raise TypeError("Must provide either stepsize or numbins")
        
    if stepsize and numbins:
        raise TypeError("Must provide either stepsize or numbins, but not both")
    
    # Sort data on mastercolumn
    sortorder = data.argsort(axis=0)[:, mastercolumn]
    data = data[sortorder,:]
    
    if mastermin is None:
        mastermin = data[0,0]
    
    if mastermax is None:
        mastermax = data[-1,0]
    
    # FIXME datalen was not defined
    # the extract metod seems to be to slow for large datasets
    # maybe consider removing min max extractors
    
    master = data[:, mastercolumn]
    extract = numpy.all([(master > mastermin),(master <= mastermax)], 0)
    data = data[extract,:]
    datalen = numpy.size(data, dim)

    # Using a stepsize
    if stepsize is not None:
        if type(stepsize) not in [float, int]:
            raise TypeError("Stepsize must be a number")
        numbins = ((mastermax - mastermin)/stepsize) + 1
        extra = ((numbins - int(numbins))*stepsize)/2
        mastermin += extra
        mastermax -= extra
        numbins = int(numbins)
    
    # If trying to supersample
    if numbins > datalen:
        
        if original_if_supersample:

            dataset["derived"] = True
            
            if "warnings" not in dataset:
                dataset["warnings"] = []
            dataset["warnings"].append("Data was not meanbined, as the sampling parameters would result in supersampled data")
            
            # limit
            master = data[:, mastercolumn]
            extract = numpy.all([(master > mastermin),(master <= mastermax)], 0)
            data = data[extract,:]
           
            if dim == 1:
                data = data.transpose()
            
            dataset["data"] = nantonone(data.tolist())
            
            return dataset

        else:

            raise TypeError("Specified parameters results in supersampling")
    
    (bins, binwidth) = numpy.linspace(mastermin, mastermax, num=numbins, retstep=True)
    
    binhalf = binwidth/2.0
    
    master = data[:, mastercolumn]
    
    outdata = numpy.empty([numbins,numpy.size(data, 1)])
    

    localend = 0
    
    for [idx, bin] in enumerate(bins):
        (binmin, binmax) = (bin-binhalf, bin+binhalf)


        # Non-optimized:       
        #extract = numpy.all([(master > binmin),(master <= binmax)], 0)
        #local = data[extract,:]

        # Optimized extract of values, due to previous sort     
        localstart = localend
        while localend < datalen and data[localend, 0] <= binmax:
            localend += 1 
        local = data[localstart:localend, :]
                   
        
        sums = numpy.nansum(local, 0)
        
        nans = numpy.isnan(local)
        nancount = (nans == True).sum(0)
        nonnancount = (nans == False).sum(0)
        
        row = sums/nonnancount
        
        nancover = nancount.astype(numpy.float)/(nonnancount+nancount)
        nanindex = (nancover > nanthreshold)
        
        # if the nancover is more than threshold, 
        # convert that point into a NaN
        row[nanindex] = numpy.NaN
        
        # set the index element to the bin instead of mean master value
        # for large bins for evenly spaced data, these should be almost the 
        # same, except for the endpoint
        row[0] = bin
        
        outdata[idx,:] = row
    
    if dim == 1:
        outdata = outdata.transpose()

    dataset["data"] = nantonone(outdata.tolist())
    dataset["rows"] = len(dataset["data"])
    dataset["derived"] = True

    return dataset
def spans_to_time(depth, time):
    """
        Put discrete data on timescale

        depth:              sequence of depth values
        time:               sequence of time values
        nan_threshold:      the fraction of samples needed to return a non-nan 
                            value from the interval
        nan_ignore:         ignore nans

        Based on Sune Olander Rasmussen 23 March 2006 data_on_timescale.m
        which in turn is based on resampling code from Bo M. Vinther

    """
    
    ## Check consistency

    if depth["columns"] != 2:
        raise TypeError("depth must be specified with two columns of data")

    if time["columns"] != 2:
        raise TypeError("time must be specified with two columns of data")

    if depth["sequence"]["index_marker_type"] != "span":
        raise TypeError("depth must be a spanning sequence")

    if time["sequence"]["index_marker_type"] != "point":
        raise TypeError("time must be a point sequence")


    depthdata = numpy.asarray(depth["data"])
    timedata = numpy.asarray(time["data"])

    M = depth["rows"]
    N = time["rows"]-1

    output = numpy.zeros((N, 3))
    output[:,0] = timedata[1:, 0]     
    output[:,1] = timedata[1:, 1]  

    minj = 0
    maxj = 0

    for i in range(N):

        # find first data sample in time interval
        for j in range(minj, M):
            if timedata[i, 0] < depthdata[j, 0]:
                minj = j
                break

        for j in range(minj, M):
            if timedata[i+1, 0] <= depthdata[j, 0]:
                maxj = j
                break

        mm = maxj-minj+1 #number of samples in the time interval

       # FIXME, if time starts before data or the reverse
       # if minj == maxj and minj == 0:
       #     output[i, 2] = numpy.nan
       #     continue

        dz = numpy.zeros((mm, 1))

        # FIXME: ignore nan stuff

        dz[0] = depthdata[minj, 0] - timedata[i, 0]
        for j in range(1, mm-1):
            dz[j] = depthdata[minj+j, 0] - depthdata[minj+j-1, 0]

        dz[mm-1] = timedata[i+1, 0] - depthdata[maxj-1, 0]

        DZ = numpy.sum(dz) #FIXME nan stuff

        for j in range(mm):
            val = (dz[j] * depthdata[minj+j, 1]) / DZ;
            output[i, 2] += val

    lst = nantonone(output.tolist())
    time["data"] = lst
    time["current_parameters"].append(depth["current_parameters"][0])
    print depth.keys()
    return time