Beispiel #1
0
def get_diffs(mat):
    """
    Applies the given pairwise difference function "pairdiff" to all unique pairs and then returns a list of the results. The length of the list is equal to the length of non NA values choose 2.
    """
    diffs = {}
    for column in mat:
        a = np.array(mat[column], dtype = 'float64') #may need to play with this dtype argument
        b = a[~np.isnan(a)]
        diffs[column] = [pairdiff(*x) for x in combinations(b,2)]
    return diffs
Beispiel #2
0
    def getPairDiff(
        self,
        filepath,
        querypath,
        numberrange,
        dayrange,
        firstcol,
        lastcol,
        excel
        ):
    
        #import data
        #determine type of import: csv of excel
        filetype = os.path.basename(filepath).split(".")[1]
        if filetype == "csv":
            data = pd.DataFrame.from_csv(filepath)
        elif filetype == "xlsx" or filetype == "xls":
            data = pd.read_excel(filepath, sheetname = excel, index_col = 0) #reads the sheet indexed by "excel" - it's python indexed.

        self.data = copy(data) #original data

        handle_range(data, numberrange, firstcol, lastcol) #process range issue, i.e. [number] to [number]
        self.datarange = copy(data) #save the range dataset

        #process subset selection from query.xml     take data, give datadrop
        inclusion = createQuery(querypath, data.shape[0], data)
        datadrop = data.ix[[x[0] for x in enumerate(inclusion) if x[1]]] #0 is row, 1 is true if the row should be included
        self.dataquery = datadrop #save the query inclusion

        #process data with multiple days. 0 is average, 1 is first, 2 is second, -1 is keep all rows    #take datadrop, give dataproc
        dataproc = handle_days(datadrop, dayrange, firstcol, lastcol)
        m, n = dataproc.shape
        self.datadays = dataproc

        #create difference     take dataproc
        datamat = dataproc.iloc[:,range(firstcol,lastcol + 1)]
        self.datamat = datamat
        diff = pairdiff(datamat)
        diffmeas = pd.DataFrame(data = diff,
                                index = dataproc.columns[firstcol:lastcol+1],
                                columns = ["diff"])

        #add summary information to output
        outmean, outsd, outnan, outsamplesize = summaries(dataproc, firstcol, lastcol)
        diffmeas['mean'] = pd.Series(outmean, index = diffmeas.index)
        diffmeas['sd'] = pd.Series(outsd, index = diffmeas.index)
        diffmeas['count_nan'] = pd.Series(outnan, index = diffmeas.index)
        diffmeas['sample_size'] = pd.Series(outsamplesize, index = diffmeas.index)

        self.pairdiff = diffmeas
        self.debugger(filepath)
            
        return(diffmeas)
Beispiel #3
0
def get_diffs(mat):
    """
    Applies the given pairwise difference function "pairdiff" to all unique pairs and then returns a list of the results. The length of the list is equal to the length of non NA values choose 2.
    """
    diffs = {}
    for column in mat:
        a = np.array(
            mat[column],
            dtype='float64')  #may need to play with this dtype argument
        b = a[~np.isnan(a)]
        diffs[column] = [pairdiff(*x) for x in combinations(b, 2)]
    return diffs
Beispiel #4
0
    def getPairDiff(self, filepath, querypath, numberrange, dayrange, firstcol,
                    lastcol, excel):

        #import data
        #determine type of import: csv of excel
        filetype = os.path.basename(filepath).split(".")[1]
        if filetype == "csv":
            data = pd.DataFrame.from_csv(filepath)
        elif filetype == "xlsx" or filetype == "xls":
            data = pd.read_excel(
                filepath, sheetname=excel, index_col=0
            )  #reads the sheet indexed by "excel" - it's python indexed.

        self.data = copy(data)  #original data

        handle_range(data, numberrange, firstcol,
                     lastcol)  #process range issue, i.e. [number] to [number]
        self.datarange = copy(data)  #save the range dataset

        #process subset selection from query.xml     take data, give datadrop
        inclusion = createQuery(querypath, data.shape[0], data)
        datadrop = data.ix[[
            x[0] for x in enumerate(inclusion) if x[1]
        ]]  #0 is row, 1 is true if the row should be included
        self.dataquery = datadrop  #save the query inclusion

        #process data with multiple days. 0 is average, 1 is first, 2 is second, -1 is keep all rows    #take datadrop, give dataproc
        dataproc = handle_days(datadrop, dayrange, firstcol, lastcol)
        m, n = dataproc.shape
        self.datadays = dataproc

        #create difference     take dataproc
        datamat = dataproc.iloc[:, range(firstcol, lastcol + 1)]
        self.datamat = datamat
        diff = pairdiff(datamat)
        diffmeas = pd.DataFrame(data=diff,
                                index=dataproc.columns[firstcol:lastcol + 1],
                                columns=["diff"])

        #add summary information to output
        outmean, outsd, outnan, outsamplesize = summaries(
            dataproc, firstcol, lastcol)
        diffmeas['mean'] = pd.Series(outmean, index=diffmeas.index)
        diffmeas['sd'] = pd.Series(outsd, index=diffmeas.index)
        diffmeas['count_nan'] = pd.Series(outnan, index=diffmeas.index)
        diffmeas['sample_size'] = pd.Series(outsamplesize,
                                            index=diffmeas.index)

        self.pairdiff = diffmeas
        self.debugger(filepath)

        return (diffmeas)