Beispiel #1
0
    def getPairDiff(
        self,
        filepath,
        querypath,
        numberrange,
        dayrange,
        firstcol,
        lastcol,
        excel
        ):
    
        #import data
        #determine type of import: csv of excel
        filetype = os.path.basename(filepath).split(".")[1]
        if filetype == "csv":
            data = pd.DataFrame.from_csv(filepath)
        elif filetype == "xlsx" or filetype == "xls":
            data = pd.read_excel(filepath, sheetname = excel, index_col = 0) #reads the sheet indexed by "excel" - it's python indexed.

        self.data = copy(data) #original data

        handle_range(data, numberrange, firstcol, lastcol) #process range issue, i.e. [number] to [number]
        self.datarange = copy(data) #save the range dataset

        #process subset selection from query.xml     take data, give datadrop
        inclusion = createQuery(querypath, data.shape[0], data)
        datadrop = data.ix[[x[0] for x in enumerate(inclusion) if x[1]]] #0 is row, 1 is true if the row should be included
        self.dataquery = datadrop #save the query inclusion

        #process data with multiple days. 0 is average, 1 is first, 2 is second, -1 is keep all rows    #take datadrop, give dataproc
        dataproc = handle_days(datadrop, dayrange, firstcol, lastcol)
        m, n = dataproc.shape
        self.datadays = dataproc

        #create difference     take dataproc
        datamat = dataproc.iloc[:,range(firstcol,lastcol + 1)]
        self.datamat = datamat
        diff = pairdiff(datamat)
        diffmeas = pd.DataFrame(data = diff,
                                index = dataproc.columns[firstcol:lastcol+1],
                                columns = ["diff"])

        #add summary information to output
        outmean, outsd, outnan, outsamplesize = summaries(dataproc, firstcol, lastcol)
        diffmeas['mean'] = pd.Series(outmean, index = diffmeas.index)
        diffmeas['sd'] = pd.Series(outsd, index = diffmeas.index)
        diffmeas['count_nan'] = pd.Series(outnan, index = diffmeas.index)
        diffmeas['sample_size'] = pd.Series(outsamplesize, index = diffmeas.index)

        self.pairdiff = diffmeas
        self.debugger(filepath)
            
        return(diffmeas)
Beispiel #2
0
    def getPairDiff(self, filepath, querypath, numberrange, dayrange, firstcol,
                    lastcol, excel):

        #import data
        #determine type of import: csv of excel
        filetype = os.path.basename(filepath).split(".")[1]
        if filetype == "csv":
            data = pd.DataFrame.from_csv(filepath)
        elif filetype == "xlsx" or filetype == "xls":
            data = pd.read_excel(
                filepath, sheetname=excel, index_col=0
            )  #reads the sheet indexed by "excel" - it's python indexed.

        self.data = copy(data)  #original data

        handle_range(data, numberrange, firstcol,
                     lastcol)  #process range issue, i.e. [number] to [number]
        self.datarange = copy(data)  #save the range dataset

        #process subset selection from query.xml     take data, give datadrop
        inclusion = createQuery(querypath, data.shape[0], data)
        datadrop = data.ix[[
            x[0] for x in enumerate(inclusion) if x[1]
        ]]  #0 is row, 1 is true if the row should be included
        self.dataquery = datadrop  #save the query inclusion

        #process data with multiple days. 0 is average, 1 is first, 2 is second, -1 is keep all rows    #take datadrop, give dataproc
        dataproc = handle_days(datadrop, dayrange, firstcol, lastcol)
        m, n = dataproc.shape
        self.datadays = dataproc

        #create difference     take dataproc
        datamat = dataproc.iloc[:, range(firstcol, lastcol + 1)]
        self.datamat = datamat
        diff = pairdiff(datamat)
        diffmeas = pd.DataFrame(data=diff,
                                index=dataproc.columns[firstcol:lastcol + 1],
                                columns=["diff"])

        #add summary information to output
        outmean, outsd, outnan, outsamplesize = summaries(
            dataproc, firstcol, lastcol)
        diffmeas['mean'] = pd.Series(outmean, index=diffmeas.index)
        diffmeas['sd'] = pd.Series(outsd, index=diffmeas.index)
        diffmeas['count_nan'] = pd.Series(outnan, index=diffmeas.index)
        diffmeas['sample_size'] = pd.Series(outsamplesize,
                                            index=diffmeas.index)

        self.pairdiff = diffmeas
        self.debugger(filepath)

        return (diffmeas)
Beispiel #3
0
        print "No filters specified in xml query."

    return(query.include)

if __name__ == "__main__":
    ########################
    #remove this later
    lastcol = 10
    firstcol = 1

    import pandas as pd
    from process import handle_range, handle_days
    from pairdiff import pairdiff
    data = pd.DataFrame.from_csv("../DmelClockTimeSeriesSearch-2015-03-26--DataTable3.csv")
    handle_range(data,1) #process range issue
    dataproc = handle_days(data,0) #processed data with dates

    m, n = dataproc.shape
    ########################

    
    #xml processing
    xmldoc = minidom.parse('query.xml') #only user input is here
    rawexclusion = xmldoc.getElementsByTagName('rowMod')
    rawType = str(rawexclusion[0].attributes['type'].value)
    rawValue = [int(x) for x in (rawexclusion[0].firstChild.nodeValue).split(',')]
    filters = xmldoc.getElementsByTagName('filter')

    #build query object
    query = filterobj(m,dataproc)
    if rawType == 'include':