Beispiel #1
0
 def execute(self, df: data.Frame) -> data.Frame:
     columns = df.getRawFrame().columns.to_list()
     # Execute
     pdf = df.getRawFrame().copy(True)
     fr = set(self.__attributes.values())
     if len(fr) == 1:
         # All ranges are the same, shortcut
         toProcess = pdf.iloc[:, list(self.__attributes.keys())]
         processedColNames = toProcess.columns
         scaled = minmax_scale(toProcess,
                               feature_range=fr.pop(),
                               axis=0,
                               copy=True)
         processed = pd.DataFrame(scaled).set_index(pdf.index)
         processed.columns = processedColNames
     else:
         processed = dict()
         for k, fr in self.__attributes.items():
             processed[columns[k]] = minmax_scale(pdf.iloc[:, k],
                                                  feature_range=fr,
                                                  axis=0,
                                                  copy=True)
         processed = pd.DataFrame(processed).set_index(pdf.index)
     # Merge result with other columns preserving order
     pdf = pdf.drop(columns=processed.columns)
     result = pd.concat([pdf, processed], ignore_index=False,
                        axis=1)[columns]
     return data.Frame(result)
Beispiel #2
0
 def execute(self, df: data.Frame) -> data.Frame:
     f = df.getRawFrame().copy(True)
     pairs: List[Tuple[int, str]] = list(self.__attributes.items())
     names = [v[1] for v in pairs]
     indexes = [v[0] for v in pairs]
     f[names] = f.iloc[:, indexes]
     return data.Frame(f)
Beispiel #3
0
 def execute(self, df: data.Frame) -> data.Frame:
     columns = df.getRawFrame().columns.to_list()
     # Execute
     pdf = df.getRawFrame().copy(True)
     processedColNames = pdf.iloc[:, self.__attributes].columns
     scaled = scale(pdf.iloc[:, self.__attributes],
                    with_mean=True,
                    with_std=True,
                    copy=True)
     processed = pd.DataFrame(scaled).set_index(pdf.index)
     processed.columns = processedColNames
     # Merge result with other columns preserving order
     pdf = pdf.drop(columns=processedColNames)
     result = pd.concat([pdf, processed], ignore_index=False,
                        axis=1)[columns]
     return data.Frame(result)
Beispiel #4
0
 def execute(self, dfl: data.Frame, dfr: data.Frame) -> data.Frame:
     if self.__onIndex:
         # Join on indexes
         return data.Frame(dfl.getRawFrame().join(dfr.getRawFrame(), how=self.__type.value,
                                                  lsuffix=self.__lSuffix,
                                                  rsuffix=self.__rSuffix))
     else:
         # Join (merge) on columns
         # onleft and onright must be set
         suffixes = (self.__lSuffix, self.__rSuffix)
         l_col = dfl.shape.colNames[self.__leftOn]
         r_col = dfr.shape.colNames[self.__rightOn]
         return data.Frame(dfl.getRawFrame().merge(dfr.getRawFrame(), how=self.__type.value,
                                                   left_on=l_col,
                                                   right_on=r_col,
                                                   suffixes=suffixes))
Beispiel #5
0
 def execute(self, df: data.Frame) -> data.Frame:
     """ Set new names for columns """
     names: List[str] = df.colnames
     for k, v in self.__names.items():
         names[k] = v
     new_df = df.getRawFrame().copy(deep=False)
     new_df.columns = names
     return data.Frame(new_df)
Beispiel #6
0
 def execute(self, df: data.Frame) -> data.Frame:
     f = df.getRawFrame().copy(True)
     columns = f.columns.to_list()
     for c, o in self.__attributes.items():
         result = pd.cut(f.iloc[:, c], bins=o[0], labels=o[1], duplicates='drop')
         colName: str = columns[c]
         newColName: str = colName if not self.__attributeSuffix else colName + self.__attributeSuffix
         f.loc[:, newColName] = result
     return data.Frame(f)
Beispiel #7
0
 def execute(self, df: data.Frame) -> data.Frame:
     # Deep copy
     raw_df = df.getRawFrame().copy(deep=True)
     # To string
     isNan = raw_df.iloc[:, self.__attributes].isnull()
     processed = raw_df.iloc[:, self.__attributes].astype(dtype=str, errors='raise')
     # Set to nan where values where nan
     processed = processed.mask(isNan, np.nan)
     colNames = df.shape.colNames
     raw_df.iloc[:, self.__attributes] = processed
     return data.Frame(raw_df)
Beispiel #8
0
 def execute(self, df: data.Frame) -> data.Frame:
     # Assume everything to go is set
     if self.__thresholdPercentage is not None and self.__thresholdNumber is not None:
         raise exp.InvalidOptions('Can\'t have both threshold set')
     pf = df.getRawFrame().copy()
     if self.__thresholdPercentage:
         # By percentage
         pf = pf.loc[:, pf.isnull().mean() <= self.__thresholdPercentage]
     else:
         # By nan number
         pf = pf.loc[:, pf.isnull().sum() <= self.__thresholdNumber]
     return data.Frame(pf)
Beispiel #9
0
 def execute(self, df: data.Frame) -> data.Frame:
     pdf = df.getRawFrame().copy(deep=True)
     prefixes = itemgetter(*self.__attributes)(self.shapes[0].colNames)
     npdf = pd.get_dummies(pdf.iloc[:, self.__attributes],
                           prefix=prefixes,
                           dummy_na=self.__includeNan,
                           dtype=int)
     npdf = npdf.astype('category', copy=False)
     # Replace eventual duplicate columns
     pdf = pdf.drop(columns=npdf.columns, errors='ignore')
     # Avoid dropping original columns (just append)
     # pdf = pdf.drop(columns[self.__attributes], axis=1, inplace=False)
     pdf = pd.concat([pdf, npdf], axis=1)
     return data.Frame(pdf)
Beispiel #10
0
    def execute(self, df: data.Frame) -> data.Frame:
        df = df.getRawFrame()
        colOrder: List[str] = df.columns.to_list()

        subDf = df.iloc[:, self.__selected]

        duplicates = find_duplicates(subDf)

        if duplicates:
            df = df.copy(True)
            df = df.drop(duplicates, axis=1)
            # Keep original order
            order = [c for c in colOrder if c not in duplicates]
            df = df[order]
        return data.Frame(df)
Beispiel #11
0
    def execute(self, df: data.Frame) -> data.Frame:
        # Deep copy
        raw_df = df.getRawFrame().copy(deep=True)
        allCols: List[str] = raw_df.columns.to_list()

        converted: Dict[str, np.ndarray] = dict()
        processedCols = list()
        for a in self.__attributes:
            view = raw_df.iloc[:, a]
            colName = allCols[a]
            converted[colName] = pd.to_numeric(view.values, errors=self.__errorMode, downcast='float')
            processedCols.append(colName)
        raw_df = raw_df.drop(columns=processedCols)
        # Get processed frame and set back its index
        processed = pd.DataFrame(converted).set_index(raw_df.index)
        raw_df = pd.concat([processed, raw_df], ignore_index=False, axis=1)[allCols]
        return data.Frame(raw_df)
Beispiel #12
0
    def execute(self, df: data.Frame) -> data.Frame:
        columns = df.colnames
        df = df.getRawFrame().copy(True)

        # Notice that this timestamps are already set to a proper format (with default time/date) by
        # the editor
        intervals: Dict[int, pd.IntervalIndex] = \
            {i: pd.IntervalIndex([pd.Interval(a, b, closed='right') for a, b in zip(opts[0],
                                                                                    opts[0][1:])])
             for i, opts in self.__attributes.items()}

        processedDict = dict()
        for i, opts in self.__attributes.items():
            _, labels, byDate, byTime = opts
            applyCol = df.iloc[:, i]
            if byTime and not byDate:
                # Replace the date part with the default date in a way that every ts has the
                # same date, but retains its original time. Nan values are propagated
                applyCol = applyCol \
                    .map(lambda ts:
                         pd.Timestamp(QDateTime(_IntervalWidget.DEFAULT_DATE,
                                                toQtDateTime(ts.to_pydatetime()).time()).toPython()),
                         na_action='ignore')
            name = columns[i]
            if self.__attributesSuffix:
                name += self.__attributesSuffix
            categoriesMap = dict(zip(intervals[i], labels))
            processedDict[name] = pd.cut(
                applyCol,
                bins=intervals[i]).cat.rename_categories(categoriesMap)

        if self.__attributesSuffix:
            duplicateColumns: Set[str] = set(
                processedDict.keys()) & set(columns)
        else:
            duplicateColumns: List[str] = list(processedDict.keys())
        if duplicateColumns:
            df = df.drop(columns=duplicateColumns)
        processed = pd.DataFrame(processedDict).set_index(df.index)

        df = pd.concat([df, processed], ignore_index=False, axis=1)
        if not self.__attributesSuffix:
            # Reorder columns
            df = df[columns]
        return data.Frame(df)
Beispiel #13
0
 def execute(self, df: data.Frame) -> data.Frame:
     pd_df = df.getRawFrame().copy(True)
     for c, colOptions in self.__attributes.items():
         for valueList, replaceVal in zip(*colOptions):
             if self.__invertedReplace:
                 valuesToReplace: List = pd_df.iloc[:, c].unique().tolist()
                 for a in valueList:
                     try:
                         valuesToReplace.remove(a)
                     except ValueError:
                         pass  # Value not in list (ignore)
             else:
                 valuesToReplace = valueList
             pd_df.iloc[:, c] = pd_df.iloc[:, c].replace(
                 to_replace=valuesToReplace,
                 value=replaceVal,
                 inplace=False)
     return data.Frame(pd_df)
Beispiel #14
0
def test_shape_index():
    d = {
        'col1': [1, 2, 3, 4.0, 10],
        'col2': [3, 4, 5, 6, 0],
        'col3': ['q', '2', 'c', '4', 'x']
    }
    f = Frame(d)
    f = Frame(f.getRawFrame().set_index('col3'))

    # Desired shape obj
    s = Shape()
    s.index = ['col3']
    s.indexTypes = [IndexType(Types.String)]
    s.colNames = ['col1', 'col2']
    s.colTypes = [Types.Numeric, Types.Numeric]

    assert f.shape == s
    assert f.nRows == 5
Beispiel #15
0
 def execute(self, df: data.Frame) -> data.Frame:
     # Deep copy
     raw_df = df.getRawFrame().copy(deep=True)
     # To string
     columnIndexes = list(self.__attributes.keys())
     isNan = raw_df.iloc[:, columnIndexes].isnull()
     raw_df.iloc[:, columnIndexes] = raw_df.iloc[:, columnIndexes] \
         .astype(dtype=str, errors='raise')
     # Set to nan where values where nan
     raw_df.iloc[:, columnIndexes] = raw_df.iloc[:, columnIndexes].mask(isNan, np.nan)
     colNames = df.shape.colNames
     # To category
     conversions: Dict[str, CategoricalDtype] = dict([
         (lambda i, opts: (colNames[i], CategoricalDtype(categories=opts[0],  # can be None
                                                         ordered=opts[1])))(index, info)
         for index, info in self.__attributes.items()
     ])
     raw_df = raw_df.astype(dtype=conversions, copy=True, errors='raise')
     return data.Frame(raw_df)
Beispiel #16
0
 def execute(self, df: data.Frame) -> data.Frame:
     columns = df.colnames
     rdf = df.getRawFrame().copy(True)
     processDf = rdf.iloc[:, list(self.__selection.keys())]
     if self.__byValue:
         valueDict = {columns[k]: v for k, v in self.__selection.items()}
         processed = processDf.fillna(valueDict, axis=0)
     elif self.__method == 'mean':
         # For some reason pandas can't compute mean of a dataframe so I do it by hand
         valueDict = {k: processDf[k].mean() for k in processDf}
         # This is the only case where we need an execution log
         self.__logExecution(valueDict)
         processed = processDf.fillna(valueDict, axis=0)
     else:
         processed = processDf.fillna(method=self.__method, axis=0)
     # Merge result with previous frame, keeping column order
     processed = pd.concat(
         [processed, rdf.drop(processed.columns.values, axis=1)],
         axis=1,
         ignore_index=False)[columns]
     return data.Frame(processed)
Beispiel #17
0
def test_fromShape_datetime():
    d = {
        'col1': [1, 2, 3, 4.0, 10],
        'col2': [3, 4, 5, 6, 0],
        'col3': ['q', '2', 'c', '4', 'x'],
        'cold':
        pd.Series([
            '05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994',
            '12-12-2012'
        ],
                  dtype='datetime64[ns]')
    }
    f = Frame(d)
    f = Frame(f.getRawFrame().set_index(['col3', 'cold']))

    g = Frame.fromShape(f.shape)

    s = Shape()
    # fromShape does preserve index
    s.colNames = ['col1', 'col2']
    s.colTypes = [Types.Numeric, Types.Numeric]
    s.index = ['col3', 'cold']
    s.indexTypes = [IndexType(Types.String), IndexType(Types.Datetime)]
    assert g.shape == s == f.shape
Beispiel #18
0
 def execute(self, df: data.Frame) -> data.Frame:
     pdf = df.getRawFrame().copy(True)
     for attr, dateFormat in self.__attributes.items():
         pdf.iloc[:, attr] = pd.to_datetime(pdf.iloc[:, attr], errors=self.__errorMode,
                                            infer_datetime_format=True, format=dateFormat)
     return data.Frame(pdf)