def execute(self, df: data.Frame) -> data.Frame: columns = df.getRawFrame().columns.to_list() # Execute pdf = df.getRawFrame().copy(True) fr = set(self.__attributes.values()) if len(fr) == 1: # All ranges are the same, shortcut toProcess = pdf.iloc[:, list(self.__attributes.keys())] processedColNames = toProcess.columns scaled = minmax_scale(toProcess, feature_range=fr.pop(), axis=0, copy=True) processed = pd.DataFrame(scaled).set_index(pdf.index) processed.columns = processedColNames else: processed = dict() for k, fr in self.__attributes.items(): processed[columns[k]] = minmax_scale(pdf.iloc[:, k], feature_range=fr, axis=0, copy=True) processed = pd.DataFrame(processed).set_index(pdf.index) # Merge result with other columns preserving order pdf = pdf.drop(columns=processed.columns) result = pd.concat([pdf, processed], ignore_index=False, axis=1)[columns] return data.Frame(result)
def execute(self, df: data.Frame) -> data.Frame: f = df.getRawFrame().copy(True) pairs: List[Tuple[int, str]] = list(self.__attributes.items()) names = [v[1] for v in pairs] indexes = [v[0] for v in pairs] f[names] = f.iloc[:, indexes] return data.Frame(f)
def execute(self, df: data.Frame) -> data.Frame: columns = df.getRawFrame().columns.to_list() # Execute pdf = df.getRawFrame().copy(True) processedColNames = pdf.iloc[:, self.__attributes].columns scaled = scale(pdf.iloc[:, self.__attributes], with_mean=True, with_std=True, copy=True) processed = pd.DataFrame(scaled).set_index(pdf.index) processed.columns = processedColNames # Merge result with other columns preserving order pdf = pdf.drop(columns=processedColNames) result = pd.concat([pdf, processed], ignore_index=False, axis=1)[columns] return data.Frame(result)
def execute(self, dfl: data.Frame, dfr: data.Frame) -> data.Frame: if self.__onIndex: # Join on indexes return data.Frame(dfl.getRawFrame().join(dfr.getRawFrame(), how=self.__type.value, lsuffix=self.__lSuffix, rsuffix=self.__rSuffix)) else: # Join (merge) on columns # onleft and onright must be set suffixes = (self.__lSuffix, self.__rSuffix) l_col = dfl.shape.colNames[self.__leftOn] r_col = dfr.shape.colNames[self.__rightOn] return data.Frame(dfl.getRawFrame().merge(dfr.getRawFrame(), how=self.__type.value, left_on=l_col, right_on=r_col, suffixes=suffixes))
def execute(self, df: data.Frame) -> data.Frame: """ Set new names for columns """ names: List[str] = df.colnames for k, v in self.__names.items(): names[k] = v new_df = df.getRawFrame().copy(deep=False) new_df.columns = names return data.Frame(new_df)
def execute(self, df: data.Frame) -> data.Frame: f = df.getRawFrame().copy(True) columns = f.columns.to_list() for c, o in self.__attributes.items(): result = pd.cut(f.iloc[:, c], bins=o[0], labels=o[1], duplicates='drop') colName: str = columns[c] newColName: str = colName if not self.__attributeSuffix else colName + self.__attributeSuffix f.loc[:, newColName] = result return data.Frame(f)
def execute(self, df: data.Frame) -> data.Frame: # Deep copy raw_df = df.getRawFrame().copy(deep=True) # To string isNan = raw_df.iloc[:, self.__attributes].isnull() processed = raw_df.iloc[:, self.__attributes].astype(dtype=str, errors='raise') # Set to nan where values where nan processed = processed.mask(isNan, np.nan) colNames = df.shape.colNames raw_df.iloc[:, self.__attributes] = processed return data.Frame(raw_df)
def execute(self, df: data.Frame) -> data.Frame: # Assume everything to go is set if self.__thresholdPercentage is not None and self.__thresholdNumber is not None: raise exp.InvalidOptions('Can\'t have both threshold set') pf = df.getRawFrame().copy() if self.__thresholdPercentage: # By percentage pf = pf.loc[:, pf.isnull().mean() <= self.__thresholdPercentage] else: # By nan number pf = pf.loc[:, pf.isnull().sum() <= self.__thresholdNumber] return data.Frame(pf)
def execute(self, df: data.Frame) -> data.Frame: pdf = df.getRawFrame().copy(deep=True) prefixes = itemgetter(*self.__attributes)(self.shapes[0].colNames) npdf = pd.get_dummies(pdf.iloc[:, self.__attributes], prefix=prefixes, dummy_na=self.__includeNan, dtype=int) npdf = npdf.astype('category', copy=False) # Replace eventual duplicate columns pdf = pdf.drop(columns=npdf.columns, errors='ignore') # Avoid dropping original columns (just append) # pdf = pdf.drop(columns[self.__attributes], axis=1, inplace=False) pdf = pd.concat([pdf, npdf], axis=1) return data.Frame(pdf)
def execute(self, df: data.Frame) -> data.Frame: df = df.getRawFrame() colOrder: List[str] = df.columns.to_list() subDf = df.iloc[:, self.__selected] duplicates = find_duplicates(subDf) if duplicates: df = df.copy(True) df = df.drop(duplicates, axis=1) # Keep original order order = [c for c in colOrder if c not in duplicates] df = df[order] return data.Frame(df)
def execute(self, df: data.Frame) -> data.Frame: # Deep copy raw_df = df.getRawFrame().copy(deep=True) allCols: List[str] = raw_df.columns.to_list() converted: Dict[str, np.ndarray] = dict() processedCols = list() for a in self.__attributes: view = raw_df.iloc[:, a] colName = allCols[a] converted[colName] = pd.to_numeric(view.values, errors=self.__errorMode, downcast='float') processedCols.append(colName) raw_df = raw_df.drop(columns=processedCols) # Get processed frame and set back its index processed = pd.DataFrame(converted).set_index(raw_df.index) raw_df = pd.concat([processed, raw_df], ignore_index=False, axis=1)[allCols] return data.Frame(raw_df)
def execute(self, df: data.Frame) -> data.Frame: columns = df.colnames df = df.getRawFrame().copy(True) # Notice that this timestamps are already set to a proper format (with default time/date) by # the editor intervals: Dict[int, pd.IntervalIndex] = \ {i: pd.IntervalIndex([pd.Interval(a, b, closed='right') for a, b in zip(opts[0], opts[0][1:])]) for i, opts in self.__attributes.items()} processedDict = dict() for i, opts in self.__attributes.items(): _, labels, byDate, byTime = opts applyCol = df.iloc[:, i] if byTime and not byDate: # Replace the date part with the default date in a way that every ts has the # same date, but retains its original time. Nan values are propagated applyCol = applyCol \ .map(lambda ts: pd.Timestamp(QDateTime(_IntervalWidget.DEFAULT_DATE, toQtDateTime(ts.to_pydatetime()).time()).toPython()), na_action='ignore') name = columns[i] if self.__attributesSuffix: name += self.__attributesSuffix categoriesMap = dict(zip(intervals[i], labels)) processedDict[name] = pd.cut( applyCol, bins=intervals[i]).cat.rename_categories(categoriesMap) if self.__attributesSuffix: duplicateColumns: Set[str] = set( processedDict.keys()) & set(columns) else: duplicateColumns: List[str] = list(processedDict.keys()) if duplicateColumns: df = df.drop(columns=duplicateColumns) processed = pd.DataFrame(processedDict).set_index(df.index) df = pd.concat([df, processed], ignore_index=False, axis=1) if not self.__attributesSuffix: # Reorder columns df = df[columns] return data.Frame(df)
def execute(self, df: data.Frame) -> data.Frame: pd_df = df.getRawFrame().copy(True) for c, colOptions in self.__attributes.items(): for valueList, replaceVal in zip(*colOptions): if self.__invertedReplace: valuesToReplace: List = pd_df.iloc[:, c].unique().tolist() for a in valueList: try: valuesToReplace.remove(a) except ValueError: pass # Value not in list (ignore) else: valuesToReplace = valueList pd_df.iloc[:, c] = pd_df.iloc[:, c].replace( to_replace=valuesToReplace, value=replaceVal, inplace=False) return data.Frame(pd_df)
def test_shape_index(): d = { 'col1': [1, 2, 3, 4.0, 10], 'col2': [3, 4, 5, 6, 0], 'col3': ['q', '2', 'c', '4', 'x'] } f = Frame(d) f = Frame(f.getRawFrame().set_index('col3')) # Desired shape obj s = Shape() s.index = ['col3'] s.indexTypes = [IndexType(Types.String)] s.colNames = ['col1', 'col2'] s.colTypes = [Types.Numeric, Types.Numeric] assert f.shape == s assert f.nRows == 5
def execute(self, df: data.Frame) -> data.Frame: # Deep copy raw_df = df.getRawFrame().copy(deep=True) # To string columnIndexes = list(self.__attributes.keys()) isNan = raw_df.iloc[:, columnIndexes].isnull() raw_df.iloc[:, columnIndexes] = raw_df.iloc[:, columnIndexes] \ .astype(dtype=str, errors='raise') # Set to nan where values where nan raw_df.iloc[:, columnIndexes] = raw_df.iloc[:, columnIndexes].mask(isNan, np.nan) colNames = df.shape.colNames # To category conversions: Dict[str, CategoricalDtype] = dict([ (lambda i, opts: (colNames[i], CategoricalDtype(categories=opts[0], # can be None ordered=opts[1])))(index, info) for index, info in self.__attributes.items() ]) raw_df = raw_df.astype(dtype=conversions, copy=True, errors='raise') return data.Frame(raw_df)
def execute(self, df: data.Frame) -> data.Frame: columns = df.colnames rdf = df.getRawFrame().copy(True) processDf = rdf.iloc[:, list(self.__selection.keys())] if self.__byValue: valueDict = {columns[k]: v for k, v in self.__selection.items()} processed = processDf.fillna(valueDict, axis=0) elif self.__method == 'mean': # For some reason pandas can't compute mean of a dataframe so I do it by hand valueDict = {k: processDf[k].mean() for k in processDf} # This is the only case where we need an execution log self.__logExecution(valueDict) processed = processDf.fillna(valueDict, axis=0) else: processed = processDf.fillna(method=self.__method, axis=0) # Merge result with previous frame, keeping column order processed = pd.concat( [processed, rdf.drop(processed.columns.values, axis=1)], axis=1, ignore_index=False)[columns] return data.Frame(processed)
def test_fromShape_datetime(): d = { 'col1': [1, 2, 3, 4.0, 10], 'col2': [3, 4, 5, 6, 0], 'col3': ['q', '2', 'c', '4', 'x'], 'cold': pd.Series([ '05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012' ], dtype='datetime64[ns]') } f = Frame(d) f = Frame(f.getRawFrame().set_index(['col3', 'cold'])) g = Frame.fromShape(f.shape) s = Shape() # fromShape does preserve index s.colNames = ['col1', 'col2'] s.colTypes = [Types.Numeric, Types.Numeric] s.index = ['col3', 'cold'] s.indexTypes = [IndexType(Types.String), IndexType(Types.Datetime)] assert g.shape == s == f.shape
def execute(self, df: data.Frame) -> data.Frame: pdf = df.getRawFrame().copy(True) for attr, dateFormat in self.__attributes.items(): pdf.iloc[:, attr] = pd.to_datetime(pdf.iloc[:, attr], errors=self.__errorMode, infer_datetime_format=True, format=dateFormat) return data.Frame(pdf)