def test_fromShape_categories(): d = { 'col1': [1, 2, 3, 4.0, 10], 'col2': pd.Categorical([3, 4, 5, 6, 0]), 'col3': pd.Categorical(['q', '2', 'c', '4', 'x'], ordered=True), 'cold': pd.Series([ '05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012' ], dtype='datetime64[ns]') } f = Frame(d) f = f.setIndex(['col2', 'col3', 'col1']) g = Frame.fromShape(f.shape) s = Shape() # fromShape does preserve index s.colNames = ['cold'] s.colTypes = [Types.Datetime] s.index = ['col3', 'col1', 'col2'] s.indexTypes = [ IndexType(Types.Ordinal), IndexType(Types.Numeric), IndexType(Types.Nominal) ] assert g.shape == s == f.shape
def test_SetInput(): d = { 'col1': [1, 2, 3, 4.0, 10], 'col2': [3, 4, 5, 6, 0], 'col3': ['q', '2', 'c', '4', 'x'], 'date': ['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012'] } f = Frame(d) work = WorkbenchModelMock() # Set dataframe work.setDataframeByName('var', f) op = SetInput(work) assert op.getOutputShape() is None assert op.getOptions() == {'inputF': None} op.setOptions(inputF='var') op.addInputShape(Shape(), pos=0) # this does nothing assert op.getOptions() == {'inputF': 'var'} assert op.getOutputShape() == f.shape g = op.execute() assert g == f # g should be a copy f = f.rename({'col1': 'ewew'}) assert g != f
def execute(self, df: data.Frame) -> data.Frame: columns = df.getRawFrame().columns.to_list() # Execute pdf = df.getRawFrame().copy(True) fr = set(self.__attributes.values()) if len(fr) == 1: # All ranges are the same, shortcut toProcess = pdf.iloc[:, list(self.__attributes.keys())] processedColNames = toProcess.columns scaled = minmax_scale(toProcess, feature_range=fr.pop(), axis=0, copy=True) processed = pd.DataFrame(scaled).set_index(pdf.index) processed.columns = processedColNames else: processed = dict() for k, fr in self.__attributes.items(): processed[columns[k]] = minmax_scale(pdf.iloc[:, k], feature_range=fr, axis=0, copy=True) processed = pd.DataFrame(processed).set_index(pdf.index) # Merge result with other columns preserving order pdf = pdf.drop(columns=processed.columns) result = pd.concat([pdf, processed], ignore_index=False, axis=1)[columns] return data.Frame(result)
def test_rename_bis(): d = { 'col1': [1, 2, 3, 4, 10], 'col2': [3, 4, 5, 6, 0], 'col3': ['q', '2', 'c', '4', 'x'] } f = Frame(d) g = f.rename(['cola', '21eeds', 'ij_']) assert g.colnames == ['cola', '21eeds', 'ij_' ] and f.colnames == ['col1', 'col2', 'col3']
def test_rename(): d = { 'col1': [1, 2, 3, 4, 10], 'col2': [3, 4, 5, 6, 0], 'col3': ['q', '2', 'c', '4', 'x'] } f = Frame(d) g = f.rename({'col2': 'new'}) assert g.colnames == ['col1', 'new', 'col3' ] and f.colnames == ['col1', 'col2', 'col3']
def test_rename_excep(): d = { 'col1': [1, 2, 3, 4, 10], 'col2': [3, 4, 5, 6, 0], 'col3': ['q', '2', 'c', '4', 'x'] } f = Frame(d) names = f.colnames names.append('1') with pytest.raises(ValueError): f.rename(names)
def execute(self, df: data.Frame) -> data.Frame: df = df.getRawFrame() colOrder: List[str] = df.columns.to_list() subDf = df.iloc[:, self.__selected] duplicates = find_duplicates(subDf) if duplicates: df = df.copy(True) df = df.drop(duplicates, axis=1) # Keep original order order = [c for c in colOrder if c not in duplicates] df = df[order] return data.Frame(df)
def test_typing(): a = 4 b = pd.DataFrame() d: int64 = 12 c = Frame() assert isinstance(type(a), type) and isinstance(type(b), type) and issubclass(type(c), Frame) and \ isinstance(type(c), type) and isinstance(type(d), type)
def test_str_to_Timestamp_validation(): d = { 'col1': ['3', '0', '5', '6', '0'], 'col2': [3, 4, 5.1, 6, 0], 'col3': ['123', '2', '0.43', '4', '2021 January'], 'cold': ['05091988', '22121994', '21111995', '22061994', '12122012'] } f = Frame(d) op = ToTimestamp() op.addInputShape(f.shape, 0) assert op.getOutputShape() is None with pytest.raises(exp.OptionValidationError): op.setOptions(attributes={}, errors='raise') assert not op.hasOptions() and op.getOutputShape() is None with pytest.raises(exp.OptionValidationError): op.setOptions(attributes={0: {'format': '%s'}}, errors='') assert not op.hasOptions() and op.getOutputShape() is None op.setOptions(attributes={0: {'format': '%d'}}, errors='coerce') assert op.hasOptions() assert op.getOutputShape().colTypes == [ Types.Datetime, Types.Numeric, Types.String, Types.String ]
def test_toString(): d = { 'col1': ['3', '0', '5', '6', '0'], 'col2': [3, 4, 5.1, 6, 0], 'col3': ['123', '2', '0.43', '4', '2021 January'], 'cold': ['05091988', '22121994', '21111995', '22061994', '12122012'] } f = Frame(d) op = ToString() assert op.getOutputShape() is None op.addInputShape(f.shape, 0) assert op.getOutputShape() is None opts = {'attributes': {1: None}} assert op.getOptions() == {'attributes': dict()} op.setOptions(**opts) assert op.getOutputShape().colTypes == [ Types.String, Types.String, Types.String, Types.String ] assert op.getOptions() == opts assert isDictDeepCopy(op.getOptions(), opts) g = op.execute(f) assert op.getOutputShape() == g.shape
def test_nominal_to_ordinal_cat(): d = { 'col1': pd.Categorical(["5", "0", "5", "U", "0 ww"], ordered=False), 'col2': [3, 4, 5.1, 6, 0] } f = Frame(d) op = ToCategorical() op.addInputShape(f.shape, pos=0) op.setOptions(attributes={0: {'cat': 'U 0 1 5', 'ordered': True}}) # Predict output shape os = f.shape.columnsDict os['col1'] = Types.Ordinal assert op.getOutputShape().columnsDict == os g = op.execute(f) gd = { 'col1': ['5', '0', '5', 'U', None], 'col2': [3.0, 4.0, 5.1, 6.0, 0.0] } assert nan_to_None(g.to_dict()) == gd assert g.shape.columnsDict == os assert list( g.getRawFrame()['col1'].dtype.categories) == ['U', '0', '1', '5'] assert g.getRawFrame()['col1'].dtype.ordered is True
def test_rename(): d = { 'col1': [1, 2, 3, 4, 10], 'col2': [3, 4, 5, 6, 0], 'col3': ['q', '2', 'c', '4', 'x'] } f = Frame(d) op = RenameColumns() op.addInputShape(f.shape, pos=0) assert op.getOutputShape() is None op.setOptions(names={0: 'col4', 2: 'col1'}) assert op.getOptions() == [{0: 'col4', 2: 'col1'}] os = f.shape.clone() os.colNames = ['col4', 'col2', 'col1'] assert op.getOutputShape() == os g = op.execute(f) gd = { 'col4': [1, 2, 3, 4, 10], 'col2': [3, 4, 5, 6, 0], 'col1': ['q', '2', 'c', '4', 'x'] } assert g.to_dict() == gd
def execute(self, df: data.Frame) -> data.Frame: f = df.getRawFrame().copy(True) pairs: List[Tuple[int, str]] = list(self.__attributes.items()) names = [v[1] for v in pairs] indexes = [v[0] for v in pairs] f[names] = f.iloc[:, indexes] return data.Frame(f)
def test_unsetOptions_toNumeric(): d = { 'col1': pd.Categorical([1, 2, 3, 4, 10]), 'col2': [3, 4, 5, 6, 0], 'col3': ['q', '2', 'c', '4', 'x'] } f = Frame(d) op = ToNumeric() op.addInputShape(f.shape, pos=0) assert op.getOptions() == { 'attributes': {}, 'errors': 'raise' } and not op.hasOptions() op.setOptions(attributes={0: dict()}, errors='raise') assert op.getOptions() == {'attributes': {0: None}, 'errors': 'raise'} assert op._shapes[0] == f.shape op.unsetOptions() assert op.getOptions() == {'attributes': {}, 'errors': 'raise'} assert op._shapes[0] == f.shape op.removeInputShape(0) assert op.getOptions() == {'attributes': {}, 'errors': 'raise'} assert op._shapes == [None] op.setOptions(attributes={1: dict()}, errors='coerce') assert op.getOptions() == {'attributes': {1: None}, 'errors': 'coerce'} assert op._shapes == [None] op.addInputShape(f.shape, pos=0) assert op.getOptions() == {'attributes': {1: None}, 'errors': 'coerce'} assert op._shapes[0] == f.shape
def execute(self, df: data.Frame) -> data.Frame: columns = df.getRawFrame().columns.to_list() # Execute pdf = df.getRawFrame().copy(True) processedColNames = pdf.iloc[:, self.__attributes].columns scaled = scale(pdf.iloc[:, self.__attributes], with_mean=True, with_std=True, copy=True) processed = pd.DataFrame(scaled).set_index(pdf.index) processed.columns = processedColNames # Merge result with other columns preserving order pdf = pdf.drop(columns=processedColNames) result = pd.concat([pdf, processed], ignore_index=False, axis=1)[columns] return data.Frame(result)
def execute(self, dfl: data.Frame, dfr: data.Frame) -> data.Frame: if self.__onIndex: # Join on indexes return data.Frame(dfl.getRawFrame().join(dfr.getRawFrame(), how=self.__type.value, lsuffix=self.__lSuffix, rsuffix=self.__rSuffix)) else: # Join (merge) on columns # onleft and onright must be set suffixes = (self.__lSuffix, self.__rSuffix) l_col = dfl.shape.colNames[self.__leftOn] r_col = dfr.shape.colNames[self.__rightOn] return data.Frame(dfl.getRawFrame().merge(dfr.getRawFrame(), how=self.__type.value, left_on=l_col, right_on=r_col, suffixes=suffixes))
def computeDiff(self) -> None: frame1 = self.columnsL.model().frameModel().frame.getRawFrame() frame2 = self.columnsR.model().frameModel().frame.getRawFrame() changedMask = frame1 != frame2 diffRows = changedMask.any(1) diffColumns = changedMask.any(0) frame = frame1.loc[diffRows, diffColumns] self.tableWidget.model().sourceModel().setFrame(Frame(frame))
def execute(self, df: data.Frame) -> data.Frame: """ Set new names for columns """ names: List[str] = df.colnames for k, v in self.__names.items(): names[k] = v new_df = df.getRawFrame().copy(deep=False) new_df.columns = names return data.Frame(new_df)
def test_addInputShape_exc(): d = {'col1': [1, 2, 3, 4.0, 10], 'col2': [3, 4, 5, 6, 0], 'col3': ['q', '2', 'c', '4', 'x'], 'date': ['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012']} f = Frame(d) op = DummyOp() with pytest.raises(ValueError): op.addInputShape(f.shape, pos=-1)
def execute(self, df: data.Frame) -> data.Frame: columns = df.colnames df = df.getRawFrame().copy(True) # Notice that this timestamps are already set to a proper format (with default time/date) by # the editor intervals: Dict[int, pd.IntervalIndex] = \ {i: pd.IntervalIndex([pd.Interval(a, b, closed='right') for a, b in zip(opts[0], opts[0][1:])]) for i, opts in self.__attributes.items()} processedDict = dict() for i, opts in self.__attributes.items(): _, labels, byDate, byTime = opts applyCol = df.iloc[:, i] if byTime and not byDate: # Replace the date part with the default date in a way that every ts has the # same date, but retains its original time. Nan values are propagated applyCol = applyCol \ .map(lambda ts: pd.Timestamp(QDateTime(_IntervalWidget.DEFAULT_DATE, toQtDateTime(ts.to_pydatetime()).time()).toPython()), na_action='ignore') name = columns[i] if self.__attributesSuffix: name += self.__attributesSuffix categoriesMap = dict(zip(intervals[i], labels)) processedDict[name] = pd.cut( applyCol, bins=intervals[i]).cat.rename_categories(categoriesMap) if self.__attributesSuffix: duplicateColumns: Set[str] = set( processedDict.keys()) & set(columns) else: duplicateColumns: List[str] = list(processedDict.keys()) if duplicateColumns: df = df.drop(columns=duplicateColumns) processed = pd.DataFrame(processedDict).set_index(df.index) df = pd.concat([df, processed], ignore_index=False, axis=1) if not self.__attributesSuffix: # Reorder columns df = df[columns] return data.Frame(df)
def execute(self, df: data.Frame) -> data.Frame: f = df.getRawFrame().copy(True) columns = f.columns.to_list() for c, o in self.__attributes.items(): result = pd.cut(f.iloc[:, c], bins=o[0], labels=o[1], duplicates='drop') colName: str = columns[c] newColName: str = colName if not self.__attributeSuffix else colName + self.__attributeSuffix f.loc[:, newColName] = result return data.Frame(f)
def test_shape_index(): d = { 'col1': [1, 2, 3, 4.0, 10], 'col2': [3, 4, 5, 6, 0], 'col3': ['q', '2', 'c', '4', 'x'] } f = Frame(d) f = Frame(f.getRawFrame().set_index('col3')) # Desired shape obj s = Shape() s.index = ['col3'] s.indexTypes = [IndexType(Types.String)] s.colNames = ['col1', 'col2'] s.colTypes = [Types.Numeric, Types.Numeric] assert f.shape == s assert f.nRows == 5
def test_unsetOptions_toCategory(): d = { 'col1': pd.Categorical([1, 2, 3, 4, 10]), 'col2': [3, 4, 5, 6, 0], 'col3': ['q', '2', 'c', '4', 'x'] } f = Frame(d) op = ToCategorical() op.addInputShape(f.shape, pos=0) op.setOptions(attributes={0: {'cat': ' " 2 e + " 1 ', 'ordered': True}}) assert op.getOptions() == { 'attributes': { 0: { 'cat': '"2 e +" 1', 'ordered': True } } } assert op._ToCategorical__attributes == {0: (['2 e +', '1'], True)} assert op._shapes == [f.shape] op.unsetOptions() assert op.getOptions() == {'attributes': dict()} assert op._ToCategorical__attributes == dict() assert op._shapes == [f.shape] op.removeInputShape(0) assert op.getOptions() == {'attributes': dict()} assert op._ToCategorical__attributes == dict() assert op._shapes == [None] op.setOptions(attributes={1: dict()}) assert op.getOptions() == { 'attributes': { 1: { 'cat': '', 'ordered': False } } } assert op._ToCategorical__attributes == {1: (None, None)} assert op._shapes == [None] assert op.getOutputShape() is None op.addInputShape(f.shape, pos=0) assert op.getOptions() == { 'attributes': { 1: { 'cat': '', 'ordered': False } } } assert op._ToCategorical__attributes == {1: (None, None)} assert op._shapes == [f.shape]
def execute(self, df: data.Frame) -> data.Frame: # Deep copy raw_df = df.getRawFrame().copy(deep=True) # To string isNan = raw_df.iloc[:, self.__attributes].isnull() processed = raw_df.iloc[:, self.__attributes].astype(dtype=str, errors='raise') # Set to nan where values where nan processed = processed.mask(isNan, np.nan) colNames = df.shape.colNames raw_df.iloc[:, self.__attributes] = processed return data.Frame(raw_df)
def execute(self, df: data.Frame) -> data.Frame: # Assume everything to go is set if self.__thresholdPercentage is not None and self.__thresholdNumber is not None: raise exp.InvalidOptions('Can\'t have both threshold set') pf = df.getRawFrame().copy() if self.__thresholdPercentage: # By percentage pf = pf.loc[:, pf.isnull().mean() <= self.__thresholdPercentage] else: # By nan number pf = pf.loc[:, pf.isnull().sum() <= self.__thresholdNumber] return data.Frame(pf)
def __init__(self, parent: QWidget = None, frame: Union[Frame, Shape] = Frame()): super().__init__(parent) if isinstance(frame, Frame): self.__frame: Frame = frame self.__shape: Shape = self.__frame.shape elif isinstance(frame, Shape): # it's a Shape self.__frame: Frame = Frame() self.__shape: Shape = frame else: self.__frame: Frame = Frame() self.__shape: Shape = Shape() # Dictionary { attributeIndex: value } self._statistics: Dict[int, Dict[str, object]] = dict() self._histogram: Dict[int, Dict[Any, int]] = dict() # Dataframe name self.name: str = '' # Set of alive workers by identifier (attribute number, type, operation) self._runningWorkers: Set[Tuple] = set() self._dataAccessMutex = QMutex()
def execute(self, df: data.Frame) -> data.Frame: pdf = df.getRawFrame().copy(deep=True) prefixes = itemgetter(*self.__attributes)(self.shapes[0].colNames) npdf = pd.get_dummies(pdf.iloc[:, self.__attributes], prefix=prefixes, dummy_na=self.__includeNan, dtype=int) npdf = npdf.astype('category', copy=False) # Replace eventual duplicate columns pdf = pdf.drop(columns=npdf.columns, errors='ignore') # Avoid dropping original columns (just append) # pdf = pdf.drop(columns[self.__attributes], axis=1, inplace=False) pdf = pd.concat([pdf, npdf], axis=1) return data.Frame(pdf)
def test_cat_toNumeric(): d = { 'col1': pd.Categorical(['3', '0', '5', '6', '0']), 'col2': [3, 4, 5, 6, 0], 'col3': ['123', '2', '0.43', '4', '90'] } # 'cold': pd.Series(['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012'], # dtype='datetime64[ns]')} f = Frame(d) op = ToNumeric() op.addInputShape(f.shape, pos=0) assert op.getOutputShape() is None assert op.getOptions() == {'attributes': {}, 'errors': 'raise'} op.setOptions(attributes={0: None}, errors='coerce') assert op.getOptions() == {'attributes': {0: None}, 'errors': 'coerce'} # Predict output shape os = f.shape.clone() os.colTypes[0] = Types.Numeric assert op.getOutputShape() == os # Removing options/input_shape causes None to be returned op.removeInputShape(0) assert op.getOutputShape() is None op.addInputShape(f.shape, pos=0) op.unsetOptions() assert op.getOutputShape() is None op.setOptions(attributes={0: dict()}, errors='raise') assert op.getOutputShape() == os # Re-adding everything g = op.execute(f) gd = { 'col1': [3.0, 0.0, 5.0, 6.0, 0.0], 'col2': [3, 4, 5, 6, 0], 'col3': ['123', '2', '0.43', '4', '90'] } assert g.to_dict() == gd assert g.shape == os # Coerce is the same op.setOptions(attributes={0: dict()}, errors='coerce') assert op.getOutputShape() == os g = op.execute(f) assert g.to_dict() == gd assert g.shape == os
def test_fromShape_single_index(): d = { 'col1': [1, 2, 3, 4.0, 10], 'col2': [3, 4, 5, 6, 0], 'col3': ['q', '2', 'c', '4', 'x'], 'cold': pd.Series([ '05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012' ], dtype='datetime64[ns]') } f = Frame(d) f = f.setIndex('col1') g = Frame.fromShape(f.shape) s = Shape() # fromShape does preserve index s.colNames = ['cold', 'col2', 'col3'] s.colTypes = [Types.Datetime, Types.Numeric, Types.String] s.index = ['col1'] s.indexTypes = [IndexType(Types.Numeric)] assert g.shape == s == f.shape
def test_shape(): d = { 'col1': [1, 2, 3, 4, 10], 'col2': [3, 4, 5, 6, 0], 'col3': ['q', '2', 'c', '4', 'x'] } f = Frame(d) s = Shape() s.index = ['Unnamed'] s.indexTypes = [IndexType(Types.Numeric)] s.colNames = ['col1', 'col2', 'col3'] s.colTypes = [Types.Numeric, Types.Numeric, Types.String] assert f.shape == s assert f.nRows == 5